HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
ftest_LinkResolverModule.py
Go to the documentation of this file.
1 #!/usr/bin/python
2 # coding: utf-8
3 
4 import json
5 import base64
6 import ConfigParser
7 import logging
8 from dc.EventObjects import URL
9 from dc.EventObjects import Batch
10 from dc.EventObjects import BatchItem
11 from dc.EventObjects import URLContentResponse
12 from dc_postprocessor.LinkResolver import LinkResolver
13 from dc_postprocessor.PostProcessingApplicationClass import PostProcessingApplicationClass
14 import app.Consts as APP_CONSTS
15 from app.Utils import varDump
16 
17 def getLogger():
18  # create logger
19  logger = logging.getLogger(APP_CONSTS.LOGGER_NAME)
20  logger.setLevel(logging.DEBUG)
21 
22  # create console handler and set level to debug
23  ch = logging.StreamHandler()
24  ch.setLevel(logging.DEBUG)
25 
26  # create formatter
27  formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
28 
29  # add formatter to ch
30  ch.setFormatter(formatter)
31 
32  # add ch to logger
33  logger.addHandler(ch)
34 
35  return logger
36 
37 
38 if __name__ == '__main__':
39 
40  logger = getLogger()
41 
42  configName = '../../ini/postprocessor_task.ini'
43  headerFileName = '../../ini/crawler-task_headers.txt'
44 
45  postProcessingApplicationClass = PostProcessingApplicationClass()
46  postProcessingApplicationClass.configParser = ConfigParser.ConfigParser()
47  postProcessingApplicationClass.configParser.optionxform = str
48  readOk = postProcessingApplicationClass.configParser.read(configName)
49  logger.debug("Read config: %s", str(readOk))
50 
51  postProcessingApplicationClass.configParser.set('LinkResolver', 'headers_file', headerFileName)
52 
53  siteId = 12345
54  url = 'http://127.0.0.1/test.html,https://retrip.jp/external-link/?article_content_id=482406'
55  urlObj = URL(siteId, url)
56 
57  processedContent = {'link':url}
58  processedContents = [base64.b64encode(json.dumps(processedContent))]
59  urlContentResponse = URLContentResponse(url=url, processedContents=processedContents)
60 
61  batchItem = BatchItem(siteId=siteId, urlId=urlObj.urlMd5, urlObj=urlObj, urlContentResponse=urlContentResponse)
62  batchItem.properties = {"LINK_RESOLVE":{"method":{"retrip.jp/external-link":"GET"}}}
63  batch = Batch(1, [batchItem])
64 
65  logger.debug("Input batch: %s", varDump(batch))
66 
67  linkResolver = LinkResolver(logger, postProcessingApplicationClass.getConfigOption)
68  linkResolver.init()
69  for i in xrange(len(batch.items)):
70  batch.items[i] = linkResolver.processBatchItem(batch.items[i])
71 
72  logger.debug("Output batch: %s", varDump(batch))
73 
74  logger.debug("Resolved url: %s", str(json.loads(base64.b64decode(batch.items[0].urlContentResponse.processedContents[0]))['link']))
75 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410