HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
ftest_SocialModule.py
Go to the documentation of this file.
1 #!/usr/bin/python
2 # coding: utf-8
3 
4 import os
5 import json
6 import base64
7 import logging
8 import ConfigParser
9 from dc.EventObjects import URL
10 from dc.EventObjects import Batch
11 from dc.EventObjects import BatchItem
12 from dc.EventObjects import URLContentResponse
13 from dc_postprocessor.SocialModule import SocialModule
14 from dc_postprocessor.PostProcessingApplicationClass import PostProcessingApplicationClass
15 import app.Consts as APP_CONSTS
16 from app.Utils import varDump
17 import app.Utils as Utils
18 
19 def getLogger():
20  # create logger
21  log = logging.getLogger(APP_CONSTS.LOGGER_NAME)
22  log.setLevel(logging.DEBUG)
23 
24  # create console handler and set level to debug
25  ch = logging.StreamHandler()
26  ch.setLevel(logging.DEBUG)
27 
28  # create formatter
29  formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
30 
31  # add formatter to ch
32  ch.setFormatter(formatter)
33 
34  # add ch to logger
35  log.addHandler(ch)
36 
37  return log
38 
39 
41 
42  configFileName = '../ini/postprocessor_task_log-rt.ini'
43 
44  retval = os.getcwd()
45  os.chdir('..')
46  # read config
47  logging.config.fileConfig(configFileName)
48 
49  # create logger
50  log = Utils.MPLogger().getLogger()
51  # log = logging.getLogger(APP_CONSTS.LOGGER_NAME)
52  os.chdir(retval)
53 
54  return log
55 
56 
57 if __name__ == '__main__':
58 
59  logger = getLogger()
60 
61  configName = '../ini/postprocessor_task.ini'
62  headerFileName = '../ini/crawler-task_headers.txt'
63 
64  postProcessingApplicationClass = PostProcessingApplicationClass()
65  postProcessingApplicationClass.configParser = ConfigParser.ConfigParser()
66  postProcessingApplicationClass.configParser.optionxform = str
67  readOk = postProcessingApplicationClass.configParser.read(configName)
68  logger.debug("Read config: %s", str(readOk))
69 
70  # postProcessingApplicationClass.configParser.set('SocialModule', 'cmd', '')
71 
72  siteId = 12345
73  url = 'https://www.theguardian.com/us-news/2016/jan/05/obama-gun-control-executive-action-background-checks-licenses-gun-shows-mental-health-funding'
74  urlObj = URL(siteId, url)
75 
76  processedContent = [{"title":"Tearful Obama tightens gun control and tells inactive Congress: 'We can't wait'",
77  "source_url": url}]
78 
79  processedContents = [base64.b64encode(json.dumps(processedContent))]
80  urlContentResponse = URLContentResponse(url=url, processedContents=processedContents)
81 
82  batchItem = BatchItem(siteId=siteId, urlId=urlObj.urlMd5, urlObj=urlObj, urlContentResponse=urlContentResponse)
83 # batchItem.properties = {"SOCIAL_RATE":json.dumps({"retries":2, "retries_delay":5, "retries_type":1, "interval":10, "lang":"en", "sentiment":1, "debug":1, "timeout":400, "social_list":{"tw":["https://www.twitter.com", "window.IFRAME_QUERY_URL=\"https://twitter.com/search?f=tweets&vertical=default&q=%25QUERY_STRING%25&src=typd\",window.IFRAME_CSCROLL_COUNT=10;window.IFRAME_MAX_TIME=350;window.IFRAME_SFIELD='source_url';", {"name":"tests", "sets":[{"name":"set1", "items":["1", "%MACRO_DATA%", "http://127.0.0.1/social.js", "!5:76:return window.IFRAME_DATA_READY;", "return window.MACRO_COLLECT;"], "repeat":1, "delay":0}], "result_type":0, "result_content_type":"text/json"}]} })}
84 # batchItem.properties = {"SOCIAL_RATE": "{\"retries\":2, \"retries_delay\":10, \"retries_type\":1, \"interval\":10,\"lang\":\"en\", \"sentiment\":1, \"debug\":1, \"timeout\":400, \"social_list\":{\"tw\":[\"https:\/\/www.twitter.com\/\",\"window.IFRAME_QUERY_URL=\\\"https:\/\/twitter.com\/search?f=tweets&vertical=default&q=%25QUERY_STRING%25&src=typd\\\",window.IFRAME_CSCROLL_COUNT=10;window.IFRAME_MAX_TIME=360;window.IFRAME_SFIELD='source_url';\",{\"name\":\"tests\", \"sets\":[{\"name\":\"set1\", \"items\":[\"3\", \"%MACRO_DATA%\", \"http:\/\/127.0.0.1\/social.js\", \"!5:76:return window.IFRAME_DATA_READY;\", \"return window.MACRO_COLLECT;\"], \"repeat\":1, \"delay\":0}], \"result_type\":0, \"result_content_type\":\"text\/json\"}]} }"}
85  batchItem.properties = {"SOCIAL_RATE": "{\"retries\":3, \"retries_delay\":5, \"retries_type\":1, \"interval\":10,\"lang\":\"en\", \"sentiment\":1, \"debug\":1, \"timeout\":400, \"social_list\":{\"fb\":[\"https:\/\/www.facebook.com\",\"window.IFRAME_QUERY_URL=\\\"https:\/\/www.facebook.com\/search\/top\/?q=%25QUERY_STRING%25\\\";window.IFRAME_CSCROLL_COUNT=100;window.IFRAME_MAX_TIME=350;window.IFRAME_SFIELD='title';\",{\"name\":\"tests\", \"sets\":[{\"name\":\"set1\", \"items\":[\"1\", \"%MACRO_DATA%\", \"http:\/\/127.0.0.1\/social.js\", \"!5:76:return window.IFRAME_DATA_READY;\", \"return window.MACRO_COLLECT;\"], \"repeat\":1, \"delay\":0}], \"result_type\":0, \"result_content_type\":\"text\/json\"}],\"tw\":[\"https:\/\/www.twitter.com\",\"window.IFRAME_QUERY_URL=\\\"https:\/\/twitter.com\/search?f=tweets&vertical=default&q=%25QUERY_STRING%25&src=typd\\\",window.IFRAME_CSCROLL_COUNT=100;window.IFRAME_MAX_TIME=350;window.IFRAME_SFIELD='source_url';\",{\"name\":\"tests\", \"sets\":[{\"name\":\"set1\", \"items\":[\"1\", \"%MACRO_DATA%\", \"http:\/\/127.0.0.1\/social.js\", \"!5:76:return window.IFRAME_DATA_READY;\", \"return window.MACRO_COLLECT;\"], \"repeat\":1, \"delay\":0}], \"result_type\":0, \"result_content_type\":\"text\/json\"}]} }"}
86  batch = Batch(77777, [batchItem])
87 
88  logger.debug("Input batch: %s", varDump(batch))
89  socialModule = SocialModule(postProcessingApplicationClass.getConfigOption, postProcessingApplicationClass.logger)
90  socialModule.init()
91  batch = socialModule.processBatch(batch)
92  logger.debug("Output batch: %s", varDump(batch))
93 
94  # logger.debug("Resolved url: %s", str(json.loads(base64.b64decode(batch.items[0].urlContentResponse.processedContents[0]))['link']))
95 
96 
97 
98 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410