2 HCE project, Python bindings, Crawler application. 3 Dynamic fetcher module tests of FB scrapping. Account: hcegroup2016@gmail.com/testtest2016 6 @file ftest_dc_dynamic_fetcher.py 7 @author bgv <bgv.hce@gmail.com> 8 @link: http://hierarchical-cluster-engine.com/ 9 @copyright: Copyright © 2016 IOIX Ukraine 10 @license: http://hierarchical-cluster-engine.com/license/ 24 logger = logging.getLogger(
'ftest_dc_dynamic_fetcher')
25 logger.setLevel(logging.DEBUG)
27 ch = logging.StreamHandler()
28 ch.setLevel(logging.DEBUG)
30 formatter = logging.Formatter(
'%(asctime)s - %(thread)ld - %(threadName)s - %(module)s - %(funcName)s - %(levelname)s - %(message)s')
32 ch.setFormatter(formatter)
37 if __name__ ==
"__main__":
40 print "CurDir:\n", os.path.dirname(os.path.realpath(__file__))
68 with open(
"../../ini/crawler-task_headers.txt",
'r') as f: 69 hdrs = ''.
join(f.readlines())
70 for header
in hdrs.splitlines():
74 key, value = header[:header.index(
':')].strip(), header[header.index(
':') + len(
':'):].strip()
76 print "header error:%s", header
78 headersDict[key] = value
86 process_content_types = [
"text/html"]
87 maxResourceSize = 1024 * 1024
89 fetchType = BaseFetcher.TYP_NORMAL
92 inlineMacro =
"window.IFRAME_QUERY_URL=\"https://plus.google.com/s/%25QUERY_STRING%25\";" + \
93 "window.IFRAME_CSCROLL_COUNT=100;" + \
94 "window.IFRAME_MAX_TIME=350;" + \
95 "window.IFRAME_SFIELD=\"title\";" 96 url =
'https://plus.google.com/' +
'###' + inlineMacro
98 tm = int(httpTimeout) / 1000.0
99 if isinstance(httpTimeout, float):
100 tm += float(
'0' + str(httpTimeout).strip()[str(httpTimeout).strip().find(
'.'):])
101 headersDict.update({
'--disable-web-security':
'',
'--allow-running-insecure-content':
''})
102 macroCode = {
"name":
"tests",
103 "sets":[{
"name":
"set1",
"items":[
'',
'',
'',
'',
''],
"repeat":1,
"delay":0}],
105 "result_content_type":
"text/json",
106 "result_fetcher_type":1}
107 macroCode[
'sets'][0][
'items'][0] =
'5' 108 macroCode[
'sets'][0][
'items'][1] =
"file:///tmp/ftest_dc_fetchers_gplus_social_macro_data.js" 109 macroCode[
'sets'][0][
'items'][2] =
'file:///tmp/social.js' 110 macroCode[
'sets'][0][
'items'][3] =
'!5:20:return window.IFRAME_DATA_READY;' 111 macroCode[
'sets'][0][
'items'][4] =
"return window.MACRO_COLLECT;" 113 if userDataDir
is not None:
114 headersDict[
'--user-data-dir'] = userDataDir
115 if userCacheDir
is not None:
116 headersDict[
'--disk-cache-dir'] = userCacheDir
117 if userProfileZip
is not None:
118 headersDict[
'--user-data-dir-zip'] = userProfileZip
121 headersDict[
'--log-chrome-debug-log'] =
'' 128 fetchType = BaseFetcher.TYP_DYNAMIC
130 os.chdir(
"../../bin/")
133 print "headersDict:\n",
varDump(headersDict)
134 f = BaseFetcher.get_fetcher(fetchType)
135 if userDataDir
is not None:
137 res = f.open(url, timeout=tm, headers=headersDict, allow_redirects=allowRedirects, proxies=proxies,
138 auth=authorization, data=postData, log=logger, allowed_content_types=process_content_types,
139 max_resource_size=maxResourceSize, max_redirects=maxHttpRedirects, filters=localFilters,
140 depth=urlObjDepth, macro=macroCode)
142 except Exception, err:
144 print "Exception:\n",
varDump(err)
147 rd = json.dumps(json.loads(jsonpickle.encode(res)), indent=2)
149 print "Result:\n", rd
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)