2 HCE project, Python bindings, Crawler application. 3 Dynamic fetcher module tests. 6 @file ftest_dc_dynamic_fetcher.py 7 @author bgv <bgv.hce@gmail.com> 8 @link: http://hierarchical-cluster-engine.com/ 9 @copyright: Copyright © 2016 IOIX Ukraine 10 @license: http://hierarchical-cluster-engine.com/license/ 24 logger = logging.getLogger(
'ftest_dc_dynamic_fetcher')
25 logger.setLevel(logging.DEBUG)
27 ch = logging.StreamHandler()
28 ch.setLevel(logging.DEBUG)
30 formatter = logging.Formatter(
'%(asctime)s - %(thread)ld - %(threadName)s - %(module)s - %(funcName)s - %(levelname)s - %(message)s')
32 ch.setFormatter(formatter)
37 if __name__ ==
"__main__":
40 print "CurDir:\n", os.path.dirname(os.path.realpath(__file__))
46 with open(
"../../ini/crawler-task_headers.txt",
'r') as f: 47 hdrs = ''.
join(f.readlines())
48 for header
in hdrs.splitlines():
52 key, value = header[:header.index(
':')].strip(), header[header.index(
':') + len(
':'):].strip()
54 print "header error:%s", header
56 headersDict[key] = value
58 print "headersDict:\n",
varDump(headersDict)
64 process_content_types = [
"text/html"]
65 maxResourceSize = 1024 * 1024
67 fetchType = BaseFetcher.TYP_NORMAL
70 url =
'https://www.google.co.jp/search?q=&gws_rd=cr' + \
71 '###window.IFRAME_KWSRC="http://127.0.0.1/keywords_big.txt";window.IFRAME_KWSRC_FROM=0;window.IFRAME_KWSRC_N=2;' 73 tm = int(httpTimeout) / 1000.0
74 if isinstance(httpTimeout, float):
75 tm += float(
'0' + str(httpTimeout).strip()[str(httpTimeout).strip().find(
'.'):])
78 headersDict = {
'--disable-web-security':
'',
'--allow-running-insecure-content':
''}
79 macroCode = {
"name":
"tests",
80 "sets":[{
"name":
"set1",
"items":[
'',
'',
'',
''],
"repeat":1,
"delay":0}],
82 "result_content_type":
"text/json",
83 "result_fetcher_type":1}
85 macroCode[
'sets'][0][
'items'][0] =
'1' 86 macroCode[
'sets'][0][
'items'][1] = \
88 var s=window.document.createElement('script');\ 89 s.src='http://127.0.0.1/google-search1.js';\ 90 s.type='text/javascript';\ 91 window.document.head.appendChild(s);\ 94 macroCode[
'sets'][0][
'items'][2] =
'!5:4:return window.IFRAME_DATA_READY;' 95 macroCode[
'sets'][0][
'items'][3] = \
97 if(window.MACRO_COLLECT===undefined){\ 98 /*return [window.jQuery===undefined, window.MACRO_COLLECT===undefined];*/\ 101 return window.MACRO_COLLECT();\ 104 fetchType = BaseFetcher.TYP_DYNAMIC
106 os.chdir(
"../../bin/")
110 res = BaseFetcher.get_fetcher(fetchType).open(url, timeout=tm, headers=headersDict,
111 allow_redirects=allowRedirects, proxies=proxies,
112 auth=authorization, data=postData, log=logger,
113 allowed_content_types=process_content_types,
114 max_resource_size=maxResourceSize,
115 max_redirects=maxHttpRedirects,
116 filters=localFilters, depth=urlObjDepth, macro=macroCode)
118 except Exception, err:
120 print "Exception:\n",
varDump(err)
123 rd = json.dumps(json.loads(jsonpickle.encode(res)), indent=2)
125 print "Result:\n", rd
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)