2 HCE project, Python bindings, Crawler application. 6 @file ftest_dc_fetchers.py 7 @author bgv <bgv.hce@gmail.com> 8 @link: http://hierarchical-cluster-engine.com/ 9 @copyright: Copyright © 2015 IOIX Ukraine 10 @license: http://hierarchical-cluster-engine.com/license/ 24 logger = logging.getLogger(
'ftest_dc_fetchers')
25 logger.setLevel(logging.DEBUG)
27 ch = logging.StreamHandler()
28 ch.setLevel(logging.DEBUG)
30 formatter = logging.Formatter(
'%(asctime)s - %(thread)ld - %(threadName)s - %(module)s - %(funcName)s - %(levelname)s - %(message)s')
32 ch.setFormatter(formatter)
37 if __name__ ==
"__main__":
40 print "CurDir:\n", os.path.dirname(os.path.realpath(__file__))
46 with open(
"../../ini/crawler-task_headers.txt",
'r') as f: 47 hdrs = ''.
join(f.readlines())
48 for header
in hdrs.splitlines():
52 key, value = header[:header.index(
':')].strip(), header[header.index(
':') + len(
':'):].strip()
54 print "header error:%s", header
56 headersDict[key] = value
58 print "headersDict:\n",
varDump(headersDict)
63 url =
'https://www.google.co.jp/search?q=&gws_rd=cr###window.IFRAME_KWSRC="http://127.0.0.1/keywords_big.txt";' 66 tm = int(httpTimeout) / 1000.0
67 if isinstance(httpTimeout, float):
68 tm += float(
'0' + str(httpTimeout).strip()[str(httpTimeout).strip().find(
'.'):])
73 process_content_types = [
"text/html"]
74 maxResourceSize = 1024 * 1024
76 fetchType = BaseFetcher.TYP_NORMAL
81 headersDict = {
'--disable-web-security':
'',
'--allow-running-insecure-content':
''}
82 macroCode = {
"name":
"tests",
83 "sets":[{
"name":
"set1",
"items":[
'',
'',
'',
''],
"repeat":1,
"delay":0}],
85 "result_content_type":
"text/json",
86 "result_fetcher_type":1}
88 macroCode[
'sets'][0][
'items'][0] =
'1' 89 macroCode[
'sets'][0][
'items'][1] = \
91 var s=window.document.createElement('script');\ 92 s.src='http://127.0.0.1/macro_test4.js';\ 93 s.type='text/javascript';\ 94 window.document.head.appendChild(s);\ 95 return [window.jQuery===undefined, window.MACRO_PREPARE===undefined, window.MACRO_COLLECT===undefined];\ 97 macroCode[
'sets'][0][
'items'][2] =
'50' 98 macroCode[
'sets'][0][
'items'][3] = \
100 if(window.MACRO_COLLECT===undefined){\ 101 return [window.jQuery===undefined, window.MACRO_COLLECT===undefined];\ 103 return [window.jQuery===undefined, window.MACRO_COLLECT([window.IFRAME_NAME, window.IFRAME_URLS])];\ 106 fetchType = BaseFetcher.TYP_DYNAMIC
108 os.chdir(
"../../bin/")
112 res = BaseFetcher.get_fetcher(fetchType).open(url, timeout=tm, headers=headersDict,
113 allow_redirects=allowRedirects, proxies=proxies,
114 auth=authorization, data=postData, log=logger,
115 allowed_content_types=process_content_types,
116 max_resource_size=maxResourceSize,
117 max_redirects=maxHttpRedirects,
118 filters=localFilters, depth=urlObjDepth, macro=macroCode)
120 except Exception, err:
122 print "Exception:\n",
varDump(err)
125 rd = json.dumps(json.loads(jsonpickle.encode(res)), indent=2)
127 print "Result:\n", rd
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)