2 HCE project, Python bindings, Crawler application. 6 @file ftest_dc_fetchers.py 7 @author bgv <bgv.hce@gmail.com> 8 @link: http://hierarchical-cluster-engine.com/ 9 @copyright: Copyright © 2015 IOIX Ukraine 10 @license: http://hierarchical-cluster-engine.com/license/ 24 logger = logging.getLogger(
'ftest_dc_fetchers')
25 logger.setLevel(logging.DEBUG)
27 ch = logging.StreamHandler()
28 ch.setLevel(logging.DEBUG)
30 formatter = logging.Formatter(
'%(asctime)s - %(thread)ld - %(threadName)s - %(module)s ' + \
31 '- %(funcName)s - %(levelname)s - %(message)s')
33 ch.setFormatter(formatter)
38 if __name__ ==
"__main__":
41 print "CurDir:\n", os.path.dirname(os.path.realpath(__file__))
46 with open(
"../../ini/crawler-task_headers.txt",
'r') as f: 47 hdrs = ''.
join(f.readlines()) +
"\n" 48 for header
in hdrs.splitlines():
50 if not header
or header[0] ==
'#':
53 key, value = header[:header.index(
':')].strip(), header[header.index(
':') + len(
':'):].strip()
55 print "header error:%s", header
57 headersDict[key] = value
59 print "headersDict:\n",
varDump(headersDict)
68 url =
'https://www.google.co.jp/search?q=&gws_rd=cr###window.IFRAME_KWSRC="http://127.0.0.1/keywords.txt";' 70 tm = int(httpTimeout) / 1000.0
71 if isinstance(httpTimeout, float):
72 tm += float(
'0' + str(httpTimeout).strip()[str(httpTimeout).strip().find(
'.'):])
77 process_content_types = [
"text/html"]
78 maxResourceSize = 1024 * 1024
80 fetchType = BaseFetcher.TYP_NORMAL
86 macroCode = {
"name":
"tests",
87 "sets":[{
"name":
"set1",
"items":[
'',
'',
'',
''],
"repeat":1,
"delay":0}],
89 "result_content_type":
"text/json",
90 "result_fetcher_type":1}
92 macroCode[
'sets'][0][
'items'][0] =
'1' 93 macroCode[
'sets'][0][
'items'][1] = \
95 var s=window.document.createElement('script');\ 96 s.src='http://127.0.0.1/macro_test4.js';\ 97 s.type='text/javascript';\ 98 window.document.head.appendChild(s);\ 99 return [window.jQuery===undefined, window.MACRO_PREPARE===undefined, window.MACRO_COLLECT===undefined];\ 101 macroCode[
'sets'][0][
'items'][2] =
'30' 102 macroCode[
'sets'][0][
'items'][3] = \
104 if(window.MACRO_COLLECT===undefined){\ 105 return [window.jQuery===undefined, window.MACRO_COLLECT===undefined];\ 107 return [window.jQuery===undefined, window.MACRO_COLLECT([window.IFRAME_NAME, window.IFRAME_URLS])];\ 110 fetchType = BaseFetcher.TYP_DYNAMIC
112 os.chdir(
"../../bin/")
116 res = BaseFetcher.get_fetcher(fetchType).open(url, timeout=tm, headers=headersDict,
117 allow_redirects=allowRedirects, proxies=proxies,
118 auth=authorization, data=postData, log=logger,
119 allowed_content_types=process_content_types,
120 max_resource_size=maxResourceSize,
121 max_redirects=maxHttpRedirects,
122 filters=localFilters, depth=urlObjDepth, macro=macroCode)
124 except Exception, err:
126 print "Exception:\n",
varDump(err)
129 rd = json.dumps(json.loads(jsonpickle.encode(res)), indent=2)
131 print "Result:\n", rd
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)