2 HCE project, Python bindings, Crawler application. 6 @file ftest_dc_fetchers.py 7 @author bgv <bgv.hce@gmail.com> 8 @link: http://hierarchical-cluster-engine.com/ 9 @copyright: Copyright © 2015 IOIX Ukraine 10 @license: http://hierarchical-cluster-engine.com/license/ 25 logger = logging.getLogger(
'ftest_dc_fetchers')
26 logger.setLevel(logging.DEBUG)
28 ch = logging.StreamHandler()
29 ch.setLevel(logging.DEBUG)
31 formatter = logging.Formatter(
'%(asctime)s - %(thread)ld - %(threadName)s - %(module)s ' + \
32 '- %(funcName)s - %(levelname)s - %(message)s')
34 ch.setFormatter(formatter)
39 if __name__ ==
"__main__":
42 print "CurDir:\n", os.path.dirname(os.path.realpath(__file__))
47 with open(
"../../ini/crawler-task_headers.txt",
'r') as f: 48 hdrs = ''.
join(f.readlines()) +
"\n" 49 for header
in hdrs.splitlines():
51 if not header
or header[0] ==
'#':
54 key, value = header[:header.index(
':')].strip(), header[header.index(
':') + len(
':'):].strip()
56 print "header error:%s", header
58 headersDict[key] = value
60 print "headersDict:\n",
varDump(headersDict)
70 url =
'https://twitter.com/search?f=tweets&vertical=default&q=%QUERY_STRING%&src=typd' 71 urls =
'http://www.bbc.com/news/world-us-canada-38141686' 72 url = url.replace(
'%QUERY_STRING%', urllib.quote(urls))
74 tm = int(httpTimeout) / 1000.0
75 if isinstance(httpTimeout, float):
76 tm += float(
'0' + str(httpTimeout).strip()[str(httpTimeout).strip().find(
'.'):])
81 process_content_types = [
"text/html"]
82 maxResourceSize = 1024 * 1024
84 fetchType = BaseFetcher.TYP_NORMAL
90 macroCode = {
"name":
"tests",
91 "sets":[{
"name":
"set1",
"items":[
'',
'',
'',
''],
"repeat":1,
"delay":0}],
93 "result_content_type":
"text/json",
94 "result_fetcher_type":1}
96 macroCode[
'sets'][0][
'items'][0] =
'3' 97 macroCode[
'sets'][0][
'items'][1] =
'http://127.0.0.1/macro_test_tw.js' 98 macroCode[
'sets'][0][
'items'][2] =
'10' 99 macroCode[
'sets'][0][
'items'][3] =
"return window.IFRAME_ITEMS;" 100 fetchType = BaseFetcher.TYP_DYNAMIC
102 os.chdir(
"../../bin/")
106 res = BaseFetcher.get_fetcher(fetchType).open(url, timeout=tm, headers=headersDict,
107 allow_redirects=allowRedirects, proxies=proxies,
108 auth=authorization, data=postData, log=logger,
109 allowed_content_types=process_content_types,
110 max_resource_size=maxResourceSize,
111 max_redirects=maxHttpRedirects,
112 filters=localFilters, depth=urlObjDepth, macro=macroCode)
114 except Exception, err:
116 print "Exception:\n",
varDump(err)
119 rd = json.dumps(json.loads(jsonpickle.encode(res)), indent=2)
121 print "Result:\n", rd
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)