HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
ftest_dc_dynamic_fetcher.py
Go to the documentation of this file.
1 """
2 HCE project, Python bindings, Crawler application.
3 Dynamic fetcher module tests.
4 
5 @package: dc
6 @file ftest_dc_dynamic_fetcher.py
7 @author bgv <bgv.hce@gmail.com>
8 @link: http://hierarchical-cluster-engine.com/
9 @copyright: Copyright &copy; 2016 IOIX Ukraine
10 @license: http://hierarchical-cluster-engine.com/license/
11 @since: 1.4.3
12 """
13 
14 import logging
15 import os
16 import json
17 import jsonpickle
18 
19 from app.Utils import varDump
20 from dc_crawler.Fetcher import BaseFetcher
21 
22 
23 # create logger
24 logger = logging.getLogger('ftest_dc_dynamic_fetcher')
25 logger.setLevel(logging.DEBUG)
26 # create console handler and set level to debug
27 ch = logging.StreamHandler()
28 ch.setLevel(logging.DEBUG)
29 # create formatter
30 formatter = logging.Formatter('%(asctime)s - %(thread)ld - %(threadName)s - %(module)s - %(funcName)s - %(levelname)s - %(message)s')
31 # add formatter to ch
32 ch.setFormatter(formatter)
33 # add ch to logger
34 logger.addHandler(ch)
35 
36 
37 if __name__ == "__main__":
38  res = None
39 
40  print "CurDir:\n", os.path.dirname(os.path.realpath(__file__))
41  loadHeaders = False
42  headersDict = {}
43 
44  if loadHeaders:
45  hdrs = None
46  with open("../../ini/crawler-task_headers.txt", 'r') as f:
47  hdrs = ''.join(f.readlines())
48  for header in hdrs.splitlines():
49  if not header:
50  continue
51  try:
52  key, value = header[:header.index(':')].strip(), header[header.index(':') + len(':'):].strip()
53  except Exception:
54  print "header error:%s", header
55  os.abort()
56  headersDict[key] = value
57 
58  print "headersDict:\n", varDump(headersDict)
59 
60  allowRedirects = 1
61  proxies = None
62  authorization = None
63  postData = None
64  process_content_types = ["text/html"]
65  maxResourceSize = 1024 * 1024
66  maxHttpRedirects = 3
67  fetchType = BaseFetcher.TYP_NORMAL
68  localFilters = None
69  urlObjDepth = 0
70  url = 'https://www.google.co.jp/search?q=&gws_rd=cr' + \
71  '###window.IFRAME_KWSRC="http://127.0.0.1/keywords_big.txt";window.IFRAME_KWSRC_FROM=0;window.IFRAME_KWSRC_N=2;'
72  httpTimeout = 60000
73  tm = int(httpTimeout) / 1000.0
74  if isinstance(httpTimeout, float):
75  tm += float('0' + str(httpTimeout).strip()[str(httpTimeout).strip().find('.'):])
76 
77  # Dynamic fetcher test
78  headersDict = {'--disable-web-security':'', '--allow-running-insecure-content':''}
79  macroCode = {"name":"tests",
80  "sets":[{"name":"set1", "items":['', '', '', ''], "repeat":1, "delay":0}],
81  "result_type":2,
82  "result_content_type":"text/json",
83  "result_fetcher_type":1}
84 
85  macroCode['sets'][0]['items'][0] = '1'
86  macroCode['sets'][0]['items'][1] = \
87  "\
88  var s=window.document.createElement('script');\
89  s.src='http://127.0.0.1/google-search1.js';\
90  s.type='text/javascript';\
91  window.document.head.appendChild(s);\
92  "
93  # macroCode['sets'][0]['items'][2] = '20'
94  macroCode['sets'][0]['items'][2] = '!5:4:return window.IFRAME_DATA_READY;'
95  macroCode['sets'][0]['items'][3] = \
96  "\
97  if(window.MACRO_COLLECT===undefined){\
98  /*return [window.jQuery===undefined, window.MACRO_COLLECT===undefined];*/\
99  return [];\
100  }else{\
101  return window.MACRO_COLLECT();\
102  }\
103  "
104  fetchType = BaseFetcher.TYP_DYNAMIC
105  # change current dir for webdriver executable run with path ./
106  os.chdir("../../bin/")
107 
108  try:
109  # Test of NORMAL (request lib based) fetcher
110  res = BaseFetcher.get_fetcher(fetchType).open(url, timeout=tm, headers=headersDict,
111  allow_redirects=allowRedirects, proxies=proxies,
112  auth=authorization, data=postData, log=logger,
113  allowed_content_types=process_content_types,
114  max_resource_size=maxResourceSize,
115  max_redirects=maxHttpRedirects,
116  filters=localFilters, depth=urlObjDepth, macro=macroCode)
117 
118  except Exception, err:
119  # logger.debug("Exception:\n%s", varDump(err))
120  print "Exception:\n", varDump(err)
121 
122  # rd = varDump(res)
123  rd = json.dumps(json.loads(jsonpickle.encode(res)), indent=2)
124  # logger.debug("Result:\n%s", varDump(res))
125  print "Result:\n", rd
126 
127 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Definition: join.py:1