HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
ftest_dc_fetchers_twitter1.py
Go to the documentation of this file.
1 """
2 HCE project, Python bindings, Crawler application.
3 Fetcher module tests.
4 
5 @package: dc
6 @file ftest_dc_fetchers.py
7 @author bgv <bgv.hce@gmail.com>
8 @link: http://hierarchical-cluster-engine.com/
9 @copyright: Copyright &copy; 2015 IOIX Ukraine
10 @license: http://hierarchical-cluster-engine.com/license/
11 @since: 1.4.3
12 """
13 
14 import logging
15 import os
16 import json
17 import jsonpickle
18 import urllib
19 
20 from app.Utils import varDump
21 from dc_crawler.Fetcher import BaseFetcher
22 
23 
24 # create logger
25 logger = logging.getLogger('ftest_dc_fetchers')
26 logger.setLevel(logging.DEBUG)
27 # create console handler and set level to debug
28 ch = logging.StreamHandler()
29 ch.setLevel(logging.DEBUG)
30 # create formatter
31 formatter = logging.Formatter('%(asctime)s - %(thread)ld - %(threadName)s - %(module)s ' + \
32  '- %(funcName)s - %(levelname)s - %(message)s')
33 # add formatter to ch
34 ch.setFormatter(formatter)
35 # add ch to logger
36 logger.addHandler(ch)
37 
38 
39 if __name__ == "__main__":
40  res = None
41 
42  print "CurDir:\n", os.path.dirname(os.path.realpath(__file__))
43  loadHeaders = True
44  headersDict = {}
45  if loadHeaders:
46  hdrs = None
47  with open("../../ini/crawler-task_headers.txt", 'r') as f:
48  hdrs = ''.join(f.readlines()) + "\n"
49  for header in hdrs.splitlines():
50  print header
51  if not header or header[0] == '#':
52  continue
53  try:
54  key, value = header[:header.index(':')].strip(), header[header.index(':') + len(':'):].strip()
55  except Exception:
56  print "header error:%s", header
57  os.abort()
58  headersDict[key] = value
59 
60  print "headersDict:\n", varDump(headersDict)
61  #import sys
62  #sys.exit()
63 
64  #url = 'http://127.0.0.1/'
65  #url = 'http://127.0.0.1/index0.html'
66  #url = 'about:blank'
67  #url = 'https://www.google.co.jp/search?q=&gws_rd=cr###window.IFRAME_KWSRC="http://127.0.0.1/keywords_big.txt";'
68  #url = 'https://www.google.com/search?q=test&gws_rd=cr'
69  #url = 'https://www.google.co.jp/search?q=&gws_rd=cr###window.IFRAME_KWSRC="http://127.0.0.1/keywords.txt";'
70  url = 'https://twitter.com/search?f=tweets&vertical=default&q=%QUERY_STRING%&src=typd'
71  urls = 'http://www.bbc.com/news/world-us-canada-38141686'
72  url = url.replace('%QUERY_STRING%', urllib.quote(urls))
73  httpTimeout = 70000
74  tm = int(httpTimeout) / 1000.0
75  if isinstance(httpTimeout, float):
76  tm += float('0' + str(httpTimeout).strip()[str(httpTimeout).strip().find('.'):])
77  allowRedirects = 1
78  proxies = None
79  authorization = None
80  postData = None
81  process_content_types = ["text/html"]
82  maxResourceSize = 1024 * 1024
83  maxHttpRedirects = 3
84  fetchType = BaseFetcher.TYP_NORMAL
85  localFilters = None
86  urlObjDepth = 0
87 
88  #Dynamic fetcher test
89  #headersDict = {'--disable-web-security':'', '--allow-running-insecure-content':''}
90  macroCode = {"name":"tests",
91  "sets":[{"name":"set1", "items":['', '', '', ''], "repeat":1, "delay":0}],
92  "result_type":2,
93  "result_content_type":"text/json",
94  "result_fetcher_type":1}
95 
96  macroCode['sets'][0]['items'][0] = '3'
97  macroCode['sets'][0]['items'][1] = 'http://127.0.0.1/macro_test_tw.js'
98  macroCode['sets'][0]['items'][2] = '10'
99  macroCode['sets'][0]['items'][3] = "return window.IFRAME_ITEMS;"
100  fetchType = BaseFetcher.TYP_DYNAMIC
101  #change current dir for webdriver executable run with path ./
102  os.chdir("../../bin/")
103 
104  try:
105  #Test of NORMAL (request lib based) fetcher
106  res = BaseFetcher.get_fetcher(fetchType).open(url, timeout=tm, headers=headersDict,
107  allow_redirects=allowRedirects, proxies=proxies,
108  auth=authorization, data=postData, log=logger,
109  allowed_content_types=process_content_types,
110  max_resource_size=maxResourceSize,
111  max_redirects=maxHttpRedirects,
112  filters=localFilters, depth=urlObjDepth, macro=macroCode)
113 
114  except Exception, err:
115  #logger.debug("Exception:\n%s", varDump(err))
116  print "Exception:\n", varDump(err)
117 
118  #rd = varDump(res)
119  rd = json.dumps(json.loads(jsonpickle.encode(res)), indent=2)
120  #logger.debug("Result:\n%s", varDump(res))
121  print "Result:\n", rd
122 
123 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Definition: join.py:1