HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
ftest_dc_fetchers_gplus.py
Go to the documentation of this file.
1 """
2 HCE project, Python bindings, Crawler application.
3 Dynamic fetcher module tests of FB scrapping. Account: hcegroup2016@gmail.com/testtest2016
4 
5 @package: dc
6 @file ftest_dc_dynamic_fetcher.py
7 @author bgv <bgv.hce@gmail.com>
8 @link: http://hierarchical-cluster-engine.com/
9 @copyright: Copyright &copy; 2016 IOIX Ukraine
10 @license: http://hierarchical-cluster-engine.com/license/
11 @since: 1.4.3
12 """
13 
14 import logging
15 import os
16 import json
17 import jsonpickle
18 
19 from app.Utils import varDump
20 from dc_crawler.Fetcher import BaseFetcher
21 
22 
23 # create logger
24 logger = logging.getLogger('ftest_dc_dynamic_fetcher')
25 logger.setLevel(logging.DEBUG)
26 # create console handler and set level to debug
27 ch = logging.StreamHandler()
28 ch.setLevel(logging.DEBUG)
29 # create formatter
30 formatter = logging.Formatter('%(asctime)s - %(thread)ld - %(threadName)s - %(module)s - %(funcName)s - %(levelname)s - %(message)s')
31 # add formatter to ch
32 ch.setFormatter(formatter)
33 # add ch to logger
34 logger.addHandler(ch)
35 
36 
37 if __name__ == "__main__":
38  res = None
39 
40  print "CurDir:\n", os.path.dirname(os.path.realpath(__file__))
41  loadHeaders = False
42  headersDict = {}
43  userDataDir = None
44  userCacheDir = None
45  userProfileZip = None
46  # userDataDir = '/tmp/custom_profile_fb'
47  # userDataDir = '/tmp/custom_profile_fb_noimg'
48  # userDataDir = '/tmp/custom_profile'
49  # userCacheDir = '/tmp/custom_profile_cache'
50 
51  # userProfileZip = '/tmp/custom_profile_fb_noimg50.zip'
52  # userProfileZip = '/tmp/custom_profile_fb_noimg50b.zip'
53  # userProfileZip = '/tmp/custom_profile_fb_noimg50c.zip'
54  # userProfileZip = '/tmp/custom_profile_fb_noimg50d.zip'
55  # userProfileZip = '/tmp/custom_profile_fb_noimg50e.zip'
56  # userProfileZip = '/tmp/custom_profile_fb_noimg50f.zip'
57  # userProfileZip = '/tmp/custom_profile_fb_noimg50g.zip'
58  # userProfileZip = '/tmp/custom_profile_fb_noimg50h.zip'
59  # userProfileZip = '/tmp/custom_profile_fb_noimg50i.zip'
60  # userProfileZip = '/tmp/custom_profile_fb_noimg50j.zip'
61  # userProfileZip = '/tmp/custom_profile_fb_noimg50k.zip'
62  # userProfileZip = '/tmp/custom_profile_fb_noimg50l.zip'
63  # userProfileZip = '/tmp/custom_profile_fb_noimg50m.zip'
64  # userProfileZip = '/tmp/custom_profile_fb_noimg50.zip,/tmp/custom_profile_fb_noimg50b.zip,/tmp/custom_profile_fb_noimg50c.zip'
65 
66  if loadHeaders:
67  hdrs = None
68  with open("../../ini/crawler-task_headers.txt", 'r') as f:
69  hdrs = ''.join(f.readlines())
70  for header in hdrs.splitlines():
71  if not header:
72  continue
73  try:
74  key, value = header[:header.index(':')].strip(), header[header.index(':') + len(':'):].strip()
75  except Exception:
76  print "header error:%s", header
77  os.abort()
78  headersDict[key] = value
79 
80  allowRedirects = 1
81  proxies = None
82  # proxies = (proxy_type, proxy_host, proxy_port, proxy_user, proxy_passwd)
83  # proxies = ('http', 'dev.hce-project.com', '3129', None, None)
84  authorization = None
85  postData = None
86  process_content_types = ["text/html"]
87  maxResourceSize = 1024 * 1024
88  maxHttpRedirects = 3
89  fetchType = BaseFetcher.TYP_NORMAL
90  localFilters = None
91  urlObjDepth = 0
92  inlineMacro = "window.IFRAME_QUERY_URL=\"https://plus.google.com/s/%25QUERY_STRING%25\";" + \
93  "window.IFRAME_CSCROLL_COUNT=100;" + \
94  "window.IFRAME_MAX_TIME=350;" + \
95  "window.IFRAME_SFIELD=\"title\";"
96  url = 'https://plus.google.com/' + '###' + inlineMacro
97  httpTimeout = 120000
98  tm = int(httpTimeout) / 1000.0
99  if isinstance(httpTimeout, float):
100  tm += float('0' + str(httpTimeout).strip()[str(httpTimeout).strip().find('.'):])
101  headersDict.update({'--disable-web-security':'', '--allow-running-insecure-content':''})
102  macroCode = {"name":"tests",
103  "sets":[{"name":"set1", "items":['', '', '', '', ''], "repeat":1, "delay":0}],
104  "result_type":2,
105  "result_content_type":"text/json",
106  "result_fetcher_type":1}
107  macroCode['sets'][0]['items'][0] = '5'
108  macroCode['sets'][0]['items'][1] = "file:///tmp/ftest_dc_fetchers_gplus_social_macro_data.js"
109  macroCode['sets'][0]['items'][2] = 'file:///tmp/social.js'
110  macroCode['sets'][0]['items'][3] = '!5:20:return window.IFRAME_DATA_READY;'
111  macroCode['sets'][0]['items'][4] = "return window.MACRO_COLLECT;"
112 
113  if userDataDir is not None:
114  headersDict['--user-data-dir'] = userDataDir
115  if userCacheDir is not None:
116  headersDict['--disk-cache-dir'] = userCacheDir
117  if userProfileZip is not None:
118  headersDict['--user-data-dir-zip'] = userProfileZip
119  # Rotation type 0 - random, 1 - cyclic, 2 - next not used now
120  # headersDict['--user-data-dir-zip-rotation'] = 0
121  headersDict['--log-chrome-debug-log'] = ''
122  # headersDict['--proxy-http'] = 'http://proxy1.dev.hce-project.com%3A3180'
123  # headersDict['--proxy-http'] = 'http://proxy2.dev.hce-project.com%3A3280'
124  # headersDict['--proxy-http'] = 'http://proxy3.dev.hce-project.com%3A3380'
125  # headersDict['--proxy-http'] = 'http://proxy4.dev.hce-project.com%3A3480'
126  # headersDict['--proxy-http-domains'] = 'www.facebook.com'
127 
128  fetchType = BaseFetcher.TYP_DYNAMIC
129  # change current dir for webdriver executable run with path ./
130  os.chdir("../../bin/")
131 
132  try:
133  print "headersDict:\n", varDump(headersDict)
134  f = BaseFetcher.get_fetcher(fetchType)
135  if userDataDir is not None:
136  f.tmpDir = ''
137  res = f.open(url, timeout=tm, headers=headersDict, allow_redirects=allowRedirects, proxies=proxies,
138  auth=authorization, data=postData, log=logger, allowed_content_types=process_content_types,
139  max_resource_size=maxResourceSize, max_redirects=maxHttpRedirects, filters=localFilters,
140  depth=urlObjDepth, macro=macroCode)
141 
142  except Exception, err:
143  # logger.debug("Exception:\n%s", varDump(err))
144  print "Exception:\n", varDump(err)
145 
146  # rd = varDump(res)
147  rd = json.dumps(json.loads(jsonpickle.encode(res)), indent=2)
148  # logger.debug("Result:\n%s", varDump(res))
149  print "Result:\n", rd
150 
151 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Definition: join.py:1