HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
Fetcher.py
Go to the documentation of this file.
1 """
2 HCE project, Python bindings, Distributed Tasks Manager application.
3 web page fetchers.
4 
5 @package: dc
6 @file Fetcher.py
7 @author madk, bgv <developers.hce@gmail.com>
8 @link: http://hierarchical-cluster-engine.com/
9 @copyright: Copyright &copy; 2013-201 IOIX Ukraine
10 @license: http://hierarchical-cluster-engine.com/license/
11 @since: 0.1
12 """
13 
14 # #The BaseFetcher class, defines the interface of fetchers
15 # #this class don't implement any fetcher
16 # concrete fetcher class should extends from this class
17 #
18 
19 
20 import re
21 import time
22 import ctypes
23 import os
24 import json
25 import logging
26 import base64
27 import shutil
28 from random import randint
29 from urlparse import urlsplit
30 import psutil
31 import requests # pylint: disable=W0611
32 import requests.exceptions
33 from requests.auth import HTTPBasicAuth
34 
35 import dc_crawler.Constants as CONSTS
36 from dc_crawler.Exceptions import InternalCrawlerException
37 from dc_crawler.Exceptions import CrawlerFilterException
38 from dc_crawler.RequestsRedirectWrapper import RequestsRedirectWrapper
39 import app.Consts as APP_CONSTS
40 import app.Utils as Utils
41 from app.Utils import varDump
42 from app.Exceptions import SeleniumFetcherException
43 
44 logger = logging.getLogger(APP_CONSTS.LOGGER_NAME)
45 
46 # max content size for use chardet to detect charset
47 MAX_CONTENT_SIZE_FOR_CHARDET = 5000000
48 
49 
50 class BaseFetcher(object):
51 
52  fetchers = None
53 
54  TYP_NORMAL = 1
55  TYP_DYNAMIC = 2
56  TYP_URLLIB = 5
57  TYP_CONTENT = 6
58  TYP_AUTO = 7
59 
60  CONNECTION_TIMEOUT = 1.0
61 
62  # #constructor
63  # initialize fields
64  #
65  def __init__(self):
67  self.logger = None
68 
69 
70  @staticmethod
71  def init(dbWrapper=None, siteId=None):
72  # enumerate content_types we don't want to fetch
73  BaseFetcher.prohibited_conten_types = ["audio/mpeg", "application/pdf"]
74 
75  BaseFetcher.fetchers = {
76  BaseFetcher.TYP_NORMAL : RequestsFetcher(dbWrapper, siteId),
77  BaseFetcher.TYP_DYNAMIC: SeleniumFetcher(),
78  BaseFetcher.TYP_URLLIB: URLLibFetcher(),
79  BaseFetcher.TYP_CONTENT: ContentFetcher()
80  }
81 
82  # #fetch a url, and return the response
83  #
84  # @param url, the url to fetch
85  # @param method, fetch HTTP method
86  # @param headers, request headers dict
87  # @param timeout, request timeout(seconds)
88  # @param allow_redirects, should follow redirect
89  # @param proxies, proxy setting, tuple of proxy_type, host, port, username, password
90  # @param auth, basic auth setting, tuple of username and password
91  # @param data, post data, used only when method is post
92  # @return Response object
93  def open(self,
94  url,
95  method='get',
96  headers=None,
97  timeout=100,
98  allow_redirects=True,
99  proxies=None,
100  auth=None,
101  data=None,
102  log=None,
103  allowed_content_types=None,
104  max_resource_size=None,
105  max_redirects=CONSTS.MAX_HTTP_REDIRECTS_LIMIT,
106  filters=None,
107  executable_path=None,
108  depth=None,
109  macro=None):
110  if headers is None:
111  headers = {}
112  del url, method, headers, timeout, allow_redirects, proxies, auth, data, log, allowed_content_types, \
113  max_resource_size, max_redirects, filters, executable_path, depth, macro
114 
115 
116  # #get fetched by fetch type
117  #
118  # @param typ, the fetch type
119  # @return fetcher
120  @staticmethod
121  def get_fetcher(typ, dbWrapper=None, siteId=None):
122  if not BaseFetcher.fetchers:
123  BaseFetcher.init(dbWrapper, siteId)
124  if typ in BaseFetcher.fetchers:
125  return BaseFetcher.fetchers[typ]
126  else:
127  raise BaseException("unsupported fetch type:%s" % (typ,))
128 
129 
130  # #check whether the fetcher have meta resource
131  #
132  # @return whether the fetcher have meta resource
134 
135  return False
136 
137  # Get domain name from URL string
138  #
139  # @param url string
140  # @param default value in case of some URL parsing problem
141  # @return domain name string or empty if error
142  def getDomainNameFromURL(self, url, default=''):
143  ret = default
144 
145  urlParts = urlsplit(url)
146  if len(urlParts) > 1:
147  ret = urlParts[1]
148 
149  return ret
150 
151 
152 # # Check redirects hook
153 #
154 #
155 def checkRedirectsHook(r, *args, **kwargs):
156  logger.debug('r.url = ' + str(r.url))
157  logger.debug('args = ' + str(args))
158  logger.debug('kwargs = ' + str(kwargs))
159  logger.debug('type(r): %s, r = %s', str(type(r)), varDump(r))
160 
161 
162 # # Fetcher base on the requests module, cann't execute javascript
163 #
164 #
166 
167  def __init__(self, dbWrapper=None, siteId=None):
168  BaseFetcher.__init__(self)
169 
170  self.dbWrapper = dbWrapper
171  self.siteId = siteId
172 
173 
174  # #fetch a url, and return the response
175  #
176  # @param url, the url to fetch
177  # @param method, fetch HTTP method
178  # @param headers, request headers dict
179  # @param timeout, request timeout(seconds)
180  # @param allow_redirects, should follow redirect
181  # @param proxies - proxy setting tuple
182  # @param auth, basic auth setting, tuple of name and password
183  # @param data, post data, used only when method is post
184  # @return Response object
185  def open(self,
186  url,
187  method='get',
188  headers=None,
189  timeout=100,
190  allow_redirects=True,
191  proxies=None,
192  auth=None,
193  data=None,
194  log=None,
195  allowed_content_types=None,
196  max_resource_size=None,
197  max_redirects=CONSTS.MAX_HTTP_REDIRECTS_LIMIT,
198  filters=None,
199  executable_path=None,
200  depth=None,
201  macro=None):
202 
203  # set logger
204  log = logger if log is None else log
205 
206  headers1 = {}
207  for key in headers.keys():
208  if not key.startswith('--'):
209  headers1[key] = headers[key]
210  headers = headers1
211 
212  if not isinstance(timeout, tuple):
213  if hasattr(self, 'connectionTimeout'):
214  timeout = (self.connectionTimeout, timeout)
215  else:
216  timeout = (self.CONNECTION_TIMEOUT, timeout)
217 
218  if auth:
219  auth = HTTPBasicAuth(auth[0], auth[1])
220 
221  proxy_setting = None
222  if proxies is not None:
223  proxy_type, proxy_host, proxy_port, proxy_user, proxy_passwd = proxies
224  if proxy_type is None:
225  proxy_type = "http"
226  if proxy_user is not None:
227  proxies = "%s://%s:%s@%s:%s" % (proxy_type, proxy_user, proxy_passwd, proxy_host, proxy_port)
228  else:
229  proxies = "%s://%s:%s" % (proxy_type, proxy_host, proxy_port)
230  proxy_setting = {"http" : proxies}
231 
232  # # save location value
233  location = url
234  res = Response()
235  try:
236  requestsRedirect = RequestsRedirectWrapper(self.dbWrapper, self.siteId)
237  impl_res = requestsRedirect.request(url=url,
238  method=method,
239  timeout=timeout,
240  headers=headers,
241  allowRedirects=allow_redirects,
242  proxySetting=proxy_setting,
243  auth=auth,
244  data=data,
245  maxRedirects=max_redirects,
246  filters=filters)
247 
248  log.debug("!!! impl_res.headers: %s", varDump(impl_res.headers))
249  log.debug("!!! impl_res.url: %s", str(impl_res.url))
250 
251  location = impl_res.url
252  headers = dict(impl_res.headers.lower_items())
253 
254  # try to prevent huge content fetching
255  if "content-length" in impl_res.headers and \
256  max_resource_size != CONSTS.MAX_HTTP_SIZE_UNLIMIT and \
257  int(impl_res.headers['content-length']) > max_resource_size:
258  log.debug("Content size overshooted. content-length: %s, max_resource_size: %s" % \
259  (str(impl_res.headers['content-length']), str(max_resource_size)))
260  res.content_size = int(impl_res.headers['content-length'])
261  else:
262  ct = impl_res.headers.get('content-type', '').lower()
263  # don't detect charset for binary content type or BIG response
264  if ct.startswith('application') or ct.startswith('audio') or \
265  len(impl_res.content) >= MAX_CONTENT_SIZE_FOR_CHARDET:
266  if "xml" in ct:
267  encoding = SimpleCharsetDetector().detect(impl_res.content, contentType='xml')
268  log.debug("encoding3=%s", str(encoding))
269  if encoding is not None:
270  impl_res.encoding = encoding
271  else:
272  detected_encoding = impl_res.encoding
273  log.debug("Headers contains 'application' or 'audio' content-type: %s",
274  impl_res.headers.get('content-type', ''))
275  else:
276  # use chardet to improve encoding detect
277 # ct = impl_res.headers.get('content-type', '').lower()
278  log.debug("impl_res.encoding1=%s, content-type=%s", impl_res.encoding, ct)
279  # Try simple way of charset detection for an html
280  encoding = None
281  if "html" in ct:
282  log.debug("Using the SimpleCharsetDetector()")
283  encoding = SimpleCharsetDetector().detect(impl_res.content)
284  log.debug("encoding=%s", str(encoding))
285  if encoding is not None:
286  impl_res.encoding = encoding
287 
288  elif "xml" in ct:
289  encoding = SimpleCharsetDetector().detect(impl_res.content, contentType='xml')
290  log.debug("encoding3=%s", str(encoding))
291  if encoding is not None:
292  impl_res.encoding = encoding
293 
294 
295  if (impl_res.encoding is None) or ((encoding is None) and (impl_res.encoding not in ct and "xml" not in ct)):
296  log.debug("Using the charset to improve encoding detect")
297  detected_encoding = impl_res.apparent_encoding
298  if detected_encoding != 'ascii' and detected_encoding != 'ISO-8859-2':
299  impl_res.encoding = detected_encoding
300  log.debug("impl_res.encoding2=%s", impl_res.encoding)
301  # Fix for pages that has xml document tag but no html structure inside
302  text_buffer = self.fixWrongXMLHeader(impl_res.content)
303  if impl_res.headers.get('content-type', '').startswith('application'):
304  res.unicode_content = impl_res.content
305  else:
306  res.unicode_content = text_buffer
307  res.str_content = impl_res.content
308  if impl_res.headers.get('content-type', '').startswith('application'):
309  res.rendered_unicode_content = impl_res.content
310  else:
311  res.rendered_unicode_content = text_buffer
312  # res.content_size = impl_res.raw.tell()
313  if res.rendered_unicode_content is None:
314  res.content_size = 0
315  else:
316  res.content_size = len(res.rendered_unicode_content)
317 
318  res.headers = impl_res.headers
319  res.redirects = impl_res.history
320  res.status_code = impl_res.status_code
321  res.url = impl_res.url
322  res.encoding = impl_res.encoding
323  res.request = impl_res.request
324  res.cookies = requests.utils.dict_from_cookiejar(impl_res.cookies)
325 
326  # update location value
327  res.headers.update({'Location':location})
328 
329  except (requests.exceptions.Timeout, requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout), err:
330  res.error_mask = APP_CONSTS.ERROR_REQUEST_TIMEOUT
331  msg = "Requests fetcher has thrown '%s' exception: " % str(type(err))
332  if isinstance(err, requests.exceptions.Timeout):
333  msg += "The request timed out."
334  elif isinstance(err, requests.exceptions.ReadTimeout):
335  msg += "The server did not send any data in the allotted amount of time."
336  elif isinstance(err, requests.exceptions.ConnectTimeout):
337  msg += "The request timed out while trying to connect to the remote server."
338 
339  log.debug(str(msg))
340  raise err
341  except requests.exceptions.ConnectionError, err:
342  res.error_mask = APP_CONSTS.ERROR_FETCH_CONNECTION_ERROR
343  log.debug(">>> Requests fetcher has thrown ConnectionError exception: " + str(err))
344  raise err
345  except requests.exceptions.HTTPError, err:
346  res.error_mask = APP_CONSTS.ERROR_FETCH_HTTP_ERROR
347  log.debug(">>> Requests fetcher has thrown HTTPError exception: " + str(err))
348  raise err
349  except requests.exceptions.URLRequired, err:
350  res.error_mask = APP_CONSTS.ERROR_FETCH_INVALID_URL
351  log.debug(">>> Requests fetcher has thrown URLRequired exception: " + str(err))
352  raise err
353  except requests.exceptions.TooManyRedirects, err:
354  res.error_mask = APP_CONSTS.ERROR_FETCH_TOO_MANY_REDIRECTS
355  log.debug(">>> Requests fetcher has thrown TooManyRedirects exception: " + str(err))
356  raise err
357  except requests.exceptions.RequestException, err:
358  res.error_mask = APP_CONSTS.ERROR_FETCH_AMBIGUOUS_REQUEST
359  log.debug(">>> Requests fetcher has thrown RequestException exception: " + str(err))
360  raise err
361  except CrawlerFilterException, err:
362  res.error_mask = APP_CONSTS.ERROR_CRAWLER_FILTERS_BREAK
363  log.debug("Crawler has not allowed filter: " + str(err))
364  raise err
365  except Exception, err:
366  res.error_mask = APP_CONSTS.ERROR_FETCHER_INTERNAL
367  log.debug(">>> Requests fetcher has thrown exception" + \
368  " type: " + str(type(err)) + "\n" + Utils.getTracebackInfo())
369  raise InternalCrawlerException("Requests fetcher has thrown exception")
370 
371  return res
372 
373 
374  # #Fix the wrong html document structure in case of XML header is defined first
375  #
376  # @param content input content
377  # @return fixed contentStr
378  def fixWrongXMLHeader(self, contentStr):
379  # text_file = open("/tmp/fetcher_log.txt", "w")
380  # text_file.write("Fetcher: start\n")
381 
382  if contentStr.startswith('<?xml ') and '<html' in contentStr and '<head' in contentStr:
383  # text_file.write("Fetcher: xml detected!\n")
384  p = re.compile(r'<\?xml .*\?>')
385  contentStr = p.sub('', contentStr, count=1)
386  # text_file.write(contentStr)
387 
388  # text_file.close()
389 
390  return contentStr
391 
392 
393 
394 # # Fetcher based on the selenium project web-driver
395 #
396 #
398 
399  DEFAUIL_TIMEOUT = 5
400  CONTENT_TYPE_JSON = 'text/json'
401  CONTENT_TYPE_HTML = 'text/html'
402  DELAY_TERMINATE_AND_QUIT = 0.5
403 
404  ERROR_FATAL = 1
405  ERROR_GENERAL = 2
406  ERROR_CONTENT_OR_COOKIE = 3
407  ERROR_NAME_NOT_RESOLVED = 400
408  ERROR_TOO_MANY_REDIRECTS = 11
409  ERROR_MACRO_RETURN_VALUE = 12
410  ERROR_PROXY_CONNECTION_FAILED = 504
411  ERROR_CONNECTION_TIMED_OUT = 505
412  ERROR_TUNNEL_CONNECTION_FAILED = 403
413  ERROR_SERVICE_UNAVAILABLE = 503
414  ERROR_CONFLICT = 409
415  ERROR_EMPTY_RESPONSE = 13
416 
417  LOG_MESSAGE_RENDERRER_TIMEOUT = 'Timed out receiving message from renderer'
418  LOG_MESSAGE_SERVER_RESPONSE_503 = 'server responded with a status of 503'
419  LOG_MESSAGE_SERVER_RESPONSE_409 = 'server responded with a status of 409 (Conflict)'
420 
421  CHROME_PROCESS_NAMES = ['chrome', 'BrowserBlocking']
422  CHROME_DIRS_TEMPLATE = '.google.Chrome.'
423  CHROME_DEBUG_LOG_NAME = 'chrome_debug.log'
424 
425  MACRO_RESULT_TYPE_DEFAULT = 0
426  MACRO_RESULT_TYPE_URLS_LIST = 1
427  MACRO_RESULT_TYPE_CONTENT = 2
428  MACRO_RESULT_TYPE_AUTO = 3
429 
430  TMP_DIR_TYPE_OPEN = 0
431  TMP_DIR_TYPE_INSTANTIATE = 1
432 
433 
434 
435  # initialize
436  #
437  def __init__(self, tmpDirOptions=None, log=None):
438  super(SeleniumFetcher, self).__init__()
439 
440  if log is not None:
441  self.logger = log
442 
443  if self.logger:
444  self.logger.debug("Initialization of instance, tmpDirOptions: %s", str(tmpDirOptions))
445 
446  self.tmpDirPath = '/tmp'
447  self.tmpDirPrefix = 'dfetcher_tmp_%PID%'
448  self.tmpDirSuffix = ''
451  if tmpDirOptions is not None:
452  if 'path' in tmpDirOptions:
453  self.tmpDirPath = tmpDirOptions['path']
454  if 'prefix' in tmpDirOptions:
455  self.tmpDirPrefix = tmpDirOptions['prefix']
456  if 'suffix' in tmpDirOptions:
457  self.tmpDirSuffix = tmpDirOptions['suffix']
458  if 'type' in tmpDirOptions:
459  self.tmpDirType = int(tmpDirOptions['type'])
460  if 'remove_before_create' in tmpDirOptions:
461  self.tmpDirRemoveBeforeCreate = bool(int(tmpDirOptions['remove_before_create']))
462  pid = str(os.getpid()).strip()
463 
464  if self.tmpDirPath == '' and self.tmpDirPrefix == '' and self.tmpDirSuffix == '':
465  self.tmpDir = ''
466  else:
467  self.tmpDir = self.tmpDirPath + '/' + self.tmpDirPrefix.replace('%PID%', pid) + \
468  self.tmpDirSuffix.replace('%PID%', pid)
469  if self.tmpDirType == self.TMP_DIR_TYPE_INSTANTIATE:
470  if not self.initializeTmpDirs(None):
471  msg = 'Temporary directory type INSTANTIATE `%s` initialization error!', self.tmpDir
472  if self.logger is not None:
473  self.logger.error(msg)
474  raise SeleniumFetcherException(msg)
475  else:
476  if self.logger is not None:
477  self.logger.debug("Temporary directory type INSTANTIATE `%s` initialized!", self.tmpDir)
478  self.driver = None
479  self.driverPid = 0
481  self.sessionId = '--sessionId=' + str(pid)
482  self.userDataDirUsed = ''
483 
484 
485  # delete
486  #
487  def __del__(self):
488  if self.logger:
489  self.logger.debug("Delete instance, temporary dir type: %s", str(self.tmpDirType))
490 
491  if self.tmpDirType == self.TMP_DIR_TYPE_INSTANTIATE:
492  self.removeTmpDirs()
493 
494 
495  # #fetch a url, and return the response
496  #
497  # @param url, the url to fetch
498  # @param method, fetch HTTP method
499  # @param headers, request headers dict
500  # @param timeout, request timeout(seconds)
501  # @param allow_redirects, should follow redirect
502  # @param proxies, proxy setting
503  # @param auth, basic auth setting
504  # @param data, post data, used only when method is post
505  # @param logger
506  # @param allowed_content_types
507  # @param max_resource_size
508  # @param max_redirects
509  # @param executable_path path and file name of the driver binary executable
510  # @return Response object
511  def open(self,
512  url,
513  method='get',
514  headers=None,
515  timeout=DEFAUIL_TIMEOUT,
516  allow_redirects=True,
517  proxies=None,
518  auth=None,
519  data=None,
520  log=None,
521  allowed_content_types=None,
522  max_resource_size=None,
523  max_redirects=1,
524  filters=None,
525  executable_path=None,
526  depth=None,
527  macro=None):
528 
529  if log is not None:
530  self.logger = log
531 
532  if self.logger is not None:
533  self.logger.debug("Dynamic fetcher call:\nurl:" + str(url) + \
534  "\nmethod:" + str(method) + "\nheaders:" + str(headers) + "\ntimeout:" + str(timeout) + \
535  "\nallow_redirects:" + str(allow_redirects) + "\nproxies:" + str(proxies) + "\nauth:" + \
536  str(auth) + "\ndata:" + str(data) + "\nlogger:" + str(self.logger) + \
537  "\nallowed_content_types:" + str(allowed_content_types) + "\nmax_resource_size:" + \
538  str(max_resource_size) + "\nmax_redirects:" + str(max_redirects) + "\nexecutable_path:" + \
539  str(executable_path) + "\ncur_dir:" + str(os.getcwd()) + "\nmacro:" + str(macro))
540 
541  t1 = 0
542  if isinstance(timeout, tuple):
543  t = int(timeout[0])
544  if isinstance(timeout[0], float):
545  t1 = int(str(timeout[0]).strip()[str(timeout[0]).strip().find('.') + 1:])
546  else:
547  t = int(timeout)
548  if isinstance(timeout, float):
549  t1 = int(str(timeout).strip()[str(timeout).strip().find('.') + 1:])
550  if self.logger is not None:
551  self.logger.debug("Execution timeout: %s, damping timeout: %s", str(t), str(t1))
552  if t1 >= t:
553  msg = "Execution timeout: %s less or equal than damping timeout: %s, aborted" % (str(t), str(t1))
554  if self.logger is not None:
555  self.logger.error(msg)
556  raise SeleniumFetcherException(msg)
557 
558  if self.tmpDirType == self.TMP_DIR_TYPE_OPEN:
559  if not self.initializeTmpDirs(headers):
560  msg = 'Temporary directory type OPEN `%s` initialization error!' % self.tmpDir
561  if self.logger is not None:
562  self.logger.error(msg)
563  raise SeleniumFetcherException(msg)
564  else:
565  if self.logger is not None:
566  self.logger.debug('Temporary directory type OPEN `%s` initialized', self.tmpDir)
567 
568  from app.Utils import executeWithTimeout
569  try:
570  ret = executeWithTimeout(func=self.openT, args=(url, headers, t1, proxies, executable_path, macro,),
571  timeout=t, log=self.logger)
572  if ret is None:
573  if self.logger is not None:
574  msg = 'Execution timeout: ' + str(t) + ' reached!'
575  self.logger.error(msg)
576  raise SeleniumFetcherException(msg, APP_CONSTS.ERROR_FETCH_TIMEOUT)
577  except SeleniumFetcherException, err:
578  if self.logger is not None:
579  self.logger.error("Error SeleniumFetcherException: %s", str(err))
580  self.cleanup(1, headers)
581  raise err
582  except Exception, err:
583  if self.logger is not None:
584  msg = 'Execution with timeout error:' + str(err)
585  self.logger.error(msg)
586  self.cleanup(1, headers)
587  raise SeleniumFetcherException(msg)
588  finally:
589  self.cleanup(0, headers)
590 
591  if self.logger is not None:
592  self.logger.debug("Dynamic fetcher call finished normally.")
593 
594  return ret
595 
596 
597 
598  # #Called by open() method with timeout of execution
599  #
600  # @param url
601  # @param headers
602  # @param timeout
603  # @param proxies
604  # @param executable_path
605  # @macro
606  # @return Response object
607  def openT(self, url, headers, timeout, proxies, executable_path, macro):
608  startTime = time.time()
609  inlineMacro = ''
610 
611  try:
612  # Prepare inline macro
613  if self.inlineURLMacroDelimiter in url:
614  t = url.split(self.inlineURLMacroDelimiter)
615  url = t[0]
616  inlineMacro = t[1]
617  # Dependent import
618  try:
619  from selenium import webdriver
620  import selenium.webdriver.support.ui # pylint: disable=W0611
621  except Exception, err:
622  msg = 'Selenium module import error: ' + str(err)
623  if self.logger is not None:
624  self.logger.error(msg)
625  raise SeleniumFetcherException(msg)
626 
627  if self.logger is not None:
628  # One way
629  from selenium.webdriver.remote.remote_connection import LOGGER as seleniumLogger
630  seleniumLogger.setLevel(self.logger.getEffectiveLevel())
631  # Second way
632  selenium_logger = logging.getLogger('selenium.webdriver.remote.remote_connection')
633  # Only display possible problems
634  # selenium_logger.setLevel(logging.WARNING)
635  selenium_logger.setLevel(self.logger.getEffectiveLevel())
636 
637  # Initialize defaults
638  exec_path = "./"
639  driver_name = "chromedriver"
640  error_msg = ""
641  error_code = 0
642  error_code_macro = 0
643  page_source_macro = None
644  content_type_macro = None
645  result_type_macro = self.MACRO_RESULT_TYPE_DEFAULT
646  fatalErrors = [self.ERROR_FATAL, self.ERROR_GENERAL, self.ERROR_NAME_NOT_RESOLVED, self.ERROR_TOO_MANY_REDIRECTS,
649 
650  # Check environment
651  # TODO: add dependecy argument pass, now reduced and hardcoded
652  checkEnv = True
653  if checkEnv:
654  # envVars = {"DISPLAY": "", "LC_ALL":"en_US.UTF-8", "LANG":"en_US.UTF-8", "LANGUAGE":"en_US.UTF-8"}
655  envVars = {"DISPLAY": "", "LANG":"en_US.UTF-8"}
656  for varName in envVars:
657  v = os.getenv(varName, "")
658  if varName == "DISPLAY":
659  if v == "":
660  raise SeleniumFetcherException("Environment variable 'DISPLAY' is not set!")
661  else:
662  if v != envVars[varName]:
663  raise SeleniumFetcherException("Environment variable '" + varName + "' value expected:'" + \
664  envVars[varName] + "', got from os: '" + v + "'; all env: " + \
665  str(os.environ))
666 
667  # Create driver instance
668  try:
669  # get chrome options
670  chrome_option = self.getOptions(webdriver, headers, proxies, url)
671 
672  # The platform-dependent path to the driver executable
673  if executable_path is None:
674  path = exec_path + driver_name + str(ctypes.sizeof(ctypes.c_voidp) * 8)
675  else:
676  path = executable_path
677  if self.logger is not None:
678  self.logger.debug("Chrome driver executable path: %s, options: %s", str(path), str(chrome_option.arguments))
679  from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
680  # enable browser logging
681  d = DesiredCapabilities.CHROME
682  d['loggingPrefs'] = {'browser':'ALL'}
683  # Get driver
684  self.driver = webdriver.Chrome(executable_path=path, chrome_options=chrome_option, desired_capabilities=d)
685  self.driverPid = self.driver.service.process.pid
686  if self.logger:
687  self.logger.debug("Driver pid: " + str(self.driverPid))
688  except Exception, err:
689  error_msg = 'Driver initialization error: ' + str(err)
690  error_code = self.ERROR_FATAL
691  except: # pylint: disable=W0702
692  error_msg = 'General driver initialization!'
693  error_code = self.ERROR_GENERAL
694 
695  if error_code > 0:
696  if self.logger is not None:
697  self.logger.error('Fatal error: ' + error_msg)
698  raise SeleniumFetcherException(error_msg)
699 
700  # Make request
701  try:
702  # driver.set_page_load_timeout(timeout * 10)
703  # driver.set_script_timeout(timeout * 10)
704  # driver.implicitly_wait(timeout * 10)
705 
706  if self.logger is not None:
707  self.logger.debug("Chrome driver get url: `%s`", str(url))
708  self.driver.get(url)
709  # Get logs
710  log_types = self.driver.log_types
711  if 'browser' in log_types:
712  log_list = self.driver.get_log('browser')
713  if self.logger is not None:
714  self.logger.debug("Driver logs: " + str(log_list))
715  for item_dict in log_list:
716  if self.logger is not None:
717  self.logger.debug("Driver message: `%s`", str(item_dict["message"]))
718  if "message" in item_dict and ((url + ' ') in item_dict["message"] or (url + '/ ') in item_dict["message"]):
719  error_msg += item_dict["message"] + " | "
720  else:
721  if self.logger is not None:
722  self.logger.error("No driver logs!")
723  if error_msg != "":
724  entrances = [
725  (r"(.*)net::ERR_NAME_NOT_RESOLVED(.*)", self.ERROR_NAME_NOT_RESOLVED),
726  (r"(.*)net::ERR_TOO_MANY_REDIRECTS(.*)", self.ERROR_TOO_MANY_REDIRECTS),
727  (r"(.*)ERR_PROXY_CONNECTION_FAILED(.*)", self.ERROR_PROXY_CONNECTION_FAILED),
728  (r"(.*)net::ERR_CONNECTION_TIMED_OUT(.*)", self.ERROR_CONNECTION_TIMED_OUT),
729  (r"(.*)net::ERR_TUNNEL_CONNECTION_FAILED(.*)", self.ERROR_TUNNEL_CONNECTION_FAILED),
730  (r"(.*)net::ERR_CONNECTION_RESET(.*)", self.ERROR_TUNNEL_CONNECTION_FAILED),
731  (r"(.*)net::ERR_INVALID_URL(.*)", self.ERROR_TUNNEL_CONNECTION_FAILED),
732  (r"(.*)net::ERR_EMPTY_RESPONSE(.*)", self.ERROR_EMPTY_RESPONSE),
733  (r"(.*)" + self.LOG_MESSAGE_RENDERRER_TIMEOUT + r"(.*)", self.ERROR_CONNECTION_TIMED_OUT),
734  (r"(.*)" + self.LOG_MESSAGE_SERVER_RESPONSE_503 + r"(.*)", self.ERROR_SERVICE_UNAVAILABLE),
735  (r"(.*)" + self.LOG_MESSAGE_SERVER_RESPONSE_409 + r"(.*)", self.ERROR_CONFLICT),
736  (r"(.*)403 \(Forbidden\)(.*)", 403),
737  (r"(.*)404 \(Not Found\)(.*)", 404),
738  (r"(.*)500 \(Internal Server Error\)(.*)", 500),
739  (r"(.*)net::(.*)", 520)]
740  for item in entrances:
741  regex = re.compile(item[0])
742  r = regex.search(error_msg)
743  if r:
744  error_code = item[1]
745  if self.logger is not None:
746  self.logger.debug("Page error: " + error_msg)
747  break
748  if error_code not in fatalErrors and inlineMacro != '':
749  if self.logger is not None:
750  self.logger.debug("Execute inline macro: %s", str(inlineMacro))
751  macroResults, errorCode, errorMsg = self.execMacroSimple([inlineMacro])
752  if error_code not in fatalErrors and macro is not None:
753  if self.logger is not None:
754  self.logger.debug("Execute macro: %s", str(macro))
755  if isinstance(macro, list):
756  macroResults, errorCode, errorMsg = self.execMacroSimple(macro)
757  else:
758  macroResults, errorCode, errorMsg, content_type_macro, result_type_macro = self.execMacroExtended(macro)
759  if errorCode > 0:
760  error_code_macro |= APP_CONSTS.ERROR_MACRO
761  error_msg = errorMsg
762  if len(macroResults) > 0:
763  if result_type_macro == self.MACRO_RESULT_TYPE_CONTENT:
764  page_source_macro = macroResults
765  else:
766  page_source_macro = json.dumps(macroResults, ensure_ascii=False) # pylint: disable=R0204
767  except Exception, err:
768  error_msg = 'Driver error: ' + str(err) + '; logs: ' + self.getAllLogsAsString()
769  error_code = self.ERROR_FATAL
770  except: # pylint: disable=W0702
771  error_msg = "General driver usage error!"
772  error_code = self.ERROR_GENERAL
773 
774  if error_code == 0:
775  if timeout > 0:
776  if self.logger is not None:
777  self.logger.debug("Wait on damping timeout to load all dynamic parts of the page: %s sec", str(timeout))
778  # Wait fixed time to load all dynamic parts of the page
779  time.sleep(timeout)
780  elif error_code in fatalErrors:
781  if self.logger is not None:
782  self.logger.debug("Fatal error, code: %s, msg: %s", str(error_code), error_msg)
783  if error_code == self.ERROR_NAME_NOT_RESOLVED:
784  code = APP_CONSTS.ERROR_FETCH_INVALID_URL
785  elif error_code == self.ERROR_TOO_MANY_REDIRECTS:
786  code = APP_CONSTS.ERROR_FETCH_TOO_MANY_REDIRECTS
787  elif error_code == self.ERROR_PROXY_CONNECTION_FAILED:
788  code = APP_CONSTS.ERROR_FETCH_CONNECTION_ERROR
789  elif error_code == self.ERROR_CONNECTION_TIMED_OUT:
790  code = APP_CONSTS.ERROR_FETCH_CONNECTION_TIMEOUT
791  elif error_code == self.ERROR_TUNNEL_CONNECTION_FAILED:
792  code = APP_CONSTS.ERROR_FETCH_FORBIDDEN
793  elif error_code == self.ERROR_EMPTY_RESPONSE:
794  code = APP_CONSTS.ERROR_EMPTY_RESPONSE
795  elif error_code == self.ERROR_SERVICE_UNAVAILABLE:
796  code = APP_CONSTS.ERROR_FETCH_FORBIDDEN
797  elif error_code == self.ERROR_CONFLICT:
798  code = APP_CONSTS.ERROR_FETCH_HTTP_ERROR
799  else:
800  code = APP_CONSTS.ERROR_FETCHER_INTERNAL
801  # self.cleanup(driver)
802  raise SeleniumFetcherException(error_msg, code)
803 
804  page_source = ""
805  cookies = {}
806  try:
807  page_source = self.driver.page_source
808  cookies = self.driver.get_cookies()
809  except Exception, err:
810  error_msg = str(err)
811  error_code = self.ERROR_CONTENT_OR_COOKIE
812  except: # pylint: disable=W0702
813  error_msg = "Content and cookies get error!"
814  error_code = self.ERROR_CONTENT_OR_COOKIE
815 
816  content_type = None
817  charset = None
818  try:
819  attr = self.driver.find_element_by_xpath(".//meta[translate(@http-equiv,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')='content-type']").get_attribute("content") # pylint: disable=C0301
820  regex = re.compile(r"(.*); charset=(.*)", re.IGNORECASE)
821  items = regex.search(attr)
822  if items is not None:
823  items = items.groups()
824  if len(items) > 1:
825  content_type = items[0]
826  charset = items[1]
827  except Exception, err:
828  pass
829  if content_type is None:
830  try:
831  attr = self.driver.find_element_by_xpath('//html')
832  content_type = self.CONTENT_TYPE_HTML
833  except Exception, err:
834  pass
835  if content_type is not None and charset is None:
836  try:
837  charset = self.driver.find_element_by_xpath('//meta[@charset]').get_attribute("charset")
838  except Exception, err:
839  pass
840  if charset is None:
841  try:
842  charset = self.driver.execute_script("return document.characterSet;")
843  except Exception, err:
844  if self.logger is not None:
845  self.logger.debug("Charset detection error: %s", str(err))
846 
847  try:
848  current_url = self.driver.current_url
849  except Exception, err:
850  current_url = url
851  if self.logger is not None:
852  self.logger.debug("Get 'current_url' error: %s, input url assumed: %s", str(err), str(url))
853 
854  # if self.LOG_MESSAGE_RENDERRER_TIMEOUT in error_msg:
855  # self.cleanup(driver)
856  # raise SeleniumFetcherException(error_msg, APP_CONSTS.ERROR_FETCH_CONNECTION_TIMEOUT)
857 
858  try:
859  res = Response()
860  res.url = current_url
861  if error_code > 100 or error_code == self.ERROR_FATAL:
862  res.status_code = error_code
863  else:
864  res.status_code = 200
865  res.redirects = []
866  if page_source_macro is None:
867  res.unicode_content = page_source
868  else:
869  res.unicode_content = page_source_macro
870  res.str_content = res.unicode_content
871  res.rendered_unicode_content = res.unicode_content
872  res.content_size = len(res.unicode_content)
873  res.encoding = charset
874  res.headers = {'content-length': res.content_size}
875  if page_source_macro is not None:
876  if content_type_macro is not None:
877  content_type = content_type_macro
878  else:
879  content_type = self.CONTENT_TYPE_JSON
880  if content_type is not None:
881  res.headers['content-type'] = content_type
882  if current_url != url:
883  res.headers['location'] = current_url
884  res.meta_res = res.unicode_content
885  res.cookies = cookies
886  res.dynamic_fetcher_type = driver_name
887  res.dynamic_fetcher_result_type = result_type_macro
888  if error_code_macro != APP_CONSTS.ERROR_OK:
889  res.error_mask |= error_code_macro
890  res.time = time.time() - startTime
891  res.request = {'headers':headers} # # alexv
892  res.error_msg = error_msg
893  except Exception, err:
894  msg = 'Response fill error: ' + str(err)
895  if self.logger is not None:
896  self.logger.error(msg)
897  raise SeleniumFetcherException(msg)
898 
899  if self.logger is not None and error_msg != "":
900  self.logger.debug("Dynamic fetcher none fatal error: " + error_msg)
901 
902  return res
903 
904  except Exception, err:
905  msg = 'Unrecognized dynamic fetcher error: ' + str(err)
906  if self.logger is not None:
907  self.logger.error(msg)
908  raise SeleniumFetcherException(msg)
909 
910 
911  # #Finish and close all dependencies, quit driver, remove temporary directory
912  #
913  # @param state: 0 - normal finish, 1 - after error
914  # @param headers
915  def cleanup(self, state=0, headers=None):
916  if self.logger is not None and '--log-chrome-debug-log' in headers:
917  logFile = self.userDataDirUsed + '/' + self.CHROME_DEBUG_LOG_NAME
918  try:
919  with open(logFile, 'r') as f:
920  logData = f.read()
921  self.logger.debug("Chrome debug log file `%s`:\n%s", logFile, logData)
922  except Exception, err:
923  self.logger.debug("Error read chrome debug log file `%s`: %s", logFile, str(err))
924 
925  if self.logger is not None:
926  self.logger.debug("Cleanup type: %s, driver: %s", str(state), str(self.driver))
927 
928  try:
929  if self.driver is not None:
930  self.driver.quit()
931  time.sleep(self.DELAY_TERMINATE_AND_QUIT)
932  except Exception:
933  pass
934 
935  if state == 1:
936  time.sleep(self.DELAY_TERMINATE_AND_QUIT)
937  try:
938  if self.driver is not None:
939  self.driver.quit()
940  except Exception:
941  pass
942  time.sleep(self.DELAY_TERMINATE_AND_QUIT)
943  try:
944  if self.logger:
945  self.logger.debug("Driver pid: " + str(self.driverPid))
946  self.killProcess(self.driverPid)
947  except Exception:
948  if self.logger:
949  self.logger.debug("Error kill driver pid: %s", str(self.driverPid))
950 
951  self.chromeProcessesCleanup(headers)
952 
953  if self.tmpDirType == self.TMP_DIR_TYPE_OPEN:
955 
956 
957  # #Chrome processes cleanup
958  #
959  # @param headers
960  def chromeProcessesCleanup(self, headers):
961  if self.logger:
962  self.logger.debug("Chrome processes cleanup started")
963 
964  if self.sessionId != '':
965  key = self.sessionId
966  else:
967  if self.tmpDir == '':
968  if '--disk-cache-dir' in headers:
969  key = '--disk-cache-dir=' + headers['--disk-cache-dir']
970  if '--profile-directory' in headers:
971  key = '--profile-directory=' + headers['--profile-directory']
972  if '--user-data-dir' in headers:
973  key = '--user-data-dir=' + headers['--user-data-dir']
974  else:
975  key = self.tmpDir
976 
977  try:
978  for proc in psutil.process_iter():
979  try:
980  # if self.logger:
981  # self.logger.debug("Candidate, pid:%s, name: %s cmdline: %s", str(proc.pid), str(proc.name()),
982  # str(proc.cmdline()))
983  for name in self.CHROME_PROCESS_NAMES:
984  if name in proc.name():
985  found = False
986  for item in proc.cmdline():
987  if key in item:
988  found = True
989  break
990  if found:
991  if self.logger:
992  self.logger.debug("Chrome process killing, pid:%s, cmdline: %s", str(proc.pid), str(proc.cmdline()))
994  except Exception, err:
995  if self.logger:
996  self.logger.debug("Chrome process kill error: %s", str(err))
997  except Exception, err:
998  if self.logger:
999  self.logger.debug("Chrome process kill error: %s", str(err))
1000 
1001 
1002  def killProcess(self, pid, dirsTemplate=CHROME_DIRS_TEMPLATE, dirDeleteBeforeTimeout=DELAY_TERMINATE_AND_QUIT):
1003  del dirsTemplate, dirDeleteBeforeTimeout
1004  try:
1005  if self.logger:
1006  self.logger.debug("Try to Kill process pid: %s", str(pid))
1007  process = psutil.Process(pid)
1008  for proc in process.children(recursive=True):
1009  if self.logger:
1010  self.logger.debug("Killing child process pid: %s", str(proc.pid))
1011  try:
1012  # dirs = self.getProcessDirs(proc, dirsTemplate)
1013  proc.kill()
1014  # if self.logger:
1015  # self.logger.debug("Dirs to remove: %s", str(dirs))
1016  # for d in dirs:
1017  # time.sleep(dirDeleteBeforeTimeout)
1018  # self.removeTmpDirs(d)
1019  # os.kill(pid, signal.SIGKILL)
1020  except Exception, err:
1021  if self.logger:
1022  self.logger.debug("Child process pid: %s kill error: ", str(pid), str(err))
1023  if self.logger:
1024  self.logger.debug("Killing main process pid: %s", str(process.pid))
1025  # dirs = self.getProcessDirs(process, dirsTemplate)
1026  process.kill()
1027  # if self.logger:
1028  # self.logger.debug("Dirs to remove: %s", str(dirs))
1029  # for d in dirs:
1030  # time.sleep(dirDeleteBeforeTimeout)
1031  # self.removeTmpDirs(d)
1032  # os.kill(pid, signal.SIGKILL)
1033  except Exception, err:
1034  if self.logger:
1035  self.logger.debug("Process pid: %s kill error: %s", str(pid), str(err))
1036 
1037 
1038  # #check whether the fetcher have meta resource
1039  #
1040  # @return whether the fetcher have meta resource
1042  return True
1043 
1044 
1045  # #Get list of directories of files opened by process
1046  #
1047  # @param process - from the psutil.Process() or process.children()
1048  # @param dirsTemplate - template string for a path item
1049  # @return options object
1050  def getProcessDirs(self, process, dirsTemplate):
1051  ret = []
1052 
1053  for f in process.open_files():
1054  # if self.logger:
1055  # self.logger.debug("Path candidate: %s", str(f.path))
1056  fp = f.path.split('/')
1057  fpr = ''
1058  templateFound = False
1059  for item in fp:
1060  fpr += '/' + item
1061  if dirsTemplate is not None and dirsTemplate != '' and dirsTemplate in item:
1062  templateFound = True
1063  break
1064  if templateFound or dirsTemplate is None or dirsTemplate == '':
1065  ret.append(fpr)
1066 
1067  return ret
1068 
1069 
1070  # #Create and returns the options object
1071  #
1072  # @param headers dictionary
1073  # @param proxies dictionary
1074  # @param url - requested URL
1075  # @return options object
1076  def getOptions(self, webdriver, headers, proxies, url):
1077  chrome_option = webdriver.ChromeOptions()
1078 
1079  arg_disable_http_cache = "--disable-http-cache"
1080  arg_clear_data_reduction_proxy_data_savings = '--clear-data-reduction-proxy-data-savings'
1081  arg_host_resolver_retry_attempts = '--host-resolver-retry-attempts=0'
1082  arg_start_maximized = '--start-maximized'
1083  if headers is not None and '--use-mobile-user-agent' in headers:
1084  use_mobile_user_agent = '--use-mobile-user-agent'
1085  else:
1086  use_mobile_user_agent = None
1087 
1088  if headers is not None and '--disable-web-security' in headers:
1089  disable_web_security = '--disable-web-security'
1090  else:
1091  disable_web_security = None
1092  # disable_web_security = '--disable-web-security'
1093 
1094  if headers is not None and '--allow-running-insecure-content' in headers:
1095  allow_running_insecure_content = '--allow-running-insecure-content'
1096  else:
1097  allow_running_insecure_content = None
1098  # allow_running_insecure_content = '--allow-running-insecure-content'
1099 
1100  if headers is not None and '--allow-file-access-from-files' in headers:
1101  allow_file_access_from_files = '--allow-file-access-from-files'
1102  else:
1103  allow_file_access_from_files = None
1104  # allow_file_access_from_files = '--allow-file-access-from-files'
1105 
1106  if headers is not None and '--proxy-bypass-list' in headers:
1107  arg_proxy_bypass_list = '--proxy-bypass-list=' + headers['--proxy-bypass-list']
1108  else:
1109  arg_proxy_bypass_list = None
1110 
1111  # if headers is not None and 'User-Agent' in headers and '--user-agent' in headers and\
1112  if headers is not None and 'User-Agent' in headers and\
1113  '--use-mobile-user-agent' not in headers:
1114  arg_user_agent = '--user-agent=' + headers['User-Agent']
1115  else:
1116  arg_user_agent = None
1117  if '--disk-cache-dir' in headers:
1118  if os.path.isdir(headers['--disk-cache-dir']):
1119  arg_disk_cache_dir = '--disk-cache-dir=' + headers['--disk-cache-dir']
1120  else:
1121  if self.logger:
1122  self.logger.debug("Header `--disk-cache-dir` directory: `%s` not found!", headers['--disk-cache-dir'])
1123  else:
1124  arg_disk_cache_dir = None
1125  if '--profile-directory' in headers:
1126  if os.path.isdir(headers['--profile-directory']):
1127  arg_profile_directory = '--profile-directory=' + headers['--profile-directory']
1128  else:
1129  if self.logger:
1130  self.logger.debug("Header `--profile-directory` directory: `%s` not found!", headers['--profile-directory'])
1131  else:
1132  arg_profile_directory = None
1133  if '--user-data-dir' in headers:
1134  if os.path.isdir(headers['--user-data-dir']):
1135  arg_user_data_dir = '--user-data-dir=' + headers['--user-data-dir']
1136  self.userDataDirUsed = headers['--user-data-dir']
1137  else:
1138  if self.logger:
1139  self.logger.debug("Header `--user-data-dir` directory: `%s` not found!", headers['--user-data-dir'])
1140  else:
1141  if self.tmpDir != '':
1142  arg_user_data_dir = '--user-data-dir=' + self.tmpDir
1143  self.userDataDirUsed = self.tmpDir
1144  else:
1145  arg_user_data_dir = None
1146  if self.logger:
1147  self.logger.error("Empty tmp dir configured!")
1148 
1149  if self.userDataDirUsed != '' and not os.path.isdir(self.userDataDirUsed):
1150  if self.logger:
1151  self.logger.debug("Profile archive user data dir `%s` not found, trying to create...",
1152  str(self.userDataDirUsed))
1153  try:
1154  os.makedirs(self.userDataDirUsed)
1155  except Exception, err:
1156  if self.logger:
1157  self.logger.debug("Profile archive user data dir creation error: %s", str(err))
1158  if os.path.isdir(self.userDataDirUsed):
1159  if self.logger:
1160  self.logger.debug("Profile archive user data dir `%s` created", str(self.userDataDirUsed))
1161 
1162  if '--user-data-dir-zip' in headers and self.userDataDirUsed != '' and os.path.isdir(self.userDataDirUsed):
1163  try:
1164  profiles = [p.strip() for p in headers['--user-data-dir-zip'].split(',') if p.strip() != '']
1165  if '--user-data-dir-zip-rotation' in headers and headers['--user-data-dir-zip-rotation'] is not None and\
1166  headers['--user-data-dir-zip-rotation'] != '':
1167  rotationType = int(headers['--user-data-dir-zip'])
1168  else:
1169  rotationType = 0
1170  profileIndex = 0
1171  if len(profiles) > 1:
1172  if rotationType == 0:
1173  r = [randint(0, len(profiles) - 1) for p in range(0, len(profiles) - 1)]
1174  profileIndex = r[0]
1175  elif rotationType == 1:
1176  pass
1177  elif rotationType == 2:
1178  pass
1179 # os.system('unzip -qq ' + profiles[profileIndex] + ' -d ' + self.userDataDirUsed)
1180 # os.system('mv ' + self.userDataDirUsed + '/' + \
1181 # os.path.splitext(os.path.basename(profiles[profileIndex]))[0] + \
1182 # '/* ' + self.userDataDirUsed)
1183 
1184  res = Utils.executeCommand('unzip -qq ' + profiles[profileIndex] + ' -d ' + self.userDataDirUsed)
1185  if res.exitCode != APP_CONSTS.EXIT_SUCCESS:
1186  raise Exception(str(res.stderr))
1187 
1188  res = Utils.executeCommand('mv ' + self.userDataDirUsed + '/' + \
1189  os.path.splitext(os.path.basename(profiles[profileIndex]))[0] + \
1190  '/* ' + self.userDataDirUsed)
1191  if res.exitCode != APP_CONSTS.EXIT_SUCCESS:
1192  raise Exception(str(res.stderr))
1193 
1194  if self.logger:
1195  self.logger.debug("Profile archive `%s` extracted to `%s` directory, rotation: %s",
1196  profiles[profileIndex], self.userDataDirUsed, str(rotationType))
1197  except Exception, err:
1198 # if self.logger:
1199 # self.logger.error("Profile archive extraction error: %s", str(err))
1200  raise Exception("Profile archive extraction error: %s" % str(err))
1201 
1202  else:
1203  d = {'--user-data-dir-zip in headers':str('--user-data-dir-zip' in headers),
1204  'self.userDataDirUsed':self.userDataDirUsed,
1205  'os.path.isdir(self.userDataDirUsed)':str(os.path.isdir(self.userDataDirUsed))}
1206  if self.logger:
1207  self.logger.debug("Profile archive not used, condition data:\n%s", str(d))
1208 
1209  arg_dns_prefetch_disable = '--dns-prefetch-disable'
1210  # --disk-cache-size=1
1211  # --media-cache-size=1
1212  # --safe-plugins
1213 
1214  if headers is not None and '--disable-setuid-sandbox' in headers:
1215  chrome_option.add_argument('--disable-setuid-sandbox')
1216 
1217  if headers is not None and '--no-sandbox' in headers:
1218  chrome_option.add_argument('--no-sandbox')
1219 
1220  if headers is not None and '--incognito' in headers:
1221  chrome_option.add_argument('--incognito')
1222 
1223  # chrome_option.add_argument('--enable-logging')
1224  # chrome_option.add_argument('--v=1')
1225  # chrome_option.add_argument('--log-level=0')
1226 
1227  if arg_user_agent is not None:
1228  chrome_option.add_argument(arg_user_agent)
1229  if use_mobile_user_agent is not None:
1230  chrome_option.add_argument(use_mobile_user_agent)
1231  if disable_web_security is not None:
1232  chrome_option.add_argument(disable_web_security)
1233  if allow_running_insecure_content is not None:
1234  chrome_option.add_argument(allow_running_insecure_content)
1235  if allow_file_access_from_files is not None:
1236  chrome_option.add_argument(allow_file_access_from_files)
1237  if arg_proxy_bypass_list is not None:
1238  chrome_option.add_argument(arg_proxy_bypass_list)
1239  chrome_option.add_argument(arg_disable_http_cache)
1240  chrome_option.add_argument(arg_clear_data_reduction_proxy_data_savings)
1241  chrome_option.add_argument(arg_host_resolver_retry_attempts)
1242  chrome_option.add_argument(arg_start_maximized)
1243  if arg_disk_cache_dir is not None and arg_disk_cache_dir != '':
1244  chrome_option.add_argument(arg_disk_cache_dir)
1245  if arg_profile_directory is not None and arg_profile_directory != '':
1246  chrome_option.add_argument(arg_profile_directory)
1247  if arg_user_data_dir is not None and arg_user_data_dir != '':
1248  chrome_option.add_argument(arg_user_data_dir)
1249  chrome_option.add_argument(arg_dns_prefetch_disable)
1250  # chrome_option.add_argument(arg_incognito)
1251 
1252  if self.sessionId != '':
1253  chrome_option.add_argument(self.sessionId)
1254 
1255  # Proxy options
1256  if proxies is not None:
1257  proxy_type, proxy_host, proxy_port, proxy_user, proxy_passwd = proxies
1258  if self.logger:
1259  self.logger.debug("Proxy used from argument tuple: %s", str(proxies))
1260  if proxy_user:
1261  proxies = proxy_type + "://%s:%s@%s:%s" % (proxy_user, proxy_passwd, proxy_host, proxy_port)
1262  else:
1263  proxies = proxy_type + "://%s:%s" % (proxy_host, proxy_port)
1264  chrome_option.add_argument("--proxy-server=" + proxies)
1265  else:
1266  if '--proxy-http' in headers and headers['--proxy-http'] is not None and headers['--proxy-http'] != '':
1267  if '--proxy-http-domains' in headers and headers['--proxy-http-domains'] is not None and\
1268  headers['--proxy-http-domains'] != '':
1269  dn = self.getDomainNameFromURL(url)
1270  domain = bool(dn in headers['--proxy-http-domains'].split(','))
1271  if self.logger and domain is False:
1272  self.logger.debug("Proxy not used because domain `%s` not listed in `--proxy-http-domains`", str(dn))
1273  else:
1274  domain = True
1275  if domain:
1276  p = headers['--proxy-http'].replace('%3A', ':')
1277  if self.logger:
1278  self.logger.debug("Proxy used from header: %s", str(p))
1279  chrome_option.add_argument("--proxy-server=" + p)
1280 
1281  return chrome_option
1282 
1283 
1284  # #Initializes tmp dir for browser data from headers or default /tmp/dfetcher_tmp_<pid>
1285  #
1286  # @param macro structure object
1287  # @param driver object
1288  # @return True if directories are created or False if error
1289  def initializeTmpDirs(self, headers):
1290  ret = True
1291 
1292  if self.tmpDir != '':
1293  try:
1294  if headers is not None and 'tmp-dir' in headers:
1295  self.tmpDir = headers['tmp-dir']
1296  if self.tmpDirRemoveBeforeCreate:
1297  self.removeTmpDirs()
1298  if not os.path.isdir(self.tmpDir):
1299  if logger is not None:
1300  self.logger.debug("Create temporary directory: %s", str(self.tmpDir))
1301  os.makedirs(self.tmpDir)
1302  except Exception, err:
1303  if self.logger is not None:
1304  ret = False
1305  if logger is not None:
1306  self.logger.debug("Error temporary directories initialization: %s", str(err))
1307 
1308  if os.path.isdir(self.tmpDir):
1309  ret = True
1310 
1311  return ret
1312 
1313 
1314  # #Remove tmp dir
1315  #
1316  def removeTmpDirs(self, delay=DELAY_TERMINATE_AND_QUIT, tries=3):
1317  if self.tmpDir != '':
1318  for i in xrange(1, tries):
1319  try:
1320  time.sleep(delay)
1321  if os.path.isdir(self.tmpDir):
1322  if self.logger is not None:
1323  self.logger.debug("Removing tmp dir: %s, try: %s", self.tmpDir, str(i))
1324  shutil.rmtree(self.tmpDir)
1325  else:
1326  break
1327  except Exception, err:
1328  if self.logger is not None:
1329  self.logger.debug("Remove tmp dir: %s, try: %s, error: %s", self.tmpDir, str(i), str(err))
1330 
1331 
1332  # #Execute macro with simple object structure
1333  #
1334  # @param macro structure object
1335  # @return result dict()
1336  def execMacroSimple(self, macro):
1337  macroResults = []
1338  error_code = 0
1339  error_msg = ''
1340  macroCounter = 0
1341  maxLenToLog = 512
1342 
1343  for m in macro:
1344  if self.logger is not None:
1345  self.logger.debug("Macro #%s in set of %s items:\n%s...",
1346  str(macroCounter), str(len(macro)), str(m)[:maxLenToLog])
1347  macroCounter += 1
1348  if m.isdigit():
1349  if self.logger is not None:
1350  self.logger.debug("Macro sleep: %s sec", str(m))
1351  time.sleep(int(m))
1352  else:
1353  iType = 0
1354  iDelay = 0
1355  iMaxIterations = 1
1356  if m.startswith('!'):
1357  m = m[1:]
1358  iType = 1
1359  params = m.split(':')
1360  if len(params) > 2:
1361  iDelay = int(params[0])
1362  iMaxIterations = int(params[1])
1363  m = params[2]
1364  elif len(params) > 1:
1365  iDelay = int(params[0])
1366  m = params[1]
1367  elif len(params) == 1:
1368  m = params[0]
1369  if self.logger is not None:
1370  self.logger.debug("Macro blocking iterative, delay: %s, max ierations: %s",
1371  str(iDelay), str(iMaxIterations))
1372  for i in xrange(0, iMaxIterations):
1373  if iType == 1:
1374  if self.logger is not None:
1375  self.logger.debug("Macro blocking iteration: %s of: %s", str(i + 1), str(iMaxIterations))
1376  if m.startswith('http://') or m.startswith('https://') or m.startswith('file://'):
1377  try:
1378  if m.startswith('file://'):
1379  with open(m[7:].replace('%PID%', str(os.getpid())), 'r') as f:
1380  m = f.read()
1381  else:
1382  r = requests.get(m.replace('%PID%', str(os.getpid())))
1383  m = r.text
1384  if self.logger is not None:
1385  self.logger.debug("Macro %s bytes loaded:\n%s...", str(len(str(m))), str(m)[:maxLenToLog])
1386  except Exception, err:
1387  error_msg = 'Error load macro code, URL: `' + str(m) + '` : ' + str(err)
1388  error_code = self.ERROR_MACRO_RETURN_VALUE
1389  if self.logger is not None:
1390  self.logger.debug(error_msg)
1391  r = None
1392  break
1393  try:
1394  r = self.driver.execute_script(m)
1395  if self.logger is not None:
1396  self.logger.debug("Macro returned: %s", json.dumps(r))
1397  except Exception, err:
1398  error_msg = 'Error macro execution: ' + str(err) + '; logs: ' + self.getAllLogsAsString()
1399  error_code = self.ERROR_MACRO_RETURN_VALUE
1400  if self.logger is not None:
1401  self.logger.debug(error_msg)
1402  r = None
1403  break
1404  if iType == 0 and r is not None:
1405  if isinstance(r, (basestring, list, dict)):
1406  macroResults.append(r)
1407  if isinstance(r, (list, dict)):
1408  if self.logger is not None:
1409  self.logger.debug("Macro items returned: %s", str(len(r)))
1410  else:
1411  error_msg = 'Error macro result value, type is: ' + str(type(r))
1412  error_code = self.ERROR_MACRO_RETURN_VALUE
1413  if self.logger is not None:
1414  self.logger.debug(error_msg)
1415  break
1416  elif iType == 1:
1417  if r is True:
1418  if self.logger is not None:
1419  self.logger.debug("Macro blocking got `True` on iteration: %s, sleeped: %s sec",
1420  str(i + 1), str(int(iDelay) * i))
1421  break
1422  elif r is not True and iDelay > 0:
1423  if self.logger is not None:
1424  self.logger.debug("Macro blocking iteration: %s sleep on: %s sec", str(i + 1), str(iDelay))
1425  time.sleep(int(iDelay))
1426  if iType == 1 and r is not True:
1427  if self.logger is not None:
1428  self.logger.debug("Macro blocking finished, but no `True` value returned!")
1429  if error_code > 0:
1430  break
1431 
1432  return macroResults, error_code, error_msg
1433 
1434 
1435  # #Get all kind of logs as string
1436  #
1437  # @return string of logs lists
1439  return 'browser: ' + str(self.driver.get_log('browser')) + '; driver: ' + str(self.driver.get_log('driver'))
1440 
1441 
1442  # #Execute macro with extended object structure
1443  #
1444  # @param macro structure object
1445  # @return result dict()
1446  def execMacroExtended(self, macro):
1447  macroResults = []
1448  error_code = 0
1449  error_msg = ''
1450  content_type = None
1451  result_type = self.MACRO_RESULT_TYPE_DEFAULT
1452 
1453  for mset in macro['sets']:
1454  if 'name' not in mset:
1455  mset['name'] = ''
1456  if 'repeat' not in mset:
1457  mset['repeat'] = '1'
1458  if 'delay' not in mset:
1459  mset['delay'] = '0'
1460  if self.logger is not None:
1461  self.logger.debug("Set:\n%s", str(mset))
1462  for i in xrange(0, int(mset['repeat'])):
1463  if int(mset['delay']) > 0:
1464  time.sleep(int(mset['delay']))
1465  if self.logger is not None:
1466  self.logger.debug("Macro %s in set", str(i))
1467  r, error_code, error_msg = self.execMacroSimple(mset['items'])
1468  if error_code > 0:
1469  break
1470  macroResults += r
1471  if error_code > 0:
1472  break
1473 
1474  if 'result_type' in macro:
1475  result_type = int(macro['result_type'])
1476  self.logger.debug("Macro results type: %s", str(result_type))
1477  if 'result_content_type' in macro:
1478  content_type = str(macro['result_content_type'])
1479  self.logger.debug("Macro results content type: %s", str(content_type))
1480 
1481  if result_type == self.MACRO_RESULT_TYPE_AUTO:
1482  self.logger.debug("Macro results before autodetect type: %s", str(macroResults))
1483  for r in macroResults:
1484  if isinstance(r, basestring):
1485  result_type = self.MACRO_RESULT_TYPE_CONTENT
1486  self.logger.debug("Macro results type autodetected as string content")
1487  break
1488  elif isinstance(r, list):
1489  for ri in r:
1490  if isinstance(ri, basestring):
1491  result_type = self.MACRO_RESULT_TYPE_URLS_LIST
1492  self.logger.debug("Macro results type autodetected as URLs list")
1493  break
1494  if result_type == self.MACRO_RESULT_TYPE_CONTENT:
1495  macroResults = ''.join(macroResults) # pylint: disable=R0204
1496  if result_type == self.MACRO_RESULT_TYPE_URLS_LIST:
1497  macroResults = [item for sublist in macroResults for item in sublist]
1498  self.logger.debug("Macro results after autodetect type: %s", str(macroResults))
1499 
1500  return macroResults, error_code, error_msg, content_type, result_type
1501 
1502 
1503 
1504 # # external Fetcher
1505 #
1506 #
1507 
1508 # # urllib Fetcher
1509 #
1510 #
1512  # #fetch a url, and return the response
1513  #
1514  # @param url, the url to fetch
1515  # @param method, fetch HTTP method
1516  # @param headers, request headers dict
1517  # @param timeout, request timeout(seconds)
1518  # @param allow_redirects, should follow redirect
1519  # @param proxies, proxy setting
1520  # @param auth, basic auth setting
1521  # @param data, post data, used only when method is post
1522  # @return Response object
1523  def open(self, url, **kwargs):
1524  import urllib2
1525 
1526  if 'logger' in kwargs['logger']:
1527  log = kwargs['logger']
1528  else:
1529  log = logger
1530  allowed_content_types = kwargs['allowed_content_types']
1531  # max_resource_size = kwargs["max_resource_size"]
1532 
1533  res = Response()
1534  log.debug("url: <%s>", url)
1535  response = None
1536  try:
1537  response = urllib2.urlopen(url)
1538  headers_info = response.info()
1539  if headers_info is not None:
1540  if headers_info.type in allowed_content_types:
1541  if response is not None:
1542  # res.encoding = impl_res.encoding
1543  # res.cookies = requests.utils.dict_from_cookiejar(impl_res.cookies)
1544  res.url = response.geturl()
1545  res.status_code = response.getcode()
1546  content_response = response.read()
1547  res.unicode_content = content_response
1548  res.str_content = content_response
1549  res.rendered_unicode_content = content_response
1550  res.content_size = len(content_response)
1551  headers = {}
1552  headers["content-length"] = res.content_size
1553  headers["content-type"] = headers_info.type
1554  res.headers = headers
1555  history = []
1556  res.redirects = history
1557  else:
1558  log.debug("URLLib return empty response.")
1559  else:
1560  log.debug("Content-Type not allowed. headers_info.type: %s", str(headers_info.type))
1561  else:
1562  log.debug("URLLib info is empty.")
1563  except urllib2.HTTPError, err:
1564  # except Exception, err:
1565  log.debug("Exception <%s>", str(err.code))
1566 
1567  return res
1568 
1569 
1570 
1571 # # external Fetcher
1572 #
1573 #
1575  # #fetch a url, and return the response
1576  #
1577  # @param url, the url to fetch
1578  # @param method, fetch HTTP method
1579  # @param headers, request headers dict
1580  # @param timeout, request timeout(seconds)
1581  # @param allow_redirects, should follow redirect
1582  # @param proxies, proxy setting
1583  # @param auth, basic auth setting
1584  # @param data, post data, used only when method is post
1585  # @return Response object
1586  def open(self, url, **kwargs):
1587  try:
1588  localBuf = base64.b64decode(kwargs["inputContent"])
1589  except TypeError:
1590  localBuf = kwargs["inputContent"]
1591  res = Response()
1592  res.content_size = len(localBuf)
1593  res.headers = {}
1594  res.redirects = []
1595  res.status_code = 200
1596  res.url = url
1597  res.encoding = SimpleCharsetDetector().detect(localBuf)
1598  if res.encoding is None:
1599  res.encoding = "utf-8"
1600  res.unicode_content = localBuf
1601  res.str_content = localBuf
1602  res.rendered_unicode_content = localBuf
1603 
1604  return res
1605 
1606 
1607 
1608 # #The Response class
1609 # represents an web page response
1610 class Response(object):
1611  def __init__(self):
1612  # final url
1613  self.url = None
1614  # http status code
1615  self.status_code = 0
1616  # redirect lists
1617  self.redirects = None
1618  # unicode content
1619  self.unicode_content = None
1620  # str content
1621  self.str_content = None
1622  # rendered(by command line browser) unicode content
1624  # http response content size
1625  self.content_size = None
1626  # content encoding
1627  self.encoding = None
1628  # headers
1629  self.headers = None
1630  # meta resource
1631  self.meta_res = None
1632  # cookies
1633  self.cookies = None
1634  # dynamic fetcher type
1636  # dynamic fetcher result type, see the macro definition specification
1638  # error mask from fetcher
1639  self.error_mask = APP_CONSTS.ERROR_OK
1640  # request
1641  self.request = None
1642  # execution time
1643  self.time = 0
1644  self.error_msg = ''
1645 
1646 
1647 
1648 # #The Response class
1649 # represents an web page response
1651 
1652 
1653  def __init__(self, content=None):
1654  # content
1655  self.content = content
1656 
1657  def detect(self, content=None, contentType="html"):
1658  ret = None
1659 
1660  if content is None:
1661  cnt = self.content
1662  else:
1663  cnt = content
1664 
1665  try:
1666  if contentType == 'html':
1667  pattern = r'<meta(?!\s*(?:name|value)\s*=)(?:[^>]*?content\s*=[\s"\']*)?([^>]*?)[\s"\';]*charset\s*=[\s"\']*([^\s"\'/>]*)' # pylint: disable=C0301
1668  matchObj = re.search(pattern, cnt, re.I | re.M | re.S)
1669  if matchObj:
1670  ret = matchObj.group(2)
1671  elif contentType == 'xml':
1672  ret = self.xmlCharsetDetector(None, cnt)
1673 
1674  except Exception, err:
1675  logger.error("Exception: %s", str(err))
1676 
1677  if ret is not None and ret in CONSTS.charsetDetectorMap:
1678  logger.debug("Extracted wrong encoding '%s' from page replace to correct '%s'", ret,
1679  CONSTS.charsetDetectorMap[ret])
1680  ret = CONSTS.charsetDetectorMap[ret]
1681 
1682  return ret
1683 
1684 
1685  def xmlCharsetDetector(self, fp, buff=None):
1686  """ Attempts to detect the character encoding of the xml file
1687  given by a file object fp. fp must not be a codec wrapped file
1688  object!
1689 
1690  The return value can be:
1691  - if detection of the BOM succeeds, the codec name of the
1692  corresponding unicode charset is returned
1693 
1694  - if BOM detection fails, the xml declaration is searched for
1695  the encoding attribute and its value returned. the "<"
1696  character has to be the very first in the file then (it's xml
1697  standard after all).
1698 
1699  - if BOM and xml declaration fail, None is returned. According
1700  to xml 1.0 it should be utf_8 then, but it wasn't detected by
1701  the means offered here. at least one can be pretty sure that a
1702  character coding including most of ASCII is used :-/
1703  """
1704  # ## detection using BOM
1705 
1706  # # the BOMs we know, by their pattern
1707  bomDict = { # bytepattern : name
1708  (0x00, 0x00, 0xFE, 0xFF) : "utf_32_be",
1709  (0xFF, 0xFE, 0x00, 0x00) : "utf_32_le",
1710  (0xFE, 0xFF, None, None) : "utf_16_be",
1711  (0xFF, 0xFE, None, None) : "utf_16_le",
1712  (0xEF, 0xBB, 0xBF, None) : "utf_8",
1713  }
1714 
1715  if fp is not None:
1716  # # go to beginning of file and get the first 4 bytes
1717  oldFP = fp.tell()
1718  fp.seek(0)
1719  (byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
1720 
1721  # # try bom detection using 4 bytes, 3 bytes, or 2 bytes
1722  bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
1723  if not bomDetection :
1724  bomDetection = bomDict.get((byte1, byte2, byte3, None))
1725  if not bomDetection :
1726  bomDetection = bomDict.get((byte1, byte2, None, None))
1727 
1728  # # if BOM detected, we're done :-)
1729  if bomDetection :
1730  fp.seek(oldFP)
1731  return bomDetection
1732 
1733  # # still here? BOM detection failed.
1734  # # now that BOM detection has failed we assume one byte character
1735  # # encoding behaving ASCII - of course one could think of nice
1736  # # algorithms further investigating on that matter, but I won't for now.
1737 
1738  # # assume xml declaration fits into the first 2 KB (*cough*)
1739  fp.seek(0)
1740  buff = fp.read(2048)
1741 
1742  # # set up regular expression
1743  xmlDeclPattern = r"""
1744  ^<\?xml # w/o BOM, xmldecl starts with <?xml at the first byte
1745  .+? # some chars (version info), matched minimal
1746  encoding= # encoding attribute begins
1747  ["'] # attribute start delimiter
1748  (?P<encstr> # what's matched in the brackets will be named encstr
1749  [^"']+ # every character not delimiter (not overly exact!)
1750  ) # closes the brackets pair for the named group
1751  ["'] # attribute end delimiter
1752  .*? # some chars optionally (standalone decl or whitespace)
1753  \?> # xmldecl end
1754  """
1755 
1756  xmlDeclRE = re.compile(xmlDeclPattern, re.VERBOSE)
1757 
1758  # # search and extract encoding string
1759  match = xmlDeclRE.search(buff)
1760  if fp is not None:
1761  fp.seek(oldFP)
1762  if match :
1763  return match.group("encstr")
1764  else :
1765  return None
def initializeTmpDirs(self, headers)
Definition: Fetcher.py:1289
def getProcessDirs(self, process, dirsTemplate)
Definition: Fetcher.py:1050
def chromeProcessesCleanup(self, headers)
Definition: Fetcher.py:960
def __init__(self, dbWrapper=None, siteId=None)
Definition: Fetcher.py:167
def __init__(self, tmpDirOptions=None, log=None)
Definition: Fetcher.py:437
def init(dbWrapper=None, siteId=None)
Definition: Fetcher.py:71
def execMacroExtended(self, macro)
Definition: Fetcher.py:1446
def openT(self, url, headers, timeout, proxies, executable_path, macro)
Definition: Fetcher.py:607
def open(self, url, kwargs)
Definition: Fetcher.py:1586
def getOptions(self, webdriver, headers, proxies, url)
Definition: Fetcher.py:1076
def killProcess(self, pid, dirsTemplate=CHROME_DIRS_TEMPLATE, dirDeleteBeforeTimeout=DELAY_TERMINATE_AND_QUIT)
Definition: Fetcher.py:1002
def cleanup(self, state=0, headers=None)
Definition: Fetcher.py:915
def executeWithTimeout(func, args=None, kwargs=None, timeout=1, default=None, log=None)
Definition: Utils.py:1544
def detect(self, content=None, contentType="html")
Definition: Fetcher.py:1657
def fixWrongXMLHeader(self, contentStr)
Definition: Fetcher.py:378
def open(self, url, method='get', headers=None, timeout=DEFAUIL_TIMEOUT, allow_redirects=True, proxies=None, auth=None, data=None, log=None, allowed_content_types=None, max_resource_size=None, max_redirects=1, filters=None, executable_path=None, depth=None, macro=None)
Definition: Fetcher.py:527
def open(self, url, method='get', headers=None, timeout=100, allow_redirects=True, proxies=None, auth=None, data=None, log=None, allowed_content_types=None, max_resource_size=None, max_redirects=CONSTS.MAX_HTTP_REDIRECTS_LIMIT, filters=None, executable_path=None, depth=None, macro=None)
Definition: Fetcher.py:109
def execMacroSimple(self, macro)
Definition: Fetcher.py:1336
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
def get_fetcher(typ, dbWrapper=None, siteId=None)
Definition: Fetcher.py:121
-mask-info
def xmlCharsetDetector(self, fp, buff=None)
Definition: Fetcher.py:1685
def removeTmpDirs(self, delay=DELAY_TERMINATE_AND_QUIT, tries=3)
Definition: Fetcher.py:1316
def __init__(self, content=None)
Definition: Fetcher.py:1653
def checkRedirectsHook(r, args, kwargs)
Definition: Fetcher.py:155
Definition: join.py:1
def getDomainNameFromURL(self, url, default='')
Definition: Fetcher.py:142
def open(self, url, method='get', headers=None, timeout=100, allow_redirects=True, proxies=None, auth=None, data=None, log=None, allowed_content_types=None, max_resource_size=None, max_redirects=CONSTS.MAX_HTTP_REDIRECTS_LIMIT, filters=None, executable_path=None, depth=None, macro=None)
Definition: Fetcher.py:201
def open(self, url, kwargs)
Definition: Fetcher.py:1523