HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_crawler.Fetcher.RequestsFetcher Class Reference
Inheritance diagram for dc_crawler.Fetcher.RequestsFetcher:
Collaboration diagram for dc_crawler.Fetcher.RequestsFetcher:

Public Member Functions

def __init__ (self, dbWrapper=None, siteId=None)
 
def open (self, url, method='get', headers=None, timeout=100, allow_redirects=True, proxies=None, auth=None, data=None, log=None, allowed_content_types=None, max_resource_size=None, max_redirects=CONSTS.MAX_HTTP_REDIRECTS_LIMIT, filters=None, executable_path=None, depth=None, macro=None)
 
def fixWrongXMLHeader (self, contentStr)
 
- Public Member Functions inherited from dc_crawler.Fetcher.BaseFetcher
def __init__ (self)
 
def open (self, url, method='get', headers=None, timeout=100, allow_redirects=True, proxies=None, auth=None, data=None, log=None, allowed_content_types=None, max_resource_size=None, max_redirects=CONSTS.MAX_HTTP_REDIRECTS_LIMIT, filters=None, executable_path=None, depth=None, macro=None)
 
def should_have_meta_res (self)
 
def getDomainNameFromURL (self, url, default='')
 

Public Attributes

 dbWrapper
 
 siteId
 
- Public Attributes inherited from dc_crawler.Fetcher.BaseFetcher
 connectionTimeout
 
 logger
 

Additional Inherited Members

- Static Public Member Functions inherited from dc_crawler.Fetcher.BaseFetcher
def init (dbWrapper=None, siteId=None)
 
def get_fetcher (typ, dbWrapper=None, siteId=None)
 
- Static Public Attributes inherited from dc_crawler.Fetcher.BaseFetcher
 fetchers = None
 
int TYP_NORMAL = 1
 
int TYP_DYNAMIC = 2
 
int TYP_URLLIB = 5
 
int TYP_CONTENT = 6
 
int TYP_AUTO = 7
 
float CONNECTION_TIMEOUT = 1.0
 

Detailed Description

Definition at line 165 of file Fetcher.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_crawler.Fetcher.RequestsFetcher.__init__ (   self,
  dbWrapper = None,
  siteId = None 
)

Definition at line 167 of file Fetcher.py.

167  def __init__(self, dbWrapper=None, siteId=None):
168  BaseFetcher.__init__(self)
169 
170  self.dbWrapper = dbWrapper
171  self.siteId = siteId
172 
173 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ fixWrongXMLHeader()

def dc_crawler.Fetcher.RequestsFetcher.fixWrongXMLHeader (   self,
  contentStr 
)

Definition at line 378 of file Fetcher.py.

378  def fixWrongXMLHeader(self, contentStr):
379  # text_file = open("/tmp/fetcher_log.txt", "w")
380  # text_file.write("Fetcher: start\n")
381 
382  if contentStr.startswith('<?xml ') and '<html' in contentStr and '<head' in contentStr:
383  # text_file.write("Fetcher: xml detected!\n")
384  p = re.compile(r'<\?xml .*\?>')
385  contentStr = p.sub('', contentStr, count=1)
386  # text_file.write(contentStr)
387 
388  # text_file.close()
389 
390  return contentStr
391 
392 
393 
394 # # Fetcher based on the selenium project web-driver
395 #
396 #
Here is the caller graph for this function:

◆ open()

def dc_crawler.Fetcher.RequestsFetcher.open (   self,
  url,
  method = 'get',
  headers = None,
  timeout = 100,
  allow_redirects = True,
  proxies = None,
  auth = None,
  data = None,
  log = None,
  allowed_content_types = None,
  max_resource_size = None,
  max_redirects = CONSTS.MAX_HTTP_REDIRECTS_LIMIT,
  filters = None,
  executable_path = None,
  depth = None,
  macro = None 
)

Definition at line 201 of file Fetcher.py.

201  macro=None):
202 
203  # set logger
204  log = logger if log is None else log
205 
206  headers1 = {}
207  for key in headers.keys():
208  if not key.startswith('--'):
209  headers1[key] = headers[key]
210  headers = headers1
211 
212  if not isinstance(timeout, tuple):
213  if hasattr(self, 'connectionTimeout'):
214  timeout = (self.connectionTimeout, timeout)
215  else:
216  timeout = (self.CONNECTION_TIMEOUT, timeout)
217 
218  if auth:
219  auth = HTTPBasicAuth(auth[0], auth[1])
220 
221  proxy_setting = None
222  if proxies is not None:
223  proxy_type, proxy_host, proxy_port, proxy_user, proxy_passwd = proxies
224  if proxy_type is None:
225  proxy_type = "http"
226  if proxy_user is not None:
227  proxies = "%s://%s:%s@%s:%s" % (proxy_type, proxy_user, proxy_passwd, proxy_host, proxy_port)
228  else:
229  proxies = "%s://%s:%s" % (proxy_type, proxy_host, proxy_port)
230  proxy_setting = {"http" : proxies}
231 
232  # # save location value
233  location = url
234  res = Response()
235  try:
236  requestsRedirect = RequestsRedirectWrapper(self.dbWrapper, self.siteId)
237  impl_res = requestsRedirect.request(url=url,
238  method=method,
239  timeout=timeout,
240  headers=headers,
241  allowRedirects=allow_redirects,
242  proxySetting=proxy_setting,
243  auth=auth,
244  data=data,
245  maxRedirects=max_redirects,
246  filters=filters)
247 
248  log.debug("!!! impl_res.headers: %s", varDump(impl_res.headers))
249  log.debug("!!! impl_res.url: %s", str(impl_res.url))
250 
251  location = impl_res.url
252  headers = dict(impl_res.headers.lower_items())
253 
254  # try to prevent huge content fetching
255  if "content-length" in impl_res.headers and \
256  max_resource_size != CONSTS.MAX_HTTP_SIZE_UNLIMIT and \
257  int(impl_res.headers['content-length']) > max_resource_size:
258  log.debug("Content size overshooted. content-length: %s, max_resource_size: %s" % \
259  (str(impl_res.headers['content-length']), str(max_resource_size)))
260  res.content_size = int(impl_res.headers['content-length'])
261  else:
262  ct = impl_res.headers.get('content-type', '').lower()
263  # don't detect charset for binary content type or BIG response
264  if ct.startswith('application') or ct.startswith('audio') or \
265  len(impl_res.content) >= MAX_CONTENT_SIZE_FOR_CHARDET:
266  if "xml" in ct:
267  encoding = SimpleCharsetDetector().detect(impl_res.content, contentType='xml')
268  log.debug("encoding3=%s", str(encoding))
269  if encoding is not None:
270  impl_res.encoding = encoding
271  else:
272  detected_encoding = impl_res.encoding
273  log.debug("Headers contains 'application' or 'audio' content-type: %s",
274  impl_res.headers.get('content-type', ''))
275  else:
276  # use chardet to improve encoding detect
277 # ct = impl_res.headers.get('content-type', '').lower()
278  log.debug("impl_res.encoding1=%s, content-type=%s", impl_res.encoding, ct)
279  # Try simple way of charset detection for an html
280  encoding = None
281  if "html" in ct:
282  log.debug("Using the SimpleCharsetDetector()")
283  encoding = SimpleCharsetDetector().detect(impl_res.content)
284  log.debug("encoding=%s", str(encoding))
285  if encoding is not None:
286  impl_res.encoding = encoding
287 
288  elif "xml" in ct:
289  encoding = SimpleCharsetDetector().detect(impl_res.content, contentType='xml')
290  log.debug("encoding3=%s", str(encoding))
291  if encoding is not None:
292  impl_res.encoding = encoding
293 
294 
295  if (impl_res.encoding is None) or ((encoding is None) and (impl_res.encoding not in ct and "xml" not in ct)):
296  log.debug("Using the charset to improve encoding detect")
297  detected_encoding = impl_res.apparent_encoding
298  if detected_encoding != 'ascii' and detected_encoding != 'ISO-8859-2':
299  impl_res.encoding = detected_encoding
300  log.debug("impl_res.encoding2=%s", impl_res.encoding)
301  # Fix for pages that has xml document tag but no html structure inside
302  text_buffer = self.fixWrongXMLHeader(impl_res.content)
303  if impl_res.headers.get('content-type', '').startswith('application'):
304  res.unicode_content = impl_res.content
305  else:
306  res.unicode_content = text_buffer
307  res.str_content = impl_res.content
308  if impl_res.headers.get('content-type', '').startswith('application'):
309  res.rendered_unicode_content = impl_res.content
310  else:
311  res.rendered_unicode_content = text_buffer
312  # res.content_size = impl_res.raw.tell()
313  if res.rendered_unicode_content is None:
314  res.content_size = 0
315  else:
316  res.content_size = len(res.rendered_unicode_content)
317 
318  res.headers = impl_res.headers
319  res.redirects = impl_res.history
320  res.status_code = impl_res.status_code
321  res.url = impl_res.url
322  res.encoding = impl_res.encoding
323  res.request = impl_res.request
324  res.cookies = requests.utils.dict_from_cookiejar(impl_res.cookies)
325 
326  # update location value
327  res.headers.update({'Location':location})
328 
329  except (requests.exceptions.Timeout, requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout), err:
330  res.error_mask = APP_CONSTS.ERROR_REQUEST_TIMEOUT
331  msg = "Requests fetcher has thrown '%s' exception: " % str(type(err))
332  if isinstance(err, requests.exceptions.Timeout):
333  msg += "The request timed out."
334  elif isinstance(err, requests.exceptions.ReadTimeout):
335  msg += "The server did not send any data in the allotted amount of time."
336  elif isinstance(err, requests.exceptions.ConnectTimeout):
337  msg += "The request timed out while trying to connect to the remote server."
338 
339  log.debug(str(msg))
340  raise err
341  except requests.exceptions.ConnectionError, err:
342  res.error_mask = APP_CONSTS.ERROR_FETCH_CONNECTION_ERROR
343  log.debug(">>> Requests fetcher has thrown ConnectionError exception: " + str(err))
344  raise err
345  except requests.exceptions.HTTPError, err:
346  res.error_mask = APP_CONSTS.ERROR_FETCH_HTTP_ERROR
347  log.debug(">>> Requests fetcher has thrown HTTPError exception: " + str(err))
348  raise err
349  except requests.exceptions.URLRequired, err:
350  res.error_mask = APP_CONSTS.ERROR_FETCH_INVALID_URL
351  log.debug(">>> Requests fetcher has thrown URLRequired exception: " + str(err))
352  raise err
353  except requests.exceptions.TooManyRedirects, err:
354  res.error_mask = APP_CONSTS.ERROR_FETCH_TOO_MANY_REDIRECTS
355  log.debug(">>> Requests fetcher has thrown TooManyRedirects exception: " + str(err))
356  raise err
357  except requests.exceptions.RequestException, err:
358  res.error_mask = APP_CONSTS.ERROR_FETCH_AMBIGUOUS_REQUEST
359  log.debug(">>> Requests fetcher has thrown RequestException exception: " + str(err))
360  raise err
361  except CrawlerFilterException, err:
362  res.error_mask = APP_CONSTS.ERROR_CRAWLER_FILTERS_BREAK
363  log.debug("Crawler has not allowed filter: " + str(err))
364  raise err
365  except Exception, err:
366  res.error_mask = APP_CONSTS.ERROR_FETCHER_INTERNAL
367  log.debug(">>> Requests fetcher has thrown exception" + \
368  " type: " + str(type(err)) + "\n" + Utils.getTracebackInfo())
369  raise InternalCrawlerException("Requests fetcher has thrown exception")
370 
371  return res
372 
373 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:

Member Data Documentation

◆ dbWrapper

dc_crawler.Fetcher.RequestsFetcher.dbWrapper

Definition at line 170 of file Fetcher.py.

◆ siteId

dc_crawler.Fetcher.RequestsFetcher.siteId

Definition at line 171 of file Fetcher.py.


The documentation for this class was generated from the following file: