Inheritance diagram for dc_crawler.Fetcher.RequestsFetcher:

Collaboration diagram for dc_crawler.Fetcher.RequestsFetcher:

Public Member Functions
def	__init__ (self, dbWrapper=None, siteId=None)

def	open (self, url, method='get', headers=None, timeout=100, allow_redirects=True, proxies=None, auth=None, data=None, log=None, allowed_content_types=None, max_resource_size=None, max_redirects=CONSTS.MAX_HTTP_REDIRECTS_LIMIT, filters=None, executable_path=None, depth=None, macro=None)

def	fixWrongXMLHeader (self, contentStr)

Public Member Functions inherited from dc_crawler.Fetcher.BaseFetcher
def	__init__ (self)

def	open (self, url, method='get', headers=None, timeout=100, allow_redirects=True, proxies=None, auth=None, data=None, log=None, allowed_content_types=None, max_resource_size=None, max_redirects=CONSTS.MAX_HTTP_REDIRECTS_LIMIT, filters=None, executable_path=None, depth=None, macro=None)

def	should_have_meta_res (self)

def	getDomainNameFromURL (self, url, default='')

Public Attributes
	dbWrapper

	siteId

Public Attributes inherited from dc_crawler.Fetcher.BaseFetcher
	connectionTimeout

	logger

Additional Inherited Members
Static Public Member Functions inherited from dc_crawler.Fetcher.BaseFetcher
def	init (dbWrapper=None, siteId=None)

def	get_fetcher (typ, dbWrapper=None, siteId=None)

Static Public Attributes inherited from dc_crawler.Fetcher.BaseFetcher
	fetchers = None

int	TYP_NORMAL = 1

int	TYP_DYNAMIC = 2

int	TYP_URLLIB = 5

int	TYP_CONTENT = 6

int	TYP_AUTO = 7

float	CONNECTION_TIMEOUT = 1.0

Detailed Description

Definition at line 165 of file Fetcher.py.

Constructor & Destructor Documentation

◆ init()

def dc_crawler.Fetcher.RequestsFetcher.__init__	(	self,
		dbWrapper = `None`,
		siteId = `None`
	)

Definition at line 167 of file Fetcher.py.

   def __init__(self, dbWrapper=None, siteId=None):
     BaseFetcher.__init__(self)
 
     self.dbWrapper = dbWrapper
     self.siteId = siteId

Member Function Documentation

◆ fixWrongXMLHeader()

def dc_crawler.Fetcher.RequestsFetcher.fixWrongXMLHeader	(	self,
		contentStr
	)

Definition at line 378 of file Fetcher.py.

   def fixWrongXMLHeader(self, contentStr):
     # text_file = open("/tmp/fetcher_log.txt", "w")
     # text_file.write("Fetcher: start\n")
 
     if contentStr.startswith('<?xml ') and '<html' in contentStr and '<head' in contentStr:
       # text_file.write("Fetcher: xml detected!\n")
       p = re.compile(r'<\?xml .*\?>')
       contentStr = p.sub('', contentStr, count=1)
       # text_file.write(contentStr)
 
     # text_file.close()
 
     return contentStr
 
 
 
 # # Fetcher based on the selenium project web-driver
 #
 #

Here is the caller graph for this function:

◆ open()

def dc_crawler.Fetcher.RequestsFetcher.open	(	self,
		url,
		method = `'get'`,
		headers = `None`,
		timeout = `100`,
		allow_redirects = `True`,
		proxies = `None`,
		auth = `None`,
		data = `None`,
		log = `None`,
		allowed_content_types = `None`,
		max_resource_size = `None`,
		max_redirects = `CONSTS.MAX_HTTP_REDIRECTS_LIMIT`,
		filters = `None`,
		executable_path = `None`,
		depth = `None`,
		macro = `None`
	)

Definition at line 201 of file Fetcher.py.

            macro=None):
 
     # set logger
     log = logger if log is None else log
 
     headers1 = {}
     for key in headers.keys():
       if not key.startswith('--'):
         headers1[key] = headers[key]
     headers = headers1
 
     if not isinstance(timeout, tuple):
       if hasattr(self, 'connectionTimeout'):
         timeout = (self.connectionTimeout, timeout)
       else:
         timeout = (self.CONNECTION_TIMEOUT, timeout)
 
     if auth:
       auth = HTTPBasicAuth(auth[0], auth[1])
 
     proxy_setting = None
     if proxies is not None:
       proxy_type, proxy_host, proxy_port, proxy_user, proxy_passwd = proxies
       if proxy_type is None:
         proxy_type = "http"
       if proxy_user is not None:
         proxies = "%s://%s:%s@%s:%s" % (proxy_type, proxy_user, proxy_passwd, proxy_host, proxy_port)
       else:
         proxies = "%s://%s:%s" % (proxy_type, proxy_host, proxy_port)
       proxy_setting = {"http" : proxies}
 
     # # save location value
     location = url
     res = Response()
     try:
       requestsRedirect = RequestsRedirectWrapper(self.dbWrapper, self.siteId)
       impl_res = requestsRedirect.request(url=url,
                                           method=method,
                                           timeout=timeout,
                                           headers=headers,
                                           allowRedirects=allow_redirects,
                                           proxySetting=proxy_setting,
                                           auth=auth,
                                           data=data,
                                           maxRedirects=max_redirects,
                                           filters=filters)
 
       log.debug("!!! impl_res.headers: %s", varDump(impl_res.headers))
       log.debug("!!! impl_res.url: %s", str(impl_res.url))
 
       location = impl_res.url
       headers = dict(impl_res.headers.lower_items())
 
       # try to prevent huge content fetching
       if "content-length" in impl_res.headers and \
         max_resource_size != CONSTS.MAX_HTTP_SIZE_UNLIMIT and \
         int(impl_res.headers['content-length']) > max_resource_size:
         log.debug("Content size overshooted. content-length: %s, max_resource_size: %s" % \
                      (str(impl_res.headers['content-length']), str(max_resource_size)))
         res.content_size = int(impl_res.headers['content-length'])
       else:
         ct = impl_res.headers.get('content-type', '').lower()
         # don't detect charset for binary content type or BIG response
         if ct.startswith('application') or ct.startswith('audio') or \
           len(impl_res.content) >= MAX_CONTENT_SIZE_FOR_CHARDET:
           if "xml" in ct:
             encoding = SimpleCharsetDetector().detect(impl_res.content, contentType='xml')
             log.debug("encoding3=%s", str(encoding))
             if encoding is not None:
               impl_res.encoding = encoding
           else:
             detected_encoding = impl_res.encoding
           log.debug("Headers contains 'application' or 'audio' content-type: %s",
                     impl_res.headers.get('content-type', ''))
         else:
           # use chardet to improve encoding detect
 #           ct = impl_res.headers.get('content-type', '').lower()
           log.debug("impl_res.encoding1=%s, content-type=%s", impl_res.encoding, ct)
           # Try simple way of charset detection for an html
           encoding = None
           if "html" in ct:
             log.debug("Using the SimpleCharsetDetector()")
             encoding = SimpleCharsetDetector().detect(impl_res.content)
             log.debug("encoding=%s", str(encoding))
             if encoding is not None:
               impl_res.encoding = encoding
 
           elif "xml" in ct:
             encoding = SimpleCharsetDetector().detect(impl_res.content, contentType='xml')
             log.debug("encoding3=%s", str(encoding))
             if encoding is not None:
               impl_res.encoding = encoding
 
 
           if (impl_res.encoding is None) or ((encoding is None) and (impl_res.encoding not in ct and "xml" not in ct)):
             log.debug("Using the charset to improve encoding detect")
             detected_encoding = impl_res.apparent_encoding
             if detected_encoding != 'ascii' and detected_encoding != 'ISO-8859-2':
               impl_res.encoding = detected_encoding
           log.debug("impl_res.encoding2=%s", impl_res.encoding)
         # Fix for pages that has xml document tag but no html structure inside
         text_buffer = self.fixWrongXMLHeader(impl_res.content)
         if impl_res.headers.get('content-type', '').startswith('application'):
           res.unicode_content = impl_res.content
         else:
           res.unicode_content = text_buffer
         res.str_content = impl_res.content
         if impl_res.headers.get('content-type', '').startswith('application'):
           res.rendered_unicode_content = impl_res.content
         else:
           res.rendered_unicode_content = text_buffer
         # res.content_size = impl_res.raw.tell()
         if res.rendered_unicode_content is None:
           res.content_size = 0
         else:
           res.content_size = len(res.rendered_unicode_content)
 
       res.headers = impl_res.headers
       res.redirects = impl_res.history
       res.status_code = impl_res.status_code
       res.url = impl_res.url
       res.encoding = impl_res.encoding
       res.request = impl_res.request
       res.cookies = requests.utils.dict_from_cookiejar(impl_res.cookies)
 
       # update location value
       res.headers.update({'Location':location})
 
     except (requests.exceptions.Timeout, requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout), err:
       res.error_mask = APP_CONSTS.ERROR_REQUEST_TIMEOUT
       msg = "Requests fetcher has thrown '%s' exception: " % str(type(err))
       if isinstance(err, requests.exceptions.Timeout):
         msg += "The request timed out."
       elif isinstance(err, requests.exceptions.ReadTimeout):
         msg += "The server did not send any data in the allotted amount of time."
       elif isinstance(err, requests.exceptions.ConnectTimeout):
         msg += "The request timed out while trying to connect to the remote server."
 
       log.debug(str(msg))
       raise err
     except requests.exceptions.ConnectionError, err:
       res.error_mask = APP_CONSTS.ERROR_FETCH_CONNECTION_ERROR
       log.debug(">>> Requests fetcher has thrown ConnectionError exception: " + str(err))
       raise err
     except requests.exceptions.HTTPError, err:
       res.error_mask = APP_CONSTS.ERROR_FETCH_HTTP_ERROR
       log.debug(">>> Requests fetcher has thrown HTTPError exception: " + str(err))
       raise err
     except requests.exceptions.URLRequired, err:
       res.error_mask = APP_CONSTS.ERROR_FETCH_INVALID_URL
       log.debug(">>> Requests fetcher has thrown URLRequired exception: " + str(err))
       raise err
     except requests.exceptions.TooManyRedirects, err:
       res.error_mask = APP_CONSTS.ERROR_FETCH_TOO_MANY_REDIRECTS
       log.debug(">>> Requests fetcher has thrown TooManyRedirects exception: " + str(err))
       raise err
     except requests.exceptions.RequestException, err:
       res.error_mask = APP_CONSTS.ERROR_FETCH_AMBIGUOUS_REQUEST
       log.debug(">>> Requests fetcher has thrown RequestException exception: " + str(err))
       raise err
     except CrawlerFilterException, err:
       res.error_mask = APP_CONSTS.ERROR_CRAWLER_FILTERS_BREAK
       log.debug("Crawler has not allowed filter: " + str(err))
       raise err
     except Exception, err:
       res.error_mask = APP_CONSTS.ERROR_FETCHER_INTERNAL
       log.debug(">>> Requests fetcher has thrown exception" + \
                 " type: " + str(type(err)) + "\n" + Utils.getTracebackInfo())
       raise InternalCrawlerException("Requests fetcher has thrown exception")
 
     return res
 
 

Here is the call graph for this function:

Member Data Documentation

◆ dbWrapper

dc_crawler.Fetcher.RequestsFetcher.dbWrapper

Definition at line 170 of file Fetcher.py.

◆ siteId

dc_crawler.Fetcher.RequestsFetcher.siteId

Definition at line 171 of file Fetcher.py.

The documentation for this class was generated from the following file:

sources/hce/dc_crawler/Fetcher.py

Public Member Functions

Public Attributes

Additional Inherited Members