204 log = logger
if log
is None else log
207 for key
in headers.keys():
208 if not key.startswith(
'--'):
209 headers1[key] = headers[key]
212 if not isinstance(timeout, tuple):
213 if hasattr(self,
'connectionTimeout'):
214 timeout = (self.connectionTimeout, timeout)
216 timeout = (self.CONNECTION_TIMEOUT, timeout)
219 auth = HTTPBasicAuth(auth[0], auth[1])
222 if proxies
is not None:
223 proxy_type, proxy_host, proxy_port, proxy_user, proxy_passwd = proxies
224 if proxy_type
is None:
226 if proxy_user
is not None:
227 proxies =
"%s://%s:%s@%s:%s" % (proxy_type, proxy_user, proxy_passwd, proxy_host, proxy_port)
229 proxies =
"%s://%s:%s" % (proxy_type, proxy_host, proxy_port)
230 proxy_setting = {
"http" : proxies}
236 requestsRedirect = RequestsRedirectWrapper(self.dbWrapper, self.siteId)
237 impl_res = requestsRedirect.request(url=url,
241 allowRedirects=allow_redirects,
242 proxySetting=proxy_setting,
245 maxRedirects=max_redirects,
248 log.debug(
"!!! impl_res.headers: %s",
varDump(impl_res.headers))
249 log.debug(
"!!! impl_res.url: %s", str(impl_res.url))
251 location = impl_res.url
252 headers = dict(impl_res.headers.lower_items())
255 if "content-length" in impl_res.headers
and \
256 max_resource_size != CONSTS.MAX_HTTP_SIZE_UNLIMIT
and \
257 int(impl_res.headers[
'content-length']) > max_resource_size:
258 log.debug(
"Content size overshooted. content-length: %s, max_resource_size: %s" % \
259 (str(impl_res.headers[
'content-length']), str(max_resource_size)))
260 res.content_size = int(impl_res.headers[
'content-length'])
262 ct = impl_res.headers.get(
'content-type',
'').lower()
264 if ct.startswith(
'application')
or ct.startswith(
'audio')
or \
265 len(impl_res.content) >= MAX_CONTENT_SIZE_FOR_CHARDET:
267 encoding = SimpleCharsetDetector().detect(impl_res.content, contentType=
'xml')
268 log.debug(
"encoding3=%s", str(encoding))
269 if encoding
is not None:
270 impl_res.encoding = encoding
272 detected_encoding = impl_res.encoding
273 log.debug(
"Headers contains 'application' or 'audio' content-type: %s",
274 impl_res.headers.get(
'content-type',
''))
278 log.debug(
"impl_res.encoding1=%s, content-type=%s", impl_res.encoding, ct)
282 log.debug(
"Using the SimpleCharsetDetector()")
283 encoding = SimpleCharsetDetector().detect(impl_res.content)
284 log.debug(
"encoding=%s", str(encoding))
285 if encoding
is not None:
286 impl_res.encoding = encoding
289 encoding = SimpleCharsetDetector().detect(impl_res.content, contentType=
'xml')
290 log.debug(
"encoding3=%s", str(encoding))
291 if encoding
is not None:
292 impl_res.encoding = encoding
295 if (impl_res.encoding
is None)
or ((encoding
is None)
and (impl_res.encoding
not in ct
and "xml" not in ct)):
296 log.debug(
"Using the charset to improve encoding detect")
297 detected_encoding = impl_res.apparent_encoding
298 if detected_encoding !=
'ascii' and detected_encoding !=
'ISO-8859-2':
299 impl_res.encoding = detected_encoding
300 log.debug(
"impl_res.encoding2=%s", impl_res.encoding)
302 text_buffer = self.fixWrongXMLHeader(impl_res.content)
303 if impl_res.headers.get(
'content-type',
'').startswith(
'application'):
304 res.unicode_content = impl_res.content
306 res.unicode_content = text_buffer
307 res.str_content = impl_res.content
308 if impl_res.headers.get(
'content-type',
'').startswith(
'application'):
309 res.rendered_unicode_content = impl_res.content
311 res.rendered_unicode_content = text_buffer
313 if res.rendered_unicode_content
is None:
316 res.content_size = len(res.rendered_unicode_content)
318 res.headers = impl_res.headers
319 res.redirects = impl_res.history
320 res.status_code = impl_res.status_code
321 res.url = impl_res.url
322 res.encoding = impl_res.encoding
323 res.request = impl_res.request
324 res.cookies = requests.utils.dict_from_cookiejar(impl_res.cookies)
327 res.headers.update({
'Location':location})
329 except (requests.exceptions.Timeout, requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout), err:
330 res.error_mask = APP_CONSTS.ERROR_REQUEST_TIMEOUT
331 msg =
"Requests fetcher has thrown '%s' exception: " % str(
type(err))
332 if isinstance(err, requests.exceptions.Timeout):
333 msg +=
"The request timed out." 334 elif isinstance(err, requests.exceptions.ReadTimeout):
335 msg +=
"The server did not send any data in the allotted amount of time." 336 elif isinstance(err, requests.exceptions.ConnectTimeout):
337 msg +=
"The request timed out while trying to connect to the remote server." 341 except requests.exceptions.ConnectionError, err:
342 res.error_mask = APP_CONSTS.ERROR_FETCH_CONNECTION_ERROR
343 log.debug(
">>> Requests fetcher has thrown ConnectionError exception: " + str(err))
345 except requests.exceptions.HTTPError, err:
346 res.error_mask = APP_CONSTS.ERROR_FETCH_HTTP_ERROR
347 log.debug(
">>> Requests fetcher has thrown HTTPError exception: " + str(err))
349 except requests.exceptions.URLRequired, err:
350 res.error_mask = APP_CONSTS.ERROR_FETCH_INVALID_URL
351 log.debug(
">>> Requests fetcher has thrown URLRequired exception: " + str(err))
353 except requests.exceptions.TooManyRedirects, err:
354 res.error_mask = APP_CONSTS.ERROR_FETCH_TOO_MANY_REDIRECTS
355 log.debug(
">>> Requests fetcher has thrown TooManyRedirects exception: " + str(err))
357 except requests.exceptions.RequestException, err:
358 res.error_mask = APP_CONSTS.ERROR_FETCH_AMBIGUOUS_REQUEST
359 log.debug(
">>> Requests fetcher has thrown RequestException exception: " + str(err))
361 except CrawlerFilterException, err:
362 res.error_mask = APP_CONSTS.ERROR_CRAWLER_FILTERS_BREAK
363 log.debug(
"Crawler has not allowed filter: " + str(err))
365 except Exception, err:
366 res.error_mask = APP_CONSTS.ERROR_FETCHER_INTERNAL
367 log.debug(
">>> Requests fetcher has thrown exception" + \
368 " type: " + str(
type(err)) +
"\n" + Utils.getTracebackInfo())
369 raise InternalCrawlerException(
"Requests fetcher has thrown exception")
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)