3 HCE project, Python bindings, Distributed Tasks Manager application. 4 RequestsRedirectWrapper Class content main functional for resolve redirect. 7 @file RequestsRedirectWrapper.py 8 @author Alexander Vybornyh <alexander.hce.cluster@gmail.com> 9 @link: http://hierarchical-cluster-engine.com/ 10 @copyright: Copyright © 2013-2016 IOIX Ukraine 11 @license: http://hierarchical-cluster-engine.com/license/ 15 import requests.exceptions
16 import requests.packages
17 from requests.packages.urllib3.exceptions
import InsecureRequestWarning
19 requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
34 REQUEST_COOKIE_HEADER_NAME =
'Cookie' 35 RESPONSE_COOKIE_HEADER_NAME =
'set-cookie' 36 REFERER_HEADER_NAME =
'Referer' 39 USAGE_ALGORITHM_BASE = 0
40 USAGE_ALGORITHM_CUSTOM = 1
41 DEFAULT_USAGE_ALGORITHM = USAGE_ALGORITHM_BASE
44 ERROR_BAD_STATUS_CODE_VALUE =
"Not allowed status code '%s'. Allowed list: %s" 47 def __init__(self, dbWrapper=None, siteId=None, usageAlgorithm=DEFAULT_USAGE_ALGORITHM, redirectCodes=None):
52 self.
redirectCodes = CONSTS.REDIRECT_HTTP_CODES
if not isinstance(redirectCodes, list)
else redirectCodes
66 def request(self, url, method, timeout, headers, allowRedirects, proxySetting, auth, data, maxRedirects, filters):
72 implRes = self.
requestBase(url, method, timeout, headers, allowRedirects, proxySetting,
73 auth, data, maxRedirects, filters)
74 except CrawlerFilterException, err:
77 logger.debug(
"!!! Hard case. Don't worry. We will try using more complexity way...")
78 implRes = self.
requestBase(url,
'head', timeout, headers, allowRedirects, proxySetting,
79 auth, data, maxRedirects, filters)
80 if implRes
is not None:
81 logger.debug(
"!!! implRes.headers: %s",
varDump(implRes.headers))
82 implRes = self.
requestBase(url, method, timeout, implRes.headers, allowRedirects, proxySetting,
83 auth, data, maxRedirects, filters)
86 implRes = self.
requestCustom(url, method, timeout, headers, allowRedirects, proxySetting, auth, data,
87 maxRedirects, filters)
90 raise Exception(
"Try using not support algorithm usage of 'requests' = %s" % (str(self.
usageAlgorithm)))
106 def requestBase(self, url, method, timeout, headers, allowRedirects, proxySetting, auth, data, maxRedirects, filters):
110 req = requests.Request(method=method,
115 hooks={
'response':[RequestsRedirectWrapper.checkRedirect]})
117 logger.debug(
"!!! headers: %s, type: %s",
varDump(headers), str(
type(headers)))
124 rSession = requests.Session()
125 rSession.max_redirects = int(maxRedirects)
126 rSession.stream =
True 127 rSession.verify =
False 128 rSession.proxies = proxySetting
129 res = rSession.send(request=reqv, allow_redirects=allowRedirects, timeout=timeout)
133 logger.debug(
"!!! res.cookies: %s, type: %s",
varDump(res.cookies), str(
type(res.cookies)))
134 cookies = requests.utils.dict_from_cookiejar(res.cookies)
135 logger.debug(
"!!! cookies: %s, type: %s",
varDump(cookies), str(
type(cookies)))
137 cookiesList = [key +
'=' + value
for key, value
in cookies.items()]
142 logger.debug(
"!!! headers updated by 'cookies': %s",
varDump(headers))
144 except requests.exceptions.TooManyRedirects, err:
146 except CrawlerFilterException, err:
148 except Exception, err:
149 logger.debug(
"!!! We have a problem: %s", str(err))
168 logger.debug(
"!!! res.url: %s",
varDump(res.url))
172 for history
in res.history:
173 logger.debug(
"!!! history.url: %s",
varDump(history.url))
174 logger.debug(
"!!! history.status_code: %s",
varDump(history.status_code))
179 for history
in res.history:
196 def requestCustom(self, url, method, timeout, headers, allowRedirects, proxySetting, auth, data, maxRedirects,
200 applyHeaders = copy.deepcopy(headers)
206 while redirectsCount < int(maxRedirects):
207 implRes, localUrl = self.
__sendRequest(url, method, timeout, applyHeaders, proxySetting, \
208 auth, data, maxRedirects)
210 logger.debug(
"!!! implRes.status_code = %s", str(implRes.status_code))
219 if redirectsCount > 0:
224 logger.debug(
"!!! redirectsCount = %s, maxRedirects = %s", str(redirectsCount), str(maxRedirects))
228 logger.debug(
"!!!>>> applyHeaders: %s",
varDump(applyHeaders))
233 if implRes.status_code
not in self.
redirectCodes or not allowRedirects:
234 logger.debug(
"!!! break !!!")
238 raise requests.exceptions.TooManyRedirects(
'Exceeded %s redirects.' % str(maxRedirects))
254 def __sendRequest(self, url, method, timeout, headers, proxySetting, auth, data, maxRedirects):
256 logger.debug(
"!!! request arguments: " + str((url, timeout, headers, proxySetting, auth, data)))
257 logger.debug(
"!!! send request to url: %s", str(url))
259 rSession = requests.Session()
260 rSession.max_redirects = int(maxRedirects)
261 methodFunc = rSession.__getattribute__(method)
263 implRes = methodFunc(url,
266 allow_redirects=
False,
267 proxies=proxySetting,
273 hooks={
'response':[RequestsRedirectWrapper.checkRedirect]})
277 for redirect
in rSession.resolve_redirects(implRes, implRes.request):
278 redirectUrl = redirect.url
281 if redirect
is not None:
284 implRes.url = redirectUrl
285 logger.debug(
"!!! redirect.url: %s", str(redirectUrl))
287 return implRes, redirectUrl
296 cid = requests.structures.CaseInsensitiveDict(headers)
297 for name
in CONSTS.REDIRECT_HEADER_FIELDS_FOR_REMOVE:
298 for key, value
in cid.lower_items():
300 if key.lower() == name.lower():
302 headers = dict(cid.lower_items())
316 if self.
dbWrapper is not None and self.
siteId is not None and inputFilters
is not None:
339 filters = copy.deepcopy(inputFilters)
340 if filters
is not None:
341 for inputFilter
in filters:
342 if inputFilter.stage == Filters.STAGE_ALL
or inputFilter.stage == Filters.STAGE_REDIRECT_URL:
343 inputFilter.stage = Filters.STAGE_COLLECT_URLS
345 ret = CollectURLs.filtersApply(filters, url, 0,
None, 0,
None, Filters.OC_RE, Filters.STAGE_COLLECT_URLS)
378 logger.debug(
"!!! cookies: '%s'", str(cookies))
379 if cookies
is not None:
380 cookieResolver.addCookie(url, cookies)
393 logger.debug(
'!!! Headers before update by cookies:\n' + str(headers))
394 cookies = cookieResolver.getCookie(url, stage)
395 if cookies
is not None and isinstance(headers, dict):
396 headers[RequestsRedirectWrapper.REQUEST_COOKIE_HEADER_NAME] = cookies
397 logger.debug(
'!!! Cookies was updated ...Use headers:\n' + str(headers))
398 except Exception, err:
399 logger.error(
"!!! Error: %s", str(err))
412 logger.debug(
'handler: %s',
varDump(handler))
413 logger.debug(
'args = ' + str(args))
414 logger.debug(
'kwargs = ' + str(kwargs))
416 if handler
is not None:
417 handler.redirectCount += 1
418 if handler.redirectCount > handler.maxRedirects:
419 raise requests.exceptions.TooManyRedirects(
'Exceeded %s redirects.' % str(handler.maxRedirects))
428 logger.debug(
'args = ' + str(args))
429 logger.debug(
'kwargs = ' + str(kwargs))
430 logger.debug(
'r.url: %s', str(r.url))
431 logger.debug(
'r.status_code = %s', str(r.status_code))
def updateHeaderFields(headers)
def updateHeadersByCookies(headers, url, cookieResolver, stage=HTTPCookieResolver.STAGE_DEFAULT)
def requestCustom(self, url, method, timeout, headers, allowRedirects, proxySetting, auth, data, maxRedirects, filters)
def request(self, url, method, timeout, headers, allowRedirects, proxySetting, auth, data, maxRedirects, filters)
string REFERER_HEADER_NAME
def __sendRequest(self, url, method, timeout, headers, proxySetting, auth, data, maxRedirects)
string RESPONSE_COOKIE_HEADER_NAME
def requestBase(self, url, method, timeout, headers, allowRedirects, proxySetting, auth, data, maxRedirects, filters)
def __isAllowedUrl(self, url, inputFilters=None)
def __init__(self, dbWrapper=None, siteId=None, usageAlgorithm=DEFAULT_USAGE_ALGORITHM, redirectCodes=None)
def checkRedirect(r, args, kwargs)
string ERROR_BAD_STATUS_CODE_VALUE
def __checkResponse(self, res, filters)
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
def checkRedirectMax(handler, args, kwargs)
string REQUEST_COOKIE_HEADER_NAME
def __saveCookies(self, url, res, cookieResolver)
int USAGE_ALGORITHM_CUSTOM
def getTracebackInfo(linesNumberMax=None)