HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
RequestsRedirectWrapper.py
Go to the documentation of this file.
1 # coding: utf-8
2 """
3 HCE project, Python bindings, Distributed Tasks Manager application.
4 RequestsRedirectWrapper Class content main functional for resolve redirect.
5 
6 @package: dc_crawler
7 @file RequestsRedirectWrapper.py
8 @author Alexander Vybornyh <alexander.hce.cluster@gmail.com>
9 @link: http://hierarchical-cluster-engine.com/
10 @copyright: Copyright &copy; 2013-2016 IOIX Ukraine
11 @license: http://hierarchical-cluster-engine.com/license/
12 @since: 0.1
13 """
14 import copy
15 import requests.exceptions
16 import requests.packages
17 from requests.packages.urllib3.exceptions import InsecureRequestWarning
18 
19 requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
20 
21 from app.Filters import Filters
22 from app.Utils import varDump
23 from app.Utils import getTracebackInfo
24 import app.Utils as Utils # pylint: disable=F0401
25 import dc_crawler.Constants as CONSTS
26 from dc_crawler.Exceptions import CrawlerFilterException
27 from dc_crawler.HTTPCookieResolver import HTTPCookieResolver
28 
29 logger = Utils.MPLogger().getLogger()
30 
31 
33  # Constants used in class
34  REQUEST_COOKIE_HEADER_NAME = 'Cookie'
35  RESPONSE_COOKIE_HEADER_NAME = 'set-cookie'
36  REFERER_HEADER_NAME = 'Referer'
37 
38  # Usage algorithm constants
39  USAGE_ALGORITHM_BASE = 0
40  USAGE_ALGORITHM_CUSTOM = 1
41  DEFAULT_USAGE_ALGORITHM = USAGE_ALGORITHM_BASE
42 
43  # Constants of error messages
44  ERROR_BAD_STATUS_CODE_VALUE = "Not allowed status code '%s'. Allowed list: %s"
45 
46  # Constructor
47  def __init__(self, dbWrapper=None, siteId=None, usageAlgorithm=DEFAULT_USAGE_ALGORITHM, redirectCodes=None):
48  object.__init__(self)
49  self.dbWrapper = dbWrapper
50  self.siteId = siteId
51  self.usageAlgorithm = usageAlgorithm
52  self.redirectCodes = CONSTS.REDIRECT_HTTP_CODES if not isinstance(redirectCodes, list) else redirectCodes
53 
54 
55  # @param url - the url to fetch
56  # @param method - fetch by HTTP method
57  # @param timeout - request timeout(seconds)
58  # @param headers - request headers dict
59  # @param allowRedirects - boolean flag allowed redirects
60  # @param proxySetting - proxy setting
61  # @param auth - basic auth setting, tuple of name and password
62  # @param data - post data, used only when method is post
63  # @param maxRedirects - max allowed redirects count
64  # @param filters - filters dict
65  # @return Response object
66  def request(self, url, method, timeout, headers, allowRedirects, proxySetting, auth, data, maxRedirects, filters):
67  # variable for return
68  implRes = None
69 
70  if self.usageAlgorithm == self.USAGE_ALGORITHM_BASE:
71  try:
72  implRes = self.requestBase(url, method, timeout, headers, allowRedirects, proxySetting,
73  auth, data, maxRedirects, filters)
74  except CrawlerFilterException, err:
75  raise err
76  except Exception:
77  logger.debug("!!! Hard case. Don't worry. We will try using more complexity way...")
78  implRes = self.requestBase(url, 'head', timeout, headers, allowRedirects, proxySetting,
79  auth, data, maxRedirects, filters)
80  if implRes is not None:
81  logger.debug("!!! implRes.headers: %s", varDump(implRes.headers))
82  implRes = self.requestBase(url, method, timeout, implRes.headers, allowRedirects, proxySetting,
83  auth, data, maxRedirects, filters)
84 
85  elif self.usageAlgorithm == self.USAGE_ALGORITHM_CUSTOM:
86  implRes = self.requestCustom(url, method, timeout, headers, allowRedirects, proxySetting, auth, data,
87  maxRedirects, filters)
88 
89  else:
90  raise Exception("Try using not support algorithm usage of 'requests' = %s" % (str(self.usageAlgorithm)))
91 
92  return implRes
93 
94 
95  # @param url - the url to fetch
96  # @param method - fetch by HTTP method
97  # @param timeout - request timeout(seconds)
98  # @param headers - request headers dict
99  # @param allowRedirects - boolean flag allowed redirects
100  # @param proxySetting - proxy setting
101  # @param auth - basic auth setting, tuple of name and password
102  # @param data - post data, used only when method is post
103  # @param maxRedirects - max allowed redirects count
104  # @param filters - filters dict
105  # @return Response object
106  def requestBase(self, url, method, timeout, headers, allowRedirects, proxySetting, auth, data, maxRedirects, filters):
107  # variable for return
108  res = None
109  try:
110  req = requests.Request(method=method,
111  url=url,
112  headers=headers,
113  auth=auth,
114  data=data,
115  hooks={'response':[RequestsRedirectWrapper.checkRedirect]})
116 
117  logger.debug("!!! headers: %s, type: %s", varDump(headers), str(type(headers)))
118 
119  if self.REFERER_HEADER_NAME in headers:
120  del headers[self.REFERER_HEADER_NAME]
121 
122  reqv = req.prepare()
123 
124  rSession = requests.Session()
125  rSession.max_redirects = int(maxRedirects)
126  rSession.stream = True
127  rSession.verify = False # don't verify ssl
128  rSession.proxies = proxySetting
129  res = rSession.send(request=reqv, allow_redirects=allowRedirects, timeout=timeout)
130 
131  self.__checkResponse(res, filters)
132 
133  logger.debug("!!! res.cookies: %s, type: %s", varDump(res.cookies), str(type(res.cookies)))
134  cookies = requests.utils.dict_from_cookiejar(res.cookies)
135  logger.debug("!!! cookies: %s, type: %s", varDump(cookies), str(type(cookies)))
136  if len(cookies) > 0:
137  cookiesList = [key + '=' + value for key, value in cookies.items()]
138  cookie = ''
139  if self.REQUEST_COOKIE_HEADER_NAME in headers:
140  cookie = headers[self.REQUEST_COOKIE_HEADER_NAME]
141  headers[self.REQUEST_COOKIE_HEADER_NAME] = cookie + (';'.join(cookiesList))
142  logger.debug("!!! headers updated by 'cookies': %s", varDump(headers))
143 
144  except requests.exceptions.TooManyRedirects, err:
145  raise err
146  except CrawlerFilterException, err:
147  raise err
148  except Exception, err:
149  logger.debug("!!! We have a problem: %s", str(err))
150  logger.info(getTracebackInfo())
151  raise err
152 
153 # logger.debug("!!! url: %s", str(url))
154 # logger.debug("!!! status_code: %s, method: %s, res.request.url: %s", str(res.status_code), str(method),
155 # str(res.request.url))
156 
157  return res
158 
159 
160  # #Check allowed Response object
161  # # if check not passed will be raise exception
162  #
163  # @param res - Response object
164  # @param filters - filters dict
165  # @return - None
166  def __checkResponse(self, res, filters):
167 
168  logger.debug("!!! res.url: %s", varDump(res.url))
169  if not self.__isAllowedUrl(res.url, filters):
170  raise CrawlerFilterException("Url %s not passed filter" % str(res.url))
171 
172  for history in res.history:
173  logger.debug("!!! history.url: %s", varDump(history.url))
174  logger.debug("!!! history.status_code: %s", varDump(history.status_code))
175  if not self.__isAllowedUrl(history.url, filters):
176  raise CrawlerFilterException("Url %s not passed filter" % str(history.url))
177 
178  if isinstance(self.redirectCodes, list):
179  for history in res.history:
180  if history.status_code not in self.redirectCodes:
181  raise requests.exceptions.TooManyRedirects(self.ERROR_BAD_STATUS_CODE_VALUE % \
182  (str(history.status_code), str(self.redirectCodes)))
183 
184 
185  # @param url - the url to fetch
186  # @param method - fetch by HTTP method
187  # @param timeout - request timeout(seconds)
188  # @param headers - request headers dict
189  # @param allowRedirects - boolean flag allowed redirects
190  # @param proxySetting - proxy setting
191  # @param auth - basic auth setting, tuple of name and password
192  # @param data - post data, used only when method is post
193  # @param maxRedirects - max allowed redirects count
194  # @param filters - filters dict
195  # @return Response object
196  def requestCustom(self, url, method, timeout, headers, allowRedirects, proxySetting, auth, data, maxRedirects,
197  filters):
198  # variable for return
199  implRes = None
200  applyHeaders = copy.deepcopy(headers)
201  # logger.debug("!!! request enter ... applyHeaders: %s", varDump(applyHeaders))
202 
203  cookieResolver = HTTPCookieResolver()
204  redirectsCount = 0
205 
206  while redirectsCount < int(maxRedirects):
207  implRes, localUrl = self.__sendRequest(url, method, timeout, applyHeaders, proxySetting, \
208  auth, data, maxRedirects)
209 
210  logger.debug("!!! implRes.status_code = %s", str(implRes.status_code))
211  # logger.debug("!!! implRes.headers: %s", str(implRes.headers))
212  # logger.debug("!!! implRes.cookies: %s, type: %s", str(implRes.cookies), str(type(implRes.cookies)))
213  # logger.debug("!!! implRes: %s, type: %s", varDump(implRes, maxDepth=10), str(type(implRes)))
214 
215  self.__saveCookies(url, implRes, cookieResolver)
216 
217  # logger.debug("!!! cookieResolver: %s", varDump(cookieResolver))
218 
219  if redirectsCount > 0:
220  if not self.__isAllowedUrl(localUrl, filters):
221  raise CrawlerFilterException("Url %s not passed filter" % str(localUrl))
222 
223  redirectsCount += 1
224  logger.debug("!!! redirectsCount = %s, maxRedirects = %s", str(redirectsCount), str(maxRedirects))
225 
226  # remove referer and other fields from header
227  applyHeaders = self.updateHeaderFields(applyHeaders)
228  logger.debug("!!!>>> applyHeaders: %s", varDump(applyHeaders))
229 
230  applyHeaders = self.updateHeadersByCookies(applyHeaders, localUrl, cookieResolver)
231  url = localUrl
232 
233  if implRes.status_code not in self.redirectCodes or not allowRedirects:
234  logger.debug("!!! break !!!")
235  break
236 
237  if implRes.status_code in self.redirectCodes:
238  raise requests.exceptions.TooManyRedirects('Exceeded %s redirects.' % str(maxRedirects))
239 
240  return implRes
241 
242 
243  # # Send request
244  #
245  # @param url - the next url to fetch
246  # @param method - fetch by HTTP method
247  # @param timeout - request timeout(seconds)
248  # @param headers - request headers dict
249  # @param proxySetting - proxy setting
250  # @param auth - basic auth setting, tuple of name and password
251  # @param data - post data, used only when method is post
252  # @param maxRedirects - max allowed redirects count
253  # @return response object
254  def __sendRequest(self, url, method, timeout, headers, proxySetting, auth, data, maxRedirects):
255 
256  logger.debug("!!! request arguments: " + str((url, timeout, headers, proxySetting, auth, data)))
257  logger.debug("!!! send request to url: %s", str(url))
258 
259  rSession = requests.Session()
260  rSession.max_redirects = int(maxRedirects)
261  methodFunc = rSession.__getattribute__(method)
262 
263  implRes = methodFunc(url,
264  timeout=timeout,
265  headers=headers,
266  allow_redirects=False,
267  proxies=proxySetting,
268  auth=auth,
269  data=data,
270  stream=True,
271  verify=False, # don't verify ssl
272  # hooks={'response':[RequestsRedirectWrapper.checkRedirectMax(handler=self)]}) # reserved
273  hooks={'response':[RequestsRedirectWrapper.checkRedirect]})
274 
275  redirect = None
276  redirectUrl = url
277  for redirect in rSession.resolve_redirects(implRes, implRes.request):
278  redirectUrl = redirect.url
279  break
280 
281  if redirect is not None:
282  implRes = redirect
283 
284  implRes.url = redirectUrl
285  logger.debug("!!! redirect.url: %s", str(redirectUrl))
286 
287  return implRes, redirectUrl
288 
289 
290  # # update headers fields
291  #
292  # @param headers - request headers dict
293  # @return headers - result headers alredy updated
294  @staticmethod
295  def updateHeaderFields(headers):
296  cid = requests.structures.CaseInsensitiveDict(headers)
297  for name in CONSTS.REDIRECT_HEADER_FIELDS_FOR_REMOVE:
298  for key, value in cid.lower_items(): # pylint: disable=W0612
299  # logger.debug("!!! key: %s, value: %s", str(key), str(value))
300  if key.lower() == name.lower():
301  del cid[name]
302  headers = dict(cid.lower_items())
303 
304  return headers
305 
306 
307  # # check allowed url by filter
308  #
309  # @param url - url for check
310  # @param inputFilters - filters dict
311  # @return - True if allowed by filter or othrwise False
312  def __isAllowedUrl(self, url, inputFilters=None):
313  # variable for result
314  ret = True
315 
316  if self.dbWrapper is not None and self.siteId is not None and inputFilters is not None:
317 # # Create class Filters instance for check 'redirect' use regular expressions
318 # localFilters = Filters(filters=inputFilters,
319 # dbTaskWrapper=self.dbWrapper,
320 # siteId=self.siteId,
321 # readMode=0,
322 # fields=None,
323 # opCode=Filters.OC_RE,
324 # stage=None) # Filters.STAGE_REDIRECT_URL)
325 #
326 # # logger.debug('!!! localFilters.filters: ' + varDump(localFilters.filters))
327 #
328 # isExistStageRedirectUrl = localFilters.isExistStage(Filters.STAGE_REDIRECT_URL)
329 # isExistStageAll = localFilters.isExistStage(Filters.STAGE_ALL)
330 #
331 # logger.debug("!!! isExistStage('STAGE_REDIRECT_URL'): %s", str(isExistStageRedirectUrl))
332 # logger.debug("!!! isExistStage('STAGE_ALL'): %s", str(isExistStageAll))
333 # #
334 # # logger.debug('!!! inputFilters: ' + varDump(inputFilters))
335 
336  # Check redirect url use regular expression
337  from dc_crawler.CollectURLs import CollectURLs
338 
339  filters = copy.deepcopy(inputFilters)
340  if filters is not None:
341  for inputFilter in filters:
342  if inputFilter.stage == Filters.STAGE_ALL or inputFilter.stage == Filters.STAGE_REDIRECT_URL:
343  inputFilter.stage = Filters.STAGE_COLLECT_URLS
344 
345  ret = CollectURLs.filtersApply(filters, url, 0, None, 0, None, Filters.OC_RE, Filters.STAGE_COLLECT_URLS)
346 
347  return ret
348 
349 
350 # # # save cookies
351 # #
352 # # @param url - url string
353 # # @param headers - request headers dict
354 # # @param cookieResolver - cookie resolver instance
355 # # @return - None
356 # def __saveCookies(self, url, headers, cookieResolver):
357 #
358 # if self.REQUEST_COOKIE_HEADER_NAME in headers:
359 # cookies = headers[self.REQUEST_COOKIE_HEADER_NAME]
360 # logger.debug("!!! cookies: '%s'", str(cookies))
361 # cookieResolver.addCookie(url, cookies)
362 #
363 # if self.RESPONSE_COOKIE_HEADER_NAME in headers:
364 # cookies = headers[self.RESPONSE_COOKIE_HEADER_NAME]
365 # logger.debug("!!! cookies: '%s'", str(cookies))
366 # cookieResolver.addCookie(url, cookies)
367 
368  # # save cookies
369  #
370  # @param url - url string
371  # @param res - responce object
372  # @param cookieResolver - cookie resolver instance
373  # @return - None
374  def __saveCookies(self, url, res, cookieResolver):
375 
376  if self.RESPONSE_COOKIE_HEADER_NAME in res.headers:
377  cookies = res.headers[self.RESPONSE_COOKIE_HEADER_NAME]
378  logger.debug("!!! cookies: '%s'", str(cookies))
379  if cookies is not None:
380  cookieResolver.addCookie(url, cookies)
381 
382 
383  # # Update headers by cached cookies
384  #
385  # @param headers - headers values dict
386  # @param url - url string
387  # @param cookieResolver - cookie resolver instance
388  # @param stage - allowed stage of usage (support of different stages use bitmask)
389  # @return updated headers object
390  @staticmethod
391  def updateHeadersByCookies(headers, url, cookieResolver, stage=HTTPCookieResolver.STAGE_DEFAULT):
392  try:
393  logger.debug('!!! Headers before update by cookies:\n' + str(headers))
394  cookies = cookieResolver.getCookie(url, stage)
395  if cookies is not None and isinstance(headers, dict):
396  headers[RequestsRedirectWrapper.REQUEST_COOKIE_HEADER_NAME] = cookies
397  logger.debug('!!! Cookies was updated ...Use headers:\n' + str(headers))
398  except Exception, err:
399  logger.error("!!! Error: %s", str(err))
400 
401  return headers
402 
403 
404  # # Next two callback function awhile not use, but possible can use in future.
405  # Now, they show different samles of usage hooks.
406 
407  # # check redirect max alowed count
408  #
409  # @param handler - RequestsRedirectWrapper instance reference
410  @staticmethod
411  def checkRedirectMax(handler, *args, **kwargs):
412  logger.debug('handler: %s', varDump(handler))
413  logger.debug('args = ' + str(args))
414  logger.debug('kwargs = ' + str(kwargs))
415 
416  if handler is not None:
417  handler.redirectCount += 1
418  if handler.redirectCount > handler.maxRedirects:
419  raise requests.exceptions.TooManyRedirects('Exceeded %s redirects.' % str(handler.maxRedirects))
420 
421 
422  # # check redirect
423  #
424  # @param r - result object instance
425  @staticmethod
426  def checkRedirect(r, *args, **kwargs):
427  # logger.debug('r: %s', varDump(r))
428  logger.debug('args = ' + str(args))
429  logger.debug('kwargs = ' + str(kwargs))
430  logger.debug('r.url: %s', str(r.url))
431  logger.debug('r.status_code = %s', str(r.status_code))
def updateHeadersByCookies(headers, url, cookieResolver, stage=HTTPCookieResolver.STAGE_DEFAULT)
def requestCustom(self, url, method, timeout, headers, allowRedirects, proxySetting, auth, data, maxRedirects, filters)
def request(self, url, method, timeout, headers, allowRedirects, proxySetting, auth, data, maxRedirects, filters)
def __sendRequest(self, url, method, timeout, headers, proxySetting, auth, data, maxRedirects)
def requestBase(self, url, method, timeout, headers, allowRedirects, proxySetting, auth, data, maxRedirects, filters)
def __init__(self, dbWrapper=None, siteId=None, usageAlgorithm=DEFAULT_USAGE_ALGORITHM, redirectCodes=None)
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Definition: join.py:1
def getTracebackInfo(linesNumberMax=None)
Definition: Utils.py:218