HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper Class Reference
Inheritance diagram for dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper:
Collaboration diagram for dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper:

Public Member Functions

def __init__ (self, dbWrapper=None, siteId=None, usageAlgorithm=DEFAULT_USAGE_ALGORITHM, redirectCodes=None)
 
def request (self, url, method, timeout, headers, allowRedirects, proxySetting, auth, data, maxRedirects, filters)
 
def requestBase (self, url, method, timeout, headers, allowRedirects, proxySetting, auth, data, maxRedirects, filters)
 
def requestCustom (self, url, method, timeout, headers, allowRedirects, proxySetting, auth, data, maxRedirects, filters)
 

Static Public Member Functions

def updateHeaderFields (headers)
 
def updateHeadersByCookies (headers, url, cookieResolver, stage=HTTPCookieResolver.STAGE_DEFAULT)
 
def checkRedirectMax (handler, args, kwargs)
 
def checkRedirect (r, args, kwargs)
 

Public Attributes

 dbWrapper
 
 siteId
 
 usageAlgorithm
 
 redirectCodes
 

Static Public Attributes

string REQUEST_COOKIE_HEADER_NAME = 'Cookie'
 
string RESPONSE_COOKIE_HEADER_NAME = 'set-cookie'
 
string REFERER_HEADER_NAME = 'Referer'
 
int USAGE_ALGORITHM_BASE = 0
 
int USAGE_ALGORITHM_CUSTOM = 1
 
int DEFAULT_USAGE_ALGORITHM = USAGE_ALGORITHM_BASE
 
string ERROR_BAD_STATUS_CODE_VALUE = "Not allowed status code '%s'. Allowed list: %s"
 

Private Member Functions

def __checkResponse (self, res, filters)
 
def __sendRequest (self, url, method, timeout, headers, proxySetting, auth, data, maxRedirects)
 
def __isAllowedUrl (self, url, inputFilters=None)
 
def __saveCookies (self, url, res, cookieResolver)
 

Detailed Description

Definition at line 32 of file RequestsRedirectWrapper.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper.__init__ (   self,
  dbWrapper = None,
  siteId = None,
  usageAlgorithm = DEFAULT_USAGE_ALGORITHM,
  redirectCodes = None 
)

Definition at line 47 of file RequestsRedirectWrapper.py.

47  def __init__(self, dbWrapper=None, siteId=None, usageAlgorithm=DEFAULT_USAGE_ALGORITHM, redirectCodes=None):
48  object.__init__(self)
49  self.dbWrapper = dbWrapper
50  self.siteId = siteId
51  self.usageAlgorithm = usageAlgorithm
52  self.redirectCodes = CONSTS.REDIRECT_HTTP_CODES if not isinstance(redirectCodes, list) else redirectCodes
53 
54 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ __checkResponse()

def dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper.__checkResponse (   self,
  res,
  filters 
)
private

Definition at line 166 of file RequestsRedirectWrapper.py.

166  def __checkResponse(self, res, filters):
167 
168  logger.debug("!!! res.url: %s", varDump(res.url))
169  if not self.__isAllowedUrl(res.url, filters):
170  raise CrawlerFilterException("Url %s not passed filter" % str(res.url))
171 
172  for history in res.history:
173  logger.debug("!!! history.url: %s", varDump(history.url))
174  logger.debug("!!! history.status_code: %s", varDump(history.status_code))
175  if not self.__isAllowedUrl(history.url, filters):
176  raise CrawlerFilterException("Url %s not passed filter" % str(history.url))
177 
178  if isinstance(self.redirectCodes, list):
179  for history in res.history:
180  if history.status_code not in self.redirectCodes:
181  raise requests.exceptions.TooManyRedirects(self.ERROR_BAD_STATUS_CODE_VALUE % \
182  (str(history.status_code), str(self.redirectCodes)))
183 
184 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:
Here is the caller graph for this function:

◆ __isAllowedUrl()

def dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper.__isAllowedUrl (   self,
  url,
  inputFilters = None 
)
private

Definition at line 312 of file RequestsRedirectWrapper.py.

312  def __isAllowedUrl(self, url, inputFilters=None):
313  # variable for result
314  ret = True
315 
316  if self.dbWrapper is not None and self.siteId is not None and inputFilters is not None:
317 # # Create class Filters instance for check 'redirect' use regular expressions
318 # localFilters = Filters(filters=inputFilters,
319 # dbTaskWrapper=self.dbWrapper,
320 # siteId=self.siteId,
321 # readMode=0,
322 # fields=None,
323 # opCode=Filters.OC_RE,
324 # stage=None) # Filters.STAGE_REDIRECT_URL)
325 #
326 # # logger.debug('!!! localFilters.filters: ' + varDump(localFilters.filters))
327 #
328 # isExistStageRedirectUrl = localFilters.isExistStage(Filters.STAGE_REDIRECT_URL)
329 # isExistStageAll = localFilters.isExistStage(Filters.STAGE_ALL)
330 #
331 # logger.debug("!!! isExistStage('STAGE_REDIRECT_URL'): %s", str(isExistStageRedirectUrl))
332 # logger.debug("!!! isExistStage('STAGE_ALL'): %s", str(isExistStageAll))
333 # #
334 # # logger.debug('!!! inputFilters: ' + varDump(inputFilters))
335 
336  # Check redirect url use regular expression
337  from dc_crawler.CollectURLs import CollectURLs
338 
339  filters = copy.deepcopy(inputFilters)
340  if filters is not None:
341  for inputFilter in filters:
342  if inputFilter.stage == Filters.STAGE_ALL or inputFilter.stage == Filters.STAGE_REDIRECT_URL:
343  inputFilter.stage = Filters.STAGE_COLLECT_URLS
344 
345  ret = CollectURLs.filtersApply(filters, url, 0, None, 0, None, Filters.OC_RE, Filters.STAGE_COLLECT_URLS)
346 
347  return ret
348 
349 
350 # # # save cookies
351 # #
352 # # @param url - url string
353 # # @param headers - request headers dict
354 # # @param cookieResolver - cookie resolver instance
355 # # @return - None
356 # def __saveCookies(self, url, headers, cookieResolver):
357 #
358 # if self.REQUEST_COOKIE_HEADER_NAME in headers:
359 # cookies = headers[self.REQUEST_COOKIE_HEADER_NAME]
360 # logger.debug("!!! cookies: '%s'", str(cookies))
361 # cookieResolver.addCookie(url, cookies)
362 #
363 # if self.RESPONSE_COOKIE_HEADER_NAME in headers:
364 # cookies = headers[self.RESPONSE_COOKIE_HEADER_NAME]
365 # logger.debug("!!! cookies: '%s'", str(cookies))
366 # cookieResolver.addCookie(url, cookies)
367 
Here is the caller graph for this function:

◆ __saveCookies()

def dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper.__saveCookies (   self,
  url,
  res,
  cookieResolver 
)
private

Definition at line 374 of file RequestsRedirectWrapper.py.

374  def __saveCookies(self, url, res, cookieResolver):
375 
376  if self.RESPONSE_COOKIE_HEADER_NAME in res.headers:
377  cookies = res.headers[self.RESPONSE_COOKIE_HEADER_NAME]
378  logger.debug("!!! cookies: '%s'", str(cookies))
379  if cookies is not None:
380  cookieResolver.addCookie(url, cookies)
381 
382 
Here is the caller graph for this function:

◆ __sendRequest()

def dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper.__sendRequest (   self,
  url,
  method,
  timeout,
  headers,
  proxySetting,
  auth,
  data,
  maxRedirects 
)
private

Definition at line 254 of file RequestsRedirectWrapper.py.

254  def __sendRequest(self, url, method, timeout, headers, proxySetting, auth, data, maxRedirects):
255 
256  logger.debug("!!! request arguments: " + str((url, timeout, headers, proxySetting, auth, data)))
257  logger.debug("!!! send request to url: %s", str(url))
258 
259  rSession = requests.Session()
260  rSession.max_redirects = int(maxRedirects)
261  methodFunc = rSession.__getattribute__(method)
262 
263  implRes = methodFunc(url,
264  timeout=timeout,
265  headers=headers,
266  allow_redirects=False,
267  proxies=proxySetting,
268  auth=auth,
269  data=data,
270  stream=True,
271  verify=False, # don't verify ssl
272  # hooks={'response':[RequestsRedirectWrapper.checkRedirectMax(handler=self)]}) # reserved
273  hooks={'response':[RequestsRedirectWrapper.checkRedirect]})
274 
275  redirect = None
276  redirectUrl = url
277  for redirect in rSession.resolve_redirects(implRes, implRes.request):
278  redirectUrl = redirect.url
279  break
280 
281  if redirect is not None:
282  implRes = redirect
283 
284  implRes.url = redirectUrl
285  logger.debug("!!! redirect.url: %s", str(redirectUrl))
286 
287  return implRes, redirectUrl
288 
289 
Here is the caller graph for this function:

◆ checkRedirect()

def dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper.checkRedirect (   r,
  args,
  kwargs 
)
static

Definition at line 426 of file RequestsRedirectWrapper.py.

426  def checkRedirect(r, *args, **kwargs):
427  # logger.debug('r: %s', varDump(r))
428  logger.debug('args = ' + str(args))
429  logger.debug('kwargs = ' + str(kwargs))
430  logger.debug('r.url: %s', str(r.url))
431  logger.debug('r.status_code = %s', str(r.status_code))
432 

◆ checkRedirectMax()

def dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper.checkRedirectMax (   handler,
  args,
  kwargs 
)
static

Definition at line 411 of file RequestsRedirectWrapper.py.

411  def checkRedirectMax(handler, *args, **kwargs):
412  logger.debug('handler: %s', varDump(handler))
413  logger.debug('args = ' + str(args))
414  logger.debug('kwargs = ' + str(kwargs))
415 
416  if handler is not None:
417  handler.redirectCount += 1
418  if handler.redirectCount > handler.maxRedirects:
419  raise requests.exceptions.TooManyRedirects('Exceeded %s redirects.' % str(handler.maxRedirects))
420 
421 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:

◆ request()

def dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper.request (   self,
  url,
  method,
  timeout,
  headers,
  allowRedirects,
  proxySetting,
  auth,
  data,
  maxRedirects,
  filters 
)

Definition at line 66 of file RequestsRedirectWrapper.py.

66  def request(self, url, method, timeout, headers, allowRedirects, proxySetting, auth, data, maxRedirects, filters):
67  # variable for return
68  implRes = None
69 
70  if self.usageAlgorithm == self.USAGE_ALGORITHM_BASE:
71  try:
72  implRes = self.requestBase(url, method, timeout, headers, allowRedirects, proxySetting,
73  auth, data, maxRedirects, filters)
74  except CrawlerFilterException, err:
75  raise err
76  except Exception:
77  logger.debug("!!! Hard case. Don't worry. We will try using more complexity way...")
78  implRes = self.requestBase(url, 'head', timeout, headers, allowRedirects, proxySetting,
79  auth, data, maxRedirects, filters)
80  if implRes is not None:
81  logger.debug("!!! implRes.headers: %s", varDump(implRes.headers))
82  implRes = self.requestBase(url, method, timeout, implRes.headers, allowRedirects, proxySetting,
83  auth, data, maxRedirects, filters)
84 
85  elif self.usageAlgorithm == self.USAGE_ALGORITHM_CUSTOM:
86  implRes = self.requestCustom(url, method, timeout, headers, allowRedirects, proxySetting, auth, data,
87  maxRedirects, filters)
88 
89  else:
90  raise Exception("Try using not support algorithm usage of 'requests' = %s" % (str(self.usageAlgorithm)))
91 
92  return implRes
93 
94 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:

◆ requestBase()

def dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper.requestBase (   self,
  url,
  method,
  timeout,
  headers,
  allowRedirects,
  proxySetting,
  auth,
  data,
  maxRedirects,
  filters 
)

Definition at line 106 of file RequestsRedirectWrapper.py.

106  def requestBase(self, url, method, timeout, headers, allowRedirects, proxySetting, auth, data, maxRedirects, filters):
107  # variable for return
108  res = None
109  try:
110  req = requests.Request(method=method,
111  url=url,
112  headers=headers,
113  auth=auth,
114  data=data,
115  hooks={'response':[RequestsRedirectWrapper.checkRedirect]})
116 
117  logger.debug("!!! headers: %s, type: %s", varDump(headers), str(type(headers)))
118 
119  if self.REFERER_HEADER_NAME in headers:
120  del headers[self.REFERER_HEADER_NAME]
121 
122  reqv = req.prepare()
123 
124  rSession = requests.Session()
125  rSession.max_redirects = int(maxRedirects)
126  rSession.stream = True
127  rSession.verify = False # don't verify ssl
128  rSession.proxies = proxySetting
129  res = rSession.send(request=reqv, allow_redirects=allowRedirects, timeout=timeout)
130 
131  self.__checkResponse(res, filters)
132 
133  logger.debug("!!! res.cookies: %s, type: %s", varDump(res.cookies), str(type(res.cookies)))
134  cookies = requests.utils.dict_from_cookiejar(res.cookies)
135  logger.debug("!!! cookies: %s, type: %s", varDump(cookies), str(type(cookies)))
136  if len(cookies) > 0:
137  cookiesList = [key + '=' + value for key, value in cookies.items()]
138  cookie = ''
139  if self.REQUEST_COOKIE_HEADER_NAME in headers:
140  cookie = headers[self.REQUEST_COOKIE_HEADER_NAME]
141  headers[self.REQUEST_COOKIE_HEADER_NAME] = cookie + (';'.join(cookiesList))
142  logger.debug("!!! headers updated by 'cookies': %s", varDump(headers))
143 
144  except requests.exceptions.TooManyRedirects, err:
145  raise err
146  except CrawlerFilterException, err:
147  raise err
148  except Exception, err:
149  logger.debug("!!! We have a problem: %s", str(err))
150  logger.info(getTracebackInfo())
151  raise err
152 
153 # logger.debug("!!! url: %s", str(url))
154 # logger.debug("!!! status_code: %s, method: %s, res.request.url: %s", str(res.status_code), str(method),
155 # str(res.request.url))
156 
157  return res
158 
159 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Definition: join.py:1
def getTracebackInfo(linesNumberMax=None)
Definition: Utils.py:218
Here is the call graph for this function:
Here is the caller graph for this function:

◆ requestCustom()

def dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper.requestCustom (   self,
  url,
  method,
  timeout,
  headers,
  allowRedirects,
  proxySetting,
  auth,
  data,
  maxRedirects,
  filters 
)

Definition at line 197 of file RequestsRedirectWrapper.py.

197  filters):
198  # variable for return
199  implRes = None
200  applyHeaders = copy.deepcopy(headers)
201  # logger.debug("!!! request enter ... applyHeaders: %s", varDump(applyHeaders))
202 
203  cookieResolver = HTTPCookieResolver()
204  redirectsCount = 0
205 
206  while redirectsCount < int(maxRedirects):
207  implRes, localUrl = self.__sendRequest(url, method, timeout, applyHeaders, proxySetting, \
208  auth, data, maxRedirects)
209 
210  logger.debug("!!! implRes.status_code = %s", str(implRes.status_code))
211  # logger.debug("!!! implRes.headers: %s", str(implRes.headers))
212  # logger.debug("!!! implRes.cookies: %s, type: %s", str(implRes.cookies), str(type(implRes.cookies)))
213  # logger.debug("!!! implRes: %s, type: %s", varDump(implRes, maxDepth=10), str(type(implRes)))
214 
215  self.__saveCookies(url, implRes, cookieResolver)
216 
217  # logger.debug("!!! cookieResolver: %s", varDump(cookieResolver))
218 
219  if redirectsCount > 0:
220  if not self.__isAllowedUrl(localUrl, filters):
221  raise CrawlerFilterException("Url %s not passed filter" % str(localUrl))
222 
223  redirectsCount += 1
224  logger.debug("!!! redirectsCount = %s, maxRedirects = %s", str(redirectsCount), str(maxRedirects))
225 
226  # remove referer and other fields from header
227  applyHeaders = self.updateHeaderFields(applyHeaders)
228  logger.debug("!!!>>> applyHeaders: %s", varDump(applyHeaders))
229 
230  applyHeaders = self.updateHeadersByCookies(applyHeaders, localUrl, cookieResolver)
231  url = localUrl
232 
233  if implRes.status_code not in self.redirectCodes or not allowRedirects:
234  logger.debug("!!! break !!!")
235  break
236 
237  if implRes.status_code in self.redirectCodes:
238  raise requests.exceptions.TooManyRedirects('Exceeded %s redirects.' % str(maxRedirects))
239 
240  return implRes
241 
242 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:
Here is the caller graph for this function:

◆ updateHeaderFields()

def dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper.updateHeaderFields (   headers)
static

Definition at line 295 of file RequestsRedirectWrapper.py.

295  def updateHeaderFields(headers):
296  cid = requests.structures.CaseInsensitiveDict(headers)
297  for name in CONSTS.REDIRECT_HEADER_FIELDS_FOR_REMOVE:
298  for key, value in cid.lower_items(): # pylint: disable=W0612
299  # logger.debug("!!! key: %s, value: %s", str(key), str(value))
300  if key.lower() == name.lower():
301  del cid[name]
302  headers = dict(cid.lower_items())
303 
304  return headers
305 
306 
Here is the caller graph for this function:

◆ updateHeadersByCookies()

def dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper.updateHeadersByCookies (   headers,
  url,
  cookieResolver,
  stage = HTTPCookieResolver.STAGE_DEFAULT 
)
static

Definition at line 391 of file RequestsRedirectWrapper.py.

391  def updateHeadersByCookies(headers, url, cookieResolver, stage=HTTPCookieResolver.STAGE_DEFAULT):
392  try:
393  logger.debug('!!! Headers before update by cookies:\n' + str(headers))
394  cookies = cookieResolver.getCookie(url, stage)
395  if cookies is not None and isinstance(headers, dict):
396  headers[RequestsRedirectWrapper.REQUEST_COOKIE_HEADER_NAME] = cookies
397  logger.debug('!!! Cookies was updated ...Use headers:\n' + str(headers))
398  except Exception, err:
399  logger.error("!!! Error: %s", str(err))
400 
401  return headers
402 
403 
Here is the caller graph for this function:

Member Data Documentation

◆ dbWrapper

dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper.dbWrapper

Definition at line 49 of file RequestsRedirectWrapper.py.

◆ DEFAULT_USAGE_ALGORITHM

int dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper.DEFAULT_USAGE_ALGORITHM = USAGE_ALGORITHM_BASE
static

Definition at line 41 of file RequestsRedirectWrapper.py.

◆ ERROR_BAD_STATUS_CODE_VALUE

string dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper.ERROR_BAD_STATUS_CODE_VALUE = "Not allowed status code '%s'. Allowed list: %s"
static

Definition at line 44 of file RequestsRedirectWrapper.py.

◆ redirectCodes

dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper.redirectCodes

Definition at line 52 of file RequestsRedirectWrapper.py.

◆ REFERER_HEADER_NAME

string dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper.REFERER_HEADER_NAME = 'Referer'
static

Definition at line 36 of file RequestsRedirectWrapper.py.

◆ REQUEST_COOKIE_HEADER_NAME

string dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper.REQUEST_COOKIE_HEADER_NAME = 'Cookie'
static

Definition at line 34 of file RequestsRedirectWrapper.py.

◆ RESPONSE_COOKIE_HEADER_NAME

string dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper.RESPONSE_COOKIE_HEADER_NAME = 'set-cookie'
static

Definition at line 35 of file RequestsRedirectWrapper.py.

◆ siteId

dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper.siteId

Definition at line 50 of file RequestsRedirectWrapper.py.

◆ USAGE_ALGORITHM_BASE

int dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper.USAGE_ALGORITHM_BASE = 0
static

Definition at line 39 of file RequestsRedirectWrapper.py.

◆ USAGE_ALGORITHM_CUSTOM

int dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper.USAGE_ALGORITHM_CUSTOM = 1
static

Definition at line 40 of file RequestsRedirectWrapper.py.

◆ usageAlgorithm

dc_crawler.RequestsRedirectWrapper.RequestsRedirectWrapper.usageAlgorithm

Definition at line 51 of file RequestsRedirectWrapper.py.


The documentation for this class was generated from the following file: