HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver Class Reference
Inheritance diagram for dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver:
Collaboration diagram for dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver:

Classes

class  RedirectProperty
 

Public Member Functions

def __init__ (self, propertyString=None, fetchType=BaseFetcher.TYP_NORMAL, dbWrapper=None, siteId=None, connectionTimeout=CONSTS.CONNECTION_TIMEOUT)
 
def getRedirectProperty (self, propertyDict)
 
def resolveRedirectUrl (self, url, headers, timeout=None, allowRedirects=True, proxies=None, auth=None, postData=None, maxRedirects=RedirectProperty.DEFAULT_VALUE_MAX_REDIRECTS, filters=None)
 

Static Public Member Functions

def isAllowedUrl (url, patterns)
 

Public Attributes

 fetchType
 
 dbWrapper
 
 siteId
 
 connectionTimeout
 

Static Public Attributes

string ERROR_INITIALIZATION = "Initialization class '%s' was failed. Error: %s"
 
string ERROR_BAD_TYPE_PROPERTY_VALUE = "Wrong type (%s) of property value: %s"
 
string ERROR_BAD_TYPE_HEADERS_VALUE = "Wrong type (%s) of headers: %s"
 
string ERROR_BAD_PROPERTY_VALUE = "Not support value '%s' for property '%s'"
 
string ERROR_BAD_STATUS_CODE_VALUE = "Not allowed status code '%s'. Allowed list: %s"
 
 redirectProperty = HTTPRedirectResolver.RedirectProperty()
 
 methodName
 
 urlPatternList
 
 maxRedirects
 
 typesList
 

Private Member Functions

def __loadProperty (self, propertyString)
 
def __repairHeaders (self, headers)
 
def __resolveRedirect (self, url, headers, method, timeout=None, allowRedirects=True, proxies=None, auth=None, postData=None, maxRedirects=RedirectProperty.DEFAULT_VALUE_MAX_REDIRECTS, filters=None, redirectProperty=None)
 
def __fetch (self, url, headers, method, timeout=None, allowRedirects=True, proxies=None, auth=None, postData=None, maxRedirects=RedirectProperty.DEFAULT_VALUE_MAX_REDIRECTS, filters=None, fetchType=BaseFetcher.TYP_NORMAL)
 

Detailed Description

Definition at line 32 of file HTTPRedirectResolver.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver.__init__ (   self,
  propertyString = None,
  fetchType = BaseFetcher.TYP_NORMAL,
  dbWrapper = None,
  siteId = None,
  connectionTimeout = CONSTS.CONNECTION_TIMEOUT 
)

Definition at line 78 of file HTTPRedirectResolver.py.

78  connectionTimeout=CONSTS.CONNECTION_TIMEOUT):
79  self.redirectProperty = self.__loadProperty(propertyString)
80  self.fetchType = fetchType
81  self.dbWrapper = dbWrapper
82  self.siteId = siteId
83  self.connectionTimeout = connectionTimeout
84 
85 
Here is the call graph for this function:

Member Function Documentation

◆ __fetch()

def dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver.__fetch (   self,
  url,
  headers,
  method,
  timeout = None,
  allowRedirects = True,
  proxies = None,
  auth = None,
  postData = None,
  maxRedirects = RedirectProperty.DEFAULT_VALUE_MAX_REDIRECTS,
  filters = None,
  fetchType = BaseFetcher.TYP_NORMAL 
)
private

Definition at line 325 of file HTTPRedirectResolver.py.

325  fetchType=BaseFetcher.TYP_NORMAL):
326  # variable for result
327  ret = None
328  fetcher = BaseFetcher.get_fetcher(fetchType, self.dbWrapper, self.siteId)
329  fetcher.connectionTimeout = self.connectionTimeout
330 
331  res = fetcher.open(url=url, method=method, headers=headers, timeout=timeout,
332  allow_redirects=allowRedirects, proxies=proxies, auth=auth, data=postData, log=logger,
333  max_redirects=maxRedirects, filters=filters)
334 
335  if res.url is not None:
336  ret = res.url
337 
338  return ret
339 
Here is the caller graph for this function:

◆ __loadProperty()

def dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver.__loadProperty (   self,
  propertyString 
)
private

Definition at line 147 of file HTTPRedirectResolver.py.

147  def __loadProperty(self, propertyString):
148  # variable for result
149  redirectProperty = HTTPRedirectResolver.RedirectProperty()
150 
151  if propertyString is not None:
152  try:
153  if not isinstance(propertyString, basestring) or propertyString == "":
154  raise Exception(self.ERROR_BAD_TYPE_PROPERTY_VALUE % (str(type(propertyString)), varDump(propertyString)))
155 
156  propertyDict = json.loads(propertyString)
157  redirectProperty = self.getRedirectProperty(propertyDict)
158 
159  except Exception, err:
160  logger.error(self.ERROR_INITIALIZATION, self.__class__.__name__, str(err))
161  logger.info(getTracebackInfo())
162 
163  return redirectProperty
164 
165 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
def getTracebackInfo(linesNumberMax=None)
Definition: Utils.py:218
Here is the call graph for this function:
Here is the caller graph for this function:

◆ __repairHeaders()

def dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver.__repairHeaders (   self,
  headers 
)
private

Definition at line 195 of file HTTPRedirectResolver.py.

195  def __repairHeaders(self, headers):
196  if isinstance(headers, dict):
197  for key, value in headers.items():
198  headers[key] = ';'.join(value.split())
199 
200 
Definition: join.py:1
Here is the call graph for this function:

◆ __resolveRedirect()

def dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver.__resolveRedirect (   self,
  url,
  headers,
  method,
  timeout = None,
  allowRedirects = True,
  proxies = None,
  auth = None,
  postData = None,
  maxRedirects = RedirectProperty.DEFAULT_VALUE_MAX_REDIRECTS,
  filters = None,
  redirectProperty = None 
)
private

Definition at line 274 of file HTTPRedirectResolver.py.

274  redirectProperty=None):
275  # variable for result
276  ret = None
277  logger.debug("type(redirectProperty) = %s", str(type(redirectProperty)))
278  if isinstance(redirectProperty, HTTPRedirectResolver.RedirectProperty):
279  logger.debug("type is GOOD!!!")
280 
281  # check is allowed url for processing by pattern list
282  if HTTPRedirectResolver.isAllowedUrl(url, redirectProperty.urlPatternList):
283 
284  if redirectProperty.methodName == HTTPRedirectResolver.RedirectProperty.METHOD_NAME_HEAD:
285  # method 'HEAD' execution
286  ret = self.__fetch(url=url, method=HTTPRedirectResolver.RedirectProperty.METHOD_NAME_HEAD,
287  headers=headers, timeout=timeout, allowRedirects=allowRedirects, proxies=proxies,
288  auth=auth, postData=postData, maxRedirects=maxRedirects, filters=filters,
289  fetchType=self.fetchType)
290 
291  elif redirectProperty.methodName == HTTPRedirectResolver.RedirectProperty.METHOD_NAME_SGET:
292  # method 'GET' for static fetcher type execution
293  ret = self.__fetch(url=url, method=HTTPRedirectResolver.RedirectProperty.METHOD_NAME_GET,
294  headers=headers, timeout=timeout, allowRedirects=allowRedirects, proxies=proxies,
295  auth=auth, postData=postData, maxRedirects=maxRedirects, filters=filters,
296  fetchType=BaseFetcher.TYP_NORMAL)
297 
298  elif self.redirectProperty.methodName == HTTPRedirectResolver.RedirectProperty.METHOD_NAME_DGET:
299  # method 'GET' for dynamic fetcher type execution
300  ret = self.__fetch(url=url, method=HTTPRedirectResolver.RedirectProperty.METHOD_NAME_GET,
301  headers=headers, timeout=timeout, allowRedirects=allowRedirects, proxies=proxies,
302  auth=auth, postData=postData, maxRedirects=maxRedirects, filters=filters,
303  fetchType=BaseFetcher.TYP_DYNAMIC)
304 
305  return ret
306 
307 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ getRedirectProperty()

def dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver.getRedirectProperty (   self,
  propertyDict 
)

Definition at line 90 of file HTTPRedirectResolver.py.

90  def getRedirectProperty(self, propertyDict):
Here is the caller graph for this function:

◆ isAllowedUrl()

def dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver.isAllowedUrl (   url,
  patterns 
)
static

Definition at line 172 of file HTTPRedirectResolver.py.

172  def isAllowedUrl(url, patterns):
173  # variable for result
174  ret = True
175  try:
176  if isinstance(patterns, list):
177  for pattern in patterns:
178  if isinstance(pattern, basestring):
179  ret = False
180  if re.search(pattern, url, re.UNICODE + re.IGNORECASE) is not None:
181  logger.debug("pattern: '%s' allowed for '%s'", str(pattern), str(url))
182  ret = True
183  break
184 
185  except Exception, err:
186  logger.error(str(err))
187 
188  return ret
189 
190 

◆ resolveRedirectUrl()

def dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver.resolveRedirectUrl (   self,
  url,
  headers,
  timeout = None,
  allowRedirects = True,
  proxies = None,
  auth = None,
  postData = None,
  maxRedirects = RedirectProperty.DEFAULT_VALUE_MAX_REDIRECTS,
  filters = None 
)

Definition at line 216 of file HTTPRedirectResolver.py.

216  maxRedirects=RedirectProperty.DEFAULT_VALUE_MAX_REDIRECTS, filters=None):
217 
218  # variable for result
219  ret = None
220 
221  logger.debug("Input url: %s \nheaders: %s", str(url), varDump(headers))
222 # logger.debug("method name: %s, max redirects = %s, redirect codes: %s",
223 # str(self.redirectProperty.methodName), str(self.redirectProperty.maxRedirects),
224 # str(self.redirectProperty.typesList))
225 
226 # self.__repairHeaders(headers) # remove in future because it's wrong logic
227 # logger.debug("headers: %s", varDump(headers))
228 
229  try:
230 
231  ret = self.__resolveRedirect(url=url, method=self.redirectProperty.methodName, headers=headers, timeout=timeout,
232  allowRedirects=allowRedirects, proxies=proxies, auth=auth, postData=postData,
233  maxRedirects=maxRedirects, filters=filters,
234  redirectProperty=self.redirectProperty)
235 
236  for urlPatternElem in self.redirectProperty.urlPatternList:
237  logger.debug("urlPatternElem: %s", varDump(urlPatternElem))
238  logger.debug("type(urlPatternElem) = %s", str(type(urlPatternElem)))
239 
240  if isinstance(urlPatternElem, HTTPRedirectResolver.RedirectProperty):
241  res = self.__resolveRedirect(url=url, method=self.redirectProperty.methodName, headers=headers,
242  timeout=timeout, allowRedirects=allowRedirects, proxies=proxies, auth=auth,
243  postData=postData, maxRedirects=maxRedirects, filters=filters,
244  redirectProperty=urlPatternElem)
245  if res is not None:
246  ret = res
247 
248  except CrawlerFilterException:
249  logger.debug("Url '%s' should be skipped.", str(url))
250  except (requests.exceptions.RequestException, Exception), err:
251  logger.debug("Resolve redirect url failed: %s", str(err))
252  logger.info(Utils.getTracebackInfo())
253 
254  return ret
255 
256 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:
Here is the caller graph for this function:

Member Data Documentation

◆ connectionTimeout

dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver.connectionTimeout

Definition at line 83 of file HTTPRedirectResolver.py.

◆ dbWrapper

dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver.dbWrapper

Definition at line 81 of file HTTPRedirectResolver.py.

◆ ERROR_BAD_PROPERTY_VALUE

string dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver.ERROR_BAD_PROPERTY_VALUE = "Not support value '%s' for property '%s'"
static

Definition at line 39 of file HTTPRedirectResolver.py.

◆ ERROR_BAD_STATUS_CODE_VALUE

string dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver.ERROR_BAD_STATUS_CODE_VALUE = "Not allowed status code '%s'. Allowed list: %s"
static

Definition at line 40 of file HTTPRedirectResolver.py.

◆ ERROR_BAD_TYPE_HEADERS_VALUE

string dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver.ERROR_BAD_TYPE_HEADERS_VALUE = "Wrong type (%s) of headers: %s"
static

Definition at line 38 of file HTTPRedirectResolver.py.

◆ ERROR_BAD_TYPE_PROPERTY_VALUE

string dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver.ERROR_BAD_TYPE_PROPERTY_VALUE = "Wrong type (%s) of property value: %s"
static

Definition at line 37 of file HTTPRedirectResolver.py.

◆ ERROR_INITIALIZATION

string dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver.ERROR_INITIALIZATION = "Initialization class '%s' was failed. Error: %s"
static

Definition at line 36 of file HTTPRedirectResolver.py.

◆ fetchType

dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver.fetchType

Definition at line 80 of file HTTPRedirectResolver.py.

◆ maxRedirects

dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver.maxRedirects
static

Definition at line 124 of file HTTPRedirectResolver.py.

◆ methodName

dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver.methodName
static

Definition at line 101 of file HTTPRedirectResolver.py.

◆ redirectProperty

dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver.redirectProperty = HTTPRedirectResolver.RedirectProperty()
static

Definition at line 92 of file HTTPRedirectResolver.py.

◆ siteId

dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver.siteId

Definition at line 82 of file HTTPRedirectResolver.py.

◆ typesList

dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver.typesList
static

Definition at line 130 of file HTTPRedirectResolver.py.

◆ urlPatternList

dc_crawler.HTTPRedirectResolver.HTTPRedirectResolver.urlPatternList
static

Definition at line 110 of file HTTPRedirectResolver.py.


The documentation for this class was generated from the following file: