3 HCE project, Python bindings, Distributed Tasks Manager application. 4 HTTPRedirectResolver Class content main functional for resolve redirects 7 @file HTTPRedirectResolver.py 8 @author Alexander Vybornyh <alexander.hce.cluster@gmail.com> 9 @link: http://hierarchical-cluster-engine.com/ 10 @copyright: Copyright © 2013-2017 IOIX Ukraine 11 @license: http://hierarchical-cluster-engine.com/license/ 18 import requests.exceptions
36 ERROR_INITIALIZATION =
"Initialization class '%s' was failed. Error: %s" 37 ERROR_BAD_TYPE_PROPERTY_VALUE =
"Wrong type (%s) of property value: %s" 38 ERROR_BAD_TYPE_HEADERS_VALUE =
"Wrong type (%s) of headers: %s" 39 ERROR_BAD_PROPERTY_VALUE =
"Not support value '%s' for property '%s'" 40 ERROR_BAD_STATUS_CODE_VALUE =
"Not allowed status code '%s'. Allowed list: %s" 46 PROPERTY_METHOD_NAME =
"METHOD" 47 PROPERTY_URL_PATTERN_LIST =
"URL" 48 PROPERTY_MAX_REDIRECTS =
"MAX" 49 PROPERTY_TYPES_LIST =
"TYPES" 51 METHOD_NAME_HEAD =
'HEAD' 52 METHOD_NAME_SGET =
'SGET' 53 METHOD_NAME_DGET =
'DGET' 55 METHOD_NAME_GET =
'GET' 57 DEFAULT_VALUE_METHOD_NAME = METHOD_NAME_HEAD
58 DEFAULT_VALUE_URL_PATTERN_LIST = [
'.*']
59 DEFAULT_VALUE_MAX_REDIRECTS = 10
60 DEFAULT_VALUE_TYPES_LIST = [301, 302, 303, 304]
62 SUPPORT_METHOD_NAMES = [METHOD_NAME_HEAD, METHOD_NAME_SGET, METHOD_NAME_DGET]
64 def __init__(self, methodName=None, urlPatternList=None, maxRedirects=None, typesList=None):
77 def __init__(self, propertyString=None, fetchType=BaseFetcher.TYP_NORMAL, dbWrapper=None, siteId=None,
78 connectionTimeout=CONSTS.CONNECTION_TIMEOUT):
94 if not isinstance(propertyDict, dict):
98 if HTTPRedirectResolver.RedirectProperty.PROPERTY_METHOD_NAME
in propertyDict:
99 if propertyDict[HTTPRedirectResolver.RedirectProperty.PROPERTY_METHOD_NAME]
in \
100 HTTPRedirectResolver.RedirectProperty.SUPPORT_METHOD_NAMES:
101 redirectProperty.methodName = propertyDict[HTTPRedirectResolver.RedirectProperty.PROPERTY_METHOD_NAME]
104 (
varDump(propertyDict[HTTPRedirectResolver.RedirectProperty.PROPERTY_METHOD_NAME]),
105 str(HTTPRedirectResolver.RedirectProperty.PROPERTY_METHOD_NAME)))
108 if HTTPRedirectResolver.RedirectProperty.PROPERTY_URL_PATTERN_LIST
in propertyDict:
109 if isinstance(propertyDict[HTTPRedirectResolver.RedirectProperty.PROPERTY_URL_PATTERN_LIST], list):
110 redirectProperty.urlPatternList = \
111 propertyDict[HTTPRedirectResolver.RedirectProperty.PROPERTY_URL_PATTERN_LIST]
113 for i
in xrange(len(redirectProperty.urlPatternList)):
114 if isinstance(redirectProperty.urlPatternList[i], dict):
115 redirectProperty.urlPatternList[i] = self.
getRedirectProperty(redirectProperty.urlPatternList[i])
119 (
varDump(propertyDict[HTTPRedirectResolver.RedirectProperty.PROPERTY_URL_PATTERN_LIST]),
120 str(HTTPRedirectResolver.RedirectProperty.PROPERTY_URL_PATTERN_LIST)))
123 if HTTPRedirectResolver.RedirectProperty.PROPERTY_MAX_REDIRECTS
in propertyDict:
124 redirectProperty.maxRedirects = \
125 int(propertyDict[HTTPRedirectResolver.RedirectProperty.PROPERTY_MAX_REDIRECTS])
128 if HTTPRedirectResolver.RedirectProperty.PROPERTY_TYPES_LIST
in propertyDict:
129 if isinstance(propertyDict[HTTPRedirectResolver.RedirectProperty.PROPERTY_TYPES_LIST], list):
130 redirectProperty.typesList = propertyDict[HTTPRedirectResolver.RedirectProperty.PROPERTY_TYPES_LIST]
133 (
varDump(propertyDict[HTTPRedirectResolver.RedirectProperty.PROPERTY_TYPES_LIST]),
134 str(HTTPRedirectResolver.RedirectProperty.PROPERTY_TYPES_LIST)))
136 except Exception, err:
140 return redirectProperty
151 if propertyString
is not None:
153 if not isinstance(propertyString, basestring)
or propertyString ==
"":
156 propertyDict = json.loads(propertyString)
159 except Exception, err:
163 return redirectProperty
176 if isinstance(patterns, list):
177 for pattern
in patterns:
178 if isinstance(pattern, basestring):
180 if re.search(pattern, url, re.UNICODE + re.IGNORECASE)
is not None:
181 logger.debug(
"pattern: '%s' allowed for '%s'", str(pattern), str(url))
185 except Exception, err:
186 logger.error(str(err))
196 if isinstance(headers, dict):
197 for key, value
in headers.items():
198 headers[key] =
';'.
join(value.split())
215 allowRedirects=True, proxies=None, auth=None, postData=None,
216 maxRedirects=RedirectProperty.DEFAULT_VALUE_MAX_REDIRECTS, filters=None):
221 logger.debug(
"Input url: %s \nheaders: %s", str(url),
varDump(headers))
232 allowRedirects=allowRedirects, proxies=proxies, auth=auth, postData=postData,
233 maxRedirects=maxRedirects, filters=filters,
237 logger.debug(
"urlPatternElem: %s",
varDump(urlPatternElem))
238 logger.debug(
"type(urlPatternElem) = %s", str(
type(urlPatternElem)))
242 timeout=timeout, allowRedirects=allowRedirects, proxies=proxies, auth=auth,
243 postData=postData, maxRedirects=maxRedirects, filters=filters,
244 redirectProperty=urlPatternElem)
248 except CrawlerFilterException:
249 logger.debug(
"Url '%s' should be skipped.", str(url))
250 except (requests.exceptions.RequestException, Exception), err:
251 logger.debug(
"Resolve redirect url failed: %s", str(err))
252 logger.info(Utils.getTracebackInfo())
272 allowRedirects=True, proxies=None, auth=None, postData=None,
273 maxRedirects=RedirectProperty.DEFAULT_VALUE_MAX_REDIRECTS, filters=None,
274 redirectProperty=None):
277 logger.debug(
"type(redirectProperty) = %s", str(
type(redirectProperty)))
279 logger.debug(
"type is GOOD!!!")
282 if HTTPRedirectResolver.isAllowedUrl(url, redirectProperty.urlPatternList):
284 if redirectProperty.methodName == HTTPRedirectResolver.RedirectProperty.METHOD_NAME_HEAD:
286 ret = self.
__fetch(url=url, method=HTTPRedirectResolver.RedirectProperty.METHOD_NAME_HEAD,
287 headers=headers, timeout=timeout, allowRedirects=allowRedirects, proxies=proxies,
288 auth=auth, postData=postData, maxRedirects=maxRedirects, filters=filters,
291 elif redirectProperty.methodName == HTTPRedirectResolver.RedirectProperty.METHOD_NAME_SGET:
293 ret = self.
__fetch(url=url, method=HTTPRedirectResolver.RedirectProperty.METHOD_NAME_GET,
294 headers=headers, timeout=timeout, allowRedirects=allowRedirects, proxies=proxies,
295 auth=auth, postData=postData, maxRedirects=maxRedirects, filters=filters,
296 fetchType=BaseFetcher.TYP_NORMAL)
298 elif self.
redirectProperty.methodName == HTTPRedirectResolver.RedirectProperty.METHOD_NAME_DGET:
300 ret = self.
__fetch(url=url, method=HTTPRedirectResolver.RedirectProperty.METHOD_NAME_GET,
301 headers=headers, timeout=timeout, allowRedirects=allowRedirects, proxies=proxies,
302 auth=auth, postData=postData, maxRedirects=maxRedirects, filters=filters,
303 fetchType=BaseFetcher.TYP_DYNAMIC)
322 def __fetch(self, url, headers, method, timeout=None,
323 allowRedirects=True, proxies=None, auth=None, postData=None,
324 maxRedirects=RedirectProperty.DEFAULT_VALUE_MAX_REDIRECTS, filters=None,
325 fetchType=BaseFetcher.TYP_NORMAL):
328 fetcher = BaseFetcher.get_fetcher(fetchType, self.
dbWrapper, self.
siteId)
331 res = fetcher.open(url=url, method=method, headers=headers, timeout=timeout,
332 allow_redirects=allowRedirects, proxies=proxies, auth=auth, data=postData, log=logger,
333 max_redirects=maxRedirects, filters=filters)
335 if res.url
is not None:
def getRedirectProperty(self, propertyDict)
string ERROR_INITIALIZATION
int DEFAULT_VALUE_MAX_REDIRECTS
def __resolveRedirect(self, url, headers, method, timeout=None, allowRedirects=True, proxies=None, auth=None, postData=None, maxRedirects=RedirectProperty.DEFAULT_VALUE_MAX_REDIRECTS, filters=None, redirectProperty=None)
def __loadProperty(self, propertyString)
list DEFAULT_VALUE_URL_PATTERN_LIST
def __fetch(self, url, headers, method, timeout=None, allowRedirects=True, proxies=None, auth=None, postData=None, maxRedirects=RedirectProperty.DEFAULT_VALUE_MAX_REDIRECTS, filters=None, fetchType=BaseFetcher.TYP_NORMAL)
list DEFAULT_VALUE_TYPES_LIST
def resolveRedirectUrl(self, url, headers, timeout=None, allowRedirects=True, proxies=None, auth=None, postData=None, maxRedirects=RedirectProperty.DEFAULT_VALUE_MAX_REDIRECTS, filters=None)
string ERROR_BAD_PROPERTY_VALUE
string ERROR_BAD_TYPE_PROPERTY_VALUE
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
string DEFAULT_VALUE_METHOD_NAME
def __init__(self, methodName=None, urlPatternList=None, maxRedirects=None, typesList=None)
def isAllowedUrl(url, patterns)
def __repairHeaders(self, headers)
def getTracebackInfo(linesNumberMax=None)