HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
HTTPRedirectResolver.py
Go to the documentation of this file.
1 # coding: utf-8
2 """
3 HCE project, Python bindings, Distributed Tasks Manager application.
4 HTTPRedirectResolver Class content main functional for resolve redirects
5 
6 @package: dc_crawler
7 @file HTTPRedirectResolver.py
8 @author Alexander Vybornyh <alexander.hce.cluster@gmail.com>
9 @link: http://hierarchical-cluster-engine.com/
10 @copyright: Copyright &copy; 2013-2017 IOIX Ukraine
11 @license: http://hierarchical-cluster-engine.com/license/
12 @since: 0.1
13 """
14 
15 import re
16 import json
17 # import requests
18 import requests.exceptions
19 # from requests.packages.urllib3.exceptions import InsecureRequestWarning
20 
21 # requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
22 
23 import app.Utils as Utils
24 from app.Utils import varDump
25 from app.Utils import getTracebackInfo
26 from dc_crawler.Fetcher import BaseFetcher
27 import dc_crawler.Constants as CONSTS
28 from dc_crawler.Exceptions import CrawlerFilterException
29 
30 logger = Utils.MPLogger().getLogger()
31 
32 class HTTPRedirectResolver(object):
33  # # Constants
34 
35  # #Constants error message
36  ERROR_INITIALIZATION = "Initialization class '%s' was failed. Error: %s"
37  ERROR_BAD_TYPE_PROPERTY_VALUE = "Wrong type (%s) of property value: %s"
38  ERROR_BAD_TYPE_HEADERS_VALUE = "Wrong type (%s) of headers: %s"
39  ERROR_BAD_PROPERTY_VALUE = "Not support value '%s' for property '%s'"
40  ERROR_BAD_STATUS_CODE_VALUE = "Not allowed status code '%s'. Allowed list: %s"
41 
42 
43  # # Internal class for redirect property
44  class RedirectProperty(object):
45  # #Constants used in class
46  PROPERTY_METHOD_NAME = "METHOD"
47  PROPERTY_URL_PATTERN_LIST = "URL"
48  PROPERTY_MAX_REDIRECTS = "MAX"
49  PROPERTY_TYPES_LIST = "TYPES"
50 
51  METHOD_NAME_HEAD = 'HEAD'
52  METHOD_NAME_SGET = 'SGET'
53  METHOD_NAME_DGET = 'DGET'
54 
55  METHOD_NAME_GET = 'GET'
56 
57  DEFAULT_VALUE_METHOD_NAME = METHOD_NAME_HEAD
58  DEFAULT_VALUE_URL_PATTERN_LIST = ['.*']
59  DEFAULT_VALUE_MAX_REDIRECTS = 10
60  DEFAULT_VALUE_TYPES_LIST = [301, 302, 303, 304]
61 
62  SUPPORT_METHOD_NAMES = [METHOD_NAME_HEAD, METHOD_NAME_SGET, METHOD_NAME_DGET]
63 
64  def __init__(self, methodName=None, urlPatternList=None, maxRedirects=None, typesList=None):
65  self.methodName = self.DEFAULT_VALUE_METHOD_NAME if methodName is None else methodName
66  self.urlPatternList = self.DEFAULT_VALUE_URL_PATTERN_LIST if urlPatternList is None else urlPatternList
67  self.maxRedirects = self.DEFAULT_VALUE_MAX_REDIRECTS if maxRedirects is None else maxRedirects
68  self.typesList = self.DEFAULT_VALUE_TYPES_LIST if typesList is None else typesList
69 
70 
71  # Initialization
72  # @param propertyString - contains string with json format
73  # @param fetchType - fetcher type
74  # @param dbWrapper - DBTaskWrapper instance
75  # @param connectionTimeout - connection timeout
76  # @param siteId- site ID
77  def __init__(self, propertyString=None, fetchType=BaseFetcher.TYP_NORMAL, dbWrapper=None, siteId=None,
78  connectionTimeout=CONSTS.CONNECTION_TIMEOUT):
79  self.redirectProperty = self.__loadProperty(propertyString)
80  self.fetchType = fetchType
81  self.dbWrapper = dbWrapper
82  self.siteId = siteId
83  self.connectionTimeout = connectionTimeout
84 
85 
86  # # get redirect property
87  #
88  # @param propertyDict - contains dictionary
89  # @return RedirectProperty instance
90  def getRedirectProperty(self, propertyDict):
91  # variable for result
93  try:
94  if not isinstance(propertyDict, dict):
95  raise Exception(self.ERROR_BAD_TYPE_PROPERTY_VALUE % (str(type(propertyDict)), varDump(propertyDict)))
96 
97  # extract method name
98  if HTTPRedirectResolver.RedirectProperty.PROPERTY_METHOD_NAME in propertyDict:
99  if propertyDict[HTTPRedirectResolver.RedirectProperty.PROPERTY_METHOD_NAME] in \
100  HTTPRedirectResolver.RedirectProperty.SUPPORT_METHOD_NAMES:
101  redirectProperty.methodName = propertyDict[HTTPRedirectResolver.RedirectProperty.PROPERTY_METHOD_NAME]
102  else:
103  raise Exception(self.ERROR_BAD_PROPERTY_VALUE % \
104  (varDump(propertyDict[HTTPRedirectResolver.RedirectProperty.PROPERTY_METHOD_NAME]),
105  str(HTTPRedirectResolver.RedirectProperty.PROPERTY_METHOD_NAME)))
106 
107  # extract url pattern list
108  if HTTPRedirectResolver.RedirectProperty.PROPERTY_URL_PATTERN_LIST in propertyDict:
109  if isinstance(propertyDict[HTTPRedirectResolver.RedirectProperty.PROPERTY_URL_PATTERN_LIST], list):
110  redirectProperty.urlPatternList = \
111  propertyDict[HTTPRedirectResolver.RedirectProperty.PROPERTY_URL_PATTERN_LIST]
112 
113  for i in xrange(len(redirectProperty.urlPatternList)):
114  if isinstance(redirectProperty.urlPatternList[i], dict):
115  redirectProperty.urlPatternList[i] = self.getRedirectProperty(redirectProperty.urlPatternList[i])
116 
117  else:
118  raise Exception(self.ERROR_BAD_PROPERTY_VALUE % \
119  (varDump(propertyDict[HTTPRedirectResolver.RedirectProperty.PROPERTY_URL_PATTERN_LIST]),
120  str(HTTPRedirectResolver.RedirectProperty.PROPERTY_URL_PATTERN_LIST)))
121 
122  # extract max redirects value
123  if HTTPRedirectResolver.RedirectProperty.PROPERTY_MAX_REDIRECTS in propertyDict:
124  redirectProperty.maxRedirects = \
125  int(propertyDict[HTTPRedirectResolver.RedirectProperty.PROPERTY_MAX_REDIRECTS])
126 
127  # extract redirect types list
128  if HTTPRedirectResolver.RedirectProperty.PROPERTY_TYPES_LIST in propertyDict:
129  if isinstance(propertyDict[HTTPRedirectResolver.RedirectProperty.PROPERTY_TYPES_LIST], list):
130  redirectProperty.typesList = propertyDict[HTTPRedirectResolver.RedirectProperty.PROPERTY_TYPES_LIST]
131  else:
132  raise Exception(self.ERROR_BAD_PROPERTY_VALUE % \
133  (varDump(propertyDict[HTTPRedirectResolver.RedirectProperty.PROPERTY_TYPES_LIST]),
134  str(HTTPRedirectResolver.RedirectProperty.PROPERTY_TYPES_LIST)))
135 
136  except Exception, err:
137  logger.error(self.ERROR_INITIALIZATION, self.__class__.__name__, str(err))
138  logger.info(getTracebackInfo())
139 
140  return redirectProperty
141 
142 
143  # # load property from input json
144  #
145  # @param propertyString - contains string with json format
146  # @return RedirectProperty instance
147  def __loadProperty(self, propertyString):
148  # variable for result
149  redirectProperty = HTTPRedirectResolver.RedirectProperty()
150 
151  if propertyString is not None:
152  try:
153  if not isinstance(propertyString, basestring) or propertyString == "":
154  raise Exception(self.ERROR_BAD_TYPE_PROPERTY_VALUE % (str(type(propertyString)), varDump(propertyString)))
155 
156  propertyDict = json.loads(propertyString)
157  redirectProperty = self.getRedirectProperty(propertyDict)
158 
159  except Exception, err:
160  logger.error(self.ERROR_INITIALIZATION, self.__class__.__name__, str(err))
161  logger.info(getTracebackInfo())
162 
163  return redirectProperty
164 
165 
166  # # check is allowed url use incming pattern list
167  #
168  # @param url - url string
169  # @param patterns - regular expression pattern list
170  # @return True if allowed or False - otherwise
171  @staticmethod
172  def isAllowedUrl(url, patterns):
173  # variable for result
174  ret = True
175  try:
176  if isinstance(patterns, list):
177  for pattern in patterns:
178  if isinstance(pattern, basestring):
179  ret = False
180  if re.search(pattern, url, re.UNICODE + re.IGNORECASE) is not None:
181  logger.debug("pattern: '%s' allowed for '%s'", str(pattern), str(url))
182  ret = True
183  break
184 
185  except Exception, err:
186  logger.error(str(err))
187 
188  return ret
189 
190 
191  # repair headers dictionary values if necessary
192  #
193  # @param headers - headers dictionary
194  # @return headers dictionary after repair values
195  def __repairHeaders(self, headers):
196  if isinstance(headers, dict):
197  for key, value in headers.items():
198  headers[key] = ';'.join(value.split())
199 
200 
201  # # resolve redirect url
202  #
203  # @param url - the url to fetch
204  # @param headers - request headers dict
205  # @param method - fetch by HTTP method
206  # @param timeout - request timeout(seconds)
207  # @param allowRedirects - boolean flag allowed redirects
208  # @param proxies - proxy setting tuple
209  # @param auth - basic auth setting, tuple of name and password
210  # @param postData - post data, used only when method is post
211  # @param maxRedirects - max allowed redirects count
212  # @param filters - filters dict
213  # @return resolved result url
214  def resolveRedirectUrl(self, url, headers, timeout=None,
215  allowRedirects=True, proxies=None, auth=None, postData=None,
216  maxRedirects=RedirectProperty.DEFAULT_VALUE_MAX_REDIRECTS, filters=None):
217 
218  # variable for result
219  ret = None
220 
221  logger.debug("Input url: %s \nheaders: %s", str(url), varDump(headers))
222 # logger.debug("method name: %s, max redirects = %s, redirect codes: %s",
223 # str(self.redirectProperty.methodName), str(self.redirectProperty.maxRedirects),
224 # str(self.redirectProperty.typesList))
225 
226 # self.__repairHeaders(headers) # remove in future because it's wrong logic
227 # logger.debug("headers: %s", varDump(headers))
228 
229  try:
230 
231  ret = self.__resolveRedirect(url=url, method=self.redirectProperty.methodName, headers=headers, timeout=timeout,
232  allowRedirects=allowRedirects, proxies=proxies, auth=auth, postData=postData,
233  maxRedirects=maxRedirects, filters=filters,
234  redirectProperty=self.redirectProperty)
235 
236  for urlPatternElem in self.redirectProperty.urlPatternList:
237  logger.debug("urlPatternElem: %s", varDump(urlPatternElem))
238  logger.debug("type(urlPatternElem) = %s", str(type(urlPatternElem)))
239 
240  if isinstance(urlPatternElem, HTTPRedirectResolver.RedirectProperty):
241  res = self.__resolveRedirect(url=url, method=self.redirectProperty.methodName, headers=headers,
242  timeout=timeout, allowRedirects=allowRedirects, proxies=proxies, auth=auth,
243  postData=postData, maxRedirects=maxRedirects, filters=filters,
244  redirectProperty=urlPatternElem)
245  if res is not None:
246  ret = res
247 
248  except CrawlerFilterException:
249  logger.debug("Url '%s' should be skipped.", str(url))
250  except (requests.exceptions.RequestException, Exception), err:
251  logger.debug("Resolve redirect url failed: %s", str(err))
252  logger.info(Utils.getTracebackInfo())
253 
254  return ret
255 
256 
257  # # resolve redirect
258  #
259  # @param url - the url to fetch
260  # @param headers - request headers dict
261  # @param method - fetch by HTTP method
262  # @param timeout - request timeout(seconds)
263  # @param allowRedirects - boolean flag allowed redirects
264  # @param proxies - proxy setting tuple
265  # @param auth - basic auth setting, tuple of name and password
266  # @param postData - post data, used only when method is post
267  # @param maxRedirects - max allowed redirects count
268  # @param filters - filters dict
269  # @param redirectProperty - RedirectProperty instance
270  # @return resolved result url
271  def __resolveRedirect(self, url, headers, method, timeout=None,
272  allowRedirects=True, proxies=None, auth=None, postData=None,
273  maxRedirects=RedirectProperty.DEFAULT_VALUE_MAX_REDIRECTS, filters=None,
274  redirectProperty=None):
275  # variable for result
276  ret = None
277  logger.debug("type(redirectProperty) = %s", str(type(redirectProperty)))
278  if isinstance(redirectProperty, HTTPRedirectResolver.RedirectProperty):
279  logger.debug("type is GOOD!!!")
280 
281  # check is allowed url for processing by pattern list
282  if HTTPRedirectResolver.isAllowedUrl(url, redirectProperty.urlPatternList):
283 
284  if redirectProperty.methodName == HTTPRedirectResolver.RedirectProperty.METHOD_NAME_HEAD:
285  # method 'HEAD' execution
286  ret = self.__fetch(url=url, method=HTTPRedirectResolver.RedirectProperty.METHOD_NAME_HEAD,
287  headers=headers, timeout=timeout, allowRedirects=allowRedirects, proxies=proxies,
288  auth=auth, postData=postData, maxRedirects=maxRedirects, filters=filters,
289  fetchType=self.fetchType)
290 
291  elif redirectProperty.methodName == HTTPRedirectResolver.RedirectProperty.METHOD_NAME_SGET:
292  # method 'GET' for static fetcher type execution
293  ret = self.__fetch(url=url, method=HTTPRedirectResolver.RedirectProperty.METHOD_NAME_GET,
294  headers=headers, timeout=timeout, allowRedirects=allowRedirects, proxies=proxies,
295  auth=auth, postData=postData, maxRedirects=maxRedirects, filters=filters,
296  fetchType=BaseFetcher.TYP_NORMAL)
297 
298  elif self.redirectProperty.methodName == HTTPRedirectResolver.RedirectProperty.METHOD_NAME_DGET:
299  # method 'GET' for dynamic fetcher type execution
300  ret = self.__fetch(url=url, method=HTTPRedirectResolver.RedirectProperty.METHOD_NAME_GET,
301  headers=headers, timeout=timeout, allowRedirects=allowRedirects, proxies=proxies,
302  auth=auth, postData=postData, maxRedirects=maxRedirects, filters=filters,
303  fetchType=BaseFetcher.TYP_DYNAMIC)
304 
305  return ret
306 
307 
308  # # make fetch
309  #
310  # @param url - the url to fetch
311  # @param headers - request headers dict
312  # @param method - fetch by HTTP method
313  # @param timeout - request timeout(seconds)
314  # @param allowRedirects - boolean flag allowed redirects
315  # @param proxies - proxy setting tuple
316  # @param auth - basic auth setting, tuple of name and password
317  # @param postData - post data, used only when method is post
318  # @param maxRedirects - max allowed redirects count
319  # @param filters - filters dict
320  # @param fetchType - fetch type
321  # @return result url
322  def __fetch(self, url, headers, method, timeout=None,
323  allowRedirects=True, proxies=None, auth=None, postData=None,
324  maxRedirects=RedirectProperty.DEFAULT_VALUE_MAX_REDIRECTS, filters=None,
325  fetchType=BaseFetcher.TYP_NORMAL):
326  # variable for result
327  ret = None
328  fetcher = BaseFetcher.get_fetcher(fetchType, self.dbWrapper, self.siteId)
329  fetcher.connectionTimeout = self.connectionTimeout
330 
331  res = fetcher.open(url=url, method=method, headers=headers, timeout=timeout,
332  allow_redirects=allowRedirects, proxies=proxies, auth=auth, data=postData, log=logger,
333  max_redirects=maxRedirects, filters=filters)
334 
335  if res.url is not None:
336  ret = res.url
337 
338  return ret
def __resolveRedirect(self, url, headers, method, timeout=None, allowRedirects=True, proxies=None, auth=None, postData=None, maxRedirects=RedirectProperty.DEFAULT_VALUE_MAX_REDIRECTS, filters=None, redirectProperty=None)
def __fetch(self, url, headers, method, timeout=None, allowRedirects=True, proxies=None, auth=None, postData=None, maxRedirects=RedirectProperty.DEFAULT_VALUE_MAX_REDIRECTS, filters=None, fetchType=BaseFetcher.TYP_NORMAL)
def resolveRedirectUrl(self, url, headers, timeout=None, allowRedirects=True, proxies=None, auth=None, postData=None, maxRedirects=RedirectProperty.DEFAULT_VALUE_MAX_REDIRECTS, filters=None)
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
def __init__(self, methodName=None, urlPatternList=None, maxRedirects=None, typesList=None)
Definition: join.py:1
def getTracebackInfo(linesNumberMax=None)
Definition: Utils.py:218