HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
HTTPCookieResolver.py
Go to the documentation of this file.
1 """
2 HCE project, Python bindings, Distributed Tasks Manager application.
3 HTTPCookieResolver Class content main functional for collect and resolve cookies
4 
5 @package: dc_crawler
6 @file HTTPCookieResolver.py
7 @author Alexander Vybornyh <alexander.hce.cluster@gmail.com>
8 @link: http://hierarchical-cluster-engine.com/
9 @copyright: Copyright &copy; 2013-2016 IOIX Ukraine
10 @license: http://hierarchical-cluster-engine.com/license/
11 @since: 0.1
12 """
13 
14 import re
15 import json
16 import datetime
17 
18 import app.Utils as Utils
19 from app.Utils import parseHost
20 from app.DateTimeType import DateTimeType
21 from app.Utils import varDump
22 
23 logger = Utils.MPLogger().getLogger()
24 
25 class HTTPCookieResolver(object):
26  # Constants names of usage stages
27  STAGE_REGULAR = 1
28  STAGE_REDIRECT = 2
29  STAGE_ROBOTS = 4
30  STAGE_RSS = 8
31  # Default value of stage
32  STAGE_DEFAULT = STAGE_REGULAR | STAGE_REDIRECT | STAGE_ROBOTS | STAGE_RSS
33  DOMAIN_DEFAULT = '.*'
34 
35  # Constants error message
36  ERROR_BAD_TYPE_INPUT_PROPERTY = "Bad type (%s) of input property: %s"
37  ERROR_BAD_TYPE_FOUND_PROPERTY = "Bad type (%s) of found property: %s"
38  ERROR_INPUT_PROPERTY = "Input wrong properties: %s"
39  ERROR_INITIALIZATION = "Initialization class '%s' was failed. Error: '%s'"
40 
41  # # Internal class for cookie properties
42  class Cookie(object):
43  supportNames = ['expires', 'domain', 'path']
44  def __init__(self):
45  self.expires = None
46  self.domain = None
47  self.path = None
48  self.name = None
49  self.value = None
50 
51  # # get data
52  #
53  # @param - None
54  # @return cookie data as string
55  def getData(self):
56  # variable for result
57  ret = ''
58  if isinstance(self.name, basestring) and isinstance(self.value, basestring) and \
59  self.name != "" and self.value != "":
60  ret = str(self.name) + '=' + str(self.value) + '; '
61 
62  return ret
63 
64 
65  # # Internal class for domain properties
66  class DomainProperty(object):
67  def __init__(self, stage=None, cookie=None):
68  self.stage = stage
69  self.cookie = cookie
70 
71 
72  # Constructor
73  # @param propertyString - contains string with json format
74  def __init__(self, propertyString=None):
75  self.cookiesDict = {}
76  self.property = {self.DOMAIN_DEFAULT:{'stage':self.STAGE_DEFAULT}}
77 
78  if propertyString is not None and isinstance(propertyString, basestring):
79  self.property = self.__loadProperty(propertyString)
80  logger.debug("!!! self.property: %s", varDump(self.property))
81 
82  # # load property from input json
83  #
84  # @param propertyString - contains string with json format
85  # @return object properties
86  def __loadProperty(self, propertyString):
87  # variable for result
88  ret = None
89  try:
90  ret = json.loads(propertyString)
91  except Exception, err:
92  logger.error(self.ERROR_INITIALIZATION, self.__class__.__name__, str(err))
93 
94  return ret
95 
96 
97  # # Add cookie to dict as raw string
98  #
99  # @param url - url string use as key in dict
100  # @param cookie - cookie string value or dict
101  # @param strip - use strip for url
102  # @return - None
103  def addCookie(self, url, cookie, strip=True):
104  # logger.debug("!!! addCookie ENTER !!! cookiesDict: %s", varDump(self.cookiesDict))
105  # logger.debug("!!! url: '%s', cookie: %s, type = %s, strip = %s",
106  # str(url), str(cookie), str(type(cookie)), str(strip))
107 
108  localUrl = self.__stripUrl(url) if strip else url
109  # logger.debug("!!! localUrl: %s", str(localUrl))
110 
111  if isinstance(cookie, basestring):
112  if cookie != "":
113  self.cookiesDict[localUrl] = ';'.join(cookie.split())
114  elif isinstance(cookie, dict):
115  self.cookiesDict[localUrl] = ';'.join(['%s=%s' % (k, v) for k, v in cookie.iteritems()])
116 
117  # logger.debug("!!! addCookie LEAVE !!! cookiesDict: %s", varDump(self.cookiesDict))
118 
119 
120  # # Strip url (remove parameters from url string)
121  #
122  # @param url - url string
123  # @return url - stripped url
124  def __stripUrl(self, url):
125  if isinstance(url, basestring) and url.count('?') > 0:
126  url = url[:url.find('?')]
127 
128  return url
129 
130 
131  # # Extract pair of name and values
132  #
133  # @param element - element string
134  # @return name and value extracted from element string
135  def __extractPair(self, element):
136  # variables for result
137  name = value = ''
138  pairElem = element.split('=')
139  if len(pairElem) > 0:
140  name = pairElem[0]
141 
142  if len(pairElem) > 1:
143  value = pairElem[1]
144 
145  return name, value
146 
147 
148  # # Split cookies string
149  #
150  # @param cookie - cookie string value
151  # @return cookies - list of Cookie class instances
152  def __splitCookieString(self, cookieString):
153  # variable for result
154  cookies = []
155  # extract elements
156  elementsList = cookieString.split(';')
157  elements = []
158  for element in elementsList:
159  if element.count('=') > 1:
160  endPos = element.rfind('=')
161  begPos = element.rfind(' ', 0, endPos)
162 
163  first = element[:begPos]
164  second = element[begPos:]
165  first = first.strip(',')
166 
167  elements.append(first.strip())
168  elements.append(second.strip())
169  else:
170  elements.append(element.strip())
171 
172  # logger.debug("!!! elements: %s", varDump(elements))
173  cookieObj = HTTPCookieResolver.Cookie()
174  for element in elements:
175  name, value = self.__extractPair(element)
176  # logger.debug("!!! name = %s, value = %s", name, value)
177 
178  if name.lower() not in HTTPCookieResolver.Cookie.supportNames and \
179  cookieObj.name is not None and cookieObj.value is not None:
180  cookies.append(cookieObj)
181  cookieObj = HTTPCookieResolver.Cookie()
182 
183  if name.lower() in HTTPCookieResolver.Cookie.supportNames:
184  if hasattr(cookieObj, name.lower()):
185  setattr(cookieObj, name.lower(), value)
186  elif cookieObj.name is None and cookieObj.value is None:
187  cookieObj.name = name
188  cookieObj.value = value
189 
190  if cookieObj.name is not None and cookieObj.value is not None:
191  cookies.append(cookieObj)
192 
193  logger.debug("!!! cookies:")
194  for cookie in cookies:
195  logger.debug("%s", varDump(cookie))
196 
197  return cookies
198 
199 
200  # # Check is allowed cookie instance
201  #
202  # @param url - url string for search cookie
203  # @param cookieObj - cookie instance for check
204  # @return True if allowed or otherwise False
205  def __isAllowedCookie(self, url, cookieObj):
206  # variable for result
207  ret = False
208 
209  isAllowedPath = isAllowedExpires = True
210 
211  # check 'path'
212  if cookieObj.path is not None and re.search('.*' + cookieObj.path + '.*', url, re.UNICODE) is None:
213  isAllowedPath = False
214 
215  # check 'expired'
216  if cookieObj.expires is not None:
217  expiresDatetime = DateTimeType.parse(cookieObj.expires, True, logger, False)
218  if expiresDatetime is not None:
219  expiresDatetime = DateTimeType.toUTC(expiresDatetime)
220  currentDatetime = datetime.datetime.utcnow()
221  currentDatetime = currentDatetime.replace(tzinfo=None)
222  if currentDatetime > expiresDatetime:
223  isAllowedExpires = False
224 
225  logger.debug("Is allowed = %s for path '%s'", str(isAllowedPath), str(cookieObj.path))
226  logger.debug("Is allowed = %s for expired '%s'", str(isAllowedExpires), str(cookieObj.expires))
227 
228  if isAllowedPath and isAllowedExpires:
229  ret = True
230 
231  return ret
232 
233 
234  # # Exctract domain properties
235  #
236  # @param url - url string for search cookie
237  # @return propertyStage and propertyCookie - extracted stage and cookies from domain properties
238  def __extractDomainProperty(self, url):
239  # variables for result
240  propertyStage = HTTPCookieResolver.STAGE_DEFAULT
241  propertyCookie = None
242 
243  propertyObj = self.__getDomainProperty(self.property, url)
244  # logger.debug('propertyObj: ' + str(propertyObj))
245 
246  if propertyObj is not None:
247  if propertyObj.stage is not None:
248  propertyStage = propertyObj.stage
249  propertyCookie = propertyObj.cookie
250 
251  return propertyStage, propertyCookie
252 
253 
254  # # Check is allowed stage
255  #
256  # @param stage - allowed stage of usage (support of different stages use bitmask)
257  # @param propertyStage - stage from properties
258  # @return True if allowed or otherwise False
259  def __isAllowedStage(self, stage, propertyStage):
260  # variable for result
261  ret = False
262  if stage & int(propertyStage):
263  ret = True
264 
265  return ret
266 
267 
268  # # Get domain property
269  #
270  # @param properties - properties structure
271  # @param domain - domain name for extract from property
272  # @return DomainProperty instance is success or otherwise None
273  def __getDomainProperty(self, properties, domain):
274  # variable for result
275  ret = None
276  try:
277  if not isinstance(properties, dict):
278  raise Exception(self.ERROR_BAD_TYPE_INPUT_PROPERTY % (str(type(property)), str(property)))
279 
280  foundProperty = None
281  if domain in properties:
282  foundProperty = properties[domain]
283  elif self.DOMAIN_DEFAULT in properties:
284  foundProperty = properties[self.DOMAIN_DEFAULT]
285  else:
286  logger.debug(self.ERROR_INPUT_PROPERTY, varDump(property))
287 
288  if foundProperty is not None:
289  if not isinstance(foundProperty, dict):
290  raise Exception(self.ERROR_BAD_TYPE_FOUND_PROPERTY % (str(type(foundProperty)), str(foundProperty)))
291 
292  domainPropertyObj = HTTPCookieResolver.DomainProperty()
293  for name, value in foundProperty.items():
294  if hasattr(domainPropertyObj, name):
295  setattr(domainPropertyObj, name, value)
296  ret = domainPropertyObj
297 
298  except Exception, err:
299  logger.error(str(err))
300 
301  return ret
302 
303 
304  # # Get valid cookie
305  #
306  # @param url - url string for search cookie
307  # @param stage - allowed stage of usage (support of different stages use bitmask)
308  # @return ret - cookie string if found cookie or otherwise None
309  def getCookie(self, url, stage=STAGE_DEFAULT):
310  # variable for result
311  ret = None
312 
313  logger.debug('!!! getCookie ENTER !!! url: ' + str(url))
314  if url in self.cookiesDict.keys():
315  if self.cookiesDict[url] is not None and self.cookiesDict[url] != "":
316  logger.debug('!!! getCookie LEAVE !!! return: ' + str(';'.join(self.cookiesDict[url].split())))
317  return ';'.join(self.cookiesDict[url].split())
318 
319  for localUrl, cookieString in self.cookiesDict.items():
320  logger.debug("!!! localUrl = %s, cookieString: %s", str(localUrl), str(cookieString))
321 
322  propertyStage, propertyCookie = self.__extractDomainProperty(parseHost(localUrl))
323  logger.debug("!!! propertyStage = %s, propertyCookie: %s", str(propertyStage), varDump(propertyCookie))
324  logger.debug('is allowed stage: ' + str(self.__isAllowedStage(stage, propertyStage)))
325 
326  if self.__isAllowedStage(stage, propertyStage):
327  if propertyCookie is None: # not exist default cookie
328  logger.debug('cookieString: ' + str(cookieString))
329 
330  cookies = self.__splitCookieString(cookieString)
331  logger.debug('cookies: ' + varDump(cookies))
332  logger.debug('localUrl: ' + str(localUrl))
333  logger.debug('url: ' + str(url))
334 
335  resStr = ''
336  resList = []
337  for cookie in cookies:
338  if cookie.domain is None or (cookie.domain is not None and \
339  re.search('.*' + cookie.domain + '.*', url, re.UNICODE) is not None):
340 
341  logger.debug('is allowed: ' + str(self.__isAllowedCookie(url, cookie)))
342  if self.__isAllowedCookie(url, cookie):
343  resList.append(cookie.getData())
344 
345  # remove dublicate
346  resList = list(set(resList))
347  resStr = ''.join(resList)
348 
349  if resStr is not '':
350  ret = resStr
351 
352  else: # apply default value of cookie
353  ret = propertyCookie
354 
355  logger.debug('return cookie: ' + str(ret))
356 
357  return ret
def parseHost(url)
Definition: Utils.py:947
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Definition: join.py:1