HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_crawler.HTTPCookieResolver.HTTPCookieResolver Class Reference
Inheritance diagram for dc_crawler.HTTPCookieResolver.HTTPCookieResolver:
Collaboration diagram for dc_crawler.HTTPCookieResolver.HTTPCookieResolver:

Classes

class  Cookie
 
class  DomainProperty
 

Public Member Functions

def __init__ (self, propertyString=None)
 
def addCookie (self, url, cookie, strip=True)
 
def getCookie (self, url, stage=STAGE_DEFAULT)
 

Public Attributes

 cookiesDict
 
 property
 

Static Public Attributes

int STAGE_REGULAR = 1
 
int STAGE_REDIRECT = 2
 
int STAGE_ROBOTS = 4
 
int STAGE_RSS = 8
 
int STAGE_DEFAULT = STAGE_REGULAR | STAGE_REDIRECT | STAGE_ROBOTS | STAGE_RSS
 
string DOMAIN_DEFAULT = '.*'
 
string ERROR_BAD_TYPE_INPUT_PROPERTY = "Bad type (%s) of input property: %s"
 
string ERROR_BAD_TYPE_FOUND_PROPERTY = "Bad type (%s) of found property: %s"
 
string ERROR_INPUT_PROPERTY = "Input wrong properties: %s"
 
string ERROR_INITIALIZATION = "Initialization class '%s' was failed. Error: '%s'"
 

Private Member Functions

def __loadProperty (self, propertyString)
 
def __stripUrl (self, url)
 
def __extractPair (self, element)
 
def __splitCookieString (self, cookieString)
 
def __isAllowedCookie (self, url, cookieObj)
 
def __extractDomainProperty (self, url)
 
def __isAllowedStage (self, stage, propertyStage)
 
def __getDomainProperty (self, properties, domain)
 

Detailed Description

Definition at line 25 of file HTTPCookieResolver.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_crawler.HTTPCookieResolver.HTTPCookieResolver.__init__ (   self,
  propertyString = None 
)

Definition at line 74 of file HTTPCookieResolver.py.

74  def __init__(self, propertyString=None):
75  self.cookiesDict = {}
76  self.property = {self.DOMAIN_DEFAULT:{'stage':self.STAGE_DEFAULT}}
77 
78  if propertyString is not None and isinstance(propertyString, basestring):
79  self.property = self.__loadProperty(propertyString)
80  logger.debug("!!! self.property: %s", varDump(self.property))
81 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ __extractDomainProperty()

def dc_crawler.HTTPCookieResolver.HTTPCookieResolver.__extractDomainProperty (   self,
  url 
)
private

Definition at line 238 of file HTTPCookieResolver.py.

238  def __extractDomainProperty(self, url):
239  # variables for result
240  propertyStage = HTTPCookieResolver.STAGE_DEFAULT
241  propertyCookie = None
242 
243  propertyObj = self.__getDomainProperty(self.property, url)
244  # logger.debug('propertyObj: ' + str(propertyObj))
245 
246  if propertyObj is not None:
247  if propertyObj.stage is not None:
248  propertyStage = propertyObj.stage
249  propertyCookie = propertyObj.cookie
250 
251  return propertyStage, propertyCookie
252 
253 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ __extractPair()

def dc_crawler.HTTPCookieResolver.HTTPCookieResolver.__extractPair (   self,
  element 
)
private

Definition at line 135 of file HTTPCookieResolver.py.

135  def __extractPair(self, element):
136  # variables for result
137  name = value = ''
138  pairElem = element.split('=')
139  if len(pairElem) > 0:
140  name = pairElem[0]
141 
142  if len(pairElem) > 1:
143  value = pairElem[1]
144 
145  return name, value
146 
147 
Here is the caller graph for this function:

◆ __getDomainProperty()

def dc_crawler.HTTPCookieResolver.HTTPCookieResolver.__getDomainProperty (   self,
  properties,
  domain 
)
private

Definition at line 273 of file HTTPCookieResolver.py.

273  def __getDomainProperty(self, properties, domain):
274  # variable for result
275  ret = None
276  try:
277  if not isinstance(properties, dict):
278  raise Exception(self.ERROR_BAD_TYPE_INPUT_PROPERTY % (str(type(property)), str(property)))
279 
280  foundProperty = None
281  if domain in properties:
282  foundProperty = properties[domain]
283  elif self.DOMAIN_DEFAULT in properties:
284  foundProperty = properties[self.DOMAIN_DEFAULT]
285  else:
286  logger.debug(self.ERROR_INPUT_PROPERTY, varDump(property))
287 
288  if foundProperty is not None:
289  if not isinstance(foundProperty, dict):
290  raise Exception(self.ERROR_BAD_TYPE_FOUND_PROPERTY % (str(type(foundProperty)), str(foundProperty)))
291 
292  domainPropertyObj = HTTPCookieResolver.DomainProperty()
293  for name, value in foundProperty.items():
294  if hasattr(domainPropertyObj, name):
295  setattr(domainPropertyObj, name, value)
296  ret = domainPropertyObj
297 
298  except Exception, err:
299  logger.error(str(err))
300 
301  return ret
302 
303 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:
Here is the caller graph for this function:

◆ __isAllowedCookie()

def dc_crawler.HTTPCookieResolver.HTTPCookieResolver.__isAllowedCookie (   self,
  url,
  cookieObj 
)
private

Definition at line 205 of file HTTPCookieResolver.py.

205  def __isAllowedCookie(self, url, cookieObj):
206  # variable for result
207  ret = False
208 
209  isAllowedPath = isAllowedExpires = True
210 
211  # check 'path'
212  if cookieObj.path is not None and re.search('.*' + cookieObj.path + '.*', url, re.UNICODE) is None:
213  isAllowedPath = False
214 
215  # check 'expired'
216  if cookieObj.expires is not None:
217  expiresDatetime = DateTimeType.parse(cookieObj.expires, True, logger, False)
218  if expiresDatetime is not None:
219  expiresDatetime = DateTimeType.toUTC(expiresDatetime)
220  currentDatetime = datetime.datetime.utcnow()
221  currentDatetime = currentDatetime.replace(tzinfo=None)
222  if currentDatetime > expiresDatetime:
223  isAllowedExpires = False
224 
225  logger.debug("Is allowed = %s for path '%s'", str(isAllowedPath), str(cookieObj.path))
226  logger.debug("Is allowed = %s for expired '%s'", str(isAllowedExpires), str(cookieObj.expires))
227 
228  if isAllowedPath and isAllowedExpires:
229  ret = True
230 
231  return ret
232 
233 
Here is the caller graph for this function:

◆ __isAllowedStage()

def dc_crawler.HTTPCookieResolver.HTTPCookieResolver.__isAllowedStage (   self,
  stage,
  propertyStage 
)
private

Definition at line 259 of file HTTPCookieResolver.py.

259  def __isAllowedStage(self, stage, propertyStage):
260  # variable for result
261  ret = False
262  if stage & int(propertyStage):
263  ret = True
264 
265  return ret
266 
267 
Here is the caller graph for this function:

◆ __loadProperty()

def dc_crawler.HTTPCookieResolver.HTTPCookieResolver.__loadProperty (   self,
  propertyString 
)
private

Definition at line 86 of file HTTPCookieResolver.py.

86  def __loadProperty(self, propertyString):
87  # variable for result
88  ret = None
89  try:
90  ret = json.loads(propertyString)
91  except Exception, err:
92  logger.error(self.ERROR_INITIALIZATION, self.__class__.__name__, str(err))
93 
94  return ret
95 
96 
Here is the caller graph for this function:

◆ __splitCookieString()

def dc_crawler.HTTPCookieResolver.HTTPCookieResolver.__splitCookieString (   self,
  cookieString 
)
private

Definition at line 152 of file HTTPCookieResolver.py.

152  def __splitCookieString(self, cookieString):
153  # variable for result
154  cookies = []
155  # extract elements
156  elementsList = cookieString.split(';')
157  elements = []
158  for element in elementsList:
159  if element.count('=') > 1:
160  endPos = element.rfind('=')
161  begPos = element.rfind(' ', 0, endPos)
162 
163  first = element[:begPos]
164  second = element[begPos:]
165  first = first.strip(',')
166 
167  elements.append(first.strip())
168  elements.append(second.strip())
169  else:
170  elements.append(element.strip())
171 
172  # logger.debug("!!! elements: %s", varDump(elements))
173  cookieObj = HTTPCookieResolver.Cookie()
174  for element in elements:
175  name, value = self.__extractPair(element)
176  # logger.debug("!!! name = %s, value = %s", name, value)
177 
178  if name.lower() not in HTTPCookieResolver.Cookie.supportNames and \
179  cookieObj.name is not None and cookieObj.value is not None:
180  cookies.append(cookieObj)
181  cookieObj = HTTPCookieResolver.Cookie()
182 
183  if name.lower() in HTTPCookieResolver.Cookie.supportNames:
184  if hasattr(cookieObj, name.lower()):
185  setattr(cookieObj, name.lower(), value)
186  elif cookieObj.name is None and cookieObj.value is None:
187  cookieObj.name = name
188  cookieObj.value = value
189 
190  if cookieObj.name is not None and cookieObj.value is not None:
191  cookies.append(cookieObj)
192 
193  logger.debug("!!! cookies:")
194  for cookie in cookies:
195  logger.debug("%s", varDump(cookie))
196 
197  return cookies
198 
199 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:
Here is the caller graph for this function:

◆ __stripUrl()

def dc_crawler.HTTPCookieResolver.HTTPCookieResolver.__stripUrl (   self,
  url 
)
private

Definition at line 124 of file HTTPCookieResolver.py.

124  def __stripUrl(self, url):
125  if isinstance(url, basestring) and url.count('?') > 0:
126  url = url[:url.find('?')]
127 
128  return url
129 
130 
Here is the caller graph for this function:

◆ addCookie()

def dc_crawler.HTTPCookieResolver.HTTPCookieResolver.addCookie (   self,
  url,
  cookie,
  strip = True 
)

Definition at line 103 of file HTTPCookieResolver.py.

103  def addCookie(self, url, cookie, strip=True):
104  # logger.debug("!!! addCookie ENTER !!! cookiesDict: %s", varDump(self.cookiesDict))
105  # logger.debug("!!! url: '%s', cookie: %s, type = %s, strip = %s",
106  # str(url), str(cookie), str(type(cookie)), str(strip))
107 
108  localUrl = self.__stripUrl(url) if strip else url
109  # logger.debug("!!! localUrl: %s", str(localUrl))
110 
111  if isinstance(cookie, basestring):
112  if cookie != "":
113  self.cookiesDict[localUrl] = ';'.join(cookie.split())
114  elif isinstance(cookie, dict):
115  self.cookiesDict[localUrl] = ';'.join(['%s=%s' % (k, v) for k, v in cookie.iteritems()])
116 
117  # logger.debug("!!! addCookie LEAVE !!! cookiesDict: %s", varDump(self.cookiesDict))
118 
119 
Definition: join.py:1
Here is the call graph for this function:

◆ getCookie()

def dc_crawler.HTTPCookieResolver.HTTPCookieResolver.getCookie (   self,
  url,
  stage = STAGE_DEFAULT 
)

Definition at line 309 of file HTTPCookieResolver.py.

309  def getCookie(self, url, stage=STAGE_DEFAULT):
310  # variable for result
311  ret = None
312 
313  logger.debug('!!! getCookie ENTER !!! url: ' + str(url))
314  if url in self.cookiesDict.keys():
315  if self.cookiesDict[url] is not None and self.cookiesDict[url] != "":
316  logger.debug('!!! getCookie LEAVE !!! return: ' + str(';'.join(self.cookiesDict[url].split())))
317  return ';'.join(self.cookiesDict[url].split())
318 
319  for localUrl, cookieString in self.cookiesDict.items():
320  logger.debug("!!! localUrl = %s, cookieString: %s", str(localUrl), str(cookieString))
321 
322  propertyStage, propertyCookie = self.__extractDomainProperty(parseHost(localUrl))
323  logger.debug("!!! propertyStage = %s, propertyCookie: %s", str(propertyStage), varDump(propertyCookie))
324  logger.debug('is allowed stage: ' + str(self.__isAllowedStage(stage, propertyStage)))
325 
326  if self.__isAllowedStage(stage, propertyStage):
327  if propertyCookie is None: # not exist default cookie
328  logger.debug('cookieString: ' + str(cookieString))
329 
330  cookies = self.__splitCookieString(cookieString)
331  logger.debug('cookies: ' + varDump(cookies))
332  logger.debug('localUrl: ' + str(localUrl))
333  logger.debug('url: ' + str(url))
334 
335  resStr = ''
336  resList = []
337  for cookie in cookies:
338  if cookie.domain is None or (cookie.domain is not None and \
339  re.search('.*' + cookie.domain + '.*', url, re.UNICODE) is not None):
340 
341  logger.debug('is allowed: ' + str(self.__isAllowedCookie(url, cookie)))
342  if self.__isAllowedCookie(url, cookie):
343  resList.append(cookie.getData())
344 
345  # remove dublicate
346  resList = list(set(resList))
347  resStr = ''.join(resList)
348 
349  if resStr is not '':
350  ret = resStr
351 
352  else: # apply default value of cookie
353  ret = propertyCookie
354 
355  logger.debug('return cookie: ' + str(ret))
356 
357  return ret
358 
def parseHost(url)
Definition: Utils.py:947
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Definition: join.py:1
Here is the call graph for this function:

Member Data Documentation

◆ cookiesDict

dc_crawler.HTTPCookieResolver.HTTPCookieResolver.cookiesDict

Definition at line 75 of file HTTPCookieResolver.py.

◆ DOMAIN_DEFAULT

string dc_crawler.HTTPCookieResolver.HTTPCookieResolver.DOMAIN_DEFAULT = '.*'
static

Definition at line 33 of file HTTPCookieResolver.py.

◆ ERROR_BAD_TYPE_FOUND_PROPERTY

string dc_crawler.HTTPCookieResolver.HTTPCookieResolver.ERROR_BAD_TYPE_FOUND_PROPERTY = "Bad type (%s) of found property: %s"
static

Definition at line 37 of file HTTPCookieResolver.py.

◆ ERROR_BAD_TYPE_INPUT_PROPERTY

string dc_crawler.HTTPCookieResolver.HTTPCookieResolver.ERROR_BAD_TYPE_INPUT_PROPERTY = "Bad type (%s) of input property: %s"
static

Definition at line 36 of file HTTPCookieResolver.py.

◆ ERROR_INITIALIZATION

string dc_crawler.HTTPCookieResolver.HTTPCookieResolver.ERROR_INITIALIZATION = "Initialization class '%s' was failed. Error: '%s'"
static

Definition at line 39 of file HTTPCookieResolver.py.

◆ ERROR_INPUT_PROPERTY

string dc_crawler.HTTPCookieResolver.HTTPCookieResolver.ERROR_INPUT_PROPERTY = "Input wrong properties: %s"
static

Definition at line 38 of file HTTPCookieResolver.py.

◆ property

dc_crawler.HTTPCookieResolver.HTTPCookieResolver.property

Definition at line 76 of file HTTPCookieResolver.py.

◆ STAGE_DEFAULT

int dc_crawler.HTTPCookieResolver.HTTPCookieResolver.STAGE_DEFAULT = STAGE_REGULAR | STAGE_REDIRECT | STAGE_ROBOTS | STAGE_RSS
static

Definition at line 32 of file HTTPCookieResolver.py.

◆ STAGE_REDIRECT

int dc_crawler.HTTPCookieResolver.HTTPCookieResolver.STAGE_REDIRECT = 2
static

Definition at line 28 of file HTTPCookieResolver.py.

◆ STAGE_REGULAR

int dc_crawler.HTTPCookieResolver.HTTPCookieResolver.STAGE_REGULAR = 1
static

Definition at line 27 of file HTTPCookieResolver.py.

◆ STAGE_ROBOTS

int dc_crawler.HTTPCookieResolver.HTTPCookieResolver.STAGE_ROBOTS = 4
static

Definition at line 29 of file HTTPCookieResolver.py.

◆ STAGE_RSS

int dc_crawler.HTTPCookieResolver.HTTPCookieResolver.STAGE_RSS = 8
static

Definition at line 30 of file HTTPCookieResolver.py.


The documentation for this class was generated from the following file: