HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
RefererHeaderResolver.py
Go to the documentation of this file.
1 """
2 HCE project, Python bindings, Distributed Tasks Manager application.
3 Event objects definitions.
4 
5 @package: dc
6 @file RefererHeaderResolver.py
7 @author scorp <developers.hce@gmail.com>
8 @copyright: Copyright &copy; 2013-2014 IOIX Ukraine
9 @license: http://hierarchical-cluster-engine.com/license/
10 @since: 0.1
11 """
12 
13 import app.Utils as Utils # pylint: disable=F0401
14 import dc.EventObjects
15 import dc.Constants as DC_CONSTS
16 
17 logger = Utils.MPLogger().getLogger()
18 
19 
20 # #RefererHeaderResolver class calculate "Referer" fields value and add it to the headers
21 #
22 class RefererHeaderResolver(object):
23 
24  MODE_NONE = 0
25  MODE_SIMPLE = 1
26  MODE_DOMAIN = 2
27  MODE_PARENT = 3
28  HEADER_NAME = "Referer"
29 
30 
31  def __init__(self, dbWrapper=None):
32  self.dbWrapper = dbWrapper
33 
34 
35  # #fetchParentUrl fetchs parent url from db
36  #
37  # @param siteId current resource's siteId
38  # @param parentMd5 current resource's parentMd5
39  # @param dbWrapper db-task wrapper
40  def fetchParentUrl(self, siteId, parentMd5, dbWrapper):
41  ret = None
42  if siteId is not None and parentMd5 is not None and dbWrapper is not None:
43  urlStatus = dc.EventObjects.URLStatus(siteId, parentMd5)
44  urlStatus.urlType = dc.EventObjects.URLStatus.URL_TYPE_MD5
45  drceSyncTasksCoverObj = DC_CONSTS.DRCESyncTasksCover(DC_CONSTS.EVENT_TYPES.URL_STATUS, [urlStatus])
46  responseDRCESyncTasksCover = dbWrapper.process(drceSyncTasksCoverObj)
47  row = responseDRCESyncTasksCover.eventObject
48  if row is not None and len(row) > 0 and row[0] is not None:
49  ret = row[0].url
50  return ret
51 
52 
53  # #resolveRefererHeader public method, adds "Referer" header in the headers dics with correspond values
54  #
55  # @param headers incoming http headers dict
56  # @param mode mode of "Referer" header value calculating
57  # @param url current resource's url
58  # @param siteId current resource's siteId
59  # @param parentMd5 current resource's parentMd5
60  # @param dbWrapper db-task wrapper
61  def resolveRefererHeader(self, headers, mode, url, siteId=None, parentMd5=None, dbWrapper=None):
62  mode = int(mode)
63 
64  for headerName in headers:
65  if headerName.lower() == self.HEADER_NAME.lower():
66  logger.info(">>> Referer field already in dict headers")
67  return
68 
69  if mode == self.MODE_NONE:
70  pass
71  elif mode == self.MODE_SIMPLE:
72  headers[self.HEADER_NAME] = url
73  elif mode == self.MODE_DOMAIN:
74  headers[self.HEADER_NAME] = Utils.UrlParser.generateDomainUrl(url)
75  elif mode == self.MODE_PARENT:
76  parentUrl = self.fetchParentUrl(siteId, parentMd5, dbWrapper if dbWrapper is not None else self.dbWrapper)
77  headers[self.HEADER_NAME] = parentUrl if parentUrl is not None else url
78 
def fetchParentUrl(self, siteId, parentMd5, dbWrapper)
def resolveRefererHeader(self, headers, mode, url, siteId=None, parentMd5=None, dbWrapper=None)