HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
DetectModified.py
Go to the documentation of this file.
1 """
2 HCE project, Python bindings, Distributed Tasks Manager application.
3 Event objects definitions.
4 
5 @package: dc
6 @file DetectModified.py
7 @author Scorp <developers.hce@gmail.com>
8 @copyright: Copyright &copy; 2013-2014 IOIX Ukraine
9 @license: http://hierarchical-cluster-engine.com/license/
10 @since: 0.1
11 """
12 
13 import json
14 import hashlib
15 import rfc822
16 import time
17 # import datetime
18 # from email.Utils import formatdate # pylint: disable=E0611,E0401
19 
20 import app.Utils as Utils # pylint: disable=F0401
21 from dc_crawler.Fetcher import BaseFetcher
22 import dc.EventObjects as dc_event
23 
24 logger = Utils.MPLogger().getLogger()
25 
26 
27 class NotModifiedException(Exception):
28  def __init__(self, message, httpCode):
29  Exception.__init__(self, message)
30 
31  self.httpCode = httpCode
32 
33 
34 # #class implemented all logic necessary for "Resource Not Modified" detecting
35 #
36 class DetectModified(object):
37 
38  MODE_DISABLED = 0
39  MODE_ONE_REQUEST = 1
40  MODE_TWO_REQUESTS = 2
41 
42  ALGO_AUTO = 0
43  ALGO_IF_NOT_MATCH = 1
44  ALGO_IF_MOFIFIED_SINCE = 2
45  ALGO_HEAD = 3
46 
47  COMPARE_AUTO = 0
48  COMPARE_DATE = 1
49  COMPARE_CRC32 = 2
50  COMPARE_RAW_CONTENT = 3
51 
52  BEHAVIOR_DEFAULT = 0
53  BEHAVIOR_CRAWLED_STATUS = 1
54  BEHAVIOR_PROCESSED_STATUS = 2
55  BEHAVIOR_SAVE_UDATE = 3
56 
57 
58  # #class constructor
59  #
60  # @param modifiedSettingsStr contains string with values for internal algorithm
61  def __init__(self, modifiedSettingsStr):
62  try:
63  self.modifiedSettings = json.loads(modifiedSettingsStr)
64  except Exception:
65  self.modifiedSettings = None
66  self.lastModified = None
67  self.eTags = None
68  self.isResourceNotChanged = False
69  self.prevContentLen = None
70  self.prevContentMd5 = None
71  self.prevContentDate = None
72 
73 
74  # #generateETagsString method pastes all elements from eTags into one string
75  #
76  # @param eTags contains list of e-tags
77  # @return resulting string
78  def generateETagsString(self, eTags):
79  ret = ""
80  if eTags is not None:
81  if isinstance(eTags, list):
82  # for eTag in eTags:
83  # ret += '"'
84  # ret += str(eTag)
85  # ret += '",'
86  # ret = ret.strip(',')
87  ret = '","'.join(eTags)
88  elif isinstance(eTags, basestring):
89  ret = eTags
90  ret = '"' + ret + '"'
91  return ret
92 
93 
94  def headersClearing(self, httpParams):
95  dellKeys = []
96  for key in httpParams["httpHeader"]:
97  if key.lower() == "if-none-match" or key.lower() == "if-modified-since":
98  dellKeys.append(key)
99  for elem in dellKeys:
100  del httpParams["httpHeader"][elem]
101 
102 
103  def makeHTTPRequest(self, fetchType, httpParams):
104  ret = None
105  self.isResourceNotChanged = False
106  if self.modifiedSettings is not None:
107  # logger.debug(">>> httpParams = " + str(httpParams))
108  # logger.debug(">>> expiredData = " + str(self.lastModified))
109  # logger.debug(">>> algorithm = " + str(self.modifiedSettings["algorithm"]))
110  if httpParams is None:
111  httpParams = {}
112 
113  self.headersClearing(httpParams)
114 
115  localMethod = "get"
116  if self.modifiedSettings["algorithm"] == self.ALGO_AUTO:
117  if self.eTags is not None and self.eTags is not "":
118  eTagsString = self.generateETagsString(self.eTags)
119  httpParams["httpHeader"]["if-none-match"] = eTagsString
120 
121  if self.lastModified is not None:
122  httpParams["httpHeader"]["if-modified-since"] = self.lastModified
123 
124  elif self.modifiedSettings["algorithm"] == self.ALGO_IF_NOT_MATCH:
125  if self.eTags is not None and self.eTags is not "":
126  eTagsString = self.generateETagsString(self.eTags)
127  httpParams["httpHeader"]["if-none-match"] = eTagsString
128 
129  elif self.modifiedSettings["algorithm"] == self.ALGO_IF_MOFIFIED_SINCE:
130  if self.lastModified is not None:
131  httpParams["httpHeader"]["if-modified-since"] = self.lastModified
132 
133  elif self.modifiedSettings["algorithm"] == self.ALGO_HEAD:
134  if self.modifiedSettings["mode"] == self.MODE_ONE_REQUEST:
135  raise Exception(">>> Error [algorithm == 3 and mode == 1] not compatible !!!")
136  localMethod = "head"
137 
138  ret = BaseFetcher.get_fetcher(fetchType).open(httpParams["url"],
139  timeout=httpParams["httpTimeout"],
140  headers=httpParams["httpHeader"],
141  allow_redirects=httpParams["allowHttpRedirects"],
142  proxies=httpParams["proxies"], auth=httpParams["auth"],
143  data=httpParams["postData"], log=logger,
144  allowed_content_types=httpParams["processContentTypes"],
145  max_resource_size=httpParams["maxResourceSize"],
146  max_redirects=httpParams["maxHttpRedirects"],
147  method=localMethod)
148 
149  if ret is not None:
150  self.resourceComparing(ret)
151  if not self.isResourceNotChanged and self.modifiedSettings["mode"] == self.MODE_TWO_REQUESTS:
152  self.headersClearing(httpParams)
153  ret = BaseFetcher.get_fetcher(fetchType).open(httpParams["url"],
154  timeout=httpParams["httpTimeout"],
155  headers=httpParams["httpHeader"],
156  allow_redirects=httpParams["allowHttpRedirects"],
157  proxies=httpParams["proxies"], auth=httpParams["auth"],
158  data=httpParams["postData"], log=logger,
159  allowed_content_types=httpParams["processContentTypes"],
160  max_resource_size=httpParams["maxResourceSize"],
161  max_redirects=httpParams["maxHttpRedirects"])
162 
163  # if ret is not None and ret.request is not None and ret.request.headers is not None:
164  # logger.debug(">>> requests headers = " + str(ret.request.headers))
165 
166  return ret
167 
168 
169  def resourceComparing(self, res):
170  self.isResourceNotChanged = False
171  if self.modifiedSettings["compare"] == self.COMPARE_AUTO:
172  if res.status_code == 304:
173  self.isResourceNotChanged = True
174  elif self.modifiedSettings["compare"] == self.COMPARE_DATE:
175  if "last-modified" in res.headers and self.prevContentDate is not None:
176  try:
177  resDate = time.mktime(rfc822.parsedate(res.headers["last-modified"]))
178  prevResDate = time.mktime(rfc822.parsedate(self.prevContentDate))
179  if resDate <= prevResDate:
180  self.isResourceNotChanged = True
181  except Exception:
182  raise Exception(">>> Bad data format - resDate -" + str(res.headers["last-modified"]) + " or prevResDate -" +
183  str(self.prevContentDate))
184  elif self.modifiedSettings["compare"] == self.COMPARE_CRC32:
185  if res.rendered_unicode_content is not None and \
186  hashlib.md5(res.rendered_unicode_content).hexdigest() == self.prevContentMd5:
187  self.isResourceNotChanged = True
188  elif self.modifiedSettings["compare"] == self.COMPARE_RAW_CONTENT:
189  if res.rendered_unicode_content is not None and len(res.rendered_unicode_content) == self.prevContentLen:
190  self.isResourceNotChanged = True
191 
192 
193 # def raiseExceptionIfNotModified(self):
194 # if self.isResourceNotChanged and self.modifiedSettings is not None and \
195 # self.modifiedSettings["behavior"] in [self.BEHAVIOR_CRAWLED_STATUS, self.BEHAVIOR_PROCESSED_STATUS, \
196 # self.BEHAVIOR_SAVE_UDATE]:
197 # raise NotModifiedException("Detect resource not modified state")
198 
199 
200  def getBehaviour(self):
201  if self.modifiedSettings is not None:
202  return self.modifiedSettings["behavior"]
203  else:
204  return None
205 
206 
207  # # Check not modified property
208  #
209  # @return True - if is not modify and otherwise False
210  def isNotModified(self):
211  # variable for result
212  ret = False
213  logger.debug("!!! isNotModified() enter ... self.isResourceNotChanged = " + str(bool(self.isResourceNotChanged)))
214 
215  if self.isResourceNotChanged and self.modifiedSettings is not None and \
216  self.modifiedSettings["behavior"] in [self.BEHAVIOR_DEFAULT, \
219  self.BEHAVIOR_SAVE_UDATE]:
220  ret = True
221 
222  logger.debug("!!! isNotModified() leave ... ret = " + str(bool(ret)))
223  return ret
224 
225 
226  # #notModifiedStateProcessing process resource in case of "not modified" state
227  #
228  # @param siteId - resource's siteId
229  # @param url - resource's url
230  # @param dbWrapper - database wrapper instance
231  # @param defaultStatus - default status
232  # @param defaultUpdateUDate - default updateUDate
233  # @return status and updateUDate
234  def notModifiedStateProcessing(self, siteId, url, dbWrapper, defaultStatus=dc_event.URL.STATUS_CRAWLED,
235  defaultUpdateUDate=True):
236  # variables for result
237  status = defaultStatus
238  updateUDate = defaultUpdateUDate
239  if self.isResourceNotChanged:
240  status = dc_event.URL.STATUS_UNDEFINED
241  updateUDate = False
242  if self.getBehaviour() == self.BEHAVIOR_CRAWLED_STATUS:
243  status = dc_event.URL.STATUS_CRAWLED
244  elif self.getBehaviour() == self.BEHAVIOR_PROCESSED_STATUS:
245  urlContentObj = dc_event.URLContentRequest(siteId, url, dc_event.URLContentRequest.CONTENT_TYPE_PROCESSED)
246  urlContentResponse = dbWrapper.urlContent([urlContentObj])
247  if len(urlContentResponse.processedContents) > 0:
248  status = dc_event.URL.STATUS_PROCESSED
249  elif self.getBehaviour() == self.BEHAVIOR_SAVE_UDATE:
250  updateUDate = True
251 
252  return status, updateUDate
def makeHTTPRequest(self, fetchType, httpParams)
def notModifiedStateProcessing(self, siteId, url, dbWrapper, defaultStatus=dc_event.URL.STATUS_CRAWLED, defaultUpdateUDate=True)
def __init__(self, modifiedSettingsStr)
Definition: join.py:1