2 HCE project, Python bindings, Distributed Tasks Manager application. 3 Event objects definitions. 6 @file DetectModified.py 7 @author Scorp <developers.hce@gmail.com> 8 @copyright: Copyright © 2013-2014 IOIX Ukraine 9 @license: http://hierarchical-cluster-engine.com/license/ 29 Exception.__init__(self, message)
44 ALGO_IF_MOFIFIED_SINCE = 2
50 COMPARE_RAW_CONTENT = 3
53 BEHAVIOR_CRAWLED_STATUS = 1
54 BEHAVIOR_PROCESSED_STATUS = 2
55 BEHAVIOR_SAVE_UDATE = 3
81 if isinstance(eTags, list):
87 ret =
'","'.
join(eTags)
88 elif isinstance(eTags, basestring):
96 for key
in httpParams[
"httpHeader"]:
97 if key.lower() ==
"if-none-match" or key.lower() ==
"if-modified-since":
100 del httpParams[
"httpHeader"][elem]
110 if httpParams
is None:
117 if self.
eTags is not None and self.
eTags is not "":
119 httpParams[
"httpHeader"][
"if-none-match"] = eTagsString
122 httpParams[
"httpHeader"][
"if-modified-since"] = self.
lastModified 125 if self.
eTags is not None and self.
eTags is not "":
127 httpParams[
"httpHeader"][
"if-none-match"] = eTagsString
131 httpParams[
"httpHeader"][
"if-modified-since"] = self.
lastModified 135 raise Exception(
">>> Error [algorithm == 3 and mode == 1] not compatible !!!")
138 ret = BaseFetcher.get_fetcher(fetchType).open(httpParams[
"url"],
139 timeout=httpParams[
"httpTimeout"],
140 headers=httpParams[
"httpHeader"],
141 allow_redirects=httpParams[
"allowHttpRedirects"],
142 proxies=httpParams[
"proxies"], auth=httpParams[
"auth"],
143 data=httpParams[
"postData"], log=logger,
144 allowed_content_types=httpParams[
"processContentTypes"],
145 max_resource_size=httpParams[
"maxResourceSize"],
146 max_redirects=httpParams[
"maxHttpRedirects"],
153 ret = BaseFetcher.get_fetcher(fetchType).open(httpParams[
"url"],
154 timeout=httpParams[
"httpTimeout"],
155 headers=httpParams[
"httpHeader"],
156 allow_redirects=httpParams[
"allowHttpRedirects"],
157 proxies=httpParams[
"proxies"], auth=httpParams[
"auth"],
158 data=httpParams[
"postData"], log=logger,
159 allowed_content_types=httpParams[
"processContentTypes"],
160 max_resource_size=httpParams[
"maxResourceSize"],
161 max_redirects=httpParams[
"maxHttpRedirects"])
172 if res.status_code == 304:
175 if "last-modified" in res.headers
and self.
prevContentDate is not None:
177 resDate = time.mktime(rfc822.parsedate(res.headers[
"last-modified"]))
179 if resDate <= prevResDate:
182 raise Exception(
">>> Bad data format - resDate -" + str(res.headers[
"last-modified"]) +
" or prevResDate -" +
185 if res.rendered_unicode_content
is not None and \
186 hashlib.md5(res.rendered_unicode_content).hexdigest() == self.
prevContentMd5:
189 if res.rendered_unicode_content
is not None and len(res.rendered_unicode_content) == self.
prevContentLen:
213 logger.debug(
"!!! isNotModified() enter ... self.isResourceNotChanged = " + str(bool(self.
isResourceNotChanged)))
222 logger.debug(
"!!! isNotModified() leave ... ret = " + str(bool(ret)))
235 defaultUpdateUDate=True):
237 status = defaultStatus
238 updateUDate = defaultUpdateUDate
240 status = dc_event.URL.STATUS_UNDEFINED
243 status = dc_event.URL.STATUS_CRAWLED
245 urlContentObj = dc_event.URLContentRequest(siteId, url, dc_event.URLContentRequest.CONTENT_TYPE_PROCESSED)
246 urlContentResponse = dbWrapper.urlContent([urlContentObj])
247 if len(urlContentResponse.processedContents) > 0:
248 status = dc_event.URL.STATUS_PROCESSED
252 return status, updateUDate
def __init__(self, message, httpCode)
def generateETagsString(self, eTags)
def makeHTTPRequest(self, fetchType, httpParams)
int BEHAVIOR_PROCESSED_STATUS
int ALGO_IF_MOFIFIED_SINCE
def notModifiedStateProcessing(self, siteId, url, dbWrapper, defaultStatus=dc_event.URL.STATUS_CRAWLED, defaultUpdateUDate=True)
def __init__(self, modifiedSettingsStr)
def headersClearing(self, httpParams)
def resourceComparing(self, res)
int BEHAVIOR_CRAWLED_STATUS