3 @file ResourceProcess.py 4 @author Scorp <developers.hce@gmail.com> 5 @link: http://hierarchical-cluster-engine.com/ 6 @copyright: Copyright © 2013-2014 IOIX Ukraine 7 @license: http://hierarchical-cluster-engine.com/license/ 34 RECOVER_IF_FAILED =
"2" 49 for name
in checkList:
50 if not hasattr(self, name)
or getattr(self, name)
is None:
51 raise Exception(
"Some mandatory field `%s` must be initialized!", name)
61 if isinstance(charset, basestring):
62 charset = charset.split(
',')[0]
63 if charset
in CRAWLER_CONSTS.standardEncodings.keys():
66 for codec, aliases
in CRAWLER_CONSTS.standardEncodings.items():
67 if aliases.find(charset) > -1
or aliases.find(charset.lower()) > -1:
81 logger.debug(
"headers: %s, type: %s", str(headers), str(
type(headers)))
82 logger.debug(
"charset: %s, type: %s", str(charset), str(
type(charset)))
85 if isinstance(headers, requests.structures.CaseInsensitiveDict)
and isinstance(charset, basestring):
87 logger.debug(
"codec: %s", str(codec))
89 responseHeader =
'\r\n'.
join([
'%s: %s' % (k, v)
for k, v
in headers.iteritems()])
91 responseHeader =
'\r\n'.
join([
'%s: %s' % (k.decode(codec).
encode(
'utf-8'), v.decode(codec).
encode(
'utf-8')) \
92 for k, v
in headers.iteritems()])
93 except Exception, err:
94 logger.error(str(err))
107 def generateResource(self, startTime, res, headers, crawledTime, defaultIcrCrawlTime, contentTypeMap=None):
110 resource.meta_content = res.meta_res
111 resource.crawling_time = int((crawledTime - startTime) * 1000)
112 if res.content_size
is not None and resource.crawling_time != 0:
113 resource.bps = res.content_size / resource.crawling_time * 1000
115 logger.info(
"crawling_time: %s, bps: %s", resource.crawling_time, resource.bps)
116 resource.http_code = res.status_code
117 logger.debug(
"headers is :%s", res.headers)
119 if res.headers
is not None:
120 for elem
in res.headers:
121 localHeaders[elem.lower()] = res.headers[elem]
123 logger.debug(
"!!! localHeaders = %s", str(localHeaders))
124 logger.debug(
"!!! localHeaders.get('content-type', '') = %s", str(localHeaders.get(
'content-type',
'')))
127 resource.content_type = localHeaders.get(
'content-type',
'text/xml').split(
';')[0]
130 resource.cookies = res.cookies
133 logger.debug(
"!!! res.encoding = '%s'", str(res.encoding))
134 if isinstance(res.encoding, basestring):
135 resource.charset = res.encoding.split(
',')[0]
137 resource.charset = res.encoding
139 resource.charset =
"utf-8" 141 if res.request
is not None and hasattr(res.request,
'headers')
and res.request.headers
is not None:
142 resource.html_request =
'\r\n'.
join([
'%s: %s' % (k, v)
for k, v
in res.request.headers.iteritems()])
143 elif res.request
is not None and isinstance(res.request, dict)
and 'headers' in res.request
and\
144 res.request[
'headers']
is not None:
145 resource.html_request =
'\r\n'.
join([
'%s: %s' % (k, v)
for k, v
in res.request[
'headers'].iteritems()])
147 resource.html_request =
"" 149 if res.headers
is not None:
151 resource.response_header = self.
convertCharset(res.headers, resource.charset)
152 except Exception, err:
153 logger.error(str(err))
156 resource.last_modified = self.
calcLastModified(resource, res, defaultIcrCrawlTime)
158 if contentTypeMap
is not None and resource.content_type
in contentTypeMap:
159 logger.debug(
">>> Mime type replaced from %s to %s", resource.content_type, contentTypeMap[resource.content_type])
160 resource.content_type = copy.deepcopy(contentTypeMap[resource.content_type])
161 logger.debug(
"request is: %s", resource.html_request)
162 logger.debug(
"response is: %s", resource.response_header)
178 if resource.http_code == 304:
179 lastModified = self.
urlObj.tcDate
181 elif 'Last-Modified' in res.headers:
182 d = DateTimeType.parse(res.headers[
'Last-Modified'],
True, logger)
184 lastModified = d.strftime(
'%Y-%m-%d %H:%M:%S')
185 elif 'Date' in res.headers:
186 d = DateTimeType.parse(res.headers[
'Date'],
True, logger)
188 lastModified = d.strftime(
'%Y-%m-%d %H:%M:%S')
190 lastModified = time.strftime(
'%Y-%m-%d %H:%M:%S', time.gmtime(time.time() - defaultIcrCrawlTime))
191 logger.debug(
"LastModified date:" + str(lastModified))
192 except Exception, err:
193 logger.debug(
'calcLastModified has fail conversation, using current datetime, err: ' + str(err))
195 if lastModified
is None:
196 d = DateTimeType.parse(datetime.datetime.today().isoformat())
198 lastModified = d.strftime(
'%Y-%m-%d %H:%M:%S')
200 return str(lastModified)
209 localSiteUpdate = dc_event.SiteUpdate(self.
batchItem.siteId)
210 for attr
in localSiteUpdate.__dict__:
211 if hasattr(localSiteUpdate, attr):
212 setattr(localSiteUpdate, attr,
None)
213 localSiteUpdate.id = self.
batchItem.siteId
215 localSiteUpdate.size =
SQLExpression((
"`Size` + %s" % str(size)))
216 self.
dbWrapper.siteNewOrUpdate(localSiteUpdate)
228 resourceSize = res.content_size
229 logger.debug(
"MaxResourceSize: " + str(maxResourceSize) +
" ResourceSize: " + str(resourceSize))
230 if resourceSize == 0
and self.
resource.http_code / 100 != 3:
231 self.
resource.error_mask = APP_CONSTS.ERROR_EMPTY_RESPONSE
232 updateSiteCallback(APP_CONSTS.ERROR_EMPTY_RESPONSE)
234 elif maxResourceSize
and resourceSize > maxResourceSize:
235 self.
resource.error_mask = APP_CONSTS.ERROR_RESPONSE_SIZE_ERROR
236 updateSiteCallback(APP_CONSTS.ERROR_RESPONSE_SIZE_ERROR)
237 logger.debug(
"Site MaxResourceSize limit overshooted.")
240 self.
resource.html_content = res.rendered_unicode_content
241 self.
resource.binary_content = res.str_content
243 if ret
and (res.status_code / 100 == 4
or res.status_code / 100 == 5):
244 self.
resource.error_mask = APP_CONSTS.ERROR_HTTP_ERROR
246 if res.status_code == CRAWLER_CONSTS.HTTP_CODE_403:
247 self.
resource.error_mask = APP_CONSTS.ERROR_FETCH_FORBIDDEN
249 updateSiteCallback(self.
resource.error_mask)
261 def domParser(self, htmlRecover, rendered_unicode_content, http_code, charset):
266 if charset
is None or charset ==
"":
268 parser = lxml.etree.HTMLParser(encoding=charset)
269 if http_code == CRAWLER_CONSTS.HTTP_CODE_304:
270 ret = lxml.html.fromstring(
"<html></html>", parser=parser)
273 rendered_unicode_content = rendered_unicode_content.decode(charset).
encode(
'utf-8')
274 ret = lxml.html.fromstring(rendered_unicode_content.decode(
'utf-8').
encode(charset), parser=parser)
275 except Exception, err:
276 logger.debug(
"Wrong DOM model structure. Description: " + str(err))
278 logger.debug(
"Try to fix DOM by tidylib.")
279 tidy_content, errors = tidylib.tidy_document(rendered_unicode_content.decode(
'utf-8').
encode(charset))
280 logger.debug(
"tidylib errors: %s", str(errors))
282 ret = lxml.html.fromstring(tidy_content, parser=parser)
283 except Exception, err:
284 logger.error(
'domParser error: ' + str(err))
295 if crawledResource.dynamic_fetcher_type:
296 rawUnicodeContent = crawledResource.meta_content
299 rawUnicodeContent = crawledResource.binary_content
300 if rawUnicodeContent
is not None:
301 ret = magic.from_buffer(str(rawUnicodeContent), mime=
True)
302 if contentTypeMap
is not None and ret
in contentTypeMap:
303 logger.debug(
">>> Mime type replaced from %s to %s", ret, contentTypeMap[ret])
304 ret = contentTypeMap[ret]
320 logger.debug(
'>>> isAllowedReplaceMimeType enter....')
323 if inputData
is not None:
325 for element
in inputData:
326 logger.debug(
'>>> element: ' + str(element))
328 if "url_expression" in element
and urlObj
is not None and urlObj.url
is not None:
329 logger.debug(
'>>> url: ' + str(urlObj.url))
330 match = re.search(element[
"url_expression"], str(urlObj.url))
332 logger.debug(
'>>> url_expression fail')
333 isOkElemList.append(
False)
336 logger.debug(
'>>> url_expression good')
343 if "mode" in element:
344 modeNumber = int(element[
"mode"])
346 if "url_types" in element:
347 urlTypes = element[
"url_types"]
349 if "url_parent" in element:
350 urlParent = element[
"url_parent"]
352 if "content_types" in element:
353 contentTypes = element[
"content_types"]
355 logger.debug(
'>>> modeNumber: ' + str(modeNumber))
356 logger.debug(
'>>> urlTypes: ' + str(urlTypes))
357 logger.debug(
'>>> urlParent: ' + str(urlParent))
358 logger.debug(
'>>> contentTypes: ' + str(contentTypes))
361 logger.debug(
'>>>>> urlObj.contentType: ' + str(urlObj.contentType))
365 elif modeNumber == 1
and urlObj.contentType !=
"":
366 logger.debug(
'>>> mode (' + str(modeNumber) +
') fail, contentType: ' + str(urlObj.contentType))
367 isOkElemList.append(
False)
369 elif modeNumber == 2
and urlObj.contentType
not in urlTypes:
370 logger.debug(
'>>> mode (' + str(modeNumber) +
') fail, contentType: ' + str(urlObj.contentType) + \
371 ' urlTypes: ' + str(urlTypes))
372 isOkElemList.append(
False)
374 elif modeNumber == 3
and urlObj.contentType
in urlTypes:
375 logger.debug(
'>>> mode (' + str(modeNumber) +
') fail, contentType: ' + str(urlObj.contentType) + \
376 ' urlTypes: ' + str(urlTypes))
377 isOkElemList.append(
False)
381 if len(urlTypes) > 0:
382 for urlType
in urlTypes:
383 if urlType == urlObj.type:
389 logger.debug(
'>>> urlTypes fail: ' + str(urlTypes) +
' urlObj.type = ' + str(urlObj.type))
390 isOkElemList.append(
False)
394 if len(urlParent) > 0:
395 for parentElem
in urlParent:
396 if parentElem == 0
and not urlObj.parentMd5:
398 elif parentElem == 1
and urlObj.parentMd5:
404 logger.debug(
'>>> urlParent fail: ' + str(urlParent) +
' urlObj.parentMd5: ' + str(urlObj.parentMd5))
405 isOkElemList.append(
False)
409 isOkElemList.append(
True)
412 logger.debug(
'isOkElemList: ' + str(isOkElemList))
413 if True in isOkElemList:
def checkResourcesResponse(self, res, maxResourceSize, updateSiteCallback)
def calcLastModified(self, resource, res, defaultIcrCrawlTime)
def domParser(self, htmlRecover, rendered_unicode_content, http_code, charset)
def mimeDetectByContent(self, crawledResource, contentTypeMap=None, urlObj=None)
def addSiteSize(self, size)
def getCodec(self, charset)
def checkFieldsIsNone(self, checkList)
def convertCharset(self, headers, charset)
def isAllowedReplaceMimeType(inputData=None, urlObj=None)
def generateResource(self, startTime, res, headers, crawledTime, defaultIcrCrawlTime, contentTypeMap=None)
def getTracebackInfo(linesNumberMax=None)