HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
ResourceProcess.py
Go to the documentation of this file.
1 """
2 @package: dc
3 @file ResourceProcess.py
4 @author Scorp <developers.hce@gmail.com>
5 @link: http://hierarchical-cluster-engine.com/
6 @copyright: Copyright &copy; 2013-2014 IOIX Ukraine
7 @license: http://hierarchical-cluster-engine.com/license/
8 @since: 0.1
9 """
10 
11 import time
12 import datetime
13 import copy
14 import re
15 import magic
16 import tidylib
17 import lxml.html
18 import lxml.etree
19 import requests
20 import dc_crawler.Constants as CRAWLER_CONSTS
21 from dc_crawler.CrawledResource import CrawledResource
22 import app.Consts as APP_CONSTS
23 from app.Utils import SQLExpression
24 from app.DateTimeType import DateTimeType
25 from app.Utils import getTracebackInfo
26 import app.Utils as Utils # pylint: disable=F0401
27 import dc.EventObjects as dc_event
28 
29 logger = Utils.MPLogger().getLogger()
30 
31 
32 class ResourceProcess(object):
33 
34  RECOVER_IF_FAILED = "2"
35 
36  def __init__(self):
37  self.dbWrapper = None
38  self.batchItem = None
39  self.resource = None
40  self.urlObj = None
41 
42 
43  # #checkFieldsIsNone method checks all class's mandatory fields
44  #
45  def checkFieldsIsNone(self, checkList):
46  # for field in self.__dict__:
47  # if field in checkList and (not hasattr(self, field) or getattr(self, field) is None):
48  # raise Exception(">>> [ResourceProcess] Mandatory field must be initialized, field Name = " + field)
49  for name in checkList:
50  if not hasattr(self, name) or getattr(self, name) is None:
51  raise Exception("Some mandatory field `%s` must be initialized!", name)
52 
53 
54  # # Get codec value
55  #
56  # @param charset - charset name
57  # @return codec name
58  def getCodec(self, charset):
59  # variable for result
60  ret = None
61  if isinstance(charset, basestring):
62  charset = charset.split(',')[0]
63  if charset in CRAWLER_CONSTS.standardEncodings.keys():
64  ret = charset
65  else:
66  for codec, aliases in CRAWLER_CONSTS.standardEncodings.items():
67  if aliases.find(charset) > -1 or aliases.find(charset.lower()) > -1:
68  ret = codec
69  break
70 
71  return ret
72 
73 
74  # # Convert charset to utf-8 for response header
75  # @param headers - resource http headers
76  # @param charset - charset name
77  # @return responseHeader - response header as string
78  def convertCharset(self, headers, charset):
79  # variable for result
80  responseHeader = ''
81  logger.debug("headers: %s, type: %s", str(headers), str(type(headers)))
82  logger.debug("charset: %s, type: %s", str(charset), str(type(charset)))
83 
84  try:
85  if isinstance(headers, requests.structures.CaseInsensitiveDict) and isinstance(charset, basestring):
86  codec = self.getCodec(charset)
87  logger.debug("codec: %s", str(codec))
88  if codec is None:
89  responseHeader = '\r\n'.join(['%s: %s' % (k, v) for k, v in headers.iteritems()])
90  else:
91  responseHeader = '\r\n'.join(['%s: %s' % (k.decode(codec).encode('utf-8'), v.decode(codec).encode('utf-8')) \
92  for k, v in headers.iteritems()])
93  except Exception, err:
94  logger.error(str(err))
95 
96  return responseHeader
97 
98 
99  # #calcLastModified - generates and returns lastModified string
100  #
101  # @param startTime - resource start time
102  # @param res - resource object fetched from request lib
103  # @param headers - resource http headers
104  # @param crawledTime - resource crawled time
105  # @param defaultIcrCrawlTime - default increment time
106  # @returns valid or not valid checking bool value
107  def generateResource(self, startTime, res, headers, crawledTime, defaultIcrCrawlTime, contentTypeMap=None): # pylint: disable=W0613
108  # use charset to improve encoding detect
109  resource = CrawledResource()
110  resource.meta_content = res.meta_res
111  resource.crawling_time = int((crawledTime - startTime) * 1000)
112  if res.content_size is not None and resource.crawling_time != 0:
113  resource.bps = res.content_size / resource.crawling_time * 1000
114 
115  logger.info("crawling_time: %s, bps: %s", resource.crawling_time, resource.bps)
116  resource.http_code = res.status_code
117  logger.debug("headers is :%s", res.headers)
118  localHeaders = {}
119  if res.headers is not None:
120  for elem in res.headers:
121  localHeaders[elem.lower()] = res.headers[elem]
122 
123  logger.debug("!!! localHeaders = %s", str(localHeaders))
124  logger.debug("!!! localHeaders.get('content-type', '') = %s", str(localHeaders.get('content-type', '')))
125 
126  # resource.content_type = localHeaders.get('content-type', 'text/html').split(';')[0]
127  resource.content_type = localHeaders.get('content-type', 'text/xml').split(';')[0]
128 
129  # save cookies
130  resource.cookies = res.cookies
131 
132  if res.encoding:
133  logger.debug("!!! res.encoding = '%s'", str(res.encoding))
134  if isinstance(res.encoding, basestring):
135  resource.charset = res.encoding.split(',')[0]
136  else:
137  resource.charset = res.encoding
138  else:
139  resource.charset = "utf-8"
140 
141  if res.request is not None and hasattr(res.request, 'headers') and res.request.headers is not None:
142  resource.html_request = '\r\n'.join(['%s: %s' % (k, v) for k, v in res.request.headers.iteritems()])
143  elif res.request is not None and isinstance(res.request, dict) and 'headers' in res.request and\
144  res.request['headers'] is not None:
145  resource.html_request = '\r\n'.join(['%s: %s' % (k, v) for k, v in res.request['headers'].iteritems()])
146  else:
147  resource.html_request = ""
148 
149  if res.headers is not None:
150  try:
151  resource.response_header = self.convertCharset(res.headers, resource.charset)
152  except Exception, err:
153  logger.error(str(err))
154  logger.info(getTracebackInfo())
155 
156  resource.last_modified = self.calcLastModified(resource, res, defaultIcrCrawlTime)
157 
158  if contentTypeMap is not None and resource.content_type in contentTypeMap:
159  logger.debug(">>> Mime type replaced from %s to %s", resource.content_type, contentTypeMap[resource.content_type])
160  resource.content_type = copy.deepcopy(contentTypeMap[resource.content_type])
161  logger.debug("request is: %s", resource.html_request)
162  logger.debug("response is: %s", resource.response_header)
163 
164  return resource
165 
166 
167  # #calcLastModified - generates and returns lastModified string
168  #
169  # @param resource - own resource object
170  # @param res - resource object fetched from request lib
171  # @param defaultIcrCrawlTime - incrementing time values
172  # @returns lastModified string
173  def calcLastModified(self, resource, res, defaultIcrCrawlTime):
174  # variables for result
175  lastModified = None
176  self.checkFieldsIsNone(["urlObj"])
177  try:
178  if resource.http_code == 304:
179  lastModified = self.urlObj.tcDate
180  # ret = self.url["TcDate"]
181  elif 'Last-Modified' in res.headers:
182  d = DateTimeType.parse(res.headers['Last-Modified'], True, logger)
183  if d is not None:
184  lastModified = d.strftime('%Y-%m-%d %H:%M:%S')
185  elif 'Date' in res.headers:
186  d = DateTimeType.parse(res.headers['Date'], True, logger)
187  if d is not None:
188  lastModified = d.strftime('%Y-%m-%d %H:%M:%S')
189  else:
190  lastModified = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(time.time() - defaultIcrCrawlTime))
191  logger.debug("LastModified date:" + str(lastModified))
192  except Exception, err:
193  logger.debug('calcLastModified has fail conversation, using current datetime, err: ' + str(err))
194  finally:
195  if lastModified is None:
196  d = DateTimeType.parse(datetime.datetime.today().isoformat())
197  if d is not None:
198  lastModified = d.strftime('%Y-%m-%d %H:%M:%S')
199 
200  return str(lastModified)
201 
202 
203  # #addSiteSize update sites table to increase size
204  #
205  # @param size content size of this crawler
206  def addSiteSize(self, size):
207  if self.dbWrapper is not None:
208  self.checkFieldsIsNone(["dbWrapper", "batchItem"])
209  localSiteUpdate = dc_event.SiteUpdate(self.batchItem.siteId)
210  for attr in localSiteUpdate.__dict__:
211  if hasattr(localSiteUpdate, attr):
212  setattr(localSiteUpdate, attr, None)
213  localSiteUpdate.id = self.batchItem.siteId
214  localSiteUpdate.tcDate = SQLExpression("NOW()")
215  localSiteUpdate.size = SQLExpression(("`Size` + %s" % str(size)))
216  self.dbWrapper.siteNewOrUpdate(localSiteUpdate)
217 
218 
219  # #checkResourcesResponse checks resource's fields on valid
220  #
221  # @param res - current resource object, just fetched from request library
222  # @param maxResourceSize - max resources size
223  # @param updateSiteCallback - update site callback
224  # @returns valid or not valid checking bool value
225  def checkResourcesResponse(self, res, maxResourceSize, updateSiteCallback):
226  ret = True
227  self.checkFieldsIsNone(["resource"])
228  resourceSize = res.content_size
229  logger.debug("MaxResourceSize: " + str(maxResourceSize) + " ResourceSize: " + str(resourceSize))
230  if resourceSize == 0 and self.resource.http_code / 100 != 3:
231  self.resource.error_mask = APP_CONSTS.ERROR_EMPTY_RESPONSE
232  updateSiteCallback(APP_CONSTS.ERROR_EMPTY_RESPONSE)
233  ret = False
234  elif maxResourceSize and resourceSize > maxResourceSize:
235  self.resource.error_mask = APP_CONSTS.ERROR_RESPONSE_SIZE_ERROR
236  updateSiteCallback(APP_CONSTS.ERROR_RESPONSE_SIZE_ERROR)
237  logger.debug("Site MaxResourceSize limit overshooted.")
238  ret = False
239  else:
240  self.resource.html_content = res.rendered_unicode_content
241  self.resource.binary_content = res.str_content
242 
243  if ret and (res.status_code / 100 == 4 or res.status_code / 100 == 5):
244  self.resource.error_mask = APP_CONSTS.ERROR_HTTP_ERROR
245  # Add error mask about forbidden fetch
246  if res.status_code == CRAWLER_CONSTS.HTTP_CODE_403:
247  self.resource.error_mask = APP_CONSTS.ERROR_FETCH_FORBIDDEN
248 
249  updateSiteCallback(self.resource.error_mask)
250  ret = False
251  if ret:
252  self.addSiteSize(resourceSize)
253  return ret
254 
255 
256  # #domParser generates DOM object and returns it
257  #
258  # @param html_recover - html recover type
259  # @param res - incoming resource
260  # @returns just generated dom object
261  def domParser(self, htmlRecover, rendered_unicode_content, http_code, charset):
262  ret = None
263 
264 # logger.debug("!!! domParser ENTER !!! http_code: %s, charset: '%s'\nrendered_unicode_content: %s",
265 # str(http_code), str(charset), str(rendered_unicode_content))
266  if charset is None or charset == "":
267  charset = 'utf-8'
268  parser = lxml.etree.HTMLParser(encoding=charset) # pylint: disable=E1101
269  if http_code == CRAWLER_CONSTS.HTTP_CODE_304:
270  ret = lxml.html.fromstring("<html></html>", parser=parser)
271  else:
272  try:
273  rendered_unicode_content = rendered_unicode_content.decode(charset).encode('utf-8')
274  ret = lxml.html.fromstring(rendered_unicode_content.decode('utf-8').encode(charset), parser=parser)
275  except Exception, err:
276  logger.debug("Wrong DOM model structure. Description: " + str(err))
277  if htmlRecover is not None and htmlRecover == self.RECOVER_IF_FAILED:
278  logger.debug("Try to fix DOM by tidylib.")
279  tidy_content, errors = tidylib.tidy_document(rendered_unicode_content.decode('utf-8').encode(charset))
280  logger.debug("tidylib errors: %s", str(errors))
281  try:
282  ret = lxml.html.fromstring(tidy_content, parser=parser)
283  except Exception, err:
284  logger.error('domParser error: ' + str(err))
285 
286  return ret
287 
288 
289  # #mimeDetectByContent autodetect mime from incoming buffer
290  #
291  # @param crawledResource just crawler resource object
292  # @returns autodetected mime or None
293  def mimeDetectByContent(self, crawledResource, contentTypeMap=None, urlObj=None): # pylint: disable=W0613
294  ret = None
295  if crawledResource.dynamic_fetcher_type:
296  rawUnicodeContent = crawledResource.meta_content
297  else:
298  # rawUnicodeContent = crawledResource.html_content
299  rawUnicodeContent = crawledResource.binary_content
300  if rawUnicodeContent is not None:
301  ret = magic.from_buffer(str(rawUnicodeContent), mime=True)
302  if contentTypeMap is not None and ret in contentTypeMap:
303  logger.debug(">>> Mime type replaced from %s to %s", ret, contentTypeMap[ret])
304  ret = contentTypeMap[ret]
305  return ret
306 
307 
308  # [
309  # {"url_expression":"regular_expression", "mode":mode_number, "url_types":[list_of_url_types], \
310  # "url_parent":[list_of_parent_type], "content_types":[list_of_content_types]}
311  # , ...]
312 
313  # #Check necessary of replace content type from map
314  #
315  # @param inputData - input data
316  # @param urlObj - input urlObj
317  # @return boolean value - True if necessary or False - otherwise
318  @staticmethod
319  def isAllowedReplaceMimeType(inputData=None, urlObj=None):
320  logger.debug('>>> isAllowedReplaceMimeType enter....')
321  # variable for result
322  ret = False
323  if inputData is not None:
324  isOkElemList = []
325  for element in inputData:
326  logger.debug('>>> element: ' + str(element))
327 
328  if "url_expression" in element and urlObj is not None and urlObj.url is not None:
329  logger.debug('>>> url: ' + str(urlObj.url))
330  match = re.search(element["url_expression"], str(urlObj.url))
331  if match is None:
332  logger.debug('>>> url_expression fail')
333  isOkElemList.append(False)
334  continue
335  else:
336  logger.debug('>>> url_expression good')
337 
338  modeNumber = 0
339  urlTypes = []
340  urlParent = []
341  contentTypes = []
342 
343  if "mode" in element:
344  modeNumber = int(element["mode"])
345 
346  if "url_types" in element:
347  urlTypes = element["url_types"]
348 
349  if "url_parent" in element:
350  urlParent = element["url_parent"]
351 
352  if "content_types" in element:
353  contentTypes = element["content_types"]
354 
355  logger.debug('>>> modeNumber: ' + str(modeNumber))
356  logger.debug('>>> urlTypes: ' + str(urlTypes))
357  logger.debug('>>> urlParent: ' + str(urlParent))
358  logger.debug('>>> contentTypes: ' + str(contentTypes))
359 
360 
361  logger.debug('>>>>> urlObj.contentType: ' + str(urlObj.contentType))
362 
363  if modeNumber == 0:
364  pass
365  elif modeNumber == 1 and urlObj.contentType != "":
366  logger.debug('>>> mode (' + str(modeNumber) + ') fail, contentType: ' + str(urlObj.contentType))
367  isOkElemList.append(False)
368  continue
369  elif modeNumber == 2 and urlObj.contentType not in urlTypes:
370  logger.debug('>>> mode (' + str(modeNumber) + ') fail, contentType: ' + str(urlObj.contentType) + \
371  ' urlTypes: ' + str(urlTypes))
372  isOkElemList.append(False)
373  continue
374  elif modeNumber == 3 and urlObj.contentType in urlTypes:
375  logger.debug('>>> mode (' + str(modeNumber) + ') fail, contentType: ' + str(urlObj.contentType) + \
376  ' urlTypes: ' + str(urlTypes))
377  isOkElemList.append(False)
378  continue
379 
380  isOk = False
381  if len(urlTypes) > 0:
382  for urlType in urlTypes:
383  if urlType == urlObj.type:
384  isOk = True
385  else:
386  isOk = True
387 
388  if not isOk:
389  logger.debug('>>> urlTypes fail: ' + str(urlTypes) + ' urlObj.type = ' + str(urlObj.type))
390  isOkElemList.append(False)
391  continue
392 
393  isOk = False
394  if len(urlParent) > 0:
395  for parentElem in urlParent:
396  if parentElem == 0 and not urlObj.parentMd5:
397  isOk = True
398  elif parentElem == 1 and urlObj.parentMd5:
399  isOk = True
400  else:
401  isOk = True
402 
403  if not isOk:
404  logger.debug('>>> urlParent fail: ' + str(urlParent) + ' urlObj.parentMd5: ' + str(urlObj.parentMd5))
405  isOkElemList.append(False)
406  continue
407 
408  # all success
409  isOkElemList.append(True)
410 
411  # Make result after loop
412  logger.debug('isOkElemList: ' + str(isOkElemList))
413  if True in isOkElemList:
414  ret = True
415 
416  return ret
def checkResourcesResponse(self, res, maxResourceSize, updateSiteCallback)
def calcLastModified(self, resource, res, defaultIcrCrawlTime)
def domParser(self, htmlRecover, rendered_unicode_content, http_code, charset)
def mimeDetectByContent(self, crawledResource, contentTypeMap=None, urlObj=None)
def convertCharset(self, headers, charset)
def isAllowedReplaceMimeType(inputData=None, urlObj=None)
def generateResource(self, startTime, res, headers, crawledTime, defaultIcrCrawlTime, contentTypeMap=None)
Definition: join.py:1
def getTracebackInfo(linesNumberMax=None)
Definition: Utils.py:218