HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_crawler.ResourceProcess.ResourceProcess Class Reference
Inheritance diagram for dc_crawler.ResourceProcess.ResourceProcess:
Collaboration diagram for dc_crawler.ResourceProcess.ResourceProcess:

Public Member Functions

def __init__ (self)
 
def checkFieldsIsNone (self, checkList)
 
def getCodec (self, charset)
 
def convertCharset (self, headers, charset)
 
def generateResource (self, startTime, res, headers, crawledTime, defaultIcrCrawlTime, contentTypeMap=None)
 
def calcLastModified (self, resource, res, defaultIcrCrawlTime)
 
def addSiteSize (self, size)
 
def checkResourcesResponse (self, res, maxResourceSize, updateSiteCallback)
 
def domParser (self, htmlRecover, rendered_unicode_content, http_code, charset)
 
def mimeDetectByContent (self, crawledResource, contentTypeMap=None, urlObj=None)
 

Static Public Member Functions

def isAllowedReplaceMimeType (inputData=None, urlObj=None)
 

Public Attributes

 dbWrapper
 
 batchItem
 
 resource
 
 urlObj
 

Static Public Attributes

string RECOVER_IF_FAILED = "2"
 

Detailed Description

Definition at line 32 of file ResourceProcess.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_crawler.ResourceProcess.ResourceProcess.__init__ (   self)

Definition at line 36 of file ResourceProcess.py.

36  def __init__(self):
37  self.dbWrapper = None
38  self.batchItem = None
39  self.resource = None
40  self.urlObj = None
41 
42 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ addSiteSize()

def dc_crawler.ResourceProcess.ResourceProcess.addSiteSize (   self,
  size 
)

Definition at line 206 of file ResourceProcess.py.

206  def addSiteSize(self, size):
207  if self.dbWrapper is not None:
208  self.checkFieldsIsNone(["dbWrapper", "batchItem"])
209  localSiteUpdate = dc_event.SiteUpdate(self.batchItem.siteId)
210  for attr in localSiteUpdate.__dict__:
211  if hasattr(localSiteUpdate, attr):
212  setattr(localSiteUpdate, attr, None)
213  localSiteUpdate.id = self.batchItem.siteId
214  localSiteUpdate.tcDate = SQLExpression("NOW()")
215  localSiteUpdate.size = SQLExpression(("`Size` + %s" % str(size)))
216  self.dbWrapper.siteNewOrUpdate(localSiteUpdate)
217 
218 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ calcLastModified()

def dc_crawler.ResourceProcess.ResourceProcess.calcLastModified (   self,
  resource,
  res,
  defaultIcrCrawlTime 
)

Definition at line 173 of file ResourceProcess.py.

173  def calcLastModified(self, resource, res, defaultIcrCrawlTime):
174  # variables for result
175  lastModified = None
176  self.checkFieldsIsNone(["urlObj"])
177  try:
178  if resource.http_code == 304:
179  lastModified = self.urlObj.tcDate
180  # ret = self.url["TcDate"]
181  elif 'Last-Modified' in res.headers:
182  d = DateTimeType.parse(res.headers['Last-Modified'], True, logger)
183  if d is not None:
184  lastModified = d.strftime('%Y-%m-%d %H:%M:%S')
185  elif 'Date' in res.headers:
186  d = DateTimeType.parse(res.headers['Date'], True, logger)
187  if d is not None:
188  lastModified = d.strftime('%Y-%m-%d %H:%M:%S')
189  else:
190  lastModified = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(time.time() - defaultIcrCrawlTime))
191  logger.debug("LastModified date:" + str(lastModified))
192  except Exception, err:
193  logger.debug('calcLastModified has fail conversation, using current datetime, err: ' + str(err))
194  finally:
195  if lastModified is None:
196  d = DateTimeType.parse(datetime.datetime.today().isoformat())
197  if d is not None:
198  lastModified = d.strftime('%Y-%m-%d %H:%M:%S')
199 
200  return str(lastModified)
201 
202 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ checkFieldsIsNone()

def dc_crawler.ResourceProcess.ResourceProcess.checkFieldsIsNone (   self,
  checkList 
)

Definition at line 45 of file ResourceProcess.py.

45  def checkFieldsIsNone(self, checkList):
46  # for field in self.__dict__:
47  # if field in checkList and (not hasattr(self, field) or getattr(self, field) is None):
48  # raise Exception(">>> [ResourceProcess] Mandatory field must be initialized, field Name = " + field)
49  for name in checkList:
50  if not hasattr(self, name) or getattr(self, name) is None:
51  raise Exception("Some mandatory field `%s` must be initialized!", name)
52 
53 
Here is the caller graph for this function:

◆ checkResourcesResponse()

def dc_crawler.ResourceProcess.ResourceProcess.checkResourcesResponse (   self,
  res,
  maxResourceSize,
  updateSiteCallback 
)

Definition at line 225 of file ResourceProcess.py.

225  def checkResourcesResponse(self, res, maxResourceSize, updateSiteCallback):
226  ret = True
227  self.checkFieldsIsNone(["resource"])
228  resourceSize = res.content_size
229  logger.debug("MaxResourceSize: " + str(maxResourceSize) + " ResourceSize: " + str(resourceSize))
230  if resourceSize == 0 and self.resource.http_code / 100 != 3:
231  self.resource.error_mask = APP_CONSTS.ERROR_EMPTY_RESPONSE
232  updateSiteCallback(APP_CONSTS.ERROR_EMPTY_RESPONSE)
233  ret = False
234  elif maxResourceSize and resourceSize > maxResourceSize:
235  self.resource.error_mask = APP_CONSTS.ERROR_RESPONSE_SIZE_ERROR
236  updateSiteCallback(APP_CONSTS.ERROR_RESPONSE_SIZE_ERROR)
237  logger.debug("Site MaxResourceSize limit overshooted.")
238  ret = False
239  else:
240  self.resource.html_content = res.rendered_unicode_content
241  self.resource.binary_content = res.str_content
242 
243  if ret and (res.status_code / 100 == 4 or res.status_code / 100 == 5):
244  self.resource.error_mask = APP_CONSTS.ERROR_HTTP_ERROR
245  # Add error mask about forbidden fetch
246  if res.status_code == CRAWLER_CONSTS.HTTP_CODE_403:
247  self.resource.error_mask = APP_CONSTS.ERROR_FETCH_FORBIDDEN
248 
249  updateSiteCallback(self.resource.error_mask)
250  ret = False
251  if ret:
252  self.addSiteSize(resourceSize)
253  return ret
254 
255 
Here is the call graph for this function:

◆ convertCharset()

def dc_crawler.ResourceProcess.ResourceProcess.convertCharset (   self,
  headers,
  charset 
)

Definition at line 78 of file ResourceProcess.py.

78  def convertCharset(self, headers, charset):
79  # variable for result
80  responseHeader = ''
81  logger.debug("headers: %s, type: %s", str(headers), str(type(headers)))
82  logger.debug("charset: %s, type: %s", str(charset), str(type(charset)))
83 
84  try:
85  if isinstance(headers, requests.structures.CaseInsensitiveDict) and isinstance(charset, basestring):
86  codec = self.getCodec(charset)
87  logger.debug("codec: %s", str(codec))
88  if codec is None:
89  responseHeader = '\r\n'.join(['%s: %s' % (k, v) for k, v in headers.iteritems()])
90  else:
91  responseHeader = '\r\n'.join(['%s: %s' % (k.decode(codec).encode('utf-8'), v.decode(codec).encode('utf-8')) \
92  for k, v in headers.iteritems()])
93  except Exception, err:
94  logger.error(str(err))
95 
96  return responseHeader
97 
98 
Definition: join.py:1
Here is the call graph for this function:
Here is the caller graph for this function:

◆ domParser()

def dc_crawler.ResourceProcess.ResourceProcess.domParser (   self,
  htmlRecover,
  rendered_unicode_content,
  http_code,
  charset 
)

Definition at line 261 of file ResourceProcess.py.

261  def domParser(self, htmlRecover, rendered_unicode_content, http_code, charset):
262  ret = None
263 
264 # logger.debug("!!! domParser ENTER !!! http_code: %s, charset: '%s'\nrendered_unicode_content: %s",
265 # str(http_code), str(charset), str(rendered_unicode_content))
266  if charset is None or charset == "":
267  charset = 'utf-8'
268  parser = lxml.etree.HTMLParser(encoding=charset) # pylint: disable=E1101
269  if http_code == CRAWLER_CONSTS.HTTP_CODE_304:
270  ret = lxml.html.fromstring("<html></html>", parser=parser)
271  else:
272  try:
273  rendered_unicode_content = rendered_unicode_content.decode(charset).encode('utf-8')
274  ret = lxml.html.fromstring(rendered_unicode_content.decode('utf-8').encode(charset), parser=parser)
275  except Exception, err:
276  logger.debug("Wrong DOM model structure. Description: " + str(err))
277  if htmlRecover is not None and htmlRecover == self.RECOVER_IF_FAILED:
278  logger.debug("Try to fix DOM by tidylib.")
279  tidy_content, errors = tidylib.tidy_document(rendered_unicode_content.decode('utf-8').encode(charset))
280  logger.debug("tidylib errors: %s", str(errors))
281  try:
282  ret = lxml.html.fromstring(tidy_content, parser=parser)
283  except Exception, err:
284  logger.error('domParser error: ' + str(err))
285 
286  return ret
287 
288 
Here is the call graph for this function:

◆ generateResource()

def dc_crawler.ResourceProcess.ResourceProcess.generateResource (   self,
  startTime,
  res,
  headers,
  crawledTime,
  defaultIcrCrawlTime,
  contentTypeMap = None 
)

Definition at line 107 of file ResourceProcess.py.

107  def generateResource(self, startTime, res, headers, crawledTime, defaultIcrCrawlTime, contentTypeMap=None): # pylint: disable=W0613
108  # use charset to improve encoding detect
109  resource = CrawledResource()
110  resource.meta_content = res.meta_res
111  resource.crawling_time = int((crawledTime - startTime) * 1000)
112  if res.content_size is not None and resource.crawling_time != 0:
113  resource.bps = res.content_size / resource.crawling_time * 1000
114 
115  logger.info("crawling_time: %s, bps: %s", resource.crawling_time, resource.bps)
116  resource.http_code = res.status_code
117  logger.debug("headers is :%s", res.headers)
118  localHeaders = {}
119  if res.headers is not None:
120  for elem in res.headers:
121  localHeaders[elem.lower()] = res.headers[elem]
122 
123  logger.debug("!!! localHeaders = %s", str(localHeaders))
124  logger.debug("!!! localHeaders.get('content-type', '') = %s", str(localHeaders.get('content-type', '')))
125 
126  # resource.content_type = localHeaders.get('content-type', 'text/html').split(';')[0]
127  resource.content_type = localHeaders.get('content-type', 'text/xml').split(';')[0]
128 
129  # save cookies
130  resource.cookies = res.cookies
131 
132  if res.encoding:
133  logger.debug("!!! res.encoding = '%s'", str(res.encoding))
134  if isinstance(res.encoding, basestring):
135  resource.charset = res.encoding.split(',')[0]
136  else:
137  resource.charset = res.encoding
138  else:
139  resource.charset = "utf-8"
140 
141  if res.request is not None and hasattr(res.request, 'headers') and res.request.headers is not None:
142  resource.html_request = '\r\n'.join(['%s: %s' % (k, v) for k, v in res.request.headers.iteritems()])
143  elif res.request is not None and isinstance(res.request, dict) and 'headers' in res.request and\
144  res.request['headers'] is not None:
145  resource.html_request = '\r\n'.join(['%s: %s' % (k, v) for k, v in res.request['headers'].iteritems()])
146  else:
147  resource.html_request = ""
148 
149  if res.headers is not None:
150  try:
151  resource.response_header = self.convertCharset(res.headers, resource.charset)
152  except Exception, err:
153  logger.error(str(err))
154  logger.info(getTracebackInfo())
155 
156  resource.last_modified = self.calcLastModified(resource, res, defaultIcrCrawlTime)
157 
158  if contentTypeMap is not None and resource.content_type in contentTypeMap:
159  logger.debug(">>> Mime type replaced from %s to %s", resource.content_type, contentTypeMap[resource.content_type])
160  resource.content_type = copy.deepcopy(contentTypeMap[resource.content_type])
161  logger.debug("request is: %s", resource.html_request)
162  logger.debug("response is: %s", resource.response_header)
163 
164  return resource
165 
166 
Definition: join.py:1
def getTracebackInfo(linesNumberMax=None)
Definition: Utils.py:218
Here is the call graph for this function:

◆ getCodec()

def dc_crawler.ResourceProcess.ResourceProcess.getCodec (   self,
  charset 
)

Definition at line 58 of file ResourceProcess.py.

58  def getCodec(self, charset):
59  # variable for result
60  ret = None
61  if isinstance(charset, basestring):
62  charset = charset.split(',')[0]
63  if charset in CRAWLER_CONSTS.standardEncodings.keys():
64  ret = charset
65  else:
66  for codec, aliases in CRAWLER_CONSTS.standardEncodings.items():
67  if aliases.find(charset) > -1 or aliases.find(charset.lower()) > -1:
68  ret = codec
69  break
70 
71  return ret
72 
73 
Here is the caller graph for this function:

◆ isAllowedReplaceMimeType()

def dc_crawler.ResourceProcess.ResourceProcess.isAllowedReplaceMimeType (   inputData = None,
  urlObj = None 
)
static

Definition at line 319 of file ResourceProcess.py.

319  def isAllowedReplaceMimeType(inputData=None, urlObj=None):
320  logger.debug('>>> isAllowedReplaceMimeType enter....')
321  # variable for result
322  ret = False
323  if inputData is not None:
324  isOkElemList = []
325  for element in inputData:
326  logger.debug('>>> element: ' + str(element))
327 
328  if "url_expression" in element and urlObj is not None and urlObj.url is not None:
329  logger.debug('>>> url: ' + str(urlObj.url))
330  match = re.search(element["url_expression"], str(urlObj.url))
331  if match is None:
332  logger.debug('>>> url_expression fail')
333  isOkElemList.append(False)
334  continue
335  else:
336  logger.debug('>>> url_expression good')
337 
338  modeNumber = 0
339  urlTypes = []
340  urlParent = []
341  contentTypes = []
342 
343  if "mode" in element:
344  modeNumber = int(element["mode"])
345 
346  if "url_types" in element:
347  urlTypes = element["url_types"]
348 
349  if "url_parent" in element:
350  urlParent = element["url_parent"]
351 
352  if "content_types" in element:
353  contentTypes = element["content_types"]
354 
355  logger.debug('>>> modeNumber: ' + str(modeNumber))
356  logger.debug('>>> urlTypes: ' + str(urlTypes))
357  logger.debug('>>> urlParent: ' + str(urlParent))
358  logger.debug('>>> contentTypes: ' + str(contentTypes))
359 
360 
361  logger.debug('>>>>> urlObj.contentType: ' + str(urlObj.contentType))
362 
363  if modeNumber == 0:
364  pass
365  elif modeNumber == 1 and urlObj.contentType != "":
366  logger.debug('>>> mode (' + str(modeNumber) + ') fail, contentType: ' + str(urlObj.contentType))
367  isOkElemList.append(False)
368  continue
369  elif modeNumber == 2 and urlObj.contentType not in urlTypes:
370  logger.debug('>>> mode (' + str(modeNumber) + ') fail, contentType: ' + str(urlObj.contentType) + \
371  ' urlTypes: ' + str(urlTypes))
372  isOkElemList.append(False)
373  continue
374  elif modeNumber == 3 and urlObj.contentType in urlTypes:
375  logger.debug('>>> mode (' + str(modeNumber) + ') fail, contentType: ' + str(urlObj.contentType) + \
376  ' urlTypes: ' + str(urlTypes))
377  isOkElemList.append(False)
378  continue
379 
380  isOk = False
381  if len(urlTypes) > 0:
382  for urlType in urlTypes:
383  if urlType == urlObj.type:
384  isOk = True
385  else:
386  isOk = True
387 
388  if not isOk:
389  logger.debug('>>> urlTypes fail: ' + str(urlTypes) + ' urlObj.type = ' + str(urlObj.type))
390  isOkElemList.append(False)
391  continue
392 
393  isOk = False
394  if len(urlParent) > 0:
395  for parentElem in urlParent:
396  if parentElem == 0 and not urlObj.parentMd5:
397  isOk = True
398  elif parentElem == 1 and urlObj.parentMd5:
399  isOk = True
400  else:
401  isOk = True
402 
403  if not isOk:
404  logger.debug('>>> urlParent fail: ' + str(urlParent) + ' urlObj.parentMd5: ' + str(urlObj.parentMd5))
405  isOkElemList.append(False)
406  continue
407 
408  # all success
409  isOkElemList.append(True)
410 
411  # Make result after loop
412  logger.debug('isOkElemList: ' + str(isOkElemList))
413  if True in isOkElemList:
414  ret = True
415 
416  return ret
417 

◆ mimeDetectByContent()

def dc_crawler.ResourceProcess.ResourceProcess.mimeDetectByContent (   self,
  crawledResource,
  contentTypeMap = None,
  urlObj = None 
)

Definition at line 293 of file ResourceProcess.py.

293  def mimeDetectByContent(self, crawledResource, contentTypeMap=None, urlObj=None): # pylint: disable=W0613
294  ret = None
295  if crawledResource.dynamic_fetcher_type:
296  rawUnicodeContent = crawledResource.meta_content
297  else:
298  # rawUnicodeContent = crawledResource.html_content
299  rawUnicodeContent = crawledResource.binary_content
300  if rawUnicodeContent is not None:
301  ret = magic.from_buffer(str(rawUnicodeContent), mime=True)
302  if contentTypeMap is not None and ret in contentTypeMap:
303  logger.debug(">>> Mime type replaced from %s to %s", ret, contentTypeMap[ret])
304  ret = contentTypeMap[ret]
305  return ret
306 
307 

Member Data Documentation

◆ batchItem

dc_crawler.ResourceProcess.ResourceProcess.batchItem

Definition at line 38 of file ResourceProcess.py.

◆ dbWrapper

dc_crawler.ResourceProcess.ResourceProcess.dbWrapper

Definition at line 37 of file ResourceProcess.py.

◆ RECOVER_IF_FAILED

string dc_crawler.ResourceProcess.ResourceProcess.RECOVER_IF_FAILED = "2"
static

Definition at line 34 of file ResourceProcess.py.

◆ resource

dc_crawler.ResourceProcess.ResourceProcess.resource

Definition at line 39 of file ResourceProcess.py.

◆ urlObj

dc_crawler.ResourceProcess.ResourceProcess.urlObj

Definition at line 40 of file ResourceProcess.py.


The documentation for this class was generated from the following file: