HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
custom_extractor.py
Go to the documentation of this file.
1 # coding: utf-8
2 '''
3 Created on Mar 02, 2016
4 
5 @package: dc_processor
6 @author: scorp
7 @link: http://hierarchical-cluster-engine.com/
8 @copyright: Copyright © 2013-2014 IOIX Ukraine
9 @license: http://hierarchical-cluster-engine.com/license/
10 @since: 0.1
11 '''
12 
13 import signal
14 import types
15 import dc_processor.Constants as CONSTS
16 from dc_processor.base_extractor import BaseExtractor
17 from dc_processor.base_extractor import signal_handler
18 from app.Utils import ExceptionLog
19 from app.Utils import varDump
20 import app.Utils as Utils # pylint: disable=F0401
21 
22 # Logger initialization
23 logger = Utils.MPLogger().getLogger()
24 
25 
26 # CustomExtractor exctractor class, extracts data from custom structure
28 
29 
30  # #constructor
31  # initialize default fields
32  # @param config - Scraper's config
33  # @param templ - default template
34  # @param domain - processing url's domain
35  # @param processorProperties - Scraper's processorProperties
36  def __init__(self, config, templ=None, domain=None, processorProperties=None):
37  try:
38  BaseExtractor.__init__(self, config, templ, domain, processorProperties)
39  logger.debug("Properties: %s", varDump(self.properties))
40  self.name = "Custom extractor"
41  # set module rank from module's properties
42  self.rankReading(self.__class__.__name__)
43  self.data["extractor"] = self.name
44  except Exception as err:
45  ExceptionLog.handler(logger, err, "Custom extractor constructor error: possible /tmp not permitted to write", (),
46  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
47  raise
48 
49 
50  # #extractTags method, common data extraction method
51  # @param resource - incoming resource element
52  # @param reslut - incoming reslut element, which filled inside method
53  # @return reslut element
54  def extractTags(self, resource, reslut):
55  # support time execution limit
56  signal.signal(signal.SIGALRM, signal_handler)
57  if 'EXTRACTOR_CUSTOM_MAX_EXECUTION' in self.processorProperties:
58  t = int(self.processorProperties['EXTRACTOR_CUSTOM_MAX_EXECUTION'])
59  else:
60  t = CONSTS.TIME_EXECUTION_LIMIT
61  signal.alarm(t)
62  logger.debug("Max execution time signal handler set timeout as: %s", str(t))
63 
64  try:
65  if resource.raw_html is not None and isinstance(resource.raw_html, types.DictType):
66  for key in resource.raw_html:
67  localTagValue = resource.raw_html[key] if isinstance(resource.raw_html[key], types.ListType) else \
68  [str(resource.raw_html[key])]
69  self.addTag(result=reslut, tag_name=key, tag_value=localTagValue)
70  except IOError as err:
71  ExceptionLog.handler(logger, err, "Custom extractor file error. It may be unsupported encoding like jp", (), \
72  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
73  except Exception as err:
74  ExceptionLog.handler(logger, err, "Custom extractor error", (), \
75  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
76  return reslut
def __init__(self, config, templ=None, domain=None, processorProperties=None)
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
def addTag(self, result, tag_name, tag_value, xpath="", isDefaultTag=False, callAdjustment=True, tagType=None, allowNotFilled=False)