HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
boilerpipe_extractor.py
Go to the documentation of this file.
1 """@package docstring
2  @file boilerpipe_extractor.py
3  @author Alexey <developers.hce@gmail.com>
4  @link http://hierarchical-cluster-engine.com/
5  @copyright Copyright &copy; 2013 IOIX Ukraine
6  @license http://hierarchical-cluster-engine.com/license/
7  @package HCE project node API
8  @since 0.1
9  """
10 
11 from dc_processor.base_extractor import BaseExtractor
12 import dc_processor.Constants as CONSTS
13 from boilerpipe.extract import Extractor # pylint: disable=F0401
14 from app.Utils import varDump
15 from app.Utils import ExceptionLog
16 import app.Utils as Utils # pylint: disable=F0401
17 
18 # Logger initialization
19 logger = Utils.MPLogger().getLogger()
20 
21 
23 
24 
25  def __init__(self, config, templ=None, domain=None, processorProperties=None):
26  BaseExtractor.__init__(self, config, templ, domain, processorProperties)
27  self.name = CONSTS.EXTRACTOR_NAME_BOILERPIPE
28  self.data["extractor"] = CONSTS.EXTRACTOR_NAME_BOILERPIPE
29  logger.debug("Properties: %s", varDump(self.properties))
30 
31  self.rankReading(self.__class__.__name__)
32 
33 
34  def extractTags(self, resource, reslt):
35  try:
36  extractor = Extractor(extractor='ArticleExtractor', html=resource.raw_html)
37  text = extractor.getText()
38  logger.info("Article's corpus: %s", text)
39  self.addTag(result=reslt, tag_name=CONSTS.TAG_CONTENT_UTF8_ENCODED, tag_value=text)
40  except Exception, err:
41  ExceptionLog.handler(logger, err, 'extractTags:', (err), \
42  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
43  return reslt
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
def addTag(self, result, tag_name, tag_value, xpath="", isDefaultTag=False, callAdjustment=True, tagType=None, allowNotFilled=False)
def __init__(self, config, templ=None, domain=None, processorProperties=None)