HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
alchemy_extractor.py
Go to the documentation of this file.
1 """@package docstring
2  @file alchemy_extractor.py
3  @author Alexey <developers.hce@gmail.com>
4  @link http://hierarchical-cluster-engine.com/
5  @copyright Copyright &copy; 2013 IOIX Ukraine
6  @license http://hierarchical-cluster-engine.com/license/
7  @package HCE project node API
8  @since 0.1
9  """
10 
11 from dc_processor.base_extractor import BaseExtractor
12 import dc_processor.Constants as CONSTS
13 from dc_processor.alchemyapi import AlchemyAPI
14 from app.Utils import varDump
15 import app.Utils as Utils # pylint: disable=F0401
16 
17 # Logger initialization
18 logger = Utils.MPLogger().getLogger()
19 
20 
22 
23 
24  def __init__(self, config, templ=None, domain=None, processorProperties=None):
25  BaseExtractor.__init__(self, config, templ, domain, processorProperties)
26  self.name = CONSTS.EXTRACTOR_NAME_ALCHEMY
27  self.data["extractor"] = CONSTS.EXTRACTOR_NAME_ALCHEMY
28  logger.debug("Properties: %s", varDump(self.properties))
29 
30  # set module rank from module's properties
31  self.rankReading(self.__class__.__name__)
32 
33 
34  def extractTags(self, resource, reslt):
35  try:
36  logger.info("AAAAAAA")
37  parser = AlchemyAPI()
38  logger.info("BBBBBBB")
39  text = parser.text("html", resource.raw_html)
40  logger.info("CCCCCCC")
41  logger.info("Article's corpus: %s", text)
42  self.addTag(result=reslt, \
43  tag_name=CONSTS.TAG_CONTENT_UTF8_ENCODED, \
44  tag_value=text)
45  logger.info("DDDDDDD")
46  except Exception, err:
47  logger.info(varDump(err))
48  return reslt
49 
50 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
def addTag(self, result, tag_name, tag_value, xpath="", isDefaultTag=False, callAdjustment=True, tagType=None, allowNotFilled=False)
def __init__(self, config, templ=None, domain=None, processorProperties=None)