2 @file boilerpipe_extractor.py 3 @author Alexey <developers.hce@gmail.com> 4 @link http://hierarchical-cluster-engine.com/ 5 @copyright Copyright © 2013 IOIX Ukraine 6 @license http://hierarchical-cluster-engine.com/license/ 7 @package HCE project node API 13 from boilerpipe.extract
import Extractor
25 def __init__(self, config, templ=None, domain=None, processorProperties=None):
26 BaseExtractor.__init__(self, config, templ, domain, processorProperties)
27 self.
name = CONSTS.EXTRACTOR_NAME_BOILERPIPE
28 self.
data[
"extractor"] = CONSTS.EXTRACTOR_NAME_BOILERPIPE
36 extractor = Extractor(extractor=
'ArticleExtractor', html=resource.raw_html)
37 text = extractor.getText()
38 logger.info(
"Article's corpus: %s", text)
39 self.
addTag(result=reslt, tag_name=CONSTS.TAG_CONTENT_UTF8_ENCODED, tag_value=text)
40 except Exception, err:
41 ExceptionLog.handler(logger, err,
'extractTags:', (err), \
42 {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)