2 @file goose_extractor.py 3 @author Alexey <developers.hce@gmail.com> 4 @link http://hierarchical-cluster-engine.com/ 5 @copyright Copyright © 2013 IOIX Ukraine 6 @license http://hierarchical-cluster-engine.com/license/ 7 @package HCE project node API 12 from goose
import Goose
28 def __init__(self, config, templ=None, domain=None, processorProperties=None):
30 BaseExtractor.__init__(self, config, templ, domain, processorProperties)
32 self.
name =
"Goose extractor" 35 if "EXTRACTOR_USER_AGENT" in processorProperties
and processorProperties[
"EXTRACTOR_USER_AGENT"]
is not None:
36 self.
goose = Goose({
'browser_user_agent': processorProperties[
"EXTRACTOR_USER_AGENT"]})
37 logger.debug(
">>> NewspaperExtractor sets userAgent, is" + str(processorProperties[
"EXTRACTOR_USER_AGENT"]))
40 self.
data[
"extractor"] =
"Goose extractor" 41 except Exception
as err:
42 ExceptionLog.handler(logger, err,
"Goose extractor constructor error: possible /tmp not permitted to write", (), \
43 {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
49 signal.signal(signal.SIGALRM, signal_handler)
53 t = CONSTS.TIME_EXECUTION_LIMIT
55 logger.debug(
"Max execution time signal handler set timeout as: %s", str(t))
58 article = self.
goose.extract(raw_html=str(resource.raw_html), url=resource.url)
59 self.
addTag(result=reslt, tag_name=CONSTS.TAG_TITLE, tag_value=article.title)
60 self.
addTag(result=reslt, tag_name=CONSTS.TAG_LINK, tag_value=article.canonical_link)
61 self.
addTag(result=reslt, tag_name=CONSTS.TAG_PUB_DATE, tag_value=article.publish_date)
63 self.
addTag(result=reslt, tag_name=CONSTS.TAG_DC_DATE, tag_value=article.additional_data)
64 self.
addTag(result=reslt, tag_name=CONSTS.TAG_CONTENT_UTF8_ENCODED, tag_value=article.cleaned_text)
65 self.
addTag(result=reslt, tag_name=CONSTS.TAG_KEYWORDS, tag_value=article.meta_keywords)
67 if CONSTS.TAG_MEDIA
in reslt.tags.keys()
and not self.
isTagNotFilled(reslt, CONSTS.TAG_MEDIA):
68 logger.debug(
"!!! Tag 'media' already selected. Skipped... value = %s", str(reslt.tags[CONSTS.TAG_MEDIA]))
70 self.
addTag(result=reslt, tag_name=CONSTS.TAG_MEDIA, tag_value=getattr(article,
"top_image.src",
None))
72 except IOError
as err:
75 logger.debug(
"Goose open file error. It may be unsupported encoding like jp. Error: " + str(err))
76 except Exception
as err:
79 logger.debug(
"Goose parse error. Error: " + str(err))
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)