2 @file newspaper_extractor.py 3 @author Alexey <developers.hce@gmail.com> 4 @link http://hierarchical-cluster-engine.com/ 5 @copyright Copyright © 2013 IOIX Ukraine 6 @license http://hierarchical-cluster-engine.com/license/ 7 @package HCE project node API 29 EXTRACTOR_NAME =
"Newspaper extractor" 30 SECTION_NAME =
"extractor" 33 def __init__(self, config, templ=None, domain=None, processorProperties=None):
34 BaseExtractor.__init__(self, config, templ, domain, processorProperties)
42 self.
name =
"Newspaper extractor" 43 self.
data[
"extractor"] =
"Newspaper extractor" 44 self.
userAgent = processorProperties[
"EXTRACTOR_USER_AGENT"]
if "EXTRACTOR_USER_AGENT" in\
45 processorProperties
else None 50 if article.top_img
is not None:
52 ret.append(article.top_img)
53 ret.extend([x
for x
in article.imgs
if x != article.top_img])
62 ret.append(localValue)
69 signal.signal(signal.SIGALRM, signal_handler)
73 t = CONSTS.TIME_EXECUTION_LIMIT
75 logger.debug(
"Max execution time signal handler set timeout as: %s", str(t))
77 isLoadUrlsParam =
False 85 if imageRation
is not None:
86 kArgs = {
"title":
u'',
"source_url":
u'',
"config":
None,
"image_dimension_ration": imageRation}
88 kArgs[
"browser_user_agent"] = self.
userAgent 89 logger.debug(
">>> NewspaperExtractor sets userAgent, is = " + str(self.
userAgent))
92 kArgs[
"isLoadUrls"] = isLoadUrlsParam
93 if CONSTS.TAG_MEDIA
in reslt.tags.keys()
and not self.
isTagNotFilled(reslt, CONSTS.TAG_MEDIA):
94 logger.debug(
"!!! Tag 'media' already selected. Skipped")
95 kArgs[
"isLoadUrls"] =
False 99 article.html = resource.raw_html
100 article.is_downloaded =
True 104 self.
addTag(result=reslt, tag_name=CONSTS.TAG_TITLE, tag_value=article.title)
106 self.
addTag(result=reslt, tag_name=CONSTS.TAG_LINK, tag_value=resource.url)
107 self.
addTag(result=reslt, tag_name=CONSTS.TAG_DESCRIPTION, tag_value=article.summary)
108 if hasattr(article,
"published_date"):
109 self.
addTag(result=reslt, tag_name=CONSTS.TAG_PUB_DATE, tag_value=str(article.published_date))
110 elif hasattr(article,
"publish_date"):
111 self.
addTag(result=reslt, tag_name=CONSTS.TAG_PUB_DATE, tag_value=str(article.publish_date))
112 self.
addTag(result=reslt, tag_name=CONSTS.TAG_AUTHOR, tag_value=article.authors)
113 self.
addTag(result=reslt, tag_name=CONSTS.TAG_DC_DATE, tag_value=article.additional_data)
114 self.
addTag(result=reslt, tag_name=CONSTS.TAG_CONTENT_UTF8_ENCODED, tag_value=article.text)
115 self.
addTag(result=reslt, tag_name=CONSTS.TAG_KEYWORDS, tag_value=article.meta_keywords)
117 logger.debug(
"!!! Tag 'media' imgList: %s", str(imgList))
118 if imgList
is not None:
119 self.
addTag(result=reslt, tag_name=CONSTS.TAG_MEDIA, tag_value=imgList)
121 except Exception
as err:
122 ExceptionLog.handler(logger, err,
'Newspaper parse error:', (), \
123 {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)