67 def extractTags(self, resource, reslt):
69 signal.signal(signal.SIGALRM, signal_handler)
70 if 'EXTRACTOR_NEWSPAPER_MAX_EXECUTION' in self.processorProperties:
71 t = int(self.processorProperties[
'EXTRACTOR_NEWSPAPER_MAX_EXECUTION'])
73 t = CONSTS.TIME_EXECUTION_LIMIT
75 logger.debug(
"Max execution time signal handler set timeout as: %s", str(t))
77 isLoadUrlsParam =
False 79 if self.processorProperties
is not None and "SCRAPER_DOWNLOAD_IMAGES" in self.processorProperties:
80 isLoadUrlsParam = bool(int(self.processorProperties[
"SCRAPER_DOWNLOAD_IMAGES"]))
81 if self.processorProperties
is not None and "IMAGE_RATION" in self.processorProperties:
82 imageRation = float(self.processorProperties[
"IMAGE_RATION"])
85 if imageRation
is not None:
86 kArgs = {
"title":
u'',
"source_url":
u'',
"config":
None,
"image_dimension_ration": imageRation}
87 if self.userAgent
is not None:
88 kArgs[
"browser_user_agent"] = self.userAgent
89 logger.debug(
">>> NewspaperExtractor sets userAgent, is = " + str(self.userAgent))
92 kArgs[
"isLoadUrls"] = isLoadUrlsParam
93 if CONSTS.TAG_MEDIA
in reslt.tags.keys()
and not self.isTagNotFilled(reslt, CONSTS.TAG_MEDIA):
94 logger.debug(
"!!! Tag 'media' already selected. Skipped")
95 kArgs[
"isLoadUrls"] =
False 97 article = NewspaperWrapper(
" ", **kArgs)
99 article.html = resource.raw_html
100 article.is_downloaded =
True 104 self.addTag(result=reslt, tag_name=CONSTS.TAG_TITLE, tag_value=article.title)
106 self.addTag(result=reslt, tag_name=CONSTS.TAG_LINK, tag_value=resource.url)
107 self.addTag(result=reslt, tag_name=CONSTS.TAG_DESCRIPTION, tag_value=article.summary)
108 if hasattr(article,
"published_date"):
109 self.addTag(result=reslt, tag_name=CONSTS.TAG_PUB_DATE, tag_value=str(article.published_date))
110 elif hasattr(article,
"publish_date"):
111 self.addTag(result=reslt, tag_name=CONSTS.TAG_PUB_DATE, tag_value=str(article.publish_date))
112 self.addTag(result=reslt, tag_name=CONSTS.TAG_AUTHOR, tag_value=article.authors)
113 self.addTag(result=reslt, tag_name=CONSTS.TAG_DC_DATE, tag_value=article.additional_data)
114 self.addTag(result=reslt, tag_name=CONSTS.TAG_CONTENT_UTF8_ENCODED, tag_value=article.text)
115 self.addTag(result=reslt, tag_name=CONSTS.TAG_KEYWORDS, tag_value=article.meta_keywords)
116 imgList = self.imagesProcessing(article)
117 logger.debug(
"!!! Tag 'media' imgList: %s", str(imgList))
118 if imgList
is not None:
119 self.addTag(result=reslt, tag_name=CONSTS.TAG_MEDIA, tag_value=imgList)
121 except Exception
as err:
122 ExceptionLog.handler(logger, err,
'Newspaper parse error:', (), \
123 {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})