47 def extractTags(self, resource, reslt):
49 signal.signal(signal.SIGALRM, signal_handler)
50 if 'EXTRACTOR_GOOSE_MAX_EXECUTION' in self.processorProperties:
51 t = int(self.processorProperties[
'EXTRACTOR_GOOSE_MAX_EXECUTION'])
53 t = CONSTS.TIME_EXECUTION_LIMIT
55 logger.debug(
"Max execution time signal handler set timeout as: %s", str(t))
58 article = self.goose.extract(raw_html=str(resource.raw_html), url=resource.url)
59 self.addTag(result=reslt, tag_name=CONSTS.TAG_TITLE, tag_value=article.title)
60 self.addTag(result=reslt, tag_name=CONSTS.TAG_LINK, tag_value=article.canonical_link)
61 self.addTag(result=reslt, tag_name=CONSTS.TAG_PUB_DATE, tag_value=article.publish_date)
63 self.addTag(result=reslt, tag_name=CONSTS.TAG_DC_DATE, tag_value=article.additional_data)
64 self.addTag(result=reslt, tag_name=CONSTS.TAG_CONTENT_UTF8_ENCODED, tag_value=article.cleaned_text)
65 self.addTag(result=reslt, tag_name=CONSTS.TAG_KEYWORDS, tag_value=article.meta_keywords)
67 if CONSTS.TAG_MEDIA
in reslt.tags.keys()
and not self.isTagNotFilled(reslt, CONSTS.TAG_MEDIA):
68 logger.debug(
"!!! Tag 'media' already selected. Skipped... value = %s", str(reslt.tags[CONSTS.TAG_MEDIA]))
70 self.addTag(result=reslt, tag_name=CONSTS.TAG_MEDIA, tag_value=getattr(article,
"top_image.src",
None))
72 except IOError
as err:
75 logger.debug(
"Goose open file error. It may be unsupported encoding like jp. Error: " + str(err))
76 except Exception
as err:
79 logger.debug(
"Goose parse error. Error: " + str(err))