Inheritance diagram for dc_processor.goose_extractor.GooseExtractor:

Collaboration diagram for dc_processor.goose_extractor.GooseExtractor:

Public Member Functions
def	__init__ (self, config, templ=None, domain=None, processorProperties=None)

def	extractTags (self, resource, reslt)

Public Member Functions inherited from dc_processor.base_extractor.BaseExtractor
def	__init__ (self, config, templ=None, domain=None, processorProperties=None)

def	__str__ (self)

def	__repr__ (self)

def	loadScraperProperties (self, scraperPropFileName)

def	isTagNotFilled (self, result, tagName)

def	isTagValueNotEmpty (self, tagValue)

def	tagValueElemValidate (self, tagValueElem, conditionElem)

def	tagValueValidate (self, tagName, tagValue)

def	addTag (self, result, tag_name, tag_value, xpath="", isDefaultTag=False, callAdjustment=True, tagType=None, allowNotFilled=False)

def	calculateMetrics (self, response)

def	rankReading (self, exctractorName)

Public Attributes
	name

Public Attributes inherited from dc_processor.base_extractor.BaseExtractor
	config

	processorProperties

	name

	rank

	process_mode

	modules

	data

	db_dc_scraper_db

	DBConnector

	imgDelimiter

	tagsValidator

Static Public Attributes
	goose = None

Static Public Attributes inherited from dc_processor.base_extractor.BaseExtractor
	properties = None

dictionary	tag

dictionary	tagsMask

Detailed Description

Definition at line 25 of file goose_extractor.py.

Constructor & Destructor Documentation

◆ init()

def dc_processor.goose_extractor.GooseExtractor.__init__	(	self,
		config,
		templ = `None`,
		domain = `None`,
		processorProperties = `None`
	)

Definition at line 28 of file goose_extractor.py.

   def __init__(self, config, templ=None, domain=None, processorProperties=None):
     try:
       BaseExtractor.__init__(self, config, templ, domain, processorProperties)
       logger.debug("Properties: %s", varDump(self.properties))
       self.name = "Goose extractor"
       # set module rank from module's properties
       self.rankReading(self.__class__.__name__)
       if "EXTRACTOR_USER_AGENT" in processorProperties and processorProperties["EXTRACTOR_USER_AGENT"] is not None:
         self.goose = Goose({'browser_user_agent': processorProperties["EXTRACTOR_USER_AGENT"]})
         logger.debug(">>>  NewspaperExtractor sets userAgent, is" + str(processorProperties["EXTRACTOR_USER_AGENT"]))
       else:
         self.goose = Goose()
       self.data["extractor"] = "Goose extractor"
     except Exception as err:
       ExceptionLog.handler(logger, err, "Goose extractor constructor error: possible /tmp not permitted to write", (), \
                            {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
       raise
 
 

Here is the call graph for this function:

Member Function Documentation

◆ extractTags()

def dc_processor.goose_extractor.GooseExtractor.extractTags	(	self,
		resource,
		reslt
	)

Definition at line 47 of file goose_extractor.py.

   def extractTags(self, resource, reslt):
     # support time execution limit
     signal.signal(signal.SIGALRM, signal_handler)
     if 'EXTRACTOR_GOOSE_MAX_EXECUTION' in self.processorProperties:
       t = int(self.processorProperties['EXTRACTOR_GOOSE_MAX_EXECUTION'])
     else:
       t = CONSTS.TIME_EXECUTION_LIMIT
     signal.alarm(t)
     logger.debug("Max execution time signal handler set timeout as: %s", str(t))
 
     try:
       article = self.goose.extract(raw_html=str(resource.raw_html), url=resource.url)
       self.addTag(result=reslt, tag_name=CONSTS.TAG_TITLE, tag_value=article.title)
       self.addTag(result=reslt, tag_name=CONSTS.TAG_LINK, tag_value=article.canonical_link)
       self.addTag(result=reslt, tag_name=CONSTS.TAG_PUB_DATE, tag_value=article.publish_date)
       # self.addTag(result=reslt, tag_name=CONSTS.TAG_GUID, tag_value=article.link_hash)
       self.addTag(result=reslt, tag_name=CONSTS.TAG_DC_DATE, tag_value=article.additional_data)
       self.addTag(result=reslt, tag_name=CONSTS.TAG_CONTENT_UTF8_ENCODED, tag_value=article.cleaned_text)
       self.addTag(result=reslt, tag_name=CONSTS.TAG_KEYWORDS, tag_value=article.meta_keywords)
 
       if CONSTS.TAG_MEDIA in reslt.tags.keys() and not self.isTagNotFilled(reslt, CONSTS.TAG_MEDIA):
         logger.debug("!!! Tag 'media' already selected. Skipped... value = %s", str(reslt.tags[CONSTS.TAG_MEDIA]))
       else:
         self.addTag(result=reslt, tag_name=CONSTS.TAG_MEDIA, tag_value=getattr(article, "top_image.src", None))
 
     except IOError as err:
       # ExceptionLog.handler(logger, err, "Goose open file error. It may be unsupported encoding like jp", (), \
       #                      {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
       logger.debug("Goose open file error. It may be unsupported encoding like jp. Error: " + str(err))
     except Exception as err:
       # ExceptionLog.handler(logger, err, "Goose parse error", (), \
       #                      {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
       logger.debug("Goose parse error. Error: " + str(err))
 
     return reslt
 

Here is the call graph for this function:

Member Data Documentation

◆ goose

dc_processor.goose_extractor.GooseExtractor.goose = None

static

Definition at line 26 of file goose_extractor.py.

◆ name

dc_processor.goose_extractor.GooseExtractor.name

Definition at line 32 of file goose_extractor.py.

The documentation for this class was generated from the following file:

sources/hce/dc_processor/goose_extractor.py

Public Member Functions

Public Attributes

Static Public Attributes