Inheritance diagram for dc_processor.newspaper_extractor.NewspaperExtractor:

Collaboration diagram for dc_processor.newspaper_extractor.NewspaperExtractor:

Public Member Functions
def	__init__ (self, config, templ=None, domain=None, processorProperties=None)

def	imagesProcessing (self, article)

def	extractTags (self, resource, reslt)

Public Member Functions inherited from dc_processor.base_extractor.BaseExtractor
def	__init__ (self, config, templ=None, domain=None, processorProperties=None)

def	__str__ (self)

def	__repr__ (self)

def	loadScraperProperties (self, scraperPropFileName)

def	isTagNotFilled (self, result, tagName)

def	isTagValueNotEmpty (self, tagValue)

def	tagValueElemValidate (self, tagValueElem, conditionElem)

def	tagValueValidate (self, tagName, tagValue)

def	addTag (self, result, tag_name, tag_value, xpath="", isDefaultTag=False, callAdjustment=True, tagType=None, allowNotFilled=False)

def	calculateMetrics (self, response)

def	rankReading (self, exctractorName)

Public Attributes
	name

	userAgent

Public Attributes inherited from dc_processor.base_extractor.BaseExtractor
	config

	processorProperties

	name

	rank

	process_mode

	modules

	data

	db_dc_scraper_db

	DBConnector

	imgDelimiter

	tagsValidator

Static Public Attributes
string	EXTRACTOR_NAME = "Newspaper extractor"

string	SECTION_NAME = "extractor"

Static Public Attributes inherited from dc_processor.base_extractor.BaseExtractor
	properties = None

dictionary	tag

dictionary	tagsMask

Detailed Description

Definition at line 25 of file newspaper_extractor.py.

Constructor & Destructor Documentation

◆ init()

def dc_processor.newspaper_extractor.NewspaperExtractor.__init__	(	self,
		config,
		templ = `None`,
		domain = `None`,
		processorProperties = `None`
	)

Definition at line 33 of file newspaper_extractor.py.

   def __init__(self, config, templ=None, domain=None, processorProperties=None):
     BaseExtractor.__init__(self, config, templ, domain, processorProperties)
 
     # self.processorProperties = processorProperties
     logger.debug("Properties: %s", varDump(self.properties))
 
     # set module rank from module's properties
     self.rankReading(self.__class__.__name__)
 
     self.name = "Newspaper extractor"
     self.data["extractor"] = "Newspaper extractor"
     self.userAgent = processorProperties["EXTRACTOR_USER_AGENT"] if "EXTRACTOR_USER_AGENT" in\
      processorProperties else None
 
 

Here is the call graph for this function:

Member Function Documentation

◆ extractTags()

def dc_processor.newspaper_extractor.NewspaperExtractor.extractTags	(	self,
		resource,
		reslt
	)

Definition at line 67 of file newspaper_extractor.py.

   def extractTags(self, resource, reslt):
     # support time execution limit
     signal.signal(signal.SIGALRM, signal_handler)
     if 'EXTRACTOR_NEWSPAPER_MAX_EXECUTION' in self.processorProperties:
       t = int(self.processorProperties['EXTRACTOR_NEWSPAPER_MAX_EXECUTION'])
     else:
       t = CONSTS.TIME_EXECUTION_LIMIT
     signal.alarm(t)
     logger.debug("Max execution time signal handler set timeout as: %s", str(t))
 
     isLoadUrlsParam = False
     imageRation = None
     if self.processorProperties is not None and "SCRAPER_DOWNLOAD_IMAGES" in self.processorProperties:
       isLoadUrlsParam = bool(int(self.processorProperties["SCRAPER_DOWNLOAD_IMAGES"]))
     if self.processorProperties is not None and "IMAGE_RATION" in self.processorProperties:
       imageRation = float(self.processorProperties["IMAGE_RATION"])
 
     kArgs = {}
     if imageRation is not None:
       kArgs = {"title": u'', "source_url": u'', "config": None, "image_dimension_ration": imageRation}
     if self.userAgent is not None:
       kArgs["browser_user_agent"] = self.userAgent
       logger.debug(">>> NewspaperExtractor sets userAgent, is = " + str(self.userAgent))
 
 
     kArgs["isLoadUrls"] = isLoadUrlsParam
     if CONSTS.TAG_MEDIA in reslt.tags.keys() and not self.isTagNotFilled(reslt, CONSTS.TAG_MEDIA):
       logger.debug("!!! Tag 'media' already selected. Skipped")
       kArgs["isLoadUrls"] = False
 
     article = NewspaperWrapper(" ", **kArgs)
 
     article.html = resource.raw_html
     article.is_downloaded = True
 
     try:
       article.parse()
       self.addTag(result=reslt, tag_name=CONSTS.TAG_TITLE, tag_value=article.title)
       # self.addTag(result=reslt, tag_name=CONSTS.TAG_LINK, tag_value=article.canonical_link)
       self.addTag(result=reslt, tag_name=CONSTS.TAG_LINK, tag_value=resource.url)
       self.addTag(result=reslt, tag_name=CONSTS.TAG_DESCRIPTION, tag_value=article.summary)
       if hasattr(article, "published_date"):
         self.addTag(result=reslt, tag_name=CONSTS.TAG_PUB_DATE, tag_value=str(article.published_date))  # pylint: disable=E1101
       elif hasattr(article, "publish_date"):
         self.addTag(result=reslt, tag_name=CONSTS.TAG_PUB_DATE, tag_value=str(article.publish_date))
       self.addTag(result=reslt, tag_name=CONSTS.TAG_AUTHOR, tag_value=article.authors)
       self.addTag(result=reslt, tag_name=CONSTS.TAG_DC_DATE, tag_value=article.additional_data)
       self.addTag(result=reslt, tag_name=CONSTS.TAG_CONTENT_UTF8_ENCODED, tag_value=article.text)
       self.addTag(result=reslt, tag_name=CONSTS.TAG_KEYWORDS, tag_value=article.meta_keywords)
       imgList = self.imagesProcessing(article)
       logger.debug("!!! Tag 'media' imgList: %s", str(imgList))
       if imgList is not None:
         self.addTag(result=reslt, tag_name=CONSTS.TAG_MEDIA, tag_value=imgList)
 
     except Exception as err:
       ExceptionLog.handler(logger, err, 'Newspaper parse error:', (), \
                            {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
     return reslt
 

Here is the call graph for this function:

◆ imagesProcessing()

def dc_processor.newspaper_extractor.NewspaperExtractor.imagesProcessing	(	self,
		article
	)

Definition at line 48 of file newspaper_extractor.py.

   def imagesProcessing(self, article):
     ret = None
     if article.top_img is not None:
       ret = []
       ret.append(article.top_img)
       ret.extend([x for x in article.imgs if x != article.top_img])
     else:
       ret = article.imgs
 
     if ret is not None:
       ret = self.tagValueValidate(CONSTS.TAG_MEDIA, ret)
       if ret is not None:
         localValue = self.imgDelimiter.join(ret)
         ret = []
         ret.append(localValue)
 
     return ret
 
 

Here is the call graph for this function:

Here is the caller graph for this function:

Member Data Documentation

◆ EXTRACTOR_NAME

string dc_processor.newspaper_extractor.NewspaperExtractor.EXTRACTOR_NAME = "Newspaper extractor"

static

Definition at line 29 of file newspaper_extractor.py.

◆ name

dc_processor.newspaper_extractor.NewspaperExtractor.name

Definition at line 42 of file newspaper_extractor.py.

◆ SECTION_NAME

string dc_processor.newspaper_extractor.NewspaperExtractor.SECTION_NAME = "extractor"

static

Definition at line 30 of file newspaper_extractor.py.

◆ userAgent

dc_processor.newspaper_extractor.NewspaperExtractor.userAgent

Definition at line 44 of file newspaper_extractor.py.

The documentation for this class was generated from the following file:

sources/hce/dc_processor/newspaper_extractor.py

Public Member Functions

Public Attributes

Static Public Attributes