HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_processor.newspaper_extractor.NewspaperExtractor Class Reference
Inheritance diagram for dc_processor.newspaper_extractor.NewspaperExtractor:
Collaboration diagram for dc_processor.newspaper_extractor.NewspaperExtractor:

Public Member Functions

def __init__ (self, config, templ=None, domain=None, processorProperties=None)
 
def imagesProcessing (self, article)
 
def extractTags (self, resource, reslt)
 
- Public Member Functions inherited from dc_processor.base_extractor.BaseExtractor
def __init__ (self, config, templ=None, domain=None, processorProperties=None)
 
def __str__ (self)
 
def __repr__ (self)
 
def loadScraperProperties (self, scraperPropFileName)
 
def isTagNotFilled (self, result, tagName)
 
def isTagValueNotEmpty (self, tagValue)
 
def tagValueElemValidate (self, tagValueElem, conditionElem)
 
def tagValueValidate (self, tagName, tagValue)
 
def addTag (self, result, tag_name, tag_value, xpath="", isDefaultTag=False, callAdjustment=True, tagType=None, allowNotFilled=False)
 
def calculateMetrics (self, response)
 
def rankReading (self, exctractorName)
 

Public Attributes

 name
 
 userAgent
 
- Public Attributes inherited from dc_processor.base_extractor.BaseExtractor
 config
 
 processorProperties
 
 name
 
 rank
 
 process_mode
 
 modules
 
 data
 
 db_dc_scraper_db
 
 DBConnector
 
 imgDelimiter
 
 tagsValidator
 

Static Public Attributes

string EXTRACTOR_NAME = "Newspaper extractor"
 
string SECTION_NAME = "extractor"
 
- Static Public Attributes inherited from dc_processor.base_extractor.BaseExtractor
 properties = None
 
dictionary tag
 
dictionary tagsMask
 

Detailed Description

Definition at line 25 of file newspaper_extractor.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_processor.newspaper_extractor.NewspaperExtractor.__init__ (   self,
  config,
  templ = None,
  domain = None,
  processorProperties = None 
)

Definition at line 33 of file newspaper_extractor.py.

33  def __init__(self, config, templ=None, domain=None, processorProperties=None):
34  BaseExtractor.__init__(self, config, templ, domain, processorProperties)
35 
36  # self.processorProperties = processorProperties
37  logger.debug("Properties: %s", varDump(self.properties))
38 
39  # set module rank from module's properties
40  self.rankReading(self.__class__.__name__)
41 
42  self.name = "Newspaper extractor"
43  self.data["extractor"] = "Newspaper extractor"
44  self.userAgent = processorProperties["EXTRACTOR_USER_AGENT"] if "EXTRACTOR_USER_AGENT" in\
45  processorProperties else None
46 
47 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
def __init__(self)
constructor
Definition: UIDGenerator.py:19
Here is the call graph for this function:

Member Function Documentation

◆ extractTags()

def dc_processor.newspaper_extractor.NewspaperExtractor.extractTags (   self,
  resource,
  reslt 
)

Definition at line 67 of file newspaper_extractor.py.

67  def extractTags(self, resource, reslt):
68  # support time execution limit
69  signal.signal(signal.SIGALRM, signal_handler)
70  if 'EXTRACTOR_NEWSPAPER_MAX_EXECUTION' in self.processorProperties:
71  t = int(self.processorProperties['EXTRACTOR_NEWSPAPER_MAX_EXECUTION'])
72  else:
73  t = CONSTS.TIME_EXECUTION_LIMIT
74  signal.alarm(t)
75  logger.debug("Max execution time signal handler set timeout as: %s", str(t))
76 
77  isLoadUrlsParam = False
78  imageRation = None
79  if self.processorProperties is not None and "SCRAPER_DOWNLOAD_IMAGES" in self.processorProperties:
80  isLoadUrlsParam = bool(int(self.processorProperties["SCRAPER_DOWNLOAD_IMAGES"]))
81  if self.processorProperties is not None and "IMAGE_RATION" in self.processorProperties:
82  imageRation = float(self.processorProperties["IMAGE_RATION"])
83 
84  kArgs = {}
85  if imageRation is not None:
86  kArgs = {"title": u'', "source_url": u'', "config": None, "image_dimension_ration": imageRation}
87  if self.userAgent is not None:
88  kArgs["browser_user_agent"] = self.userAgent
89  logger.debug(">>> NewspaperExtractor sets userAgent, is = " + str(self.userAgent))
90 
91 
92  kArgs["isLoadUrls"] = isLoadUrlsParam
93  if CONSTS.TAG_MEDIA in reslt.tags.keys() and not self.isTagNotFilled(reslt, CONSTS.TAG_MEDIA):
94  logger.debug("!!! Tag 'media' already selected. Skipped")
95  kArgs["isLoadUrls"] = False
96 
97  article = NewspaperWrapper(" ", **kArgs)
98 
99  article.html = resource.raw_html
100  article.is_downloaded = True
101 
102  try:
103  article.parse()
104  self.addTag(result=reslt, tag_name=CONSTS.TAG_TITLE, tag_value=article.title)
105  # self.addTag(result=reslt, tag_name=CONSTS.TAG_LINK, tag_value=article.canonical_link)
106  self.addTag(result=reslt, tag_name=CONSTS.TAG_LINK, tag_value=resource.url)
107  self.addTag(result=reslt, tag_name=CONSTS.TAG_DESCRIPTION, tag_value=article.summary)
108  if hasattr(article, "published_date"):
109  self.addTag(result=reslt, tag_name=CONSTS.TAG_PUB_DATE, tag_value=str(article.published_date)) # pylint: disable=E1101
110  elif hasattr(article, "publish_date"):
111  self.addTag(result=reslt, tag_name=CONSTS.TAG_PUB_DATE, tag_value=str(article.publish_date))
112  self.addTag(result=reslt, tag_name=CONSTS.TAG_AUTHOR, tag_value=article.authors)
113  self.addTag(result=reslt, tag_name=CONSTS.TAG_DC_DATE, tag_value=article.additional_data)
114  self.addTag(result=reslt, tag_name=CONSTS.TAG_CONTENT_UTF8_ENCODED, tag_value=article.text)
115  self.addTag(result=reslt, tag_name=CONSTS.TAG_KEYWORDS, tag_value=article.meta_keywords)
116  imgList = self.imagesProcessing(article)
117  logger.debug("!!! Tag 'media' imgList: %s", str(imgList))
118  if imgList is not None:
119  self.addTag(result=reslt, tag_name=CONSTS.TAG_MEDIA, tag_value=imgList)
120 
121  except Exception as err:
122  ExceptionLog.handler(logger, err, 'Newspaper parse error:', (), \
123  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
124  return reslt
125 
Here is the call graph for this function:

◆ imagesProcessing()

def dc_processor.newspaper_extractor.NewspaperExtractor.imagesProcessing (   self,
  article 
)

Definition at line 48 of file newspaper_extractor.py.

48  def imagesProcessing(self, article):
49  ret = None
50  if article.top_img is not None:
51  ret = []
52  ret.append(article.top_img)
53  ret.extend([x for x in article.imgs if x != article.top_img])
54  else:
55  ret = article.imgs
56 
57  if ret is not None:
58  ret = self.tagValueValidate(CONSTS.TAG_MEDIA, ret)
59  if ret is not None:
60  localValue = self.imgDelimiter.join(ret)
61  ret = []
62  ret.append(localValue)
63 
64  return ret
65 
66 
Definition: join.py:1
Here is the call graph for this function:
Here is the caller graph for this function:

Member Data Documentation

◆ EXTRACTOR_NAME

string dc_processor.newspaper_extractor.NewspaperExtractor.EXTRACTOR_NAME = "Newspaper extractor"
static

Definition at line 29 of file newspaper_extractor.py.

◆ name

dc_processor.newspaper_extractor.NewspaperExtractor.name

Definition at line 42 of file newspaper_extractor.py.

◆ SECTION_NAME

string dc_processor.newspaper_extractor.NewspaperExtractor.SECTION_NAME = "extractor"
static

Definition at line 30 of file newspaper_extractor.py.

◆ userAgent

dc_processor.newspaper_extractor.NewspaperExtractor.userAgent

Definition at line 44 of file newspaper_extractor.py.


The documentation for this class was generated from the following file: