HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_processor.goose_extractor.GooseExtractor Class Reference
Inheritance diagram for dc_processor.goose_extractor.GooseExtractor:
Collaboration diagram for dc_processor.goose_extractor.GooseExtractor:

Public Member Functions

def __init__ (self, config, templ=None, domain=None, processorProperties=None)
 
def extractTags (self, resource, reslt)
 
- Public Member Functions inherited from dc_processor.base_extractor.BaseExtractor
def __init__ (self, config, templ=None, domain=None, processorProperties=None)
 
def __str__ (self)
 
def __repr__ (self)
 
def loadScraperProperties (self, scraperPropFileName)
 
def isTagNotFilled (self, result, tagName)
 
def isTagValueNotEmpty (self, tagValue)
 
def tagValueElemValidate (self, tagValueElem, conditionElem)
 
def tagValueValidate (self, tagName, tagValue)
 
def addTag (self, result, tag_name, tag_value, xpath="", isDefaultTag=False, callAdjustment=True, tagType=None, allowNotFilled=False)
 
def calculateMetrics (self, response)
 
def rankReading (self, exctractorName)
 

Public Attributes

 name
 
- Public Attributes inherited from dc_processor.base_extractor.BaseExtractor
 config
 
 processorProperties
 
 name
 
 rank
 
 process_mode
 
 modules
 
 data
 
 db_dc_scraper_db
 
 DBConnector
 
 imgDelimiter
 
 tagsValidator
 

Static Public Attributes

 goose = None
 
- Static Public Attributes inherited from dc_processor.base_extractor.BaseExtractor
 properties = None
 
dictionary tag
 
dictionary tagsMask
 

Detailed Description

Definition at line 25 of file goose_extractor.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_processor.goose_extractor.GooseExtractor.__init__ (   self,
  config,
  templ = None,
  domain = None,
  processorProperties = None 
)

Definition at line 28 of file goose_extractor.py.

28  def __init__(self, config, templ=None, domain=None, processorProperties=None):
29  try:
30  BaseExtractor.__init__(self, config, templ, domain, processorProperties)
31  logger.debug("Properties: %s", varDump(self.properties))
32  self.name = "Goose extractor"
33  # set module rank from module's properties
34  self.rankReading(self.__class__.__name__)
35  if "EXTRACTOR_USER_AGENT" in processorProperties and processorProperties["EXTRACTOR_USER_AGENT"] is not None:
36  self.goose = Goose({'browser_user_agent': processorProperties["EXTRACTOR_USER_AGENT"]})
37  logger.debug(">>> NewspaperExtractor sets userAgent, is" + str(processorProperties["EXTRACTOR_USER_AGENT"]))
38  else:
39  self.goose = Goose()
40  self.data["extractor"] = "Goose extractor"
41  except Exception as err:
42  ExceptionLog.handler(logger, err, "Goose extractor constructor error: possible /tmp not permitted to write", (), \
43  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
44  raise
45 
46 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
def __init__(self)
constructor
Definition: UIDGenerator.py:19
Here is the call graph for this function:

Member Function Documentation

◆ extractTags()

def dc_processor.goose_extractor.GooseExtractor.extractTags (   self,
  resource,
  reslt 
)

Definition at line 47 of file goose_extractor.py.

47  def extractTags(self, resource, reslt):
48  # support time execution limit
49  signal.signal(signal.SIGALRM, signal_handler)
50  if 'EXTRACTOR_GOOSE_MAX_EXECUTION' in self.processorProperties:
51  t = int(self.processorProperties['EXTRACTOR_GOOSE_MAX_EXECUTION'])
52  else:
53  t = CONSTS.TIME_EXECUTION_LIMIT
54  signal.alarm(t)
55  logger.debug("Max execution time signal handler set timeout as: %s", str(t))
56 
57  try:
58  article = self.goose.extract(raw_html=str(resource.raw_html), url=resource.url)
59  self.addTag(result=reslt, tag_name=CONSTS.TAG_TITLE, tag_value=article.title)
60  self.addTag(result=reslt, tag_name=CONSTS.TAG_LINK, tag_value=article.canonical_link)
61  self.addTag(result=reslt, tag_name=CONSTS.TAG_PUB_DATE, tag_value=article.publish_date)
62  # self.addTag(result=reslt, tag_name=CONSTS.TAG_GUID, tag_value=article.link_hash)
63  self.addTag(result=reslt, tag_name=CONSTS.TAG_DC_DATE, tag_value=article.additional_data)
64  self.addTag(result=reslt, tag_name=CONSTS.TAG_CONTENT_UTF8_ENCODED, tag_value=article.cleaned_text)
65  self.addTag(result=reslt, tag_name=CONSTS.TAG_KEYWORDS, tag_value=article.meta_keywords)
66 
67  if CONSTS.TAG_MEDIA in reslt.tags.keys() and not self.isTagNotFilled(reslt, CONSTS.TAG_MEDIA):
68  logger.debug("!!! Tag 'media' already selected. Skipped... value = %s", str(reslt.tags[CONSTS.TAG_MEDIA]))
69  else:
70  self.addTag(result=reslt, tag_name=CONSTS.TAG_MEDIA, tag_value=getattr(article, "top_image.src", None))
71 
72  except IOError as err:
73  # ExceptionLog.handler(logger, err, "Goose open file error. It may be unsupported encoding like jp", (), \
74  # {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
75  logger.debug("Goose open file error. It may be unsupported encoding like jp. Error: " + str(err))
76  except Exception as err:
77  # ExceptionLog.handler(logger, err, "Goose parse error", (), \
78  # {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
79  logger.debug("Goose parse error. Error: " + str(err))
80 
81  return reslt
82 
Here is the call graph for this function:

Member Data Documentation

◆ goose

dc_processor.goose_extractor.GooseExtractor.goose = None
static

Definition at line 26 of file goose_extractor.py.

◆ name

dc_processor.goose_extractor.GooseExtractor.name

Definition at line 32 of file goose_extractor.py.


The documentation for this class was generated from the following file: