HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_processor.base_extractor.BaseExtractor Class Reference
Inheritance diagram for dc_processor.base_extractor.BaseExtractor:
Collaboration diagram for dc_processor.base_extractor.BaseExtractor:

Public Member Functions

def __init__ (self, config, templ=None, domain=None, processorProperties=None)
 
def __str__ (self)
 
def __repr__ (self)
 
def loadScraperProperties (self, scraperPropFileName)
 
def isTagNotFilled (self, result, tagName)
 
def isTagValueNotEmpty (self, tagValue)
 
def tagValueElemValidate (self, tagValueElem, conditionElem)
 
def tagValueValidate (self, tagName, tagValue)
 
def addTag (self, result, tag_name, tag_value, xpath="", isDefaultTag=False, callAdjustment=True, tagType=None, allowNotFilled=False)
 
def calculateMetrics (self, response)
 
def rankReading (self, exctractorName)
 

Public Attributes

 config
 
 processorProperties
 
 name
 
 rank
 
 process_mode
 
 modules
 
 data
 
 db_dc_scraper_db
 
 DBConnector
 
 imgDelimiter
 
 tagsValidator
 

Static Public Attributes

 properties = None
 
dictionary tag
 
dictionary tagsMask
 

Detailed Description

Definition at line 101 of file base_extractor.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_processor.base_extractor.BaseExtractor.__init__ (   self,
  config,
  templ = None,
  domain = None,
  processorProperties = None 
)

Definition at line 161 of file base_extractor.py.

161  def __init__(self, config, templ=None, domain=None, processorProperties=None): # pylint: disable=W0612,W0613
162  self.config = config
163  self.processorProperties = processorProperties
164  self.properties = None
165  scraperPropFileName = self.config.get("Application", "property_file_name")
166 
167  if scraperPropFileName is not None:
168  self.loadScraperProperties(scraperPropFileName)
169 
170  self.name = "Base extractor"
171  self.rank = CONSTS.SCRAPER_RANK_INIT
172 
173  # support processing modes
174  self.process_mode = CONSTS.PROCESS_ALGORITHM_REGULAR
175  self.modules = {}
176 
177  self.data = {"extractor":"Base extractor", "data":"", "name":""}
178  self.db_dc_scraper_db = None
179  self.DBConnector = None
180  if processorProperties is not None and "SCRAPER_TAG_ITEMS_DELIMITER" in processorProperties:
181  self.imgDelimiter = processorProperties["SCRAPER_TAG_ITEMS_DELIMITER"]
182  else:
183  self.imgDelimiter = ' '
184  self.tagsValidator = None
185  if processorProperties is not None and "tagsValidator" in processorProperties:
186  try:
187  self.tagsValidator = json.loads(processorProperties["tagsValidator"])
188  except Exception as excp:
189  ExceptionLog.handler(logger, excp, '>>> tagsValidator wronj json format', (), \
190  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
191 
192 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ __repr__()

def dc_processor.base_extractor.BaseExtractor.__repr__ (   self)

Definition at line 197 of file base_extractor.py.

197  def __repr__(self):
198  return repr((self.name, self.rank))
199 
200 

◆ __str__()

def dc_processor.base_extractor.BaseExtractor.__str__ (   self)

Definition at line 193 of file base_extractor.py.

193  def __str__(self):
194  return "%s" % (self.name)
195 
196 

◆ addTag()

def dc_processor.base_extractor.BaseExtractor.addTag (   self,
  result,
  tag_name,
  tag_value,
  xpath = "",
  isDefaultTag = False,
  callAdjustment = True,
  tagType = None,
  allowNotFilled = False 
)

Definition at line 291 of file base_extractor.py.

291  allowNotFilled=False):
292  ret = False
293  if tag_name not in result.blockedByXpathTags:
294  tag_value = self.tagValueValidate(tag_name, tag_value)
295  if tag_value is not None:
296  if callAdjustment:
297  try:
298  if tag_value and not isinstance(tag_value, list):
299  pass
300  if tag_value and isinstance(tag_value, list):
301  pass
302  tag_value = self.tag[tag_name](tag_value)
303  except Exception as err:
304  logger.debug('No tag name in result template: %s', str(err))
305 
306  result.errorCode = 0
307  result.errorMessage = ERR_MSG_OK
308 
309  if (tag_name not in result.tags.keys() and self.isTagValueNotEmpty(tag_value) is not None) or \
310  (self.isTagNotFilled(result, tag_name) and self.isTagValueNotEmpty(tag_value) is not None) or \
311  allowNotFilled:
312  data = {"extractor": "Base extractor", "data": "", "name": ""}
313  data["data"] = tag_value
314  data["name"] = tag_name
315  data["xpath"] = xpath
316  data["type"] = tagType
317  data["lang"] = dc_processor.scraper_result.Result.TAGS_LANG_DEFAULT
318  data["lang_suffix"] = dc_processor.scraper_result.Result.TAGS_LANG_SUFFIX_DEFAULT
319  data["extractor"] = self.__class__.__name__
320  result.tags[tag_name] = data
321  if isDefaultTag and tag_name not in result.defaultTags:
322  result.defaultTags.append(tag_name)
323  ret = True
324  else:
325  logger.debug(">>> BaseExtractor.addTag, tags in break list; tag is = " + tag_name)
326  return ret
327 
328 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ calculateMetrics()

def dc_processor.base_extractor.BaseExtractor.calculateMetrics (   self,
  response 
)

Definition at line 331 of file base_extractor.py.

331  def calculateMetrics(self, response):
332  try:
333  for metric in response.metrics:
334  logger.debug("response.tags:\n%s\nmetric:\n%s", varDump(response.tags), varDump(metric))
335  metric.calculateMetricValue(response.tags)
336  except Exception, err:
337  ExceptionLog.handler(logger, err, CONSTS.MSG_ERROR_CALC_METRICS)
338  raise err
339 
340 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:

◆ isTagNotFilled()

def dc_processor.base_extractor.BaseExtractor.isTagNotFilled (   self,
  result,
  tagName 
)

Definition at line 217 of file base_extractor.py.

217  def isTagNotFilled(self, result, tagName):
218  ret = True
219  if tagName in result.tags:
220  if isinstance(result.tags[tagName], basestring):
221  ret = (result.tags[tagName].strip() == "")
222  elif isinstance(result.tags[tagName], list):
223  if len(result.tags[tagName]) > 0:
224  ret = False
225  elif isinstance(result.tags[tagName], dict):
226  if "data" in result.tags[tagName]:
227  if isinstance(result.tags[tagName]["data"], basestring):
228  ret = (result.tags[tagName]["data"].strip() == "")
229  elif isinstance(result.tags[tagName]["data"], list):
230  for elem in result.tags[tagName]["data"]:
231  ret = (elem.strip() == "")
232  if not ret:
233  break
234 
235  return ret
236 
237 
Here is the caller graph for this function:

◆ isTagValueNotEmpty()

def dc_processor.base_extractor.BaseExtractor.isTagValueNotEmpty (   self,
  tagValue 
)

Definition at line 240 of file base_extractor.py.

240  def isTagValueNotEmpty(self, tagValue):
241  full = None
242  if isinstance(tagValue, list):
243  if len(tagValue) == 0:
244  full = None
245  else:
246  full = tagValue
247  else:
248  full = tagValue
249  return full
250 
251 
Here is the caller graph for this function:

◆ loadScraperProperties()

def dc_processor.base_extractor.BaseExtractor.loadScraperProperties (   self,
  scraperPropFileName 
)

Definition at line 205 of file base_extractor.py.

205  def loadScraperProperties(self, scraperPropFileName):
206  if scraperPropFileName is not None:
207  try:
208  with open(scraperPropFileName, "rb") as fd:
209  scraperProperies = json.loads(fd.read())
210  self.properties = scraperProperies[self.__class__.__name__][CONSTS.PROPERTIES_KEY]
211  except Exception as excp:
212  logger.debug(">>> Some error with scraper property loads = " + str(excp))
213 
214 
Here is the caller graph for this function:

◆ rankReading()

def dc_processor.base_extractor.BaseExtractor.rankReading (   self,
  exctractorName 
)

Definition at line 343 of file base_extractor.py.

343  def rankReading(self, exctractorName):
344  wasSet = False
345  if self.processorProperties is not None and exctractorName is not None and \
346  CONSTS.RANK_KEY in self.processorProperties:
347  try:
348  rankProp = json.loads(self.processorProperties)
349  if exctractorName in rankProp:
350  self.rank = rankProp[exctractorName]
351  wasSet = True
352  except Exception:
353  logger.debug(">>> Wrong json string in processorProperties[\"%s\"]", CONSTS.RANK_KEY)
354 
355  if not wasSet and self.properties is not None and CONSTS.RANK_KEY in self.properties:
356  self.rank = self.properties[CONSTS.RANK_KEY]
357 
358  logger.debug(">>> Rank is : %s", str(self.rank))
359 
Here is the caller graph for this function:

◆ tagValueElemValidate()

def dc_processor.base_extractor.BaseExtractor.tagValueElemValidate (   self,
  tagValueElem,
  conditionElem 
)

Definition at line 254 of file base_extractor.py.

254  def tagValueElemValidate(self, tagValueElem, conditionElem):
255  ret = True
256  if conditionElem["type"] == "include":
257  ret = False
258  if re.compile(conditionElem["RE"]).match(tagValueElem) is not None:
259  ret = True
260  elif conditionElem["type"] == "exclude":
261  if re.compile(conditionElem["RE"]).match(tagValueElem) is not None:
262  ret = False
263  return ret
264 
265 
Here is the caller graph for this function:

◆ tagValueValidate()

def dc_processor.base_extractor.BaseExtractor.tagValueValidate (   self,
  tagName,
  tagValue 
)

Definition at line 268 of file base_extractor.py.

268  def tagValueValidate(self, tagName, tagValue):
269  ret = tagValue
270  if self.tagsValidator is not None and self.name in self.tagsValidator and tagName in self.tagsValidator[self.name]:
271  try:
272  if isinstance(tagValue, list):
273  ret = []
274  for elem in tagValue:
275  if self.tagValueElemValidate(elem, self.tagsValidator[self.name][tagName]):
276  ret.append(elem)
277  if len(ret) == 0:
278  ret = None
279  elif isinstance(tagValue, basestring):
280  if not self.tagValueElemValidate(tagValue, self.tagsValidator[self.name][tagName]):
281  ret = None
282  except Exception as excp:
283  ExceptionLog.handler(logger, excp, '>>> something wrong in tagValueValidate method', (), \
284  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
285  return ret
286 
287 
Here is the call graph for this function:
Here is the caller graph for this function:

Member Data Documentation

◆ config

dc_processor.base_extractor.BaseExtractor.config

Definition at line 162 of file base_extractor.py.

◆ data

dc_processor.base_extractor.BaseExtractor.data

Definition at line 177 of file base_extractor.py.

◆ db_dc_scraper_db

dc_processor.base_extractor.BaseExtractor.db_dc_scraper_db

Definition at line 178 of file base_extractor.py.

◆ DBConnector

dc_processor.base_extractor.BaseExtractor.DBConnector

Definition at line 179 of file base_extractor.py.

◆ imgDelimiter

dc_processor.base_extractor.BaseExtractor.imgDelimiter

Definition at line 181 of file base_extractor.py.

◆ modules

dc_processor.base_extractor.BaseExtractor.modules

Definition at line 175 of file base_extractor.py.

◆ name

dc_processor.base_extractor.BaseExtractor.name

Definition at line 170 of file base_extractor.py.

◆ process_mode

dc_processor.base_extractor.BaseExtractor.process_mode

Definition at line 174 of file base_extractor.py.

◆ processorProperties

dc_processor.base_extractor.BaseExtractor.processorProperties

Definition at line 163 of file base_extractor.py.

◆ properties

dc_processor.base_extractor.BaseExtractor.properties = None
static

Definition at line 103 of file base_extractor.py.

◆ rank

dc_processor.base_extractor.BaseExtractor.rank

Definition at line 171 of file base_extractor.py.

◆ tag

dictionary dc_processor.base_extractor.BaseExtractor.tag
static
Initial value:
= {CONSTS.TAG_MEDIA: adjustMedia,
CONSTS.TAG_CONTENT_UTF8_ENCODED: adjustContentUTF8Encoded,
CONSTS.TAG_PUB_DATE: adjustPubDate,
CONSTS.TAG_TITLE: adjustNone,
CONSTS.TAG_LINK: adjustLink,
CONSTS.TAG_DESCRIPTION: adjustNone,
CONSTS.TAG_DC_DATE: adjustNone,
CONSTS.TAG_AUTHOR: adjustNone,
CONSTS.TAG_GUID: adjustNone,
CONSTS.TAG_KEYWORDS: adjustNone,
CONSTS.TAG_MEDIA_THUMBNAIL: adjustNone,
CONSTS.TAG_ENCLOSURE: adjustNone,
CONSTS.TAG_MEDIA_CONTENT: adjustNone,
CONSTS.TAG_GOOGLE: adjustNone,
CONSTS.TAG_GOOGLE_TOTAL: adjustNone,
CONSTS.HTML_LANG: adjustNone
}

Definition at line 105 of file base_extractor.py.

◆ tagsMask

dictionary dc_processor.base_extractor.BaseExtractor.tagsMask
static

Definition at line 124 of file base_extractor.py.

◆ tagsValidator

dc_processor.base_extractor.BaseExtractor.tagsValidator

Definition at line 184 of file base_extractor.py.


The documentation for this class was generated from the following file: