HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_processor.ml_extractor.MLExtractor Class Reference
Inheritance diagram for dc_processor.ml_extractor.MLExtractor:
Collaboration diagram for dc_processor.ml_extractor.MLExtractor:

Public Member Functions

def __init__ (self, config, templ=None, domain=None, processorProperties=None)
 
def processAttributes (self, elem)
 
def extractTags (self, resource, reslt)
 
def getXPathFromContent (self, content)
 
- Public Member Functions inherited from dc_processor.base_extractor.BaseExtractor
def __init__ (self, config, templ=None, domain=None, processorProperties=None)
 
def __str__ (self)
 
def __repr__ (self)
 
def loadScraperProperties (self, scraperPropFileName)
 
def isTagNotFilled (self, result, tagName)
 
def isTagValueNotEmpty (self, tagValue)
 
def tagValueElemValidate (self, tagValueElem, conditionElem)
 
def tagValueValidate (self, tagName, tagValue)
 
def addTag (self, result, tag_name, tag_value, xpath="", isDefaultTag=False, callAdjustment=True, tagType=None, allowNotFilled=False)
 
def calculateMetrics (self, response)
 
def rankReading (self, exctractorName)
 

Public Attributes

 name
 
- Public Attributes inherited from dc_processor.base_extractor.BaseExtractor
 config
 
 processorProperties
 
 name
 
 rank
 
 process_mode
 
 modules
 
 data
 
 db_dc_scraper_db
 
 DBConnector
 
 imgDelimiter
 
 tagsValidator
 

Additional Inherited Members

- Static Public Attributes inherited from dc_processor.base_extractor.BaseExtractor
 properties = None
 
dictionary tag
 
dictionary tagsMask
 

Detailed Description

Definition at line 24 of file ml_extractor.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_processor.ml_extractor.MLExtractor.__init__ (   self,
  config,
  templ = None,
  domain = None,
  processorProperties = None 
)

Definition at line 27 of file ml_extractor.py.

27  def __init__(self, config, templ=None, domain=None, processorProperties=None):
28  BaseExtractor.__init__(self, config, templ, domain, processorProperties)
29  self.name = CONSTS.EXTRACTOR_NAME_ML
30  self.data["extractor"] = CONSTS.EXTRACTOR_NAME_ML
31  """
32  #stub
33  #set properties manually
34  #later it will be filled from db
35  #prepate algorithm dict
36  properties_dict = json.loads(CONSTS.ML_EXTRACTOR_PROPERTIES_JSON)
37  logger.debug("properties_dict: %s" % varDump(properties_dict))
38  self.properties = properties_dict[CONSTS.PROPERTIES_KEY]
39  """
40  logger.debug("Properties: %s", varDump(self.properties))
41 
42  # set module rank from module's properties
43  self.rankReading(self.__class__.__name__)
44 
45 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ extractTags()

def dc_processor.ml_extractor.MLExtractor.extractTags (   self,
  resource,
  reslt 
)

Definition at line 74 of file ml_extractor.py.

74  def extractTags(self, resource, reslt):
75  try:
76  xml = resource.raw_html
77  context = etree.iterparse(BytesIO(xml.encode("utf-8")), html=True, events=("start", "end")) # pylint: disable=E1101
78  X = {"data":[]}
79  try:
80  for action, elem in context:
81  if (elem.tag == "div" or elem.tag == "article") and action == "start":
82  child_tags = [child.tag for child in elem.getchildren()] # pylint: disable=W0613,W0612
83  if elem.tag == "article" or self.processAttributes(elem):
84  attr = elem.items()
85  full_text = ""
86  T = elem.iter()
87  for t in T:
88  if t.tag == "script":
89  t.clear()
90  for text in elem.itertext():
91  text = text.strip("\r\n\t ")
92  full_text = full_text + text if len(text) > 0 else full_text
93  X["data"].append({"value":full_text, "attr":attr})
94  except Exception, err:
95  logger.debug("Empty DOM. %s", str(err.message))
96  if len(X["data"]) > 0:
97  I = 0
98  L = []
99  for x in X["data"]:
100  l = 0
101  for xx in x["value"]:
102  l = l + len(xx)
103  L.append(l)
104  m = max(L)
105  I = [i for i, j in enumerate(L) if j == m]
106  self.addTag(result=reslt, tag_name=CONSTS.TAG_CONTENT_UTF8_ENCODED, tag_value=X["data"][I[0]]["value"])
107  else:
108  logger.debug("Nothing to extarct")
109  except Exception as err:
110  ExceptionLog.handler(logger, err, 'Parse error:', (err))
111  return reslt
112 
113 
Here is the call graph for this function:

◆ getXPathFromContent()

def dc_processor.ml_extractor.MLExtractor.getXPathFromContent (   self,
  content 
)

Definition at line 114 of file ml_extractor.py.

114  def getXPathFromContent(self, content): # pylint: disable=W0613
115  xpath = None
116  # xpath = //*[contains(., content)]
117  return xpath
118 

◆ processAttributes()

def dc_processor.ml_extractor.MLExtractor.processAttributes (   self,
  elem 
)

Definition at line 46 of file ml_extractor.py.

46  def processAttributes(self, elem):
47  candidates = []
48  attr = elem.items()
49  A = elem.getchildren()
50  for a in A:
51  childs = a.iter(tag="div")
52  for child in childs:
53  attr = child.items()
54  for items in attr:
55  words = re.sub(r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))', r' \1', items[1]).lower()
56  words = re.sub("_", " ", words)
57  words = re.sub("-", " ", words)
58  for word in words.split():
59  candidates.append(word)
60  if "article" in candidates or "content" in candidates:
61  return False
62  candidates = []
63  attr = elem.items()
64  for items in attr:
65  if items[0] != "style":
66  words = re.sub(r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))', r' \1', items[1]).lower()
67  words = re.sub("_", " ", words)
68  words = re.sub("-", " ", words)
69  for word in words.split():
70  candidates.append(word)
71  return True if "article" in candidates or "content" in candidates or "text" in candidates else False
72 
73 
Here is the caller graph for this function:

Member Data Documentation

◆ name

dc_processor.ml_extractor.MLExtractor.name

Definition at line 29 of file ml_extractor.py.


The documentation for this class was generated from the following file: