HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
ml_extractor.py
Go to the documentation of this file.
1 """@package docstring
2  @file ml_extractor.py
3  @author Alexey <developers.hce@gmail.com>
4  @link http://hierarchical-cluster-engine.com/
5  @copyright Copyright &copy; 2013 IOIX Ukraine
6  @license http://hierarchical-cluster-engine.com/license/
7  @package HCE project node API
8  @since 0.1
9  """
10 
11 import re
12 from io import BytesIO
13 from lxml import etree
14 from dc_processor.base_extractor import BaseExtractor
15 import dc_processor.Constants as CONSTS
16 from app.Utils import varDump
17 from app.Utils import ExceptionLog
18 import app.Utils as Utils # pylint: disable=F0401
19 
20 # Logger initialization
21 logger = Utils.MPLogger().getLogger()
22 
23 
25 
26 
27  def __init__(self, config, templ=None, domain=None, processorProperties=None):
28  BaseExtractor.__init__(self, config, templ, domain, processorProperties)
29  self.name = CONSTS.EXTRACTOR_NAME_ML
30  self.data["extractor"] = CONSTS.EXTRACTOR_NAME_ML
31  """
32  #stub
33  #set properties manually
34  #later it will be filled from db
35  #prepate algorithm dict
36  properties_dict = json.loads(CONSTS.ML_EXTRACTOR_PROPERTIES_JSON)
37  logger.debug("properties_dict: %s" % varDump(properties_dict))
38  self.properties = properties_dict[CONSTS.PROPERTIES_KEY]
39  """
40  logger.debug("Properties: %s", varDump(self.properties))
41 
42  # set module rank from module's properties
43  self.rankReading(self.__class__.__name__)
44 
45 
46  def processAttributes(self, elem):
47  candidates = []
48  attr = elem.items()
49  A = elem.getchildren()
50  for a in A:
51  childs = a.iter(tag="div")
52  for child in childs:
53  attr = child.items()
54  for items in attr:
55  words = re.sub(r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))', r' \1', items[1]).lower()
56  words = re.sub("_", " ", words)
57  words = re.sub("-", " ", words)
58  for word in words.split():
59  candidates.append(word)
60  if "article" in candidates or "content" in candidates:
61  return False
62  candidates = []
63  attr = elem.items()
64  for items in attr:
65  if items[0] != "style":
66  words = re.sub(r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))', r' \1', items[1]).lower()
67  words = re.sub("_", " ", words)
68  words = re.sub("-", " ", words)
69  for word in words.split():
70  candidates.append(word)
71  return True if "article" in candidates or "content" in candidates or "text" in candidates else False
72 
73 
74  def extractTags(self, resource, reslt):
75  try:
76  xml = resource.raw_html
77  context = etree.iterparse(BytesIO(xml.encode("utf-8")), html=True, events=("start", "end")) # pylint: disable=E1101
78  X = {"data":[]}
79  try:
80  for action, elem in context:
81  if (elem.tag == "div" or elem.tag == "article") and action == "start":
82  child_tags = [child.tag for child in elem.getchildren()] # pylint: disable=W0613,W0612
83  if elem.tag == "article" or self.processAttributes(elem):
84  attr = elem.items()
85  full_text = ""
86  T = elem.iter()
87  for t in T:
88  if t.tag == "script":
89  t.clear()
90  for text in elem.itertext():
91  text = text.strip("\r\n\t ")
92  full_text = full_text + text if len(text) > 0 else full_text
93  X["data"].append({"value":full_text, "attr":attr})
94  except Exception, err:
95  logger.debug("Empty DOM. %s", str(err.message))
96  if len(X["data"]) > 0:
97  I = 0
98  L = []
99  for x in X["data"]:
100  l = 0
101  for xx in x["value"]:
102  l = l + len(xx)
103  L.append(l)
104  m = max(L)
105  I = [i for i, j in enumerate(L) if j == m]
106  self.addTag(result=reslt, tag_name=CONSTS.TAG_CONTENT_UTF8_ENCODED, tag_value=X["data"][I[0]]["value"])
107  else:
108  logger.debug("Nothing to extarct")
109  except Exception as err:
110  ExceptionLog.handler(logger, err, 'Parse error:', (err))
111  return reslt
112 
113 
114  def getXPathFromContent(self, content): # pylint: disable=W0613
115  xpath = None
116  # xpath = //*[contains(., content)]
117  return xpath
def __init__(self, config, templ=None, domain=None, processorProperties=None)
Definition: ml_extractor.py:27
def extractTags(self, resource, reslt)
Definition: ml_extractor.py:74
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
def addTag(self, result, tag_name, tag_value, xpath="", isDefaultTag=False, callAdjustment=True, tagType=None, allowNotFilled=False)