Definition at line 24 of file ml_extractor.py.
◆ __init__()
def dc_processor.ml_extractor.MLExtractor.__init__ |
( |
|
self, |
|
|
|
config, |
|
|
|
templ = None , |
|
|
|
domain = None , |
|
|
|
processorProperties = None |
|
) |
| |
Definition at line 27 of file ml_extractor.py.
27 def __init__(self, config, templ=None, domain=None, processorProperties=None):
28 BaseExtractor.__init__(self, config, templ, domain, processorProperties)
29 self.name = CONSTS.EXTRACTOR_NAME_ML
30 self.data[
"extractor"] = CONSTS.EXTRACTOR_NAME_ML
33 #set properties manually 34 #later it will be filled from db 35 #prepate algorithm dict 36 properties_dict = json.loads(CONSTS.ML_EXTRACTOR_PROPERTIES_JSON) 37 logger.debug("properties_dict: %s" % varDump(properties_dict)) 38 self.properties = properties_dict[CONSTS.PROPERTIES_KEY] 40 logger.debug(
"Properties: %s",
varDump(self.properties))
43 self.rankReading(self.__class__.__name__)
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
def __init__(self)
constructor
◆ extractTags()
def dc_processor.ml_extractor.MLExtractor.extractTags |
( |
|
self, |
|
|
|
resource, |
|
|
|
reslt |
|
) |
| |
Definition at line 74 of file ml_extractor.py.
74 def extractTags(self, resource, reslt):
76 xml = resource.raw_html
77 context = etree.iterparse(BytesIO(xml.encode(
"utf-8")), html=
True, events=(
"start",
"end"))
80 for action, elem
in context:
81 if (elem.tag ==
"div" or elem.tag ==
"article")
and action ==
"start":
82 child_tags = [child.tag
for child
in elem.getchildren()]
83 if elem.tag ==
"article" or self.processAttributes(elem):
90 for text
in elem.itertext():
91 text = text.strip(
"\r\n\t ")
92 full_text = full_text + text
if len(text) > 0
else full_text
93 X[
"data"].append({
"value":full_text,
"attr":attr})
94 except Exception, err:
95 logger.debug(
"Empty DOM. %s", str(err.message))
96 if len(X[
"data"]) > 0:
101 for xx
in x[
"value"]:
105 I = [i
for i, j
in enumerate(L)
if j == m]
106 self.addTag(result=reslt, tag_name=CONSTS.TAG_CONTENT_UTF8_ENCODED, tag_value=X[
"data"][I[0]][
"value"])
108 logger.debug(
"Nothing to extarct")
109 except Exception
as err:
110 ExceptionLog.handler(logger, err,
'Parse error:', (err))
◆ getXPathFromContent()
def dc_processor.ml_extractor.MLExtractor.getXPathFromContent |
( |
|
self, |
|
|
|
content |
|
) |
| |
Definition at line 114 of file ml_extractor.py.
114 def getXPathFromContent(self, content):
◆ processAttributes()
def dc_processor.ml_extractor.MLExtractor.processAttributes |
( |
|
self, |
|
|
|
elem |
|
) |
| |
Definition at line 46 of file ml_extractor.py.
46 def processAttributes(self, elem):
49 A = elem.getchildren()
51 childs = a.iter(tag=
"div")
55 words = re.sub(
r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))',
r' \1', items[1]).lower()
56 words = re.sub(
"_",
" ", words)
57 words = re.sub(
"-",
" ", words)
58 for word
in words.split():
59 candidates.append(word)
60 if "article" in candidates
or "content" in candidates:
65 if items[0] !=
"style":
66 words = re.sub(
r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))',
r' \1', items[1]).lower()
67 words = re.sub(
"_",
" ", words)
68 words = re.sub(
"-",
" ", words)
69 for word
in words.split():
70 candidates.append(word)
71 return True if "article" in candidates
or "content" in candidates
or "text" in candidates
else False
◆ name
dc_processor.ml_extractor.MLExtractor.name |
The documentation for this class was generated from the following file: