3 @author Alexey <developers.hce@gmail.com> 4 @link http://hierarchical-cluster-engine.com/ 5 @copyright Copyright © 2013 IOIX Ukraine 6 @license http://hierarchical-cluster-engine.com/license/ 7 @package HCE project node API 12 from io
import BytesIO
13 from lxml
import etree
27 def __init__(self, config, templ=None, domain=None, processorProperties=None):
28 BaseExtractor.__init__(self, config, templ, domain, processorProperties)
29 self.
name = CONSTS.EXTRACTOR_NAME_ML
30 self.
data[
"extractor"] = CONSTS.EXTRACTOR_NAME_ML
33 #set properties manually 34 #later it will be filled from db 35 #prepate algorithm dict 36 properties_dict = json.loads(CONSTS.ML_EXTRACTOR_PROPERTIES_JSON) 37 logger.debug("properties_dict: %s" % varDump(properties_dict)) 38 self.properties = properties_dict[CONSTS.PROPERTIES_KEY] 49 A = elem.getchildren()
51 childs = a.iter(tag=
"div")
55 words = re.sub(
r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))',
r' \1', items[1]).lower()
56 words = re.sub(
"_",
" ", words)
57 words = re.sub(
"-",
" ", words)
58 for word
in words.split():
59 candidates.append(word)
60 if "article" in candidates
or "content" in candidates:
65 if items[0] !=
"style":
66 words = re.sub(
r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))',
r' \1', items[1]).lower()
67 words = re.sub(
"_",
" ", words)
68 words = re.sub(
"-",
" ", words)
69 for word
in words.split():
70 candidates.append(word)
71 return True if "article" in candidates
or "content" in candidates
or "text" in candidates
else False 76 xml = resource.raw_html
77 context = etree.iterparse(BytesIO(xml.encode(
"utf-8")), html=
True, events=(
"start",
"end"))
80 for action, elem
in context:
81 if (elem.tag ==
"div" or elem.tag ==
"article")
and action ==
"start":
82 child_tags = [child.tag
for child
in elem.getchildren()]
90 for text
in elem.itertext():
91 text = text.strip(
"\r\n\t ")
92 full_text = full_text + text
if len(text) > 0
else full_text
93 X[
"data"].append({
"value":full_text,
"attr":attr})
94 except Exception, err:
95 logger.debug(
"Empty DOM. %s", str(err.message))
96 if len(X[
"data"]) > 0:
101 for xx
in x[
"value"]:
105 I = [i
for i, j
in enumerate(L)
if j == m]
106 self.
addTag(result=reslt, tag_name=CONSTS.TAG_CONTENT_UTF8_ENCODED, tag_value=X[
"data"][I[0]][
"value"])
108 logger.debug(
"Nothing to extarct")
109 except Exception
as err:
110 ExceptionLog.handler(logger, err,
'Parse error:', (err))
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)