2 @file scraper_result.py 3 @author Alexey, bgv <developers.hce@gmail.com> 4 @link http://hierarchical-cluster-engine.com/ 5 @copyright Copyright © 2013 IOIX Ukraine 6 @license http://hierarchical-cluster-engine.com/license/ 7 @package HCE project node API 25 TAGS_LANG_DEFAULT =
"en" 26 TAGS_LANG_SUFFIX_DEFAULT =
"_language" 28 def __init__(self, config, resId, metrics=None):
29 self.
name =
"Scraper result object" 42 self.
article_tags = [tag
for tag
in config.options(
"article_tags")]
68 if value
is not None and value !=
"":
75 if tag[
"name"]
not in self.
defaultTags and "data" in value:
77 if isinstance(value[
"data"], basestring):
78 realValueString = value[
"data"]
79 elif isinstance(value[
"data"], list)
and len(value[
"data"]) > 0:
80 realValueString = value[
"data"][0]
81 if realValueString
is not None and realValueString.strip() !=
"":
82 if altTagsMask
is not None:
83 if tag[
"name"]
in altTagsMask:
85 elif tag[
"name"]
in BaseExtractor.tagsMask:
89 if container
is not None:
90 container.append(copy.copy(tag))
95 Metrics.fillMetricModulesList()
96 Metrics.metricsPrecalculate(self.
metrics, self)
101 data[
"resId"] = self.
resId 105 data[
"tagList"].append([])
109 self.
data[
"data"] = data
117 return json.dumps(self.
data, ensure_ascii=
False, sort_keys=
True, indent=4, separators=(
",",
":"))
122 for blockedTag
in result.blockedByXpathTags:
129 for tagName
in result.tags:
131 self.
tags[tagName] = result.tags[tagName]
132 if tagName
in result.defaultTags
and tagName
not in self.
defaultTags:
137 tmp = [item
for item
in items_list
if item !=
""]
143 if tmp[0][
"name"] ==
"content_encoded":
144 response = max(tmp, key=
lambda x: x[
"data"])
147 response = max(tmp, key=
lambda x: x[
"data"])
159 for key
in self.
tags:
160 if isinstance(self.
tags[key], basestring):
161 self.
tags[key] = self.
tags[key].strip()
162 if self.
tags[key] ==
"":
163 removeKeys.append(key)
164 elif isinstance(self.
tags[key], dict)
and "data" in self.
tags[key]:
165 if isinstance(self.
tags[key][
"data"], basestring):
166 self.
tags[key][
"data"] = self.
tags[key][
"data"].strip()
167 if self.
tags[key][
"data"] ==
"":
168 removeKeys.append(key)
169 elif isinstance(self.
tags[key][
"data"], list)
and len(self.
tags[key][
"data"]) > 0
and \
170 isinstance(self.
tags[key][
"data"][0], basestring):
171 self.
tags[key][
"data"][0] = self.
tags[key][
"data"][0].strip()
172 if self.
tags[key][
"data"][0] ==
"":
173 removeKeys.append(key)
175 removeKeys.append(key)
177 for key
in removeKeys:
179 logger.debug(
">>> Remove " + key +
" element because it empty")
185 if tagsName
in self.
tags:
186 if isinstance(self.
tags[tagsName], basestring):
187 if self.
tags[tagsName].strip() !=
"":
189 elif isinstance(self.
tags[tagsName], dict)
and "data" in self.
tags[tagsName]:
190 if isinstance(self.
tags[tagsName][
"data"], basestring):
191 if self.
tags[tagsName][
"data"].strip() !=
"":
193 elif isinstance(self.
tags[tagsName][
"data"], list):
194 for elem
in self.
tags[tagsName][
"data"]:
195 if isinstance(elem, basestring)
and elem !=
"":
def recalcTagMaskCount(self, container=None, altTagsMask=None)
def mergeResults(self, result)
def metricsPrecalculate(self)
def isTagFilled(self, tagsName)
def __init__(self, config, resId, metrics=None)
def getBestValue(self, items_list)