HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
scraper_result.py
Go to the documentation of this file.
1 """@package docstring
2  @file scraper_result.py
3  @author Alexey, bgv <developers.hce@gmail.com>
4  @link http://hierarchical-cluster-engine.com/
5  @copyright Copyright &copy; 2013 IOIX Ukraine
6  @license http://hierarchical-cluster-engine.com/license/
7  @package HCE project node API
8  @since 0.1
9 """
10 import json
11 import time
12 import copy
13 import dc_processor.Constants as CONSTS
14 from dc_processor.base_extractor import BaseExtractor
15 from app.Metrics import Metrics
16 from app.ContentHashCalculator import ContentHashCalculator
17 # from app.Utils import varDump
18 import app.Utils as Utils # pylint: disable=F0401
19 
20 # Logger initialization
21 logger = Utils.MPLogger().getLogger()
22 
23 class Result(object):
24 
25  TAGS_LANG_DEFAULT = "en"
26  TAGS_LANG_SUFFIX_DEFAULT = "_language"
27 
28  def __init__(self, config, resId, metrics=None):
29  self.name = "Scraper result object"
30  if metrics is None:
31  self.metrics = {}
32  else:
33  self.metrics = metrics
34  self.data = {}
35  self.tags = {}
37  self.defaultTags = []
38  self.resId = resId
39  if config is None:
40  self.article_tags = []
41  else:
42  self.article_tags = [tag for tag in config.options("article_tags")]
43  for tag in self.article_tags:
44  self.tags[tag] = ""
45  self.start = time.time()
46  self.finish = self.start
47  self.errorCode = CONSTS.ERROR_OK
48  self.errorMessage = CONSTS.MSG_ERROR_OK
49  self.tagsCount = 0
50  self.tagsMask = 0
51 
52 
53  def getEmptyTags(self):
54  empty_tags = [key for key, value in self.tags.items() if key in self.article_tags and not value]
55  return empty_tags
56 
57 
58  def getFilledTags(self):
59  filled_tags = [key for key, value in self.tags.items() if key in self.article_tags and value]
60  return filled_tags
61 
62 
63  def recalcTagMaskCount(self, container=None, altTagsMask=None):
64  self.tagsCount = 0
65  self.tagsMask = 0
66 
67  for key, value in self.tags.items():
68  if value is not None and value != "":
69  tag = {}
70  for key in value:
71  tag[key] = value[key]
72 
73  # set tag's mask only if tag is registered, also increment tags count.
74  # Execute it functionaly if tag's value not default
75  if tag["name"] not in self.defaultTags and "data" in value:
76  realValueString = ""
77  if isinstance(value["data"], basestring):
78  realValueString = value["data"]
79  elif isinstance(value["data"], list) and len(value["data"]) > 0:
80  realValueString = value["data"][0]
81  if realValueString is not None and realValueString.strip() != "":
82  if altTagsMask is not None:
83  if tag["name"] in altTagsMask:
84  self.tagsMask = self.tagsMask | altTagsMask[tag["name"]]
85  elif tag["name"] in BaseExtractor.tagsMask:
86  self.tagsMask = self.tagsMask | BaseExtractor.tagsMask[tag["name"]]
87  self.tagsCount += 1
88 
89  if container is not None:
90  container.append(copy.copy(tag))
91 
92 
94  if len(self.metrics) > 0:
95  Metrics.fillMetricModulesList()
96  Metrics.metricsPrecalculate(self.metrics, self)
97 
98 
99  def get(self):
100  data = {}
101  data["resId"] = self.resId
102  data["tagList"] = []
103 
104  # Convert old format to new collection format
105  data["tagList"].append([])
106 
107  self.recalcTagMaskCount(data["tagList"][0])
108 
109  self.data["data"] = data
110  self.data["error_code"] = self.errorCode
111  self.data["error_message"] = self.errorMessage
112  self.data["time"] = "%s" % (self.finish - self.start)
113 
114  self.metrics = json.dumps(self.metrics)
115  self.data["metrics"] = self.metrics
116 
117  return json.dumps(self.data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":"))
118 
119 
120  def mergeResults(self, result):
121  # logger.debug(">>> incoming result: %s", varDump(result))
122  for blockedTag in result.blockedByXpathTags:
123  if blockedTag not in self.blockedByXpathTags:
124  self.blockedByXpathTags.append(blockedTag)
125 
126 # logger.debug("!!! self.tags: %s", varDump(self.tags))
127 # logger.debug("!!! result.tags: %s", varDump(result.tags))
128 
129  for tagName in result.tags:
130  if tagName not in self.tags or not self.isTagFilled(tagName):
131  self.tags[tagName] = result.tags[tagName]
132  if tagName in result.defaultTags and tagName not in self.defaultTags:
133  self.defaultTags.append(tagName)
134 
135 
136  def getBestValue(self, items_list):
137  tmp = [item for item in items_list if item != ""]
138  response = ""
139  # if more than one suggestions try to select best one
140  if len(tmp) > 1:
141  # for each tag own rule
142  # for content_encoded select biggest text
143  if tmp[0]["name"] == "content_encoded":
144  response = max(tmp, key=lambda x: x["data"])
145  # for any else apply the same rule
146  else:
147  response = max(tmp, key=lambda x: x["data"])
148  # if only one suggestion return it
149  elif len(tmp) > 0:
150  response = tmp[0]
151  # if no one suggestions return empty string
152  else:
153  response = ""
154  return response
155 
156 
157  def stripResult(self):
158  removeKeys = []
159  for key in self.tags:
160  if isinstance(self.tags[key], basestring):
161  self.tags[key] = self.tags[key].strip()
162  if self.tags[key] == "":
163  removeKeys.append(key)
164  elif isinstance(self.tags[key], dict) and "data" in self.tags[key]:
165  if isinstance(self.tags[key]["data"], basestring):
166  self.tags[key]["data"] = self.tags[key]["data"].strip()
167  if self.tags[key]["data"] == "":
168  removeKeys.append(key)
169  elif isinstance(self.tags[key]["data"], list) and len(self.tags[key]["data"]) > 0 and \
170  isinstance(self.tags[key]["data"][0], basestring):
171  self.tags[key]["data"][0] = self.tags[key]["data"][0].strip()
172  if self.tags[key]["data"][0] == "":
173  removeKeys.append(key)
174  else:
175  removeKeys.append(key)
176 
177  for key in removeKeys:
178  if key in self.tags:
179  logger.debug(">>> Remove " + key + " element because it empty")
180  del self.tags[key]
181 
182 
183  def isTagFilled(self, tagsName):
184  ret = False
185  if tagsName in self.tags:
186  if isinstance(self.tags[tagsName], basestring):
187  if self.tags[tagsName].strip() != "":
188  ret = True
189  elif isinstance(self.tags[tagsName], dict) and "data" in self.tags[tagsName]:
190  if isinstance(self.tags[tagsName]["data"], basestring):
191  if self.tags[tagsName]["data"].strip() != "":
192  ret = True
193  elif isinstance(self.tags[tagsName]["data"], list):
194  for elem in self.tags[tagsName]["data"]:
195  if isinstance(elem, basestring) and elem != "":
196  ret = True
197  break
198 
199  return ret
200 
201 
202 # # # retTagsText
203 # #
204 # def retTagsText(self, tagName):
205 # ret = None
206 # if tagName in self.tags:
207 # if isinstance(self.tags[tagName], basestring):
208 # ret = self.tags[tagName]
209 # elif isinstance(self.tags[tagName], dict) and "data" in self.tags[tagName]:
210 # if isinstance(self.tags[tagName]["data"], basestring):
211 # ret = self.tags[tagName]["data"]
212 # elif isinstance(self.tags[tagName]["data"], list):
213 # ret = ""
214 # for elem in self.tags[tagName]["data"]:
215 # ret += elem
216 # ret += ' '
217 # ret = ret.strip()
218 # return ret
219 #
220 #
221 # # # setLangField
222 # #
223 # def setLangField(self, text, tagName, fieldName, suffixName):
224 #
225 # logger.info("Enter setLangField() text = '%s', tagName = '%s', fieldName = '%s', suffixName = '%s'",
226 # str(text), str(tagName), str(fieldName), str(suffixName))
227 # if text is not None:
228 # lang = ContentHashCalculator.langDetect(text, False)
229 # logger.info("lang = '%s'", str(lang))
230 # logger.info("self.tags[tagName] = '%s', type = %s", str(self.tags[tagName]), str(type(self.tags[tagName])))
231 # if lang is not None and isinstance(self.tags[tagName], dict):
232 # self.tags[tagName][fieldName] = lang
233 # self.tags[tagName]["lang_suffix"] = suffixName
234 #
235 # logger.info("self.tags[%s]: '%s'", str(tagName), str(self.tags[tagName]))
236 #
237 #
238 # # # tagsLangDetecting
239 # #
240 # def tagsLangDetecting(self, scraperLangDetect):
241 # if "tags" in scraperLangDetect and "suffix" in scraperLangDetect:
242 # if isinstance(scraperLangDetect["tags"], basestring) and scraperLangDetect["tags"] == "*":
243 # for tagName in self.tags:
244 # localTextValue = self.retTagsText(tagName)
245 # self.setLangField(localTextValue, tagName, "lang", scraperLangDetect["suffix"])
246 # elif isinstance(scraperLangDetect["tags"], basestring) and scraperLangDetect["tags"] == "&":
247 # localTextResult = None
248 # for tagName in self.tags:
249 # localTextResult = ""
250 # localTextValue = self.retTagsText(tagName)
251 # if localTextValue is not None:
252 # localTextResult += localTextValue
253 # localTextResult += ' '
254 # localTextResult = localTextResult.strip()
255 # for tagName in self.tags:
256 # self.setLangField(localTextResult, tagName, "summary_lang", scraperLangDetect["suffix"])
257 # elif isinstance(scraperLangDetect["tags"], list):
258 # for tagName in scraperLangDetect["tags"]:
259 # localTextValue = self.retTagsText(tagName)
260 # self.setLangField(localTextValue, tagName, "lang", scraperLangDetect["suffix"])
def recalcTagMaskCount(self, container=None, altTagsMask=None)
def __init__(self, config, resId, metrics=None)
def getBestValue(self, items_list)