HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_processor.scraper_result.Result Class Reference
Inheritance diagram for dc_processor.scraper_result.Result:
Collaboration diagram for dc_processor.scraper_result.Result:

Public Member Functions

def __init__ (self, config, resId, metrics=None)
 
def getEmptyTags (self)
 
def getFilledTags (self)
 
def recalcTagMaskCount (self, container=None, altTagsMask=None)
 
def metricsPrecalculate (self)
 
def get (self)
 
def mergeResults (self, result)
 
def getBestValue (self, items_list)
 
def stripResult (self)
 
def isTagFilled (self, tagsName)
 

Public Attributes

 name
 
 metrics
 
 data
 
 tags
 
 blockedByXpathTags
 
 defaultTags
 
 resId
 
 article_tags
 
 start
 
 finish
 
 errorCode
 
 errorMessage
 
 tagsCount
 
 tagsMask
 

Static Public Attributes

string TAGS_LANG_DEFAULT = "en"
 
string TAGS_LANG_SUFFIX_DEFAULT = "_language"
 

Detailed Description

Definition at line 23 of file scraper_result.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_processor.scraper_result.Result.__init__ (   self,
  config,
  resId,
  metrics = None 
)

Definition at line 28 of file scraper_result.py.

28  def __init__(self, config, resId, metrics=None):
29  self.name = "Scraper result object"
30  if metrics is None:
31  self.metrics = {}
32  else:
33  self.metrics = metrics
34  self.data = {}
35  self.tags = {}
36  self.blockedByXpathTags = []
37  self.defaultTags = []
38  self.resId = resId
39  if config is None:
40  self.article_tags = []
41  else:
42  self.article_tags = [tag for tag in config.options("article_tags")]
43  for tag in self.article_tags:
44  self.tags[tag] = ""
45  self.start = time.time()
46  self.finish = self.start
47  self.errorCode = CONSTS.ERROR_OK
48  self.errorMessage = CONSTS.MSG_ERROR_OK
49  self.tagsCount = 0
50  self.tagsMask = 0
51 
52 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ get()

def dc_processor.scraper_result.Result.get (   self)

Definition at line 99 of file scraper_result.py.

99  def get(self):
100  data = {}
101  data["resId"] = self.resId
102  data["tagList"] = []
103 
104  # Convert old format to new collection format
105  data["tagList"].append([])
106 
107  self.recalcTagMaskCount(data["tagList"][0])
108 
109  self.data["data"] = data
110  self.data["error_code"] = self.errorCode
111  self.data["error_message"] = self.errorMessage
112  self.data["time"] = "%s" % (self.finish - self.start)
113 
114  self.metrics = json.dumps(self.metrics)
115  self.data["metrics"] = self.metrics
116 
117  return json.dumps(self.data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":"))
118 
119 
Here is the call graph for this function:

◆ getBestValue()

def dc_processor.scraper_result.Result.getBestValue (   self,
  items_list 
)

Definition at line 136 of file scraper_result.py.

136  def getBestValue(self, items_list):
137  tmp = [item for item in items_list if item != ""]
138  response = ""
139  # if more than one suggestions try to select best one
140  if len(tmp) > 1:
141  # for each tag own rule
142  # for content_encoded select biggest text
143  if tmp[0]["name"] == "content_encoded":
144  response = max(tmp, key=lambda x: x["data"])
145  # for any else apply the same rule
146  else:
147  response = max(tmp, key=lambda x: x["data"])
148  # if only one suggestion return it
149  elif len(tmp) > 0:
150  response = tmp[0]
151  # if no one suggestions return empty string
152  else:
153  response = ""
154  return response
155 
156 

◆ getEmptyTags()

def dc_processor.scraper_result.Result.getEmptyTags (   self)

Definition at line 53 of file scraper_result.py.

53  def getEmptyTags(self):
54  empty_tags = [key for key, value in self.tags.items() if key in self.article_tags and not value]
55  return empty_tags
56 
57 

◆ getFilledTags()

def dc_processor.scraper_result.Result.getFilledTags (   self)

Definition at line 58 of file scraper_result.py.

58  def getFilledTags(self):
59  filled_tags = [key for key, value in self.tags.items() if key in self.article_tags and value]
60  return filled_tags
61 
62 

◆ isTagFilled()

def dc_processor.scraper_result.Result.isTagFilled (   self,
  tagsName 
)

Definition at line 183 of file scraper_result.py.

183  def isTagFilled(self, tagsName):
184  ret = False
185  if tagsName in self.tags:
186  if isinstance(self.tags[tagsName], basestring):
187  if self.tags[tagsName].strip() != "":
188  ret = True
189  elif isinstance(self.tags[tagsName], dict) and "data" in self.tags[tagsName]:
190  if isinstance(self.tags[tagsName]["data"], basestring):
191  if self.tags[tagsName]["data"].strip() != "":
192  ret = True
193  elif isinstance(self.tags[tagsName]["data"], list):
194  for elem in self.tags[tagsName]["data"]:
195  if isinstance(elem, basestring) and elem != "":
196  ret = True
197  break
198 
199  return ret
200 
201 
202 # # # retTagsText
203 # #
204 # def retTagsText(self, tagName):
205 # ret = None
206 # if tagName in self.tags:
207 # if isinstance(self.tags[tagName], basestring):
208 # ret = self.tags[tagName]
209 # elif isinstance(self.tags[tagName], dict) and "data" in self.tags[tagName]:
210 # if isinstance(self.tags[tagName]["data"], basestring):
211 # ret = self.tags[tagName]["data"]
212 # elif isinstance(self.tags[tagName]["data"], list):
213 # ret = ""
214 # for elem in self.tags[tagName]["data"]:
215 # ret += elem
216 # ret += ' '
217 # ret = ret.strip()
218 # return ret
219 #
220 #
221 # # # setLangField
222 # #
223 # def setLangField(self, text, tagName, fieldName, suffixName):
224 #
225 # logger.info("Enter setLangField() text = '%s', tagName = '%s', fieldName = '%s', suffixName = '%s'",
226 # str(text), str(tagName), str(fieldName), str(suffixName))
227 # if text is not None:
228 # lang = ContentHashCalculator.langDetect(text, False)
229 # logger.info("lang = '%s'", str(lang))
230 # logger.info("self.tags[tagName] = '%s', type = %s", str(self.tags[tagName]), str(type(self.tags[tagName])))
231 # if lang is not None and isinstance(self.tags[tagName], dict):
232 # self.tags[tagName][fieldName] = lang
233 # self.tags[tagName]["lang_suffix"] = suffixName
234 #
235 # logger.info("self.tags[%s]: '%s'", str(tagName), str(self.tags[tagName]))
236 #
237 #
238 # # # tagsLangDetecting
239 # #
240 # def tagsLangDetecting(self, scraperLangDetect):
241 # if "tags" in scraperLangDetect and "suffix" in scraperLangDetect:
242 # if isinstance(scraperLangDetect["tags"], basestring) and scraperLangDetect["tags"] == "*":
243 # for tagName in self.tags:
244 # localTextValue = self.retTagsText(tagName)
245 # self.setLangField(localTextValue, tagName, "lang", scraperLangDetect["suffix"])
246 # elif isinstance(scraperLangDetect["tags"], basestring) and scraperLangDetect["tags"] == "&":
247 # localTextResult = None
248 # for tagName in self.tags:
249 # localTextResult = ""
250 # localTextValue = self.retTagsText(tagName)
251 # if localTextValue is not None:
252 # localTextResult += localTextValue
253 # localTextResult += ' '
254 # localTextResult = localTextResult.strip()
255 # for tagName in self.tags:
256 # self.setLangField(localTextResult, tagName, "summary_lang", scraperLangDetect["suffix"])
257 # elif isinstance(scraperLangDetect["tags"], list):
258 # for tagName in scraperLangDetect["tags"]:
259 # localTextValue = self.retTagsText(tagName)
260 # self.setLangField(localTextValue, tagName, "lang", scraperLangDetect["suffix"])
261 
Here is the caller graph for this function:

◆ mergeResults()

def dc_processor.scraper_result.Result.mergeResults (   self,
  result 
)

Definition at line 120 of file scraper_result.py.

120  def mergeResults(self, result):
121  # logger.debug(">>> incoming result: %s", varDump(result))
122  for blockedTag in result.blockedByXpathTags:
123  if blockedTag not in self.blockedByXpathTags:
124  self.blockedByXpathTags.append(blockedTag)
125 
126 # logger.debug("!!! self.tags: %s", varDump(self.tags))
127 # logger.debug("!!! result.tags: %s", varDump(result.tags))
128 
129  for tagName in result.tags:
130  if tagName not in self.tags or not self.isTagFilled(tagName):
131  self.tags[tagName] = result.tags[tagName]
132  if tagName in result.defaultTags and tagName not in self.defaultTags:
133  self.defaultTags.append(tagName)
134 
135 
Here is the call graph for this function:

◆ metricsPrecalculate()

def dc_processor.scraper_result.Result.metricsPrecalculate (   self)

Definition at line 93 of file scraper_result.py.

93  def metricsPrecalculate(self):
94  if len(self.metrics) > 0:
95  Metrics.fillMetricModulesList()
96  Metrics.metricsPrecalculate(self.metrics, self)
97 
98 

◆ recalcTagMaskCount()

def dc_processor.scraper_result.Result.recalcTagMaskCount (   self,
  container = None,
  altTagsMask = None 
)

Definition at line 63 of file scraper_result.py.

63  def recalcTagMaskCount(self, container=None, altTagsMask=None):
64  self.tagsCount = 0
65  self.tagsMask = 0
66 
67  for key, value in self.tags.items():
68  if value is not None and value != "":
69  tag = {}
70  for key in value:
71  tag[key] = value[key]
72 
73  # set tag's mask only if tag is registered, also increment tags count.
74  # Execute it functionaly if tag's value not default
75  if tag["name"] not in self.defaultTags and "data" in value:
76  realValueString = ""
77  if isinstance(value["data"], basestring):
78  realValueString = value["data"]
79  elif isinstance(value["data"], list) and len(value["data"]) > 0:
80  realValueString = value["data"][0]
81  if realValueString is not None and realValueString.strip() != "":
82  if altTagsMask is not None:
83  if tag["name"] in altTagsMask:
84  self.tagsMask = self.tagsMask | altTagsMask[tag["name"]]
85  elif tag["name"] in BaseExtractor.tagsMask:
86  self.tagsMask = self.tagsMask | BaseExtractor.tagsMask[tag["name"]]
87  self.tagsCount += 1
88 
89  if container is not None:
90  container.append(copy.copy(tag))
91 
92 
Here is the caller graph for this function:

◆ stripResult()

def dc_processor.scraper_result.Result.stripResult (   self)

Definition at line 157 of file scraper_result.py.

157  def stripResult(self):
158  removeKeys = []
159  for key in self.tags:
160  if isinstance(self.tags[key], basestring):
161  self.tags[key] = self.tags[key].strip()
162  if self.tags[key] == "":
163  removeKeys.append(key)
164  elif isinstance(self.tags[key], dict) and "data" in self.tags[key]:
165  if isinstance(self.tags[key]["data"], basestring):
166  self.tags[key]["data"] = self.tags[key]["data"].strip()
167  if self.tags[key]["data"] == "":
168  removeKeys.append(key)
169  elif isinstance(self.tags[key]["data"], list) and len(self.tags[key]["data"]) > 0 and \
170  isinstance(self.tags[key]["data"][0], basestring):
171  self.tags[key]["data"][0] = self.tags[key]["data"][0].strip()
172  if self.tags[key]["data"][0] == "":
173  removeKeys.append(key)
174  else:
175  removeKeys.append(key)
176 
177  for key in removeKeys:
178  if key in self.tags:
179  logger.debug(">>> Remove " + key + " element because it empty")
180  del self.tags[key]
181 
182 

Member Data Documentation

◆ article_tags

dc_processor.scraper_result.Result.article_tags

Definition at line 40 of file scraper_result.py.

◆ blockedByXpathTags

dc_processor.scraper_result.Result.blockedByXpathTags

Definition at line 36 of file scraper_result.py.

◆ data

dc_processor.scraper_result.Result.data

Definition at line 34 of file scraper_result.py.

◆ defaultTags

dc_processor.scraper_result.Result.defaultTags

Definition at line 37 of file scraper_result.py.

◆ errorCode

dc_processor.scraper_result.Result.errorCode

Definition at line 47 of file scraper_result.py.

◆ errorMessage

dc_processor.scraper_result.Result.errorMessage

Definition at line 48 of file scraper_result.py.

◆ finish

dc_processor.scraper_result.Result.finish

Definition at line 46 of file scraper_result.py.

◆ metrics

dc_processor.scraper_result.Result.metrics

Definition at line 31 of file scraper_result.py.

◆ name

dc_processor.scraper_result.Result.name

Definition at line 29 of file scraper_result.py.

◆ resId

dc_processor.scraper_result.Result.resId

Definition at line 38 of file scraper_result.py.

◆ start

dc_processor.scraper_result.Result.start

Definition at line 45 of file scraper_result.py.

◆ tags

dc_processor.scraper_result.Result.tags

Definition at line 35 of file scraper_result.py.

◆ TAGS_LANG_DEFAULT

string dc_processor.scraper_result.Result.TAGS_LANG_DEFAULT = "en"
static

Definition at line 25 of file scraper_result.py.

◆ TAGS_LANG_SUFFIX_DEFAULT

string dc_processor.scraper_result.Result.TAGS_LANG_SUFFIX_DEFAULT = "_language"
static

Definition at line 26 of file scraper_result.py.

◆ tagsCount

dc_processor.scraper_result.Result.tagsCount

Definition at line 49 of file scraper_result.py.

◆ tagsMask

dc_processor.scraper_result.Result.tagsMask

Definition at line 50 of file scraper_result.py.


The documentation for this class was generated from the following file: