HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_processor.ScraperMultiItemsTask.ScraperResultDocuments Class Reference
Inheritance diagram for dc_processor.ScraperMultiItemsTask.ScraperResultDocuments:
Collaboration diagram for dc_processor.ScraperMultiItemsTask.ScraperResultDocuments:

Public Member Functions

def __init__ (self, keys, urlId)
 
def addEtree (self, key, value)
 
def addDoc (self, key, value, join, isExtract, mandatory)
 
def getMaxCount (self, inDict)
 
def getTagNamesExistAllDocs (self)
 
def getCommonPath (self, lhs, rhs, logger=None)
 
def calculateIndexPath (self, etree, logger=None)
 
def getIndexNumberOfPath (self, indexPath, elemPath, logger=None)
 
def getAllTags (self, mandatoryTags, logger=None)
 
def updateTagValue (self, result, tags, tag_name)
 
def getAllDocs (self, mandatoryTags, logger=None)
 

Public Attributes

 urlId
 
 docs
 
 join
 
 isExtract
 
 mandatory
 
 etree
 

Detailed Description

Definition at line 65 of file ScraperMultiItemsTask.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_processor.ScraperMultiItemsTask.ScraperResultDocuments.__init__ (   self,
  keys,
  urlId 
)

Definition at line 70 of file ScraperMultiItemsTask.py.

70  def __init__(self, keys, urlId):
71  self.urlId = urlId
72  self.docs = {}
73  self.join = {}
74  self.isExtract = {}
75  self.mandatory = {}
76  self.etree = {}
77  for key in keys:
78  self.docs[key] = []
79  self.join[key] = []
80  self.isExtract[key] = []
81  self.mandatory[key] = []
82  self.etree[key] = []
83 
84 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ addDoc()

def dc_processor.ScraperMultiItemsTask.ScraperResultDocuments.addDoc (   self,
  key,
  value,
  join,
  isExtract,
  mandatory 
)

Definition at line 105 of file ScraperMultiItemsTask.py.

105  def addDoc(self, key, value, join, isExtract, mandatory):
106  if not self.docs.has_key(key):
107  self.docs[key] = []
108  self.join[key] = []
109  self.isExtract[key] = []
110  self.mandatory[key] = []
111 
112  self.docs.get(key).append(copy.deepcopy(value))
113  self.join.get(key).append(copy.deepcopy(join))
114  self.isExtract.get(key).append(copy.deepcopy(isExtract))
115  self.mandatory.get(key).append(copy.deepcopy(mandatory))
116 
117 

◆ addEtree()

def dc_processor.ScraperMultiItemsTask.ScraperResultDocuments.addEtree (   self,
  key,
  value 
)

Definition at line 90 of file ScraperMultiItemsTask.py.

90  def addEtree(self, key, value):
91  if not self.etree.has_key(key):
92  self.docs[key] = []
93  self.join[key] = []
94  self.isExtract[key] = []
95  self.mandatory[key] = []
96  self.etree[key] = []
97 
98  self.etree.get(key).append(copy.deepcopy(value))
99 
100 

◆ calculateIndexPath()

def dc_processor.ScraperMultiItemsTask.ScraperResultDocuments.calculateIndexPath (   self,
  etree,
  logger = None 
)

Definition at line 188 of file ScraperMultiItemsTask.py.

188  def calculateIndexPath(self, etree, logger=None):
189  # variable for result
190  ret = []
191  pathDict = {}
192  pathList = []
193 
194  for key in etree.keys():
195  pathList.extend(etree.get(key))
196 
197  for index in range(len(pathList) - 1):
198  commonPath = self.getCommonPath(pathList[index], pathList[index + 1], logger)
199  commonPathCount = 0
200  if pathDict.has_key(str(commonPath)):
201  commonPathCount = int(pathDict.get(str(commonPath))[1])
202 
203  pathDict[str(commonPath)] = (commonPath, commonPathCount + 1)
204 
205  localpathList = []
206  for elem in pathDict.values():
207  localpathList.append(elem)
208 
209  localpathList.sort(key=lambda tup: tup[1], reverse=True)
210  if len(localpathList) > 0:
211  ret = (localpathList[0])[0]
212 
213  return ret
214 
215 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ getAllDocs()

def dc_processor.ScraperMultiItemsTask.ScraperResultDocuments.getAllDocs (   self,
  mandatoryTags,
  logger = None 
)

Definition at line 358 of file ScraperMultiItemsTask.py.

358  def getAllDocs(self, mandatoryTags, logger=None):
359  # variable for result
360  resDocs = []
361 
362  resTags = self.getAllTags(mandatoryTags, logger)
363  count = len(resTags)
364 
365  tagsNames = self.getTagNamesExistAllDocs()
366 
367  if len(tagsNames) > 0:
368  key = tagsNames[0]
369 
370  for index in range(count):
371  if len(self.join.get(key)) > index and \
372  len(self.isExtract.get(key)) > index and \
373  len(self.mandatory.get(key)) > index:
374  resDocs.append({"obj": resTags[index],
375  "join": self.join.get(key)[index],
376  "isExtract": self.isExtract.get(key)[index],
377  "mandatory": self.mandatory.get(key)[index],
378  CONSTS.TAG_ORDER_NUMBER: len(resDocs) + 1})
379 
380  return resDocs
381 
382 
383 
384 # # ScraperMultiItemsTask Class content main functional scrapering for multi items,
385 # class inherits from foundation.CementApp
386 #
Here is the call graph for this function:

◆ getAllTags()

def dc_processor.ScraperMultiItemsTask.ScraperResultDocuments.getAllTags (   self,
  mandatoryTags,
  logger = None 
)

Definition at line 264 of file ScraperMultiItemsTask.py.

264  def getAllTags(self, mandatoryTags, logger=None):
265  # variable for result
266  resTags = []
267  count = self.getMaxCount(self.docs)
268 
269  # #Calculate index block
270  indexPath = self.calculateIndexPath(self.etree, logger)
271  if logger is not None:
272  logger.info('Calculated indexPath: ' + str(indexPath))
273 
274  if logger is not None:
275  for key in self.etree:
276  logger.debug('len(self.etree.get(' + str(key) + ') = ' + str(len(self.etree.get(key))))
277  for key in self.docs:
278  logger.debug('len(self.docs.get(' + str(key) + ') = ' + str(len(self.docs.get(key))))
279 
280  resultList = []
281  for index in range(self.getMaxCount(self.etree)):
282  localRes = Result(None, self.urlId)
283  resultList.append(localRes)
284 
285  if logger is not None:
286  logger.debug('count = ' + str(count))
287  logger.debug('len(resultList) = ' + str(len(resultList)))
288 
289  for key in self.docs.keys():
290  for index in range(len(self.docs.get(key))):
291  if logger is not None:
292  logger.debug('==== key: ' + str(key) + ' index: ' + str(index) + ' ====')
293 
294  if len(self.etree.get(key)) > index:
295  number = int(self.getIndexNumberOfPath(indexPath, self.etree.get(key)[index], logger))
296  if logger is not None:
297  logger.debug('number = ' + str(number) + ' self.docs.get(' + str(key) + ')[' + str(index) + '].tags: ' + \
298  varDump(self.docs.get(key)[index].tags))
299 
300  if int(number) > 0 and int(number) <= len(self.docs.get(key)):
301  if resultList[int(number) - 1].tags.has_key(key):
302  result = self.updateTagValue(resultList[int(number) - 1], self.docs.get(key)[index].tags, key)
303  resultList[int(number) - 1].tags.update(result.tags)
304  else:
305  resultList[int(number) - 1].tags.update({key:self.docs.get(key)[index].tags[key]})
306 
307  if logger is not None:
308  logger.debug("resultList[" + str(int(number) - 1) + "].tags.update({" + str(key) + ":self.docs.get(" + \
309  str(key) + ")[" + str(index) + "].tags[" + str(key) + "]})")
310 
311  for index in range(0, len(resultList)):
312  isMandatory = True
313  countSelected = 0
314  for key in self.docs.keys():
315  if not resultList[index].tags.has_key(key) and bool(mandatoryTags[key]) is True:
316  isMandatory = False
317  break
318 
319  if resultList[index].tags.has_key(key):
320  countSelected = countSelected + 1
321 
322  if countSelected == 0:
323  isMandatory = False
324 
325  if isMandatory:
326  resTags.append(resultList[index])
327 
328  if len(resTags) == 0:
329  resTags.append(Result(None, self.urlId))
330 
331  return resTags
332 
333 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:
Here is the caller graph for this function:

◆ getCommonPath()

def dc_processor.ScraperMultiItemsTask.ScraperResultDocuments.getCommonPath (   self,
  lhs,
  rhs,
  logger = None 
)

Definition at line 151 of file ScraperMultiItemsTask.py.

151  def getCommonPath(self, lhs, rhs, logger=None): # pylint: disable=W0612,W0613
152  # variable for result
153  ret = []
154  length = min(len(lhs), len(rhs))
155 
156  # if logger is not None:
157  # logger.debug('>>> lhs: ' + str(lhs))
158  # logger.debug('>>> rhs: ' + str(rhs))
159 
160  for i in range(length):
161  if isinstance(lhs[i], str) and isinstance(rhs[i], str) and lhs[i] != rhs[i]:
162  if i > 0:
163  ret = lhs[:i]
164  return ret
165 
166  # logger.info('len(lhs[' + str(i) + ']) = ' + str(len(lhs[i])) + \
167  # ' len(rhs[' + str(i) + ']) = ' + str(len(rhs[i])))
168 
169  if isinstance(lhs[i], tuple) and isinstance(rhs[i], tuple) and len(lhs[i]) == len(rhs[i]):
170  for j in range(len(lhs[i])):
171  # logger.info('lhs[' + str(j) + '] = ' + str(lhs[i][j] + ' rhs[' + str(j) + '] = ' + str(rhs[i][j])))
172  if lhs[i][j] != rhs[i][j]:
173 
174  # logger.info('lhs[:i] = ' + str(lhs[:i]))
175  if i > 0:
176  ret = lhs[:i]
177 
178  # logger.debug('ret = ' + str(ret))
179  return ret
180 
181  return ret
182 
183 
Here is the caller graph for this function:

◆ getIndexNumberOfPath()

def dc_processor.ScraperMultiItemsTask.ScraperResultDocuments.getIndexNumberOfPath (   self,
  indexPath,
  elemPath,
  logger = None 
)

Definition at line 221 of file ScraperMultiItemsTask.py.

221  def getIndexNumberOfPath(self, indexPath, elemPath, logger=None):
222  elementPath = copy.deepcopy(elemPath)
223  length = min(len(indexPath), len(elementPath))
224 
225  if logger is not None:
226  logger.debug('\n>>> indexPath: ' + str(indexPath))
227  logger.debug('\n>>> elementPath: ' + str(elementPath))
228 
229  for i in range(length):
230  if isinstance(indexPath[i], str) and isinstance(elementPath[i], str) and indexPath[i] != elementPath[i]:
231  if logger is not None:
232  logger.debug("Both have type 'str' and indexPath[" + str(i) + "] != elementPath[" + str(i) + "]")
233  return -1
234 
235  if isinstance(indexPath[i], tuple) and isinstance(elementPath[i], tuple):
236  size = min(len(indexPath[i]), len(elementPath[i]))
237  for j in range(size):
238  if indexPath[i][j] != elementPath[i][j]:
239  if logger is not None:
240  logger.debug("Both have type 'tuple' and indexPath[" + str(i) + "][" + str(j) + "] != elementPath[" + \
241  str(i) + "][" + str(j) + "]")
242  return -1
243 
244  if len(elementPath) > len(indexPath):
245  if logger is not None:
246  logger.debug('type(elementPath[len(indexPath)])) = ' + str(type(elementPath[len(indexPath)])) + \
247  ' elementPath[' + str(len(indexPath)) + ']: ' + str(elementPath[len(indexPath)]))
248 
249  if isinstance(elementPath[len(indexPath)], tuple):
250  if len(elementPath[len(indexPath)]) > 1:
251  if logger is not None:
252  logger.debug('>>> elementPath[' + str(len(indexPath)) + '][1] = ' + str(elementPath[len(indexPath)][1]))
253 
254  return elementPath[len(indexPath)][1]
255 
256  return -1
257 
258 
Here is the caller graph for this function:

◆ getMaxCount()

def dc_processor.ScraperMultiItemsTask.ScraperResultDocuments.getMaxCount (   self,
  inDict 
)

Definition at line 122 of file ScraperMultiItemsTask.py.

122  def getMaxCount(self, inDict):
123  # variable for result
124  count = 0
125  for key in inDict.keys():
126  count = max(count, len(inDict.get(key)))
127 
128  return count
129 
130 
Here is the caller graph for this function:

◆ getTagNamesExistAllDocs()

def dc_processor.ScraperMultiItemsTask.ScraperResultDocuments.getTagNamesExistAllDocs (   self)

Definition at line 135 of file ScraperMultiItemsTask.py.

135  def getTagNamesExistAllDocs(self):
136  # variable for result
137  tagNames = []
138  count = self.getMaxCount(self.docs)
139  for key in self.docs.keys():
140  size = len(self.docs.get(key))
141  if count == size:
142  tagNames.append(key)
143 
144  return tagNames
145 
146 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ updateTagValue()

def dc_processor.ScraperMultiItemsTask.ScraperResultDocuments.updateTagValue (   self,
  result,
  tags,
  tag_name 
)

Definition at line 340 of file ScraperMultiItemsTask.py.

340  def updateTagValue(self, result, tags, tag_name):
341 
342  data = {"extractor":"Base extractor", "data":"", "name":""}
343  data["data"] = [result.tags[tag_name]["data"][0] + tags[tag_name]["data"][0]]
344  data["name"] = result.tags[tag_name]["name"]
345  data["xpath"] = result.tags[tag_name]["xpath"]
346  data["type"] = result.tags[tag_name]["type"]
347  data["extractor"] = result.tags[tag_name]["extractor"]
348  result.tags[tag_name] = data
349 
350  return result
351 
352 
Here is the caller graph for this function:

Member Data Documentation

◆ docs

dc_processor.ScraperMultiItemsTask.ScraperResultDocuments.docs

Definition at line 72 of file ScraperMultiItemsTask.py.

◆ etree

dc_processor.ScraperMultiItemsTask.ScraperResultDocuments.etree

Definition at line 76 of file ScraperMultiItemsTask.py.

◆ isExtract

dc_processor.ScraperMultiItemsTask.ScraperResultDocuments.isExtract

Definition at line 74 of file ScraperMultiItemsTask.py.

◆ join

dc_processor.ScraperMultiItemsTask.ScraperResultDocuments.join

Definition at line 73 of file ScraperMultiItemsTask.py.

◆ mandatory

dc_processor.ScraperMultiItemsTask.ScraperResultDocuments.mandatory

Definition at line 75 of file ScraperMultiItemsTask.py.

◆ urlId

dc_processor.ScraperMultiItemsTask.ScraperResultDocuments.urlId

Definition at line 71 of file ScraperMultiItemsTask.py.


The documentation for this class was generated from the following file: