HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_processor.TemplateExtractorXPathPreparing.TemplateExtractorXPathPreparing Class Reference
Inheritance diagram for dc_processor.TemplateExtractorXPathPreparing.TemplateExtractorXPathPreparing:
Collaboration diagram for dc_processor.TemplateExtractorXPathPreparing.TemplateExtractorXPathPreparing:

Public Member Functions

def __init__ (self, innerTextTagReplacers=None, attrConditions=None)
 
def resolveDelimiter (self, path, properties, defautlDelimiter=" ")
 
def resolveInnerDelimiter (self, path, properties, defautlDelimiter=" ")
 
def process (self, path, sel, delimiter=' ', innerDelimiter=' ', innerTextFunc=Utils.innerText)
 
def getXpathValueForDTime (self, initXpath, sel, innerTextFunc=Utils.innerText)
 
def extractXpathFromSelectorList (self, sList, localXpath, lambdaCall)
 

Public Attributes

 innerTextTagReplacers
 
 attrConditions
 

Detailed Description

Definition at line 16 of file TemplateExtractorXPathPreparing.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_processor.TemplateExtractorXPathPreparing.TemplateExtractorXPathPreparing.__init__ (   self,
  innerTextTagReplacers = None,
  attrConditions = None 
)

Definition at line 18 of file TemplateExtractorXPathPreparing.py.

18  def __init__(self, innerTextTagReplacers=None, attrConditions=None):
19  self.innerTextTagReplacers = innerTextTagReplacers
20  self.attrConditions = attrConditions
21 
22 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ extractXpathFromSelectorList()

def dc_processor.TemplateExtractorXPathPreparing.TemplateExtractorXPathPreparing.extractXpathFromSelectorList (   self,
  sList,
  localXpath,
  lambdaCall 
)

Definition at line 183 of file TemplateExtractorXPathPreparing.py.

183  def extractXpathFromSelectorList(self, sList, localXpath, lambdaCall):
184  ret = []
185  for elem in sList:
186  if lambdaCall(elem):
187  ret = elem.xpath(localXpath).extract()
188  if len(ret) > 1 and any(True for ch in ret if ch >= '0' and ch <= '9'):
189  ret = [ret[0]]
190  break
191  return ret
192 
Here is the caller graph for this function:

◆ getXpathValueForDTime()

def dc_processor.TemplateExtractorXPathPreparing.TemplateExtractorXPathPreparing.getXpathValueForDTime (   self,
  initXpath,
  sel,
  innerTextFunc = Utils.innerText 
)

Definition at line 159 of file TemplateExtractorXPathPreparing.py.

159  def getXpathValueForDTime(self, initXpath, sel, innerTextFunc=Utils.innerText):
160  xpath = initXpath
161  localXpath = sel.xpath(xpath)
162  logger.info(">>> Datetime | Meta extraction")
163  xpathValue = self.extractXpathFromSelectorList(localXpath, "@content", \
164  lambda elem: elem.extract().find("<meta") == 0)
165  if len(xpathValue) == 0:
166  logger.info(">>> Datetime | any tag @datetime argument extraction")
167  xpathValue = self.extractXpathFromSelectorList(localXpath, "@datetime", \
168  lambda elem: elem.extract().find("<time") == 0)
169  if len(xpathValue) == 0:
170  logger.info(">>> Datetime | inner Text Extraction")
171  localStr = innerTextFunc(localXpath, ' ', ' ', self.innerTextTagReplacers, None, self.attrConditions)
172  if localStr != '':
173  xpathValue = [localStr]
174  return xpath, xpathValue
175 
176 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ process()

def dc_processor.TemplateExtractorXPathPreparing.TemplateExtractorXPathPreparing.process (   self,
  path,
  sel,
  delimiter = ' ',
  innerDelimiter = ' ',
  innerTextFunc = Utils.innerText 
)

Definition at line 67 of file TemplateExtractorXPathPreparing.py.

67  def process(self, path, sel, delimiter=' ', innerDelimiter=' ', innerTextFunc=Utils.innerText):
68  xpath = None
69  xpathValue = None
70  # Added new template type specification
71  #---> "text" rule type.
72  if path["type"] == "text":
73  localXpath = sel.xpath(path["target"])
74  xpathValue = innerTextFunc(localXpath, delimiter, innerDelimiter, self.innerTextTagReplacers, None,
75  self.attrConditions)
76  xpath = path["target"]
77  #---> "datetime" rule type.
78  elif path["type"] == "datetime":
79  xpath, xpathValue = self.getXpathValueForDTime(path["target"], sel, innerTextFunc)
80  logger.info(">>> final XPath = " + str(xpathValue))
81  #---> "image" rule type.
82  elif path["type"] == "image":
83  logger.info(">>> img format = " + str(path["format"]))
84  if path["target"][0] not in SelectorWrapper.CSS_DETECT_SYMBOLS:
85  if path["format"] == "URL":
86  if path["target"].find('/@') == -1:
87  localXPathPattern = '/@%s'
88  subXPathesList = ['src', 'srcset', 'image-src']
89  xpath, xpathValue = Utils.getFirstNotEmptySubXPath(path["target"], sel, localXPathPattern, subXPathesList)
90  elif path["format"] == "DATA":
91  pass
92  elif path["format"] == "ALT":
93  xpath = path["target"] + "/@alt"
94  elif path["format"] == "TITLE":
95  xpath = path["target"] + "/@title"
96  #---> "html" rule type.
97  elif path["type"] == "html":
98  pass
99  #---> "link" rule type.
100  elif path["type"] == "link":
101  if path["target"][0] not in SelectorWrapper.CSS_DETECT_SYMBOLS:
102  formatName = path["format"]
103  if len(formatName.split(',')) > 1:
104  formatName = formatName.split(',')[1]
105  if formatName == "email-text":
106  isEmail = False
107  if not Utils.isTailSubstr(path["target"], "/@href"):
108  localXpath = path["target"] + "/@href"
109  xpathValue = sel.xpath(localXpath).extract()
110  for xpathValueElem in xpathValue:
111  if isinstance(xpathValueElem, basestring) and xpathValueElem.find("mailto:") >= 0:
112  isEmail = True
113  break
114  if isEmail:
115  localXpath = sel.xpath(path["target"])
116  xpathValue = innerTextFunc(localXpath, delimiter, innerDelimiter, self.innerTextTagReplacers, None,
117  self.attrConditions)
118  else:
119  xpathValue = []
120  xpath = path["target"]
121  else:
122  if not Utils.isTailSubstr(path["target"], "/@href"):
123  xpath = path["target"] + "/@href"
124  #---> "attribute" rule type.
125  elif path["type"] == "attribute":
126  if path["format"] == "":
127  xpathValue = []
128  else:
129  splittedFormatString = path["format"].split(',')
130  attrName = None
131  if len(splittedFormatString) >= 2:
132  attrName = splittedFormatString[1]
133  else:
134  attrName = splittedFormatString[0]
135  if path["target"].rfind(attrName) == -1 or \
136  (len(path["target"]) - len(attrName)) != path["target"].rfind(attrName):
137  xpath = path["target"]
138  xpath += "/@"
139  xpath += attrName
140  if xpath is None:
141  xpath = path["target"]
142 
143  if xpathValue is None:
144  try:
145  xpathValue = sel.xpath(xpath).extract()
146  except Exception as excp:
147  logger.info(">>> Common xPath extractor exception=" + str(excp))
148  xpathValue = []
149 
150  return xpath, xpathValue
151 
152 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ resolveDelimiter()

def dc_processor.TemplateExtractorXPathPreparing.TemplateExtractorXPathPreparing.resolveDelimiter (   self,
  path,
  properties,
  defautlDelimiter = " " 
)

Definition at line 28 of file TemplateExtractorXPathPreparing.py.

28  def resolveDelimiter(self, path, properties, defautlDelimiter=" "):
29  ret = defautlDelimiter
30  if "delimiter" in path and path["delimiter"]:
31  ret = path["delimiter"]
32  elif "SCRAPER_TAG_ITEMS_DELIMITER" in properties:
33  ret = properties["SCRAPER_TAG_ITEMS_DELIMITER"]
34  elif path["type"] == "text" or path["type"] == "html":
35  ret = ' '
36  else:
37  ret = ','
38  return ret
39 
40 

◆ resolveInnerDelimiter()

def dc_processor.TemplateExtractorXPathPreparing.TemplateExtractorXPathPreparing.resolveInnerDelimiter (   self,
  path,
  properties,
  defautlDelimiter = " " 
)

Definition at line 46 of file TemplateExtractorXPathPreparing.py.

46  def resolveInnerDelimiter(self, path, properties, defautlDelimiter=" "):
47  ret = defautlDelimiter
48  if "delimiter_sub_items" in path and path["delimiter_sub_items"]:
49  ret = path["delimiter_sub_items"]
50  elif "SCRAPER_TAG_ITEMS_INNER_DELIMITER" in properties:
51  ret = properties["SCRAPER_TAG_ITEMS_INNER_DELIMITER"]
52  elif path["type"] == "text" or path["type"] == "html":
53  ret = ' '
54  else:
55  ret = ','
56  return ret
57 
58 

Member Data Documentation

◆ attrConditions

dc_processor.TemplateExtractorXPathPreparing.TemplateExtractorXPathPreparing.attrConditions

Definition at line 20 of file TemplateExtractorXPathPreparing.py.

◆ innerTextTagReplacers

dc_processor.TemplateExtractorXPathPreparing.TemplateExtractorXPathPreparing.innerTextTagReplacers

Definition at line 19 of file TemplateExtractorXPathPreparing.py.


The documentation for this class was generated from the following file: