HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
TemplateExtractorXPathPreparing.py
Go to the documentation of this file.
1 '''
2 @package: dc
3 @author scorp
4 @link: http://hierarchical-cluster-engine.com/
5 @copyright: Copyright © 2013-2014 IOIX Ukraine
6 @license: http://hierarchical-cluster-engine.com/license/
7 @since: 0.1
8 '''
9 
10 # import dc_processor.Constants as Constants
11 from app.SelectorWrapper import SelectorWrapper
12 import app.Utils as Utils # pylint: disable=F0401
13 
14 logger = Utils.MPLogger().getLogger()
15 
17 
18  def __init__(self, innerTextTagReplacers=None, attrConditions=None):
19  self.innerTextTagReplacers = innerTextTagReplacers
20  self.attrConditions = attrConditions
21 
22 
23  # #resolveDelimiter method resolves content delimiter bases on path["delimiter"] value
24  #
25  # @param path - path container from incoming json
26  # @param defautlDelimiter - incoming default delimiter
27  # @return just resolved delimiter
28  def resolveDelimiter(self, path, properties, defautlDelimiter=" "):
29  ret = defautlDelimiter
30  if "delimiter" in path and path["delimiter"]:
31  ret = path["delimiter"]
32  elif "SCRAPER_TAG_ITEMS_DELIMITER" in properties:
33  ret = properties["SCRAPER_TAG_ITEMS_DELIMITER"]
34  elif path["type"] == "text" or path["type"] == "html":
35  ret = ' '
36  else:
37  ret = ','
38  return ret
39 
40 
41  # #resolveInnerDelimiter method resolves content innerDelimiter bases on path["delimiter"] value
42  #
43  # @param path - path container from incoming json
44  # @param defautlDelimiter - incoming default delimiter
45  # @return just resolved delimiter
46  def resolveInnerDelimiter(self, path, properties, defautlDelimiter=" "):
47  ret = defautlDelimiter
48  if "delimiter_sub_items" in path and path["delimiter_sub_items"]:
49  ret = path["delimiter_sub_items"]
50  elif "SCRAPER_TAG_ITEMS_INNER_DELIMITER" in properties:
51  ret = properties["SCRAPER_TAG_ITEMS_INNER_DELIMITER"]
52  elif path["type"] == "text" or path["type"] == "html":
53  ret = ' '
54  else:
55  ret = ','
56  return ret
57 
58 
59  # #process main class's functional method
60  #
61  # @param path - path container from incoming json
62  # @param sel - incoming x-path selector
63  # @param delimiter - delimiter used for processing
64  # @param innerDelimiter - inner delimiter used for processing
65  # @param innerTextFunc - function pointer used innerText extraction algorithm
66  # @return tuple of extracted (xpath, xpathValue) elements
67  def process(self, path, sel, delimiter=' ', innerDelimiter=' ', innerTextFunc=Utils.innerText):
68  xpath = None
69  xpathValue = None
70  # Added new template type specification
71  #---> "text" rule type.
72  if path["type"] == "text":
73  localXpath = sel.xpath(path["target"])
74  xpathValue = innerTextFunc(localXpath, delimiter, innerDelimiter, self.innerTextTagReplacers, None,
75  self.attrConditions)
76  xpath = path["target"]
77  #---> "datetime" rule type.
78  elif path["type"] == "datetime":
79  xpath, xpathValue = self.getXpathValueForDTime(path["target"], sel, innerTextFunc)
80  logger.info(">>> final XPath = " + str(xpathValue))
81  #---> "image" rule type.
82  elif path["type"] == "image":
83  logger.info(">>> img format = " + str(path["format"]))
84  if path["target"][0] not in SelectorWrapper.CSS_DETECT_SYMBOLS:
85  if path["format"] == "URL":
86  if path["target"].find('/@') == -1:
87  localXPathPattern = '/@%s'
88  subXPathesList = ['src', 'srcset', 'image-src']
89  xpath, xpathValue = Utils.getFirstNotEmptySubXPath(path["target"], sel, localXPathPattern, subXPathesList)
90  elif path["format"] == "DATA":
91  pass
92  elif path["format"] == "ALT":
93  xpath = path["target"] + "/@alt"
94  elif path["format"] == "TITLE":
95  xpath = path["target"] + "/@title"
96  #---> "html" rule type.
97  elif path["type"] == "html":
98  pass
99  #---> "link" rule type.
100  elif path["type"] == "link":
101  if path["target"][0] not in SelectorWrapper.CSS_DETECT_SYMBOLS:
102  formatName = path["format"]
103  if len(formatName.split(',')) > 1:
104  formatName = formatName.split(',')[1]
105  if formatName == "email-text":
106  isEmail = False
107  if not Utils.isTailSubstr(path["target"], "/@href"):
108  localXpath = path["target"] + "/@href"
109  xpathValue = sel.xpath(localXpath).extract()
110  for xpathValueElem in xpathValue:
111  if isinstance(xpathValueElem, basestring) and xpathValueElem.find("mailto:") >= 0:
112  isEmail = True
113  break
114  if isEmail:
115  localXpath = sel.xpath(path["target"])
116  xpathValue = innerTextFunc(localXpath, delimiter, innerDelimiter, self.innerTextTagReplacers, None,
117  self.attrConditions)
118  else:
119  xpathValue = []
120  xpath = path["target"]
121  else:
122  if not Utils.isTailSubstr(path["target"], "/@href"):
123  xpath = path["target"] + "/@href"
124  #---> "attribute" rule type.
125  elif path["type"] == "attribute":
126  if path["format"] == "":
127  xpathValue = []
128  else:
129  splittedFormatString = path["format"].split(',')
130  attrName = None
131  if len(splittedFormatString) >= 2:
132  attrName = splittedFormatString[1]
133  else:
134  attrName = splittedFormatString[0]
135  if path["target"].rfind(attrName) == -1 or \
136  (len(path["target"]) - len(attrName)) != path["target"].rfind(attrName):
137  xpath = path["target"]
138  xpath += "/@"
139  xpath += attrName
140  if xpath is None:
141  xpath = path["target"]
142 
143  if xpathValue is None:
144  try:
145  xpathValue = sel.xpath(xpath).extract()
146  except Exception as excp:
147  logger.info(">>> Common xPath extractor exception=" + str(excp))
148  xpathValue = []
149 
150  return xpath, xpathValue
151 
152 
153  # #getXpathValueForDTime special method af data extracting in datatime cases
154  #
155  # @param initXpath - initial element's xPath
156  # @param sel - incoming selector
157  # @param innerTextFunc - function pointer used innerText extraction algorithm
158  # @return tuple of extracted (xpath, xpathValue) elements
159  def getXpathValueForDTime(self, initXpath, sel, innerTextFunc=Utils.innerText):
160  xpath = initXpath
161  localXpath = sel.xpath(xpath)
162  logger.info(">>> Datetime | Meta extraction")
163  xpathValue = self.extractXpathFromSelectorList(localXpath, "@content", \
164  lambda elem: elem.extract().find("<meta") == 0)
165  if len(xpathValue) == 0:
166  logger.info(">>> Datetime | any tag @datetime argument extraction")
167  xpathValue = self.extractXpathFromSelectorList(localXpath, "@datetime", \
168  lambda elem: elem.extract().find("<time") == 0)
169  if len(xpathValue) == 0:
170  logger.info(">>> Datetime | inner Text Extraction")
171  localStr = innerTextFunc(localXpath, ' ', ' ', self.innerTextTagReplacers, None, self.attrConditions)
172  if localStr != '':
173  xpathValue = [localStr]
174  return xpath, xpathValue
175 
176 
177  # #extractXpathFromSelectorList returns first xPath in case of selector List
178  #
179  # @param sList - incoming selectors list
180  # @param localXpath - incoming xPath
181  # @param lambdaCall - boolean lambda call for checking elements in sList
182  # @return extracted xPath
183  def extractXpathFromSelectorList(self, sList, localXpath, lambdaCall):
184  ret = []
185  for elem in sList:
186  if lambdaCall(elem):
187  ret = elem.xpath(localXpath).extract()
188  if len(ret) > 1 and any(True for ch in ret if ch >= '0' and ch <= '9'):
189  ret = [ret[0]]
190  break
191  return ret
def process(self, path, sel, delimiter=' ', innerDelimiter=' ', innerTextFunc=Utils.innerText)