4 @link: http://hierarchical-cluster-engine.com/ 5 @copyright: Copyright © 2013-2014 IOIX Ukraine 6 @license: http://hierarchical-cluster-engine.com/license/ 18 def __init__(self, innerTextTagReplacers=None, attrConditions=None):
29 ret = defautlDelimiter
30 if "delimiter" in path
and path[
"delimiter"]:
31 ret = path[
"delimiter"]
32 elif "SCRAPER_TAG_ITEMS_DELIMITER" in properties:
33 ret = properties[
"SCRAPER_TAG_ITEMS_DELIMITER"]
34 elif path[
"type"] ==
"text" or path[
"type"] ==
"html":
47 ret = defautlDelimiter
48 if "delimiter_sub_items" in path
and path[
"delimiter_sub_items"]:
49 ret = path[
"delimiter_sub_items"]
50 elif "SCRAPER_TAG_ITEMS_INNER_DELIMITER" in properties:
51 ret = properties[
"SCRAPER_TAG_ITEMS_INNER_DELIMITER"]
52 elif path[
"type"] ==
"text" or path[
"type"] ==
"html":
67 def process(self, path, sel, delimiter=' ', innerDelimiter=' ', innerTextFunc=Utils.innerText):
72 if path[
"type"] ==
"text":
73 localXpath = sel.xpath(path[
"target"])
76 xpath = path[
"target"]
78 elif path[
"type"] ==
"datetime":
80 logger.info(
">>> final XPath = " + str(xpathValue))
82 elif path[
"type"] ==
"image":
83 logger.info(
">>> img format = " + str(path[
"format"]))
84 if path[
"target"][0]
not in SelectorWrapper.CSS_DETECT_SYMBOLS:
85 if path[
"format"] ==
"URL":
86 if path[
"target"].find(
'/@') == -1:
87 localXPathPattern =
'/@%s' 88 subXPathesList = [
'src',
'srcset',
'image-src']
89 xpath, xpathValue = Utils.getFirstNotEmptySubXPath(path[
"target"], sel, localXPathPattern, subXPathesList)
90 elif path[
"format"] ==
"DATA":
92 elif path[
"format"] ==
"ALT":
93 xpath = path[
"target"] +
"/@alt" 94 elif path[
"format"] ==
"TITLE":
95 xpath = path[
"target"] +
"/@title" 97 elif path[
"type"] ==
"html":
100 elif path[
"type"] ==
"link":
101 if path[
"target"][0]
not in SelectorWrapper.CSS_DETECT_SYMBOLS:
102 formatName = path[
"format"]
103 if len(formatName.split(
',')) > 1:
104 formatName = formatName.split(
',')[1]
105 if formatName ==
"email-text":
107 if not Utils.isTailSubstr(path[
"target"],
"/@href"):
108 localXpath = path[
"target"] +
"/@href" 109 xpathValue = sel.xpath(localXpath).extract()
110 for xpathValueElem
in xpathValue:
111 if isinstance(xpathValueElem, basestring)
and xpathValueElem.find(
"mailto:") >= 0:
115 localXpath = sel.xpath(path[
"target"])
120 xpath = path[
"target"]
122 if not Utils.isTailSubstr(path[
"target"],
"/@href"):
123 xpath = path[
"target"] +
"/@href" 125 elif path[
"type"] ==
"attribute":
126 if path[
"format"] ==
"":
129 splittedFormatString = path[
"format"].split(
',')
131 if len(splittedFormatString) >= 2:
132 attrName = splittedFormatString[1]
134 attrName = splittedFormatString[0]
135 if path[
"target"].rfind(attrName) == -1
or \
136 (len(path[
"target"]) - len(attrName)) != path[
"target"].rfind(attrName):
137 xpath = path[
"target"]
141 xpath = path[
"target"]
143 if xpathValue
is None:
145 xpathValue = sel.xpath(xpath).extract()
146 except Exception
as excp:
147 logger.info(
">>> Common xPath extractor exception=" + str(excp))
150 return xpath, xpathValue
161 localXpath = sel.xpath(xpath)
162 logger.info(
">>> Datetime | Meta extraction")
164 lambda elem: elem.extract().find(
"<meta") == 0)
165 if len(xpathValue) == 0:
166 logger.info(
">>> Datetime | any tag @datetime argument extraction")
168 lambda elem: elem.extract().find(
"<time") == 0)
169 if len(xpathValue) == 0:
170 logger.info(
">>> Datetime | inner Text Extraction")
173 xpathValue = [localStr]
174 return xpath, xpathValue
187 ret = elem.xpath(localXpath).extract()
188 if len(ret) > 1
and any(
True for ch
in ret
if ch >=
'0' and ch <=
'9'):