2 Created on Mar 28, 2014 6 @link: http://hierarchical-cluster-engine.com/ 7 @copyright: Copyright © 2013-2014 IOIX Ukraine 8 @license: http://hierarchical-cluster-engine.com/license/ 17 logger = logging.getLogger(APP_CONSTS.LOGGER_NAME)
22 NONE_CLOSED_HTML_TAGS = [
'area',
'base',
'br',
'col',
'command',
'embed',
'hr',
'img',
'input',
'keygen',
'link',
23 'meta',
'param',
'source',
'track',
'wbr']
25 CANONIZATION_TAGS = [
'href',
'src']
27 MACRO_ATTRIBUTES =
'%ATTRIBUTES%' 29 PATTERN_CLOSE_VOID =
r"<%s.*?(/)>" 31 CLOSE_VOID_NOT_CLOSE=0
35 def __init__(self, tagReplacers=None, delimiter=' ', innerDelimiter=' ', REconditions=None, attrConditions=None,
36 keepAttributes=None, baseUrl=None, closeVoid=None, excludeNodes=None):
52 openTagName = str(nodeElem.xpath(
"name()")[0].extract())
62 elif openTagName !=
"":
79 closeTagName = str(nodeElem.xpath(
"name()")[0].extract())
82 self.
stripHtml +=
'</' + closeTagName +
'>' 84 closeTag =
"</" + closeTagName +
">" 85 if (len(str(nodeElem.extract())) >= len(closeTag))
and \
86 str(nodeElem.extract()).rfind(closeTag) == (len(str(nodeElem.extract())) - len(closeTag)):
87 closeTagName =
'/' + closeTagName
95 buff = str(nodeElem.extract())
96 if buff.strip() !=
"":
97 for excludeTag
in excludeTags:
99 pattern =
'<' + excludeTag +
'.*>' 100 buff = re.sub(pattern=pattern, repl=
'', string=buff, flags=re.I + re.U + re.M)
109 def innerText(self, contentBuf, xPath, tagRemoves=None):
112 if xPath
is not None:
113 if tagRemoves
is None:
114 tagRemoves = [
'script',
'style',
'']
116 if isinstance(xPath, basestring):
118 selectorElem = sel.xpath(xPath)
123 for elem
in selectorElem:
130 elemList.append(elem)
137 except Exception
as excp:
139 logger.error(
"!!! Exception: %s", str(self.
errorString))
141 logger.info(Utils.getTracebackInfo())
147 if attrConditions
is not None:
148 if attrConditions[
"TYPE"] ==
"include":
151 attrList = elem.xpath(
"@*")
152 if len(attrList) > 0:
153 for internalElem
in attrList:
154 for key
in attrConditions:
155 attrName =
"".
join(elem.xpath(
"name(@*[%s])" % str(i)).extract())
156 if key !=
"type" and (key ==
"*" or key == attrName)
and \
157 re.compile(attrConditions[key]).match(internalElem.extract()):
161 elif "NO_ATTRIBUTES" in attrConditions:
167 def traversalNodes(elemList, level=0, nodeCallbackOpen=None, nodeCallbackClose=None, textCallback=None,
168 excludeTags=None, attrConditions=None, excludeNodes=None):
169 if excludeTags
is None:
170 excludeTags = [
'script',
'style',
'']
175 for elem
in elemList:
177 if not ExtendInnerText.checkElemAttributes(attrConditions, elem):
180 if ExtendInnerText.isExcludeNode(excludeNodes, elem):
183 if len(elem.xpath(
"name()")) > 0:
184 if nodeCallbackOpen
is not None and str(elem.xpath(
"name()")[0].extract())
not in excludeTags:
185 nodeCallbackOpen(elem, level)
188 if str(elem.xpath(
"name()")[0].extract())
not in excludeTags:
189 ExtendInnerText.traversalNodes(elem.xpath(
"node()"), level + 1, nodeCallbackOpen, nodeCallbackClose,
190 textCallback, excludeTags, attrConditions, excludeNodes)
191 if nodeCallbackClose
is not None and str(elem.xpath(
"name()")[0].extract())
not in excludeTags:
192 nodeCallbackClose(elem, level)
194 if textCallback
is not None:
195 textCallback(elem, level, excludeTags)
202 if xPath
is not None:
203 if tagRemoves
is None:
204 tagRemoves = [
'script',
'style',
'']
206 if isinstance(xPath, types.StringTypes):
208 selectorElem = sel.xpath(xPath)
212 for elem
in selectorElem:
219 elemList.append(elem)
226 except Exception
as excp:
242 if keepAttributes
is not None and tagName
in keepAttributes.keys():
243 attrList = keepAttributes[tagName]
245 for attrName
in attrList:
246 value = nodeElem.xpath(
'@' + attrName).extract()
248 if len(value) > 0
and value[0] !=
"":
252 values.append(attrName +
'="' + value[0].replace(
'\n',
' ').replace(
'"',
'\\\"') +
'"')
255 ret =
' ' +
' '.
join(values)
284 logger.info(
"!!!!! nodeElem.extract(): '%s'", str(nodeElem.extract()))
286 logger.info(
"!!!!! pattern: '%s'", str(pattern))
287 res = nodeElem.re(pattern)
288 logger.info(
"!!!!! nodeElem.re(pattern): '%s'", str(res))
306 if len(elem.xpath(
"name()")) > 0:
307 nodeName = str(elem.xpath(
"name()")[0].extract())
312 if isinstance(excludeNodes, list):
313 for excludeNode
in excludeNodes:
314 if isinstance(excludeNode, dict):
315 for tagName, attributes
in excludeNode.items():
317 if Utils.reMatch(tagName, nodeName, logger):
319 if attributes
is None:
320 logger.debug(
"Found exclude node rule for '%s' with attributes: %s", str(tagName), str(attributes))
324 if isinstance(attributes, dict):
325 for attrName, attrValue
in attributes.items():
327 values = elem.xpath(
'@' + attrName).extract()
332 if Utils.reMatch(attrValue, value, logger):
333 logger.debug(
"Found exclude node rule for '%s' with attributes: %s", str(tagName), str(attributes))
def textCallbackHandler(self, nodeElem, level, excludeTags)
def innerTextToList(self, contentBuf, xPath, tagRemoves=None)
def applyCloseVoid(self, nodeElem, tagName)
def checkElemAttributes(attrConditions, elem)
def extractAttributes(self, nodeElem, tagName, keepAttributes, baseUrl)
def isExcludeNode(excludeNodes, elem)
check is exlude node
def __init__(self, tagReplacers=None, delimiter=' ', innerDelimiter=' ', REconditions=None, attrConditions=None, keepAttributes=None, baseUrl=None, closeVoid=None, excludeNodes=None)
list NONE_CLOSED_HTML_TAGS
def innerText(self, contentBuf, xPath, tagRemoves=None)
def traversalNodes(elemList, level=0, nodeCallbackOpen=None, nodeCallbackClose=None, textCallback=None, excludeTags=None, attrConditions=None, excludeNodes=None)
def urlNormalization(base, url, supportProtocols=None, log=None)
string PATTERN_CLOSE_VOID
def nodeCallbackOpenHandler(self, nodeElem, level)
def nodeCallbackCloseHandler(self, nodeElem, level)