HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
app.ExtendInnerText.ExtendInnerText Class Reference
Inheritance diagram for app.ExtendInnerText.ExtendInnerText:
Collaboration diagram for app.ExtendInnerText.ExtendInnerText:

Public Member Functions

def __init__ (self, tagReplacers=None, delimiter=' ', innerDelimiter=' ', REconditions=None, attrConditions=None, keepAttributes=None, baseUrl=None, closeVoid=None, excludeNodes=None)
 
def nodeCallbackOpenHandler (self, nodeElem, level)
 
def nodeCallbackCloseHandler (self, nodeElem, level)
 
def textCallbackHandler (self, nodeElem, level, excludeTags)
 
def innerText (self, contentBuf, xPath, tagRemoves=None)
 
def innerTextToList (self, contentBuf, xPath, tagRemoves=None)
 
def extractAttributes (self, nodeElem, tagName, keepAttributes, baseUrl)
 
def applyCloseVoid (self, nodeElem, tagName)
 

Static Public Member Functions

def checkElemAttributes (attrConditions, elem)
 
def traversalNodes (elemList, level=0, nodeCallbackOpen=None, nodeCallbackClose=None, textCallback=None, excludeTags=None, attrConditions=None, excludeNodes=None)
 
def isExcludeNode (excludeNodes, elem)
 check is exlude node More...
 

Public Attributes

 stripHtml
 
 stripHtmlList
 
 errorString
 
 delimiter
 
 innerDelimiter
 
 REconditions
 
 attrConditions
 
 tagReplacers
 
 keepAttributes
 
 baseUrl
 
 closeVoid
 
 excludeNodes
 

Static Public Attributes

list NONE_CLOSED_HTML_TAGS
 
list CANONIZATION_TAGS = ['href', 'src']
 
string MACRO_ATTRIBUTES = '%ATTRIBUTES%'
 
string PATTERN_CLOSE_VOID = r"<%s.*?(/)>"
 
int CLOSE_VOID_NOT_CLOSE = 0
 
int CLOSE_VOID_CLOSE = 1
 
int CLOSE_VOID_AUTO = 2
 
 values = elem.xpath('@' + attrName).extract()
 
bool found = False
 
bool ret = True
 

Detailed Description

Definition at line 20 of file ExtendInnerText.py.

Constructor & Destructor Documentation

◆ __init__()

def app.ExtendInnerText.ExtendInnerText.__init__ (   self,
  tagReplacers = None,
  delimiter = ' ',
  innerDelimiter = ' ',
  REconditions = None,
  attrConditions = None,
  keepAttributes = None,
  baseUrl = None,
  closeVoid = None,
  excludeNodes = None 
)

Definition at line 36 of file ExtendInnerText.py.

36  keepAttributes=None, baseUrl=None, closeVoid=None, excludeNodes=None):
37  self.stripHtml = ''
38  self.stripHtmlList = []
39  self.errorString = ''
40  self.delimiter = delimiter
41  self.innerDelimiter = innerDelimiter
42  self.REconditions = REconditions
43  self.attrConditions = attrConditions
44  self.tagReplacers = tagReplacers
45  self.keepAttributes = keepAttributes
46  self.baseUrl = baseUrl
47  self.closeVoid = closeVoid
48  self.excludeNodes = excludeNodes
49 
50 

Member Function Documentation

◆ applyCloseVoid()

def app.ExtendInnerText.ExtendInnerText.applyCloseVoid (   self,
  nodeElem,
  tagName 
)

Definition at line 266 of file ExtendInnerText.py.

266  def applyCloseVoid(self, nodeElem, tagName):
267  # variable for result
268  ret = ''
269 
270  if tagName in self.NONE_CLOSED_HTML_TAGS:
271  closeVoid = self.CLOSE_VOID_NOT_CLOSE
272  if self.closeVoid is not None:
273  closeVoid = int(self.closeVoid)
274 
275  if closeVoid == self.CLOSE_VOID_NOT_CLOSE:
276  ret = ''
277  elif closeVoid == self.CLOSE_VOID_CLOSE:
278  ret = '/'
279  elif closeVoid == self.CLOSE_VOID_AUTO:
280 # logger.info("!!!!! BEFORE nodeElem.select()")
281 # for sel in nodeElem.select('//*'):
282 # logger.info("!!!!! sel: '%s'", str(sel.extract()))
283 
284  logger.info("!!!!! nodeElem.extract(): '%s'", str(nodeElem.extract()))
285  pattern = self.PATTERN_CLOSE_VOID % str(tagName)
286  logger.info("!!!!! pattern: '%s'", str(pattern))
287  res = nodeElem.re(pattern)
288  logger.info("!!!!! nodeElem.re(pattern): '%s'", str(res))
289  if len(res) > 0:
290  ret = '/'
291 
292  return ret
293 
294 
Here is the caller graph for this function:

◆ checkElemAttributes()

def app.ExtendInnerText.ExtendInnerText.checkElemAttributes (   attrConditions,
  elem 
)
static

Definition at line 145 of file ExtendInnerText.py.

145  def checkElemAttributes(attrConditions, elem):
146  ret = True
147  if attrConditions is not None:
148  if attrConditions["TYPE"] == "include":
149  ret = False
150  i = 1
151  attrList = elem.xpath("@*")
152  if len(attrList) > 0:
153  for internalElem in attrList:
154  for key in attrConditions:
155  attrName = "".join(elem.xpath("name(@*[%s])" % str(i)).extract())
156  if key != "type" and (key == "*" or key == attrName) and \
157  re.compile(attrConditions[key]).match(internalElem.extract()):
158  ret = not ret
159  break
160  i += 1
161  elif "NO_ATTRIBUTES" in attrConditions:
162  ret = True
163  return ret
164 
165 
Definition: join.py:1
Here is the call graph for this function:

◆ extractAttributes()

def app.ExtendInnerText.ExtendInnerText.extractAttributes (   self,
  nodeElem,
  tagName,
  keepAttributes,
  baseUrl 
)

Definition at line 236 of file ExtendInnerText.py.

236  def extractAttributes(self, nodeElem, tagName, keepAttributes, baseUrl):
237  # variable for result
238  ret = ''
239 
240 # logger.info("!!! tagName = %s", str(tagName))
241  import app.Utils
242  if keepAttributes is not None and tagName in keepAttributes.keys():
243  attrList = keepAttributes[tagName]
244  values = []
245  for attrName in attrList:
246  value = nodeElem.xpath('@' + attrName).extract()
247 # logger.info("!!! for %s extracted: %s", str(attrName), str(value))
248  if len(value) > 0 and value[0] != "":
249  if attrName in self.CANONIZATION_TAGS:
250  value[0] = app.Utils.urlNormalization(baseUrl, value[0])
251 
252  values.append(attrName + '="' + value[0].replace('\n', ' ').replace('"', '\\\"') + '"')
253 
254  # ret = "<" + tagName + ' ' + ' '.join(values) + '>'
255  ret = ' ' + ' '.join(values)
256 
257 # logger.debug("!!! return: '%s'", str(ret))
258  return ret
259 
260 
def urlNormalization(base, url, supportProtocols=None, log=None)
Definition: Utils.py:561
Definition: join.py:1
Here is the call graph for this function:
Here is the caller graph for this function:

◆ innerText()

def app.ExtendInnerText.ExtendInnerText.innerText (   self,
  contentBuf,
  xPath,
  tagRemoves = None 
)

Definition at line 109 of file ExtendInnerText.py.

109  def innerText(self, contentBuf, xPath, tagRemoves=None):
110  self.stripHtml = ''
111  self.errorString = ''
112  if xPath is not None:
113  if tagRemoves is None:
114  tagRemoves = ['script', 'style', '']
115  try:
116  if isinstance(xPath, basestring):
117  sel = SelectorWrapper(text=contentBuf)
118  selectorElem = sel.xpath(xPath)
119  else:
120  selectorElem = xPath
121  localBuf = ''
122 
123  for elem in selectorElem:
124  if self.REconditions is not None:
125  if (self.REconditions["type"] == "include" and re.compile(self.REconditions["RE"]).match(elem) is None) or \
126  (self.REconditions["type"] == "exclude" and re.compile(self.REconditions["RE"]).match(elem) is not None):
127  continue
128  self.stripHtml = ''
129  elemList = []
130  elemList.append(elem)
131  ExtendInnerText.traversalNodes(elemList, 0, self.nodeCallbackOpenHandler, self.nodeCallbackCloseHandler,
132  self.textCallbackHandler, tagRemoves, self.attrConditions, self.excludeNodes)
133  localBuf += self.stripHtml.strip(self.innerDelimiter)
134  localBuf += self.delimiter
135 
136  self.stripHtml = localBuf.strip(self.delimiter)
137  except Exception as excp:
138  self.errorString = str(excp)
139  logger.error("!!! Exception: %s", str(self.errorString))
140  import app.Utils as Utils
141  logger.info(Utils.getTracebackInfo())
142 
143 
def innerText(selectorList, delimiter=' ', innerDelimiter=' ', tagReplacers=None, REconditions=None, attrConditions=None, keepAttributes=None, baseUrl=None, closeVoid=None, excludeNodes=None)
Definition: Utils.py:1148
Here is the call graph for this function:

◆ innerTextToList()

def app.ExtendInnerText.ExtendInnerText.innerTextToList (   self,
  contentBuf,
  xPath,
  tagRemoves = None 
)

Definition at line 198 of file ExtendInnerText.py.

198  def innerTextToList(self, contentBuf, xPath, tagRemoves=None):
199  stripHtmlList = []
200  self.stripHtml = ''
201  self.errorString = ''
202  if xPath is not None:
203  if tagRemoves is None:
204  tagRemoves = ['script', 'style', '']
205  try:
206  if isinstance(xPath, types.StringTypes):
207  sel = SelectorWrapper(text=contentBuf)
208  selectorElem = sel.xpath(xPath)
209  else:
210  selectorElem = xPath
211 
212  for elem in selectorElem:
213  if self.REconditions is not None:
214  if (self.REconditions["type"] == "include" and re.compile(self.REconditions["RE"]).match(elem) is None) or \
215  (self.REconditions["type"] == "exclude" and re.compile(self.REconditions["RE"]).match(elem) is not None):
216  continue
217  self.stripHtml = ''
218  elemList = []
219  elemList.append(elem)
220  ExtendInnerText.traversalNodes(elemList, 0, self.nodeCallbackOpenHandler, self.nodeCallbackCloseHandler,
221  self.textCallbackHandler, tagRemoves, self.attrConditions, self.excludeNodes)
222 
223  stripHtmlList.append((self.stripHtml.strip(self.innerDelimiter) + self.delimiter).strip(self.delimiter))
224 
225  self.stripHtmlList = stripHtmlList
226  except Exception as excp:
227  self.errorString = str(excp)
228 
229 
def innerTextToList(selectorList, delimiter=' ', innerDelimiter=' ', tagReplacers=None, REconditions=None, attrConditions=None, keepAttributes=None, baseUrl=None, closeVoid=None, excludeNodes=None)
Definition: Utils.py:1160
Here is the call graph for this function:

◆ isExcludeNode()

def app.ExtendInnerText.ExtendInnerText.isExcludeNode (   excludeNodes,
  elem 
)
static

check is exlude node

Parameters
excludeNodes- dictionary with criterion for exclude
elem- element for check
Returns
True if necessary exclude or False otherwise

Definition at line 301 of file ExtendInnerText.py.

301  def isExcludeNode(excludeNodes, elem):
302  # variable for result
303  ret = False
304 # logger.debug("!!! excludeNodes: %s, type: %s", str(excludeNodes), str(type(excludeNodes)))
305 
306  if len(elem.xpath("name()")) > 0:
307  nodeName = str(elem.xpath("name()")[0].extract())
308 # logger.debug("!!! nodeName: %s, type: %s", str(nodeName), str(type(nodeName)))
309 
310  import app.Utils as Utils
311 
312  if isinstance(excludeNodes, list):
313  for excludeNode in excludeNodes:
314  if isinstance(excludeNode, dict):
315  for tagName, attributes in excludeNode.items():
316 # logger.debug("!!! tagName: %s, attributes: %s", str(tagName), str(attributes))
317  if Utils.reMatch(tagName, nodeName, logger):
318 # logger.debug("tagName: %s == nodeName: %s", str(tagName), str(nodeName))
319  if attributes is None:
320  logger.debug("Found exclude node rule for '%s' with attributes: %s", str(tagName), str(attributes))
321  ret = True
322  break
323 
324  if isinstance(attributes, dict):
325  for attrName, attrValue in attributes.items():

◆ nodeCallbackCloseHandler()

def app.ExtendInnerText.ExtendInnerText.nodeCallbackCloseHandler (   self,
  nodeElem,
  level 
)

Definition at line 78 of file ExtendInnerText.py.

78  def nodeCallbackCloseHandler(self, nodeElem, level): # pylint: disable=W0613
79  closeTagName = str(nodeElem.xpath("name()")[0].extract())
80 # logger.info("closeTagName: %s", str(closeTagName))
81  if self.tagReplacers is None and closeTagName not in self.NONE_CLOSED_HTML_TAGS and closeTagName != "":
82  self.stripHtml += '</' + closeTagName + '>'
83  else:
84  closeTag = "</" + closeTagName + ">"
85  if (len(str(nodeElem.extract())) >= len(closeTag)) and \
86  str(nodeElem.extract()).rfind(closeTag) == (len(str(nodeElem.extract())) - len(closeTag)):
87  closeTagName = '/' + closeTagName
88  if self.tagReplacers is not None and closeTagName in self.tagReplacers:
89  self.stripHtml += self.tagReplacers[closeTagName]
90 
91 
Here is the caller graph for this function:

◆ nodeCallbackOpenHandler()

def app.ExtendInnerText.ExtendInnerText.nodeCallbackOpenHandler (   self,
  nodeElem,
  level 
)

Definition at line 51 of file ExtendInnerText.py.

51  def nodeCallbackOpenHandler(self, nodeElem, level): # pylint: disable=W0613
52  openTagName = str(nodeElem.xpath("name()")[0].extract())
53 
54 # logger.info("self.tagReplacers: %s", str(self.tagReplacers))
55 # logger.info("openTagName: %s", str(openTagName))
56 
57  if self.tagReplacers is None:
58 
59  if self.MACRO_ATTRIBUTES in openTagName:
60  self.stripHtml += '<' + openTagName.replace(self.MACRO_ATTRIBUTES, self.extractAttributes(nodeElem, \
61  openTagName, self.keepAttributes, self.baseUrl)) + self.applyCloseVoid(nodeElem, openTagName) + '>'
62  elif openTagName != "":
63  self.stripHtml += '<' + openTagName + self.extractAttributes(nodeElem, openTagName, self.keepAttributes, \
64  self.baseUrl) + self.applyCloseVoid(nodeElem, openTagName) + '>'
65 
66 # logger.info("self.stripHtml1: %s", str(self.stripHtml))
67  else:
68  if openTagName in self.tagReplacers:
69  self.stripHtml += self.tagReplacers[openTagName].replace(self.MACRO_ATTRIBUTES, \
70  self.extractAttributes(nodeElem, openTagName, self.keepAttributes, self.baseUrl))
71 
72 # logger.info("self.stripHtml2: %s", str(self.stripHtml))
73 
74 # logger.info("!!! nodeElem.xpath('name(@href)').extract() = %s", str(nodeElem.xpath('name(@href)').extract()))
75 # logger.info("!!! nodeElem.xpath('@href').extract() = %s", str(nodeElem.xpath('@href').extract()))
76 
77 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ textCallbackHandler()

def app.ExtendInnerText.ExtendInnerText.textCallbackHandler (   self,
  nodeElem,
  level,
  excludeTags 
)

Definition at line 92 of file ExtendInnerText.py.

92  def textCallbackHandler(self, nodeElem, level, excludeTags): # pylint: disable=W0613
93 # logger.debug("excludeTags: %s", str(excludeTags))
94 
95  buff = str(nodeElem.extract())
96  if buff.strip() != "":
97  for excludeTag in excludeTags:
98  if excludeTag != "":
99  pattern = '<' + excludeTag + '.*>'
100  buff = re.sub(pattern=pattern, repl='', string=buff, flags=re.I + re.U + re.M)
101 
102  if self.tagReplacers is None:
103  self.stripHtml += buff
104  else:
105  self.stripHtml += buff + self.innerDelimiter
106 # self.stripHtml += str(nodeElem.extract()) + self.innerDelimiter
107 
108 
Here is the caller graph for this function:

◆ traversalNodes()

def app.ExtendInnerText.ExtendInnerText.traversalNodes (   elemList,
  level = 0,
  nodeCallbackOpen = None,
  nodeCallbackClose = None,
  textCallback = None,
  excludeTags = None,
  attrConditions = None,
  excludeNodes = None 
)
static

Definition at line 168 of file ExtendInnerText.py.

168  excludeTags=None, attrConditions=None, excludeNodes=None):
169  if excludeTags is None:
170  excludeTags = ['script', 'style', '']
171  # print str(level) + " " + str(len(elemList))
172 
173 # logger.debug("elemList: %s", str(elemList))
174 # logger.debug("excludeNodes: %s", str(excludeNodes))
175  for elem in elemList:
176 
177  if not ExtendInnerText.checkElemAttributes(attrConditions, elem):
178  continue
179 
180  if ExtendInnerText.isExcludeNode(excludeNodes, elem):
181  continue
182 
183  if len(elem.xpath("name()")) > 0:
184  if nodeCallbackOpen is not None and str(elem.xpath("name()")[0].extract()) not in excludeTags:
185  nodeCallbackOpen(elem, level)
186 
187 
188  if str(elem.xpath("name()")[0].extract()) not in excludeTags:
189  ExtendInnerText.traversalNodes(elem.xpath("node()"), level + 1, nodeCallbackOpen, nodeCallbackClose,
190  textCallback, excludeTags, attrConditions, excludeNodes)
191  if nodeCallbackClose is not None and str(elem.xpath("name()")[0].extract()) not in excludeTags:
192  nodeCallbackClose(elem, level)
193  else:
194  if textCallback is not None:
195  textCallback(elem, level, excludeTags)
196 
197 
Here is the caller graph for this function:

Member Data Documentation

◆ attrConditions

app.ExtendInnerText.ExtendInnerText.attrConditions

Definition at line 43 of file ExtendInnerText.py.

◆ baseUrl

app.ExtendInnerText.ExtendInnerText.baseUrl

Definition at line 46 of file ExtendInnerText.py.

◆ CANONIZATION_TAGS

list app.ExtendInnerText.ExtendInnerText.CANONIZATION_TAGS = ['href', 'src']
static

Definition at line 25 of file ExtendInnerText.py.

◆ CLOSE_VOID_AUTO

int app.ExtendInnerText.ExtendInnerText.CLOSE_VOID_AUTO = 2
static

Definition at line 33 of file ExtendInnerText.py.

◆ CLOSE_VOID_CLOSE

int app.ExtendInnerText.ExtendInnerText.CLOSE_VOID_CLOSE = 1
static

Definition at line 32 of file ExtendInnerText.py.

◆ CLOSE_VOID_NOT_CLOSE

int app.ExtendInnerText.ExtendInnerText.CLOSE_VOID_NOT_CLOSE = 0
static

Definition at line 31 of file ExtendInnerText.py.

◆ closeVoid

app.ExtendInnerText.ExtendInnerText.closeVoid

Definition at line 47 of file ExtendInnerText.py.

◆ delimiter

app.ExtendInnerText.ExtendInnerText.delimiter

Definition at line 40 of file ExtendInnerText.py.

◆ errorString

app.ExtendInnerText.ExtendInnerText.errorString

Definition at line 39 of file ExtendInnerText.py.

◆ excludeNodes

app.ExtendInnerText.ExtendInnerText.excludeNodes

Definition at line 48 of file ExtendInnerText.py.

◆ found

bool app.ExtendInnerText.ExtendInnerText.found = False
static

Definition at line 330 of file ExtendInnerText.py.

◆ innerDelimiter

app.ExtendInnerText.ExtendInnerText.innerDelimiter

Definition at line 41 of file ExtendInnerText.py.

◆ keepAttributes

app.ExtendInnerText.ExtendInnerText.keepAttributes

Definition at line 45 of file ExtendInnerText.py.

◆ MACRO_ATTRIBUTES

string app.ExtendInnerText.ExtendInnerText.MACRO_ATTRIBUTES = '%ATTRIBUTES%'
static

Definition at line 27 of file ExtendInnerText.py.

◆ NONE_CLOSED_HTML_TAGS

list app.ExtendInnerText.ExtendInnerText.NONE_CLOSED_HTML_TAGS
static
Initial value:
= ['area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input', 'keygen', 'link',
'meta', 'param', 'source', 'track', 'wbr']

Definition at line 22 of file ExtendInnerText.py.

◆ PATTERN_CLOSE_VOID

string app.ExtendInnerText.ExtendInnerText.PATTERN_CLOSE_VOID = r"<%s.*?(/)>"
static

Definition at line 29 of file ExtendInnerText.py.

◆ REconditions

app.ExtendInnerText.ExtendInnerText.REconditions

Definition at line 42 of file ExtendInnerText.py.

◆ ret

bool app.ExtendInnerText.ExtendInnerText.ret = True
static

Definition at line 338 of file ExtendInnerText.py.

◆ stripHtml

app.ExtendInnerText.ExtendInnerText.stripHtml

Definition at line 37 of file ExtendInnerText.py.

◆ stripHtmlList

app.ExtendInnerText.ExtendInnerText.stripHtmlList

Definition at line 38 of file ExtendInnerText.py.

◆ tagReplacers

app.ExtendInnerText.ExtendInnerText.tagReplacers

Definition at line 44 of file ExtendInnerText.py.

◆ values

app.ExtendInnerText.ExtendInnerText.values = elem.xpath('@' + attrName).extract()
static

Definition at line 327 of file ExtendInnerText.py.


The documentation for this class was generated from the following file: