HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
scrapy_extractor.py
Go to the documentation of this file.
1 # coding: utf-8
2 """@package docstring
3  @file scrapy_extractor.py
4  @author Alexey <developers.hce@gmail.com>
5  @link http://hierarchical-cluster-engine.com/
6  @copyright Copyright &copy; 2013 IOIX Ukraine
7  @license http://hierarchical-cluster-engine.com/license/
8  @package HCE project node API
9  @since 0.1
10 """
11 
12 import re
13 import json
14 import copy
15 import ConfigParser
16 import dc_processor.Constants as CONSTS
17 from dc_processor.base_extractor import BaseExtractor
18 from app.SelectorWrapper import SelectorWrapper
19 from app.Utils import varDump
20 from app.Utils import ExceptionLog
21 import app.Utils as Utils # pylint: disable=F0401
22 from app.Url import Url
23 
24 # Logger initialization
25 logger = Utils.MPLogger().getLogger()
26 
27 
28 # #ScrapyExtractor class implements data extracting using Scrapy module with prepared XPathes/Css
29 #
31 
32  SELF_NAME = "Scrapy extractor"
33 
34  # Constants used in class
35  TEMPLATE_FILE_RULE_XPATH = 'xpath'
36  TEMPLATE_FILE_RULE_REPLACE = 'replace'
37  TEMPLATE_FILE_RULE_EXCLUDE = 'exclude'
38 
39  DISABLE_XPATH_CHARS_LIST = [';', '#']
40 
41  # #class constructor
42  #
43  # @param config - incoming app config
44  # @param templ - optionality dict with base set of extractor rules/xpathes
45  # @param domain - optionality param, processing url domain
46  # @param processorProperties - optionality param, incoming app processor properties
47  # @param template - optionality param, incoming template set
48  def __init__(self, config, templ=None, domain=None, processorProperties=None):
49  BaseExtractor.__init__(self, config, templ, domain, processorProperties)
50  logger.debug("Properties: %s", varDump(self.properties))
51 
52  # set module rank from module's properties
53  self.rankReading(self.__class__.__name__)
54 
55  self.closeVoid = None
56  if processorProperties is not None and CONSTS.TAG_CLOSE_VOID_PROP_NAME in processorProperties and \
57  processorProperties[CONSTS.TAG_CLOSE_VOID_PROP_NAME] is not None:
58  self.closeVoid = int(processorProperties[CONSTS.TAG_CLOSE_VOID_PROP_NAME])
59 
60  self.keepAttributes = None
61  if processorProperties is not None and CONSTS.TAG_KEEP_ATTRIBUTES_PROP_NAME in processorProperties and \
62  processorProperties[CONSTS.TAG_KEEP_ATTRIBUTES_PROP_NAME] is not None:
63  self.keepAttributes = {}
64  for key in processorProperties[CONSTS.TAG_KEEP_ATTRIBUTES_PROP_NAME]:
65  self.keepAttributes[key.lower()] = processorProperties[CONSTS.TAG_KEEP_ATTRIBUTES_PROP_NAME][key]
66 
67  if processorProperties is not None and CONSTS.TAG_MARKUP_PROP_NAME in processorProperties and \
68  processorProperties[CONSTS.TAG_MARKUP_PROP_NAME] is not None:
70  for key in processorProperties[CONSTS.TAG_MARKUP_PROP_NAME]:
71  self.innerTextTagReplacers[key.lower()] = processorProperties[CONSTS.TAG_MARKUP_PROP_NAME][key]
72  else:
73  self.innerTextTagReplacers = None
74 
75  self.name = self.SELF_NAME
76  self.data["extractor"] = self.SELF_NAME
77  self.sel = None
78  self.resource = None
79  # for post processing
80  self.postReplace = {}
81  self.postExclude = {}
82 
83  if processorProperties is not None and "SCRAPER_SCRAPY_PRECONFIGURED" in processorProperties:
84  self.templates = self.generateTemplatesFromRowTemplates(json.loads(processorProperties\
85  ["SCRAPER_SCRAPY_PRECONFIGURED"]), domain)
86  else:
87  self.templates = [{self.SELF_NAME + "_default": self.templateLoad(config, templ, domain)}]
88 
89  try:
90  defaultConfigTemplate = config.get("Application", "default_template", None)
91  except ConfigParser.NoOptionError:
92  defaultConfigTemplate = None
93  if defaultConfigTemplate is not None:
94  logger.debug(">>> Extend Templates with config default template")
95  tempTemplates = self.generateTemplatesFromRowTemplates(json.loads(defaultConfigTemplate), domain)
96  if len(tempTemplates) > 0:
97  newTemplates = []
98  for templeteElemConfig in tempTemplates:
99  for templeteElemProperty in self.templates:
100  for templeteKeyProperty in templeteElemProperty:
101  if templeteKeyProperty in templeteElemConfig:
102  templeteElemConfig = None
103  break
104  if templeteElemConfig is None:
105  break
106  if templeteElemConfig is not None:
107  newTemplates.append(templeteElemConfig)
108  self.templates = self.templates + newTemplates
110  logger.debug("!!! INIT Template Domain: '%s'", str(domain))
111 # logger.debug("!!! INIT Template: %s", str(self.templates))
112 
113 
114  # #generateTemplatesFromRowTemplates method extract templates from incoming rowTemplates data
115  #
116  # @param rowTemplates - incoming rowTemplates
117  # @return result - list of template dicts
118  def generateTemplatesFromRowTemplates(self, rowTemplates, domain=None):
119  ret = []
120  try:
121  if "sets" in rowTemplates:
122  ret = rowTemplates["sets"]
123  for elem in ret:
124  for setName in elem:
125  if isinstance(elem[setName], basestring):
126  try:
127  with open(elem[setName], "rb") as fd:
128  elem[setName] = json.loads(fd.read())
129  except Exception as excp:
130  logger.debug(">>> generateTemplatesFromRowTemplates element[%s] file/json operations error, %s",
131  setName, str(type(elem[setName])))
132  elem[setName] = {}
133  elif not isinstance(elem[setName], dict):
134  logger.debug(">>> generateTemplatesFromRowTemplates element[%s] wrong type is %s", setName,
135  str(type(elem[setName])))
136  elem[setName] = {}
137 
138  elem[setName] = self.templatePreparer(None, domain, elem[setName])
139  break
140  except Exception as excp:
141  logger.debug(">>> Some error during generateTemplatesFromRowTemplates = " + str(excp))
142  return ret
143 
144 
145  # #templateLoad method which fills internal template dict with preparatory extractor rules/xpathes
146  #
147  # @param config - incoming app config
148  # @param templ - optionality dict with base set of extractor rules/xpathes
149  # @param domain - optionality param, processing url domain
150  # @return result template dict
151  def templateLoad(self, config, templ=None, domain=None):
152  ret = {}
153  defaultTemplate = None
154  try:
155  templateFile = config.get("Application", "template", None)
156  except ConfigParser.NoOptionError:
157  templateFile = None
158  if templateFile:
159  try:
160  logger.debug("Read template from file. %s", templateFile)
161  with open(templateFile, "rb") as fd:
162  defaultTemplate = self.templatePreparer(fd.read(), domain, {})
163  except Exception, err:
164  logger.error("Error Read template from file. %s", str(err))
165 
166  if self.properties is not None and CONSTS.TEMPLATE_KEY not in self.properties:
167  ret = self.templatePreparer(self.properties[CONSTS.TEMPLATE_KEY], domain, {})
168  logger.debug("template: " + str(ret))
169  elif templ is not None:
170  logger.debug("template: %s", str(templ))
171  if isinstance(templ, dict):
172  ret = self.templatePreparer(None, domain, templ)
173  else:
174  ret = self.templatePreparer(templ, domain, {})
175 
176 # logger.debug("!!! ret template: %s ", str(ret))
177 
178  # merge default template and custom one
179  if defaultTemplate is not None:
180  logger.debug("merge default template and custom one")
181  defaultTags = defaultTemplate.keys()
182  customTags = ret.keys()
183  logger.debug("tags in default template:\n%s\nin custom template:\n%s", str(defaultTags), str(customTags))
184  for tag in defaultTags:
185  if tag not in customTags:
186  ret[tag] = defaultTemplate[tag]
187  logger.debug("%s was replaced from custom template", str(tag))
188  elif defaultTemplate is not None:
189  ret = defaultTemplate
190  else:
191  logger.error("Error Read template.")
192  return ret
193 
194 
195  # #pasteLists pastes same elements in 2 incoming dicts
196  #
197  # @param lhs - incoming destination dict
198  # @param rhs - incoming source dict
199  def pasteLists(self, lhs, rhs):
200 # logger.debug("lhs: %s, type: %s", str(lhs), str(type(lhs)))
201 # logger.debug("rhs: %s, type: %s", str(rhs), str(type(rhs)))
202 
203  if isinstance(lhs, dict) and isinstance(rhs, dict):
204  for elem in rhs:
205 
206  self.postReplace[elem] = []
207  if elem in lhs and self.TEMPLATE_FILE_RULE_REPLACE in lhs[elem] and \
208  isinstance(lhs[elem][self.TEMPLATE_FILE_RULE_REPLACE], dict):
209  self.postReplace[elem].append(lhs[elem][self.TEMPLATE_FILE_RULE_REPLACE])
210 # logger.debug("!!! lhs self.postReplace: %s", str(lhs[elem][self.TEMPLATE_FILE_RULE_REPLACE]))
211 # logger.debug("!!! self.postReplace: %s", str(self.postReplace))
212 
213  if elem in rhs and self.TEMPLATE_FILE_RULE_REPLACE in rhs[elem] and \
214  isinstance(rhs[elem][self.TEMPLATE_FILE_RULE_REPLACE], dict):
215  self.postReplace[elem].append(rhs[elem][self.TEMPLATE_FILE_RULE_REPLACE])
216 # logger.debug("!!! rhs self.postReplace: %s", str(rhs[elem][self.TEMPLATE_FILE_RULE_REPLACE]))
217 # logger.debug("!!! self.postReplace: %s", str(self.postReplace))
218 
219  self.postExclude[elem] = []
220  if elem in lhs and self.TEMPLATE_FILE_RULE_EXCLUDE in lhs[elem] and \
221  isinstance(lhs[elem][self.TEMPLATE_FILE_RULE_EXCLUDE], list):
222  self.postExclude[elem].extend(lhs[elem][self.TEMPLATE_FILE_RULE_EXCLUDE])
223 
224  if elem in rhs and self.TEMPLATE_FILE_RULE_EXCLUDE in rhs[elem] and \
225  isinstance(rhs[elem][self.TEMPLATE_FILE_RULE_EXCLUDE], list):
226  self.postExclude[elem].extend(rhs[elem][self.TEMPLATE_FILE_RULE_EXCLUDE])
227 
228 # logger.debug("!!! self.postExclude['%s']: %s", str(elem), str(self.postExclude[elem]))
229 
230  lXpathList = []
231  rXpathList = []
232 
233  if elem in lhs and isinstance(lhs[elem], dict) and self.TEMPLATE_FILE_RULE_XPATH in lhs[elem] and isinstance(lhs[elem][self.TEMPLATE_FILE_RULE_XPATH], list):
234  lXpathList = lhs[elem][self.TEMPLATE_FILE_RULE_XPATH]
235 
236  if elem in lhs and isinstance(lhs[elem], list):
237  lXpathList = lhs[elem]
238 
239  if elem in rhs and isinstance(rhs[elem], dict) and self.TEMPLATE_FILE_RULE_XPATH in rhs[elem] and isinstance(rhs[elem][self.TEMPLATE_FILE_RULE_XPATH], list):
240  rXpathList = rhs[elem][self.TEMPLATE_FILE_RULE_XPATH]
241 
242  if elem in rhs and isinstance(rhs[elem], list):
243  rXpathList = rhs[elem]
244 
245 # logger.debug("!!! lXpathList: %s", varDump(lXpathList))
246 # logger.debug("!!! rXpathList: %s", varDump(rXpathList))
247 
248  lhs[elem] = lXpathList + rXpathList
249 # logger.debug("!!! lhs[elem]: %s", varDump(lhs[elem]))
250 
251 
252  # #Common method of prepared templates extract
253  #
254  # @param jsonBuf - incoming json with templates
255  # @param domains - domains list
256  # @param globalTemplate - incoming templates
257  # @return template dict, that corresponds incoming domainCrc or "*"
258  def templatePreparer(self, jsonBuf, domains, globalTemplate):
259  ret = {}
260  if len(globalTemplate) == 0:
261  try:
262  globalTemplate = json.loads(jsonBuf)
263  except Exception, err:
264  logger.error(">>> Wrong json format. %s", str(err))
265 
266  if len(globalTemplate) > 0:
267  try:
268  if domains is not None:
269 # logger.debug("!!! domains: '%s', type: %s", str(domains), str(type(domains)))
270 # logger.debug("!!! globalTemplate: '%s'", str(globalTemplate))
271 # logger.debug("!!! type(globalTemplate): '%s'", str(type(globalTemplate)))
272  if isinstance(domains, basestring):
273  domains = [domains]
274 
275  for domain in domains:
276  for pattern in globalTemplate:
277  try:
278  searchPatterns = pattern.split()
279  # logger.debug("!!! searchPatterns: '%s'", str(searchPatterns))
280  found = False
281  for searchPattern in searchPatterns:
282  if searchPattern != '*':
283  if re.search(searchPattern, domain, re.UNICODE) is not None:
284  logger.debug("!!! Found pattern: '%s'", str(pattern))
285  if isinstance(globalTemplate[pattern], dict):
286  ret = globalTemplate[pattern]
287  found = True
288  break
289 
290  if found:
291  break
292  except Exception, err:
293  logger.debug("Regular expression error: %s, pattern: '%s', domain: '%s'",
294  str(err), str(pattern), str(domain))
295 
296  # If was fail use old algorithm
297  if len(ret) == 0 and domain in globalTemplate and isinstance(globalTemplate[domain], dict):
298  ret = globalTemplate[domain]
299 
300  if domains is not None:
301  for domain in domains:
302  if len(ret) == 0:
303  while domain.find(".") != -1:
304  domain = domain[domain.find(".") + 1: len(domain)]
305  if domain is not None and domain in globalTemplate:
306  self.pasteLists(ret, globalTemplate[domain])
307  if domain is not None and domain in globalTemplate:
308  self.pasteLists(ret, globalTemplate[domain])
309 
310  domain = "*"
311  if domains is not None and domain in globalTemplate:
312  self.pasteLists(ret, globalTemplate[domain])
313 
314  except Exception, err:
315  ExceptionLog.handler(logger, err, 'Exception: ', (ret))
316 
317  for key, value in ret.items():
318  if isinstance(value, list):
319  removeList = []
320  for elemXPath in value:
321  if elemXPath != "" and elemXPath[0] in self.DISABLE_XPATH_CHARS_LIST:
322  removeList.append(elemXPath)
323 
324  for removeElem in removeList:
325  value.remove(removeElem)
326  logger.debug("For '%s' found disabled xpath: %s", str(key), str(removeElem))
327 
328  return ret
329 
330 
331  # #extractTag method extracts for concrete tag
332  #
333  # @param tagName - incoming tag name
334  # @param result - incoming result object
335  # @param textHandler - optionality param with text processing callback function
336  # @param delimiter - optionality delimiter between extracted elements
337  def extractTag(self, tagName, result, template, textHandler=None, delimiter=' '):
338  try:
339  if tagName in template:
340  for path in template[tagName]:
341 # logger.debug("!!! ENTER tagName: %s, xpath: '%s'", str(tagName), str(path))
342 
343  if tagName in self.blockedByXpathTags:
344  break
345 
346  if path == "":
347  if tagName not in result.blockedByXpathTags:
348  result.blockedByXpathTags.append(tagName)
349  break
350  elif path == "none":
351  if tagName not in self.blockedByXpathTags:
352  self.blockedByXpathTags.append(tagName)
353  break
354  if textHandler is not None:
355  conditions = None
356  if self.tagsValidator is not None and self.name in self.tagsValidator and \
357  tagName in self.tagsValidator[self.name]:
358  conditions = self.tagsValidator[self.name][tagName]
359  localValue = textHandler(self.sel.xpath(path), delimiter, delimiter, self.innerTextTagReplacers, conditions,
360  keepAttributes=self.keepAttributes, baseUrl=self.resource.url,
361  closeVoid=self.closeVoid, excludeNodes=self.postExclude[tagName] if tagName in self.postExclude else None)
362  else:
363  localValue = self.sel.xpath(path).extract()
364 
365 # if tagName == 'content_encoded':
366 # logger.debug("!!! tagName: %s", str(tagName))
367 # logger.debug("!!! xpath: %s", str(path))
368 # logger.debug("!!! value: '%s'", varDump(localValue))
369 
370 # if tagName == 'title' or tagName == 'html_lang':
371 # logger.debug("!!! tagName: %s", str(tagName))
372 # logger.debug("!!! xpath: %s", str(path))
373 # logger.debug("!!! value: '%s'", varDump(localValue))
374 
375  # apply post-processing
376  if isinstance(self.postReplace, dict) and tagName in self.postReplace and \
377  isinstance(self.postReplace[tagName], list) and localValue != "":
378 # if len(localValue) > 0:
379 # logger.debug("!!! localValue before: %s", varDump(localValue))
380 # logger.info("POST PROCESSING FOR TAG '%s', len = %s", str(tagName), len(localValue))
381  for postReplace in self.postReplace[tagName]:
382  if isinstance(postReplace, dict):
383  for pattern, repl in postReplace.items():
384  if isinstance(pattern, basestring) and isinstance(repl, basestring):
385 # logger.debug("!!! pattern: '%s', repl: '%s'", str(pattern), str(repl))
386  localValue = re.sub(pattern=pattern, repl=repl, string=localValue.decode('utf-8'), flags=re.U + re.M + re.I + re.DOTALL)
387 # logger.debug("!!! localValue after replace: %s", varDump(localValue))
388 
389  if tagName == CONSTS.TAG_LINK:
390  urlObj = Url(localValue)
391  if urlObj.isValid():
392  self.addTag(result=result, tag_name=tagName, tag_value=localValue, xpath=path)
393  else:
394  self.addTag(result=result, tag_name=tagName, tag_value=localValue, xpath=path)
395  except Exception, err:
396  ExceptionLog.handler(logger, err, 'Exception in ScrapyExtractor.extractTag:')
397 
398 
399  # #extractTagsForOneTemplate method extract data by tags for one template and fills incoming result object
400  #
401  # @param resource - incoming raw data
402  # @param result - incoming result object
403  # @param template - current template
404  # @return incoming result with additionally filled fields/tags
405  def extractTagsForOneTemplate(self, resource, result, template):
406  try:
407  self.resource = resource
408 # logger.debug("URL: %s \nresource.raw_html: %s ", self.resource.url, resource.raw_html[:255])
409  self.sel = SelectorWrapper(text=resource.raw_html)
410 
411  # search engine parsing
412 # logger.debug("Regular parsing")
413  self.extractTag(CONSTS.TAG_TITLE, result, template, Utils.innerText)
414  self.extractTag(CONSTS.TAG_AUTHOR, result, template, Utils.innerText)
415  self.extractTag(CONSTS.TAG_PUB_DATE, result, template)
416  self.extractTag(CONSTS.TAG_DESCRIPTION, result, template, Utils.innerText)
417  self.extractTag(CONSTS.TAG_DC_DATE, result, template)
418  self.extractTag(CONSTS.TAG_MEDIA, result, template, Utils.innerText, self.imgDelimiter)
419  self.extractTag(CONSTS.TAG_LINK, result, template, Utils.innerText)
420  self.extractTag(CONSTS.TAG_CONTENT_UTF8_ENCODED, result, template, Utils.innerText)
421 
422  # for path in template["enclosure"]]
423  self.extractTag(CONSTS.TAG_KEYWORDS, result, template, Utils.innerText)
424  # Add support of html_lang tag
425  self.extractTag(CONSTS.HTML_LANG, result, template, Utils.innerText)
426 
427  except Exception as err:
428  ExceptionLog.handler(logger, err, "Parse error:", (err))
429 
430  return result
431 
432 
433  # #extractTags public method extract data by tags and fills incoming result object
434  #
435  # @param resource - incoming raw data
436  # @param result - incoming result object
437  # @return incoming result with additionally filled fields/tags
438  def extractTags(self, resource, result):
439  self.blockedByXpathTags = []
440  localResults = []
441  for templateDict in self.templates:
442  for templateName in templateDict:
443  localResult = copy.deepcopy(result)
444  self.extractTagsForOneTemplate(resource, localResult, templateDict[templateName])
445  localResults.append(localResult)
446  break
447 
448  for localResult in localResults:
449  result.mergeResults(localResult)
450  return result
def extractTagsForOneTemplate(self, resource, result, template)
def templatePreparer(self, jsonBuf, domains, globalTemplate)
def generateTemplatesFromRowTemplates(self, rowTemplates, domain=None)
Definition: Url.py:1
def templateLoad(self, config, templ=None, domain=None)
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
def addTag(self, result, tag_name, tag_value, xpath="", isDefaultTag=False, callAdjustment=True, tagType=None, allowNotFilled=False)
def extractTag(self, tagName, result, template, textHandler=None, delimiter=' ')
def __init__(self, config, templ=None, domain=None, processorProperties=None)