3 @file scrapy_extractor.py 4 @author Alexey <developers.hce@gmail.com> 5 @link http://hierarchical-cluster-engine.com/ 6 @copyright Copyright © 2013 IOIX Ukraine 7 @license http://hierarchical-cluster-engine.com/license/ 8 @package HCE project node API 32 SELF_NAME =
"Scrapy extractor" 35 TEMPLATE_FILE_RULE_XPATH =
'xpath' 36 TEMPLATE_FILE_RULE_REPLACE =
'replace' 37 TEMPLATE_FILE_RULE_EXCLUDE =
'exclude' 39 DISABLE_XPATH_CHARS_LIST = [
';',
'#']
48 def __init__(self, config, templ=None, domain=None, processorProperties=None):
49 BaseExtractor.__init__(self, config, templ, domain, processorProperties)
56 if processorProperties
is not None and CONSTS.TAG_CLOSE_VOID_PROP_NAME
in processorProperties
and \
57 processorProperties[CONSTS.TAG_CLOSE_VOID_PROP_NAME]
is not None:
58 self.
closeVoid = int(processorProperties[CONSTS.TAG_CLOSE_VOID_PROP_NAME])
61 if processorProperties
is not None and CONSTS.TAG_KEEP_ATTRIBUTES_PROP_NAME
in processorProperties
and \
62 processorProperties[CONSTS.TAG_KEEP_ATTRIBUTES_PROP_NAME]
is not None:
64 for key
in processorProperties[CONSTS.TAG_KEEP_ATTRIBUTES_PROP_NAME]:
65 self.
keepAttributes[key.lower()] = processorProperties[CONSTS.TAG_KEEP_ATTRIBUTES_PROP_NAME][key]
67 if processorProperties
is not None and CONSTS.TAG_MARKUP_PROP_NAME
in processorProperties
and \
68 processorProperties[CONSTS.TAG_MARKUP_PROP_NAME]
is not None:
70 for key
in processorProperties[CONSTS.TAG_MARKUP_PROP_NAME]:
83 if processorProperties
is not None and "SCRAPER_SCRAPY_PRECONFIGURED" in processorProperties:
85 [
"SCRAPER_SCRAPY_PRECONFIGURED"]), domain)
90 defaultConfigTemplate = config.get(
"Application",
"default_template",
None)
91 except ConfigParser.NoOptionError:
92 defaultConfigTemplate =
None 93 if defaultConfigTemplate
is not None:
94 logger.debug(
">>> Extend Templates with config default template")
96 if len(tempTemplates) > 0:
98 for templeteElemConfig
in tempTemplates:
99 for templeteElemProperty
in self.
templates:
100 for templeteKeyProperty
in templeteElemProperty:
101 if templeteKeyProperty
in templeteElemConfig:
102 templeteElemConfig =
None 104 if templeteElemConfig
is None:
106 if templeteElemConfig
is not None:
107 newTemplates.append(templeteElemConfig)
110 logger.debug(
"!!! INIT Template Domain: '%s'", str(domain))
121 if "sets" in rowTemplates:
122 ret = rowTemplates[
"sets"]
125 if isinstance(elem[setName], basestring):
127 with open(elem[setName],
"rb")
as fd:
128 elem[setName] = json.loads(fd.read())
129 except Exception
as excp:
130 logger.debug(
">>> generateTemplatesFromRowTemplates element[%s] file/json operations error, %s",
131 setName, str(
type(elem[setName])))
133 elif not isinstance(elem[setName], dict):
134 logger.debug(
">>> generateTemplatesFromRowTemplates element[%s] wrong type is %s", setName,
135 str(
type(elem[setName])))
140 except Exception
as excp:
141 logger.debug(
">>> Some error during generateTemplatesFromRowTemplates = " + str(excp))
153 defaultTemplate =
None 155 templateFile = config.get(
"Application",
"template",
None)
156 except ConfigParser.NoOptionError:
160 logger.debug(
"Read template from file. %s", templateFile)
161 with open(templateFile,
"rb")
as fd:
163 except Exception, err:
164 logger.error(
"Error Read template from file. %s", str(err))
168 logger.debug(
"template: " + str(ret))
169 elif templ
is not None:
170 logger.debug(
"template: %s", str(templ))
171 if isinstance(templ, dict):
179 if defaultTemplate
is not None:
180 logger.debug(
"merge default template and custom one")
181 defaultTags = defaultTemplate.keys()
182 customTags = ret.keys()
183 logger.debug(
"tags in default template:\n%s\nin custom template:\n%s", str(defaultTags), str(customTags))
184 for tag
in defaultTags:
185 if tag
not in customTags:
186 ret[tag] = defaultTemplate[tag]
187 logger.debug(
"%s was replaced from custom template", str(tag))
188 elif defaultTemplate
is not None:
189 ret = defaultTemplate
191 logger.error(
"Error Read template.")
203 if isinstance(lhs, dict)
and isinstance(rhs, dict):
236 if elem
in lhs
and isinstance(lhs[elem], list):
237 lXpathList = lhs[elem]
242 if elem
in rhs
and isinstance(rhs[elem], list):
243 rXpathList = rhs[elem]
248 lhs[elem] = lXpathList + rXpathList
260 if len(globalTemplate) == 0:
262 globalTemplate = json.loads(jsonBuf)
263 except Exception, err:
264 logger.error(
">>> Wrong json format. %s", str(err))
266 if len(globalTemplate) > 0:
268 if domains
is not None:
272 if isinstance(domains, basestring):
275 for domain
in domains:
276 for pattern
in globalTemplate:
278 searchPatterns = pattern.split()
281 for searchPattern
in searchPatterns:
282 if searchPattern !=
'*':
283 if re.search(searchPattern, domain, re.UNICODE)
is not None:
284 logger.debug(
"!!! Found pattern: '%s'", str(pattern))
285 if isinstance(globalTemplate[pattern], dict):
286 ret = globalTemplate[pattern]
292 except Exception, err:
293 logger.debug(
"Regular expression error: %s, pattern: '%s', domain: '%s'",
294 str(err), str(pattern), str(domain))
297 if len(ret) == 0
and domain
in globalTemplate
and isinstance(globalTemplate[domain], dict):
298 ret = globalTemplate[domain]
300 if domains
is not None:
301 for domain
in domains:
303 while domain.find(
".") != -1:
304 domain = domain[domain.find(
".") + 1: len(domain)]
305 if domain
is not None and domain
in globalTemplate:
307 if domain
is not None and domain
in globalTemplate:
311 if domains
is not None and domain
in globalTemplate:
314 except Exception, err:
315 ExceptionLog.handler(logger, err,
'Exception: ', (ret))
317 for key, value
in ret.items():
318 if isinstance(value, list):
320 for elemXPath
in value:
322 removeList.append(elemXPath)
324 for removeElem
in removeList:
325 value.remove(removeElem)
326 logger.debug(
"For '%s' found disabled xpath: %s", str(key), str(removeElem))
337 def extractTag(self, tagName, result, template, textHandler=None, delimiter=' '):
339 if tagName
in template:
340 for path
in template[tagName]:
347 if tagName
not in result.blockedByXpathTags:
348 result.blockedByXpathTags.append(tagName)
354 if textHandler
is not None:
363 localValue = self.
sel.xpath(path).extract()
377 isinstance(self.
postReplace[tagName], list)
and localValue !=
"":
382 if isinstance(postReplace, dict):
383 for pattern, repl
in postReplace.items():
384 if isinstance(pattern, basestring)
and isinstance(repl, basestring):
386 localValue = re.sub(pattern=pattern, repl=repl, string=localValue.decode(
'utf-8'), flags=re.U + re.M + re.I + re.DOTALL)
389 if tagName == CONSTS.TAG_LINK:
390 urlObj =
Url(localValue)
392 self.
addTag(result=result, tag_name=tagName, tag_value=localValue, xpath=path)
394 self.
addTag(result=result, tag_name=tagName, tag_value=localValue, xpath=path)
395 except Exception, err:
396 ExceptionLog.handler(logger, err,
'Exception in ScrapyExtractor.extractTag:')
413 self.
extractTag(CONSTS.TAG_TITLE, result, template, Utils.innerText)
414 self.
extractTag(CONSTS.TAG_AUTHOR, result, template, Utils.innerText)
415 self.
extractTag(CONSTS.TAG_PUB_DATE, result, template)
416 self.
extractTag(CONSTS.TAG_DESCRIPTION, result, template, Utils.innerText)
417 self.
extractTag(CONSTS.TAG_DC_DATE, result, template)
419 self.
extractTag(CONSTS.TAG_LINK, result, template, Utils.innerText)
420 self.
extractTag(CONSTS.TAG_CONTENT_UTF8_ENCODED, result, template, Utils.innerText)
423 self.
extractTag(CONSTS.TAG_KEYWORDS, result, template, Utils.innerText)
425 self.
extractTag(CONSTS.HTML_LANG, result, template, Utils.innerText)
427 except Exception
as err:
428 ExceptionLog.handler(logger, err,
"Parse error:", (err))
442 for templateName
in templateDict:
443 localResult = copy.deepcopy(result)
445 localResults.append(localResult)
448 for localResult
in localResults:
449 result.mergeResults(localResult)
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)