3 @author Alexey <developers.hce@gmail.com>, scorp, bgv 4 @link http://hierarchical-cluster-engine.com/ 5 @copyright Copyright © 2013-2016 IOIX Ukraine 6 @license http://hierarchical-cluster-engine.com/license/ 7 @package HCE project node API 17 import xml.sax.saxutils
24 from dateutil
import parser
25 from cement.core
import foundation
63 MSG_ERROR_LOAD_CONFIG =
"Error loading config file. Exciting." 64 MSG_ERROR_LOAD_LOG_CONFIG_FILE =
"Error loading logging config file. Exiting." 65 MSG_ERROR_LOAD_EXTRACTORS =
"Error load extractors " 66 MSG_ERROR_TEMPLATE_EXTRACTION =
"Error template extraction " 67 MSG_ERROR_DYNAMIC_EXTRACTION =
"Error dynamic extraction " 68 MSG_ERROR_LOAD_DB_BACKEND =
"Error load db backend" 69 MSG_ERROR_LOAD_OPTIONS =
"Error load options" 70 MSG_INFO_PREPARE_CONTENT =
"Prepare content: " 71 MSG_ERROR_ADJUST_PR =
"Error adjust partial references. " 72 MSG_ERROR_ADJUST_PUBDATE =
"PUBDATE_ERROR " 73 MSG_ERROR_ADJUST_TITLE =
"Can't adjust title. " 80 ENV_SCRAPER_STORE_PATH =
"ENV_SCRAPER_STORE_PATH" 82 CONTENT_REPLACEMENT_LIST = [
'\n',
'\r\n',
'\t',
' ',
'<br>',
'<p>',
'</p>']
83 DEFAULT_TAG_REDUCE_MASK = 65535
85 EXTENDED_NEWS_TAGS = {
"description": [
"//meta[@name='description']//@content"]}
86 LINKS_NEWS_TAGS = [CONSTS.TAG_MEDIA, CONSTS.TAG_LINK, CONSTS.TAG_MEDIA_CONTENT,
"links",
"href"]
90 TAGS_DATETIME_NEWS_NAMES = [CONSTS.TAG_PUB_DATE, CONSTS.TAG_DC_DATE]
91 TAGS_DATETIME_TEMPLATE_TYPES = [CONSTS.TAG_TYPE_DATETIME]
93 OPTION_SECTION_DATETIME_NEWS_NAMES =
'tags_datetime_news_names' 94 OPTION_SECTION_DATETIME_TEMPLATE_TYPES =
'tags_datetime_template_types' 96 OPTION_SECTION_TAGS_TYPE =
'tagsTypes' 98 OPTION_SECTION_URL_SOURCES_RULES =
'urlSourcesRules' 99 URL_SOURCES_RULE_DATA_URL =
'd_url' 100 URL_SOURCES_RULE_REDIRECT_URL =
'r_url ' 101 URL_SOURCES_RULE_FEED_URL =
'f_url' 108 MSG_ERROR_WRONG_CONFIG_FILE_NAME =
"Config file name is wrong" 121 def __init__(self, usageModel=APP_CONSTS.APP_USAGE_MODEL_PROCESS, configFile=None, logger=None, inputData=None):
122 if usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
124 foundation.CementApp.__init__(self)
167 if self.
usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
169 foundation.CementApp.setup(self)
175 if self.
usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
177 foundation.CementApp.run(self)
194 if self.
usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
196 self.
logger.info(APP_CONSTS.LOGGER_DELIMITER_LINE)
205 if re.search(
'<', elem):
206 self.
logger.debug(
"Media tag contain DOM element: %s", elem)
220 if "link" in response.tags
and isinstance(response.tags[
"link"], dict)
and \
221 "media" in response.tags
and isinstance(response.tags[
"media"], dict):
225 self.
logger.debug(
"url type: %s", str(
type(response.tags[
"link"][
"data"])))
226 if isinstance(response.tags[
"link"][
"data"], basestring):
227 url = response.tags[
"link"][
"data"]
229 url = response.tags[
"link"][
"data"][0]
232 response.tags[
"link"][
"data"] = url
242 if isinstance(response.tags[
"media"][
"data"], basestring):
243 mediaData = [response.tags[
"media"][
"data"]]
244 elif isinstance(response.tags[
"media"][
"data"], list):
245 mediaData = list(set(response.tags[
"media"][
"data"]))
247 self.
logger.
error(
"!!! Wrong type of tag 'media': %s", str(
type(response.tags[
"media"][
"data"])))
249 filter_patterns, filter_types = [], []
253 filter_types = [filter_item.type
for filter_item
in self.
input_data.filters]
254 filter_patterns = [re.compile(filter_item.pattern)
for filter_item
in self.
input_data.filters]
257 for media
in mediaData:
258 self.
logger.debug(
"Media link: '%s'", media)
267 for filter_type, filter_pattern
in zip(filter_types, filter_patterns):
268 match = filter_pattern.search(media)
269 if filter_type == SiteFilter.TYPE_EXCLUDE
and match:
271 if filter_type == SiteFilter.TYPE_INCLUDE
and match:
273 if len(allowedUrls) > 0:
274 res.append(
','.
join(allowedUrls))
277 self.
logger.debug(
"media: %s", media)
278 self.
logger.debug(
"url: %s", url)
280 if len(allowedUrls) > 0:
281 res.append(
','.
join(allowedUrls))
285 self.
logger.debug(
"media tag is empty. Remove media tag from response.")
286 del response.tags[
"media"]
288 self.
logger.debug(
"media tag is adjusted. Copy media tag to response.")
289 response.tags[
"media"][
"data"] = res
293 except Exception
as err:
294 ExceptionLog.handler(self.
logger, err, MSG_ERROR_ADJUST_PR, (), \
295 {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
298 self.
logger.debug(
">>> Response has not have link or media tag, Don't need adjust media")
305 if self.
input_data.template
and "title" in self.
input_data.template
and "title" in response.tags:
306 self.
logger.debug(
"resource has template with title tag. Try to adjust title.")
307 self.
logger.debug(
"response.tags['title']: " + str(response.tags[
"title"]))
309 if localExtractor
is None:
313 raise Exception(
">>> Wrong! self.extractors list doesn't have 3'th element (index 2)")
314 if isinstance(response.tags[
"title"], basestring):
315 self.
logger.debug(
"response has not have title tag")
317 title = sel.xpath(
"//title/text()").extract()
318 localExtractor.addTag(result=response, tag_name=
"title", tag_value=title)
319 self.
logger.debug(
"TYPE response.tags['title']['data']" + str(
type(response.tags[
"title"][
"data"])))
321 self.
logger.debug(
"resource hasn't template with title tag. Don't need adjust title.")
322 except Exception
as err:
323 ExceptionLog.handler(self.
logger, err, MSG_ERROR_ADJUST_TITLE, (), \
324 {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
332 if response.tags
and "link" in response.tags:
333 self.
logger.debug(
"resource has template with link tag. Try to adjust link.")
334 self.
logger.debug(
"response.tags['link']: " + str(response.tags[
"link"]))
338 self.
logger.debug(
"Extractor exists")
339 if isinstance(response.tags[
"link"], basestring):
340 self.
logger.debug(
"response has not have link tag")
344 response.tags[
"link"][
"data"] = self.
input_data.url
349 self.
logger.debug(
">>> Wrong! self.extractors list doesn't have 3'th element (index 2)")
350 self.
logger.debug(
"TYPE response.tags['link']['data']" + str(
type(response.tags[
"link"][
"data"])))
352 self.
logger.debug(
"resource hasn't template with link tag. Don't need adjust link.")
353 except Exception
as err:
354 ExceptionLog.handler(self.
logger, err, MSG_ERROR_ADJUST_PR, (), \
355 {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
368 if response
is not None and response.tags
is not None:
372 if AuthorType.MAIN_TAG_NAME
in response.tags
and response.tags[AuthorType.MAIN_TAG_NAME]
is not None and \
373 "data" in response.tags[AuthorType.MAIN_TAG_NAME]:
374 inputData = response.tags[AuthorType.MAIN_TAG_NAME][
"data"]
375 self.
logger.debug(
"normalizeAuthor response has '" + str(AuthorType.MAIN_TAG_NAME) +
"' is: " + \
377 self.
logger.debug(
"normalizeAuthor type of '" + str(AuthorType.MAIN_TAG_NAME) +
"' is: " + \
378 str(
type(inputData)))
381 if isinstance(inputData, str)
or isinstance(inputData, unicode):
382 inputList = [inputData]
383 elif isinstance(inputData, list):
384 inputList = inputData
388 self.
logger.debug(
"normalizeAuthor confProp: " +
varDump(confProp))
389 self.
logger.debug(
"normalizeAuthor procProp: " +
varDump(procProp))
392 for inputElem
in inputList:
393 author = AuthorType.parse(confProp, procProp, inputElem, self.
logger)
394 if author
is not None:
395 authors.append(author)
397 self.
logger.debug(
"normalizeAuthor result author: " + str(authors))
399 response.tags[AuthorType.MAIN_TAG_NAME][
"data"] = authors
401 except Exception, err:
402 ExceptionLog.handler(self.
logger, err,
'normalizeAuthor error:', (), \
403 {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
415 if response
is not None and response.tags
is not None:
418 if self.
input_data.template
and algorithmName == CONSTS.PROCESS_ALGORITHM_REGULAR:
421 for responseTagName
in response.tags:
422 self.
logger.debug(
"normalizeDatetime responseTagName: '" + str(responseTagName) +
"'")
423 if (response.tags.get(responseTagName)
is not None and \
424 'type' in response.tags[responseTagName]
and \
425 response.tags[responseTagName][
'type'] == responseType)
or \
426 (responseTagName == CONSTS.TAG_PUB_DATE
and response.tags.get(responseTagName)
is not None):
427 tagNames.append(responseTagName)
432 self.
logger.debug(
'normalizeDatetime tagNames: ' +
varDump(tagNames))
434 for tagName
in tagNames:
436 if self.
extractor and tagName
in response.tags:
437 self.
extractor.addTag(result=response, tag_name=tagName +
'_normalized', tag_value=pubdate, \
438 xpath=response.tags[tagName][
'xpath'])
440 self.
logger.debug(
'tagName: ' + str(tagName) +
' pubdate: ' + str(pubdate))
441 retDict[tagName] = pubdate
443 if tagName == CONSTS.TAG_PUB_DATE:
450 for key, value
in retDict.items():
451 if value
is not None:
453 self.
logger.debug(
'set return value from ' + str(key) +
' : ' + str(value))
456 except Exception, err:
457 ExceptionLog.handler(self.
logger, err,
'normalizeDatetime error:', (), \
458 {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
473 if response
is not None and dataTagName
in response.tags
and response.tags[dataTagName]
is not None:
477 inputData = response.tags[dataTagName][
"data"]
478 self.
logger.debug(
"extractPubDate response has '" + str(dataTagName) +
"' is: " + str(inputData))
479 self.
logger.debug(
"extractPubDate type of '" + str(dataTagName) +
"' is: " + str(
type(inputData)))
482 if isinstance(inputData, basestring):
483 inputList = [inputData]
484 elif isinstance(inputData, list):
485 inputList = inputData
491 for inputElem
in inputList:
493 self.
logger.debug(
'pubdate: ' + str(d))
496 d, tzone = DateTimeType.split(d)
497 pubdate.append(d.isoformat(DateTimeType.ISO_SEP))
498 timezones.append(tzone)
500 self.
logger.debug(
"extractPubDate result pubdate: " + str(pubdate))
501 response.tags[dataTagName][
"data"] = pubdate
505 if len(timezones) > 0:
506 timezone = timezones[0]
508 except Exception, err:
509 ExceptionLog.handler(self.
logger, err,
'extractPubDate error:', (), \
510 {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
525 timezone = rawTimezone
528 if CONSTS.PDATE_TIMEZONES_NAME
in properties:
529 propertyString = properties[CONSTS.PDATE_TIMEZONES_NAME]
530 self.
logger.debug(
'inputted ' + CONSTS.PDATE_TIMEZONES_NAME +
':' + str(propertyString))
533 self.
logger.debug(
'pubdate: ' + str(dt))
536 utcOffset = DateTimeType.extractUtcOffset(rawTimezone, self.
logger)
537 self.
logger.debug(
'utcOffset: ' + str(utcOffset))
539 d = PDateTimezonesHandler.transform(dt, utcOffset, propertyString, urlString, self.
logger)
544 d, tzone = DateTimeType.split(dt)
545 pubdate = d.isoformat(DateTimeType.ISO_SEP)
548 return pubdate, timezone
555 for key
in response.tags:
556 if key
in DATA_NEWS_TAGS:
559 if isinstance(response.tags[key], basestring):
560 tagsValue = response.tags[key]
561 elif isinstance(response.tags[key], dict)
and "data" in response.tags[key]:
562 if isinstance(response.tags[key][
"data"], basestring):
563 tagsValue = response.tags[key][
"data"]
564 elif isinstance(response.tags[key][
"data"], list)
and len(response.tags[key][
"data"]) > 0
and \
565 isinstance(response.tags[key][
"data"][0], basestring):
566 tagsValue = response.tags[key][
"data"][0]
568 if tagsValue
is not None:
570 dt = parser.parse(tagsValue)
571 int(time.mktime(dt.timetuple()))
573 removeKeys.append(key)
575 for key
in removeKeys:
576 if key
in response.tags:
577 logging.debug(
">>> Remove " + key +
" element besause it empty")
578 del response.tags[key]
586 auth = urlparse.urlsplit(url)[1]
588 urlHost = (re.search(
'([^@]*@)?([^:]*):?(.*)', auth).groups())[1]
589 if urlHost
is not None and urlHost.find(self.
WWW_PREFIX) == 0:
590 urlHost = urlHost[len(self.
WWW_PREFIX): len(urlHost)]
613 self.
itr = iter(sorted(self.
extractors, key=
lambda extractor: 0, reverse=
True))
617 if CONSTS.MEDIA_LIMITS_NAME
in self.
input_data.batch_item.properties:
618 self.
logger.debug(
"Found property '%s'", str(CONSTS.MEDIA_LIMITS_NAME))
621 for response
in responses:
622 response.metricsPrecalculate()
623 response.stripResult()
625 self.
addCustomTag(result=response, tag_name=CONSTS.TAG_SOURCE_URL, \
630 self.
logger.debug(
"!!! Enter '%s' !!!", str(CONSTS.LANG_PROP_NAME))
633 langDetector.process(response, self.
logger)
634 langTagsDict = langDetector.getLangTags()
659 for tagName, langValue
in langTagsDict.items():
660 self.
addCustomTag(result=response, tag_name=tagName, tag_value=langValue)
662 summaryLang = langDetector.getSummaryLang(response, self.
logger)
663 self.
addCustomTag(result=response, tag_name=CONSTS.TAG_SUMMARY_LANG, tag_value=summaryLang)
664 self.
logger.debug(
"!!! Leave '%s' !!!", str(CONSTS.LANG_PROP_NAME))
679 if CONSTS.TAGS_TYPES_NAME
in self.
input_data.batch_item.properties:
680 tagsTypes = self.
input_data.batch_item.properties[CONSTS.TAGS_TYPES_NAME]
682 self.
logger.info(
'=' * 50)
689 pdateSourceMask = APP_CONSTS.PDATE_SOURCES_MASK_BIT_DEFAULT
690 pdateSourceMaskOverwrite = APP_CONSTS.PDATE_SOURCES_MASK_OVERWRITE_DEFAULT
693 if APP_CONSTS.PDATE_SOURCES_MASK_PROP_NAME
in self.
input_data.batch_item.properties:
694 pdateSourceMask = int(self.
input_data.batch_item.properties[APP_CONSTS.PDATE_SOURCES_MASK_PROP_NAME])
697 if APP_CONSTS.PDATE_SOURCES_MASK_OVERWRITE_PROP_NAME
in self.
input_data.batch_item.properties:
698 pdateSourceMaskOverwrite = \
699 int(self.
input_data.batch_item.properties[APP_CONSTS.PDATE_SOURCES_MASK_OVERWRITE_PROP_NAME])
701 self.
logger.debug(
'pdateSourceMask = %s, pdateSourceMaskOverwrite = %s',
702 str(pdateSourceMask), str(pdateSourceMaskOverwrite))
704 self.
logger.debug(
"!!! self.input_data.batch_item.urlObj.pDate = " + str(self.
input_data.batch_item.urlObj.pDate))
708 if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_RSS_FEED:
709 if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_RSS_FEED)
or \
710 not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_RSS_FEED:
714 if CONSTS.TAG_DC_DATE
in response.tags
and pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_DC_DATE:
715 if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_DC_DATE
and self.
pubdate is None)
or \
716 not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_DC_DATE:
717 if CONSTS.TAG_PUB_DATE
not in response.tags
or \
718 (isinstance(response.tags[CONSTS.TAG_PUB_DATE][
"data"], basestring)
and \
719 response.tags[CONSTS.TAG_PUB_DATE][
"data"].strip() ==
""):
720 response.tags[CONSTS.TAG_PUB_DATE] = copy.deepcopy(response.tags[CONSTS.TAG_DC_DATE])
721 response.tags[CONSTS.TAG_PUB_DATE][
"name"] = CONSTS.TAG_PUB_DATE
722 if len(response.tags[CONSTS.TAG_PUB_DATE][
"data"]) > 0
and response.tags[CONSTS.TAG_PUB_DATE][
"data"][0]:
723 self.
pubdate = response.tags[CONSTS.TAG_PUB_DATE][
"data"][0]
724 self.
logger.debug(
"Pubdate from 'dc_date': " + str(self.
pubdate))
727 self.
logger.debug(
'Check format pubdate: ' + str(d))
729 d, timezone = DateTimeType.split(d)
730 self.
pubdate = d.isoformat(DateTimeType.ISO_SEP)
731 self.
logger.debug(
"Result pubdate from 'dc_date': %s, timezone: %s", str(self.
pubdate), str(timezone))
736 if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_PUBDATE:
737 if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_PUBDATE
and self.
pubdate is None)
or \
738 not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_PUBDATE:
740 if pubdate
is not None:
743 self.
logger.debug(
"Pubdate from 'pubdate': " + str(self.
pubdate) +
" timezone: " + str(timezone))
746 if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_NOW:
747 if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_NOW
and self.
pubdate is None)
or \
748 not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_NOW:
750 self.
logger.debug(
"Pubdate from 'SQL NOW()': " + str(self.
pubdate))
753 if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_SQL_EXPRESSION
and \
754 APP_CONSTS.PDATE_SOURCES_EXPRESSION_PROP_NAME
in self.
properties:
755 if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_SQL_EXPRESSION
and self.
pubdate is None)
or \
756 not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_SQL_EXPRESSION:
758 self.
logger.debug(
"Pubdate from 'sql expression': " + str(self.
pubdate))
765 self.
pubdate = FieldsSQLExpressionEvaluator.evaluatePDateTime(self.
input_data.batch_item.properties,
778 self.
addCustomTag(result=response, tag_name=CONSTS.TAG_PUBDATE_TZ, tag_value=[timezone])
788 if feedUrl
is not None:
789 self.
addCustomTag(result=response, tag_name=CONSTS.TAG_FEED_URL, tag_value=[feedUrl])
794 self.
logger.debug(
">>> Warning, can't extract output format")
798 response.recalcTagMaskCount(
None, self.
altTagsMask)
804 response.finish = time.time()
805 response.data[
"time"] =
"%s" % (response.finish - response.start)
808 self.
input_data.batch_item.properties, response)
819 if isinstance(pubdate, SQLExpression)
and str(pubdate) ==
"NOW()":
820 pubdate = datetime.datetime.now().strftime(
"%Y-%m-%d %H:%M:%S")
823 self.
logger.debug(
"Check pubdate: '%s'", str(d))
825 pubdate = d.strftime(
"%Y-%m-%d %H:%M:%S")
829 if "pubdate" in response.tags
and "data" not in response.tags[
"pubdate"]:
830 response.tags[
"pubdate"][
"data"] = []
832 if "pubdate" in response.tags
and "data" in response.tags[
"pubdate"]:
833 if len(response.tags[
"pubdate"][
"data"]) > 0:
834 response.tags[
"pubdate"][
"data"][0] = pubdate
836 response.tags[
"pubdate"][
"data"] = [pubdate]
838 if "pubdate" not in response.tags:
839 self.
addCustomTag(result=response, tag_name=CONSTS.TAG_PUB_DATE, tag_value=[pubdate])
843 for key
in response.tags:
844 if "data" in response.tags[key]:
845 if isinstance(response.tags[key][
"data"], basestring):
846 localStr = response.tags[key][
"data"]
847 response.tags[key][
"data"] = []
848 response.tags[key][
"data"].append(localStr)
853 if localOutputFormat ==
"json":
854 localStr = json.dumps(elem, ensure_ascii=
False)
856 if len(localStr) > 0:
857 if localStr[0] ==
'\"' or localStr[0] ==
'\'':
858 localStr = localStr[1:]
859 if localStr[-1] ==
'\"' or localStr[-1] ==
'\'':
860 localStr = localStr[0:-1]
863 elif localOutputFormat ==
"html" or localOutputFormat ==
"xml":
864 ret = xml.sax.saxutils.escape(elem, {
"'":
"'",
"\"" :
"""})
865 elif localOutputFormat ==
"sql":
867 ret = Utils.escape(elem)
873 for key
in response.tags:
874 if "data" in response.tags[key]:
875 if isinstance(response.tags[key][
"data"], list):
876 for i, elem
in enumerate(response.tags[key][
"data"]):
877 if len(response.tags[key][
"data"]) > i:
880 elif isinstance(response.tags[key][
"data"], str)
or isinstance(response.tags[key][
"data"], unicode):
881 response.tags[key][
"data"] = self.
formatOutpuElement(response.tags[key][
"data"], localOutputFormat)
885 if isinstance(self.
input_data.template, dict):
891 self.
logger.
error(
"Wrong template structure: `%s` but dict expected, assumed empty!",
901 if rule.get(
'postProcessing')
is not None and rule[
"postProcessing"] !=
"":
902 self.
logger.debug(
"Post-processing applied for tag `%s` with expression: %s",
903 str(tag), str(rule[
"postProcessing"]))
906 self.
logger.debug(
"Post-processing is not applied for tag `%s`", str(tag))
922 self.
logger.debug(
"Template tag: " + tag)
923 if "state" in template[tag]
and not bool(int(template[tag][
"state"])):
924 self.
logger.debug(
"Tag skipped because state disabled, name: %s", str(tag))
927 if CONSTS.TAG_MARKUP_PROP_NAME
in self.
properties else None)
928 for rule
in template[tag]:
929 if not isinstance(rule, dict):
930 self.
logger.
error(
"Rule skipped because wrong structure - is not dict() type: %s", str(
type(rule)))
932 if "attributesExclude" in rule:
934 if rule[
"attributesExclude"] !=
"":
936 except Exception
as err:
937 self.
logger.
error(
"Feature of attributesExclude ignored because wrong structure: %s", str(err))
942 pathDict = Utils.getPairsDicts(rule)
951 innerDelimiter = xPathPreparing.resolveInnerDelimiter(rule, self.
properties)
953 self.
logger.debug(
">>> innerDelimiter: '" + str(innerDelimiter) +
"'")
955 xpath, xpathValue = xPathPreparing.process(rule, sel, self.
xpathSplitString, innerDelimiter)
956 except Exception
as excp:
957 ExceptionLog.handler(self.
logger, excp,
"Rule/xpath exception: ", (), \
958 {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
960 self.
logger.debug(
"xpath: `%s`, xpathType: `%s`, xpathValue: `%s`",
961 str(xpath), str(
type(xpathValue)), str(xpathValue))
962 if (isinstance(xpathValue, list)
and len(xpathValue) == 0)
or\
963 (isinstance(xpathValue, basestring)
and xpathValue ==
''):
964 self.
logger.debug(
">>> set default xpathValue")
966 xpathValue.append(rule[
"default"])
968 self.
logger.debug(
"result before:\n%s",
varDump(localResult))
969 self.
extractor.addTag(localResult, tag, xpathValue, xpath,
not isExtract,
False, rule[
"type"])
972 self.
logger.debug(
"Tag type: `%s`, tags data type: `%s`",
973 str(
type(localResult.tags)), str(
type(localResult.tags[tag][
"data"])))
974 if tag
in localResult.tags
and isinstance(localResult.tags[tag][
"data"], basestring):
975 self.
logger.debug(
"Convert result for tag: `%s`", str(tag))
976 localString = localResult.tags[tag][
"data"]
977 localResult.tags[tag][
"data"] = []
978 localResult.tags[tag][
"data"].append(localString)
980 self.
formatTag(localResult, rule, tag, pathDict, isExtract)
985 localResult.finish = time.time()
987 resultsList.append({
"obj": localResult,
"join": rule[
"join"],
"isExtract": isExtract,
"mandatory":
988 (bool(rule[
"mandatory"])
if "mandatory" in rule
else False),
989 "delimiter": (rule[
"delimiter"]
if "delimiter" in rule
else self.
xpathSplitString),
990 "type": rule[
"type"]})
993 self.
compileResults(result, prepareResultsList, tag, xPathPreparing)
995 result.finish = time.time()
1007 if tag_name
not in result.tags:
1008 data = {
"extractor":
"Base extractor",
"data":
"",
"name":
""}
1009 data[
"data"] = tag_value
1010 data[
"name"] = tag_name
1011 data[
"xpath"] =
None 1013 data[
"extractor"] = self.__class__.__name__
1014 result.tags[tag_name] = data
1036 for elem
in resultsList:
1037 if key
in result.tags:
1038 if result.tags[key]
is not None:
1039 if result.tags[key][
"xpath"]
is None:
1040 result.tags[key][
"xpath"] = elem[
"obj"].tags[key][
"xpath"]
1042 result.tags[key][
"xpath"] +=
' ' 1043 result.tags[key][
"xpath"] += elem[
"obj"].tags[key][
"xpath"]
1044 if result.tags[key][
"data"]
is None or len(result.tags[key][
"data"]) == 0:
1045 result.tags[key][
"data"] = elem[
"obj"].tags[key][
"data"]
1047 if xPathPreparing
is not None:
1051 result.tags[key][
"data"][0] +=
' ' 1052 result.tags[key][
"data"][0] += elem[
"obj"].tags[key][
"data"][0]
1054 result.tags.update(elem[
"obj"].tags)
1059 if len(resultsList) > 0:
1064 for elem
in resultsList:
1066 if elem[
"join"] ==
"concat":
1067 tempList.append(elem)
1069 if elem[
"mandatory"]:
1071 if not elem[
"isExtract"]:
1074 localElemWeight = localElemWeight | CONSTS.TAGS_RULES_MASK_MANDATORY_FIELD
1075 if elem[
"join"] ==
"best":
1076 localElemWeight = localElemWeight | CONSTS.TAGS_RULES_MASK_RULE_PRIORITY
1077 if elem[
"isExtract"]:
1078 localElemWeight = localElemWeight | CONSTS.TAGS_RULES_MASK_DEFAULT_VALUE
1080 self.
logger.debug(
">>> Rule weight = " + str(localElemWeight))
1081 self.
logger.debug(
">>> Rule join = " + elem[
"join"])
1082 if localElemWeight > firstElemWeight:
1083 firstElemWeight = localElemWeight
1086 if firstElem
is not None:
1087 tempList = [firstElem] + tempList
1088 isExtractResults = any([elem[
"isExtract"]
for elem
in tempList])
1089 if isExtractResults:
1090 ret = [elem
for elem
in tempList
if elem[
"isExtract"]]
1091 elif len(tempList) > 0:
1092 ret.append(tempList[0])
1101 def elemUrlsCanoizator(self, data, baseUrl=None, firstDelim=' ', secondDelim=',', useAdditionEncoding=False):
1102 normMask = UrlNormalizator.NORM_NONE
1103 if "URL_NORMALIZE_MASK_PROCESSOR" in self.
properties:
1104 normMask = int(self.
properties[
"URL_NORMALIZE_MASK_PROCESSOR"])
1107 if data.strip() !=
"":
1109 for elem
in data.split(firstDelim):
1110 if elem.strip() !=
"":
1112 if baseUrl
is not None:
1115 processedUrl = dc_event.URL(0, localUrl, normalizeMask=normMask).getURL(normMask)
1116 if useAdditionEncoding:
1117 processedUrl = xml.sax.saxutils.escape(processedUrl, {})
1118 ret += processedUrl + secondDelim
1119 if ret !=
"" and ret[-1] == secondDelim:
1120 ret = ret[0: len(ret) - 1]
1132 if isinstance(data, basestring):
1133 ret = self.
elemUrlsCanoizator(data, baseUrl, useAdditionEncoding=useAdditionEncoding)
1134 elif isinstance(data, list):
1137 elem = self.
elemUrlsCanoizator(elem, baseUrl, useAdditionEncoding=useAdditionEncoding)
1144 def formatTag(self, result, path, key, pathDict, isExtract):
1146 self.
logger.debug(
"Tag name: '%s', tag type: %s, tag format: '%s'",
1147 str(key), str(path[
"type"]), str(path[
"format"]))
1149 if path[
"type"] ==
"text":
1151 for elem
in result.tags[key][
"data"]:
1155 if "format" in pathDict
and "maxCh" in pathDict[
"format"]:
1156 localMaxCh = pathDict[
"format"][
"maxCh"]
1157 self.
logger.debug(
"!!! get localMaxCh from pathDict[\"format\"][\"maxCh\"] = %s", str(localMaxCh))
1159 localMaxCh = path[
"format"]
1160 if isinstance(localMaxCh, basestring)
and localMaxCh ==
"":
1162 self.
logger.debug(
"!!! get localMaxCh from [\"format\"] = %s", str(localMaxCh))
1165 if localMaxCh
is not None and int(localMaxCh) > 0
and len(localText) > int(localMaxCh):
1166 localText = localText[0: int(localMaxCh)]
1167 except ValueError, err:
1168 self.
logger.debug(
"!!! Use wrong value, error: %s", str(err))
1170 result.tags[key][
"data"] = []
1171 result.tags[key][
"data"].append(localText)
1172 elif path[
"type"] ==
"html":
1174 for i, elem
in enumerate(result.tags[key][
"data"]):
1175 result.tags[key][
"data"][i] = re.sub(
r"<![ \r\n\t]*(--([^\-]|[\r\n]|-[^\-])*--[ \r\n\t]*)>",
"", elem)
1176 self.
logger.debug(
">>> After RE = " + str(result.tags[key][
"data"]))
1180 elif path[
"type"] ==
"datetime":
1184 self.
logger.debug(
"Try to convert data")
1187 if len(result.tags[key][
"data"][0]) > 0
and result.tags[key][
"data"][0][0] ==
'@':
1188 localFormatStr = result.tags[key][
"data"][0][1: len(result.tags[key][
"data"][0])]
1189 localTm = datetime.datetime.fromtimestamp(time.time())
1190 result.tags[key][
"data"][0] = datetime.datetime.strftime(localTm, localFormatStr)
1193 self.
logger.debug(
">>> Time log Before = " + bestData)
1194 if path[
"format"] !=
"" and path[
"format"] !=
"FULL":
1195 result.tags[key][
"data"][0] = datetime.datetime.strftime(parser.parse(bestData), path[
"format"])
1197 result.tags[key][
"data"][0] = str(parser.parse(bestData))
1198 self.
logger.debug(
">>> Time log after = " + result.tags[key][
"data"][0])
1199 except Exception
as err:
1200 self.
logger.debug(
"Can't convert data <<< " + str(result.tags) +
" " + str(key) +
" err = " + str(err))
1201 result.tags[key][
"data"][0] = bestData
1202 if len(result.tags[key][
"data"]) > 0:
1203 result.tags[key][
"data"] = [result.tags[key][
"data"][0]]
1205 elif path[
"type"] ==
"image":
1206 if path[
"format"] ==
"URL" and "canonicalizeURLs" in path
and int(path[
"canonicalizeURLs"]) == 1:
1208 elif path[
"type"] ==
"link":
1209 formatName = path[
"format"]
1210 if len(formatName.split(
',')) > 1:
1211 formatName = formatName.split(
',')[1]
1212 if formatName ==
"email-address" or formatName ==
"email-to":
1214 if isinstance(result.tags[key][
"data"], basestring):
1215 self.
logger.debug(
">>> mail to str type")
1217 index = localText.find(
"mailto:")
1219 localText = localText[index + len(
"mailto:"), len(localText)]
1222 elif isinstance(result.tags[key][
"data"], list):
1223 self.
logger.debug(
">>> mail to list type")
1224 for elem
in result.tags[key][
"data"]:
1226 index = elemText.find(
"mailto:")
1228 elemText = elemText[index + len(
"mailto:"): len(elemText)]
1229 if formatName ==
"email-address":
1230 elemText = Utils.emailParse(elemText)
1232 elemText = Utils.emailParse(elemText,
True)
1238 result.tags[key][
"data"] = []
1239 result.tags[key][
"data"].append(localText)
1240 if "canonicalizeURLs" in path
and int(path[
"canonicalizeURLs"]) == 1:
1242 elif path[
"type"] ==
"attribute":
1245 if isinstance(result.tags[key][
"data"], basestring):
1246 localText = result.tags[key][
"data"]
1247 elif isinstance(result.tags[key][
"data"], list):
1248 localText = self.
xpathSplitString.
join([elem
for elem
in result.tags[key][
"data"]
if elem !=
''])
1249 splittedFormatString = path[
"format"].split(
',')
1250 if len(splittedFormatString) >= 2:
1252 if int(splittedFormatString[0]) < len(localText):
1253 localText = localText[0: int(splittedFormatString[0])]
1254 except Exception
as err:
1255 self.
logger.debug(
"Error: %s; Wrong path format for attribute rule, format=%s", str(err), path[
"format"])
1256 result.tags[key][
"data"] = []
1257 result.tags[key][
"data"].append(localText)
1260 for elem
in result.tags[key][
"data"]:
1263 result.tags[key][
"data"][0] = localElem
1264 result.tags[key][
"data"][0] = result.tags[key][
"data"][0].strip(self.
xpathSplitString)
1268 if key
in result.tags
and "data" in result.tags[key]
and result.tags[key][
"data"]
is not None and \
1269 len(result.tags[key][
"data"]) > 0:
1271 matchingVal = re.compile(postProcessingRE)
1272 except re.error
as err:
1273 self.
logger.debug(
"Post-processing RE error: %s", str(err))
1276 self.
logger.debug(
"!!! type(result.tags[%s][\"data\"] = %s", str(key),
type(result.tags[key][
"data"]))
1280 if isinstance(result.tags[key][
"data"], basestring):
1281 matchingResult = matchingVal.findall(result.tags[key][
"data"])
1282 elif isinstance(result.tags[key][
"data"], list):
1284 for tagData
in result.tags[key][
"data"]:
1285 self.
logger.debug(
"!!! type(tagData) = %s, tagData: %s", str(
type(tagData)),
varDump(tagData))
1286 localRes = matchingVal.findall(tagData)
1287 matchingResult.extend(localRes)
1293 innerSplitString =
'|||||' 1294 self.
logger.debug(
"Post-processing has %s matched results!", str(len(matchingResult)))
1295 self.
logger.debug(
"Post-processing matchingResult: %s",
varDump(matchingResult))
1296 if len(matchingResult) > 0:
1297 for elem
in matchingResult:
1298 if isinstance(elem, basestring):
1302 for innerElem
in elem:
1303 if innerElem
is not None and innerElem !=
'':
1304 tmpStr += str(innerElem)
1305 tmpStr += innerSplitString
1307 self.
logger.debug(
"Post-processing has no matched results!")
1311 self.
logger.debug(
"Post-processing matched and replaced with pieces!")
1312 self.
logger.debug(
"!!! type(result.tags[%s][\"data\"])) = %s", str(key), str(
type(result.tags[key][
"data"])))
1314 if isinstance(result.tags[key][
"data"], basestring):
1315 result.tags[key][
"data"] = tmpStr
1318 elif isinstance(result.tags[key][
"data"], list):
1319 result.tags[key][
"data"] = matchingResult
1322 self.
logger.debug(
"Post-processing not matched, value replaced with None or empty!")
1323 if isinstance(result.tags[key][
"data"], basestring):
1324 result.tags[key][
"data"] =
'' 1326 result.tags[key][
"data"][0] =
None 1328 self.
logger.debug(
"Post-processing keys not found!")
1333 if bufFormat.find(
"NO_SCRIPT") >= 0:
1334 ret = Utils.stripHTMLComments(htmlBuf, soup=
None)
1335 if bufFormat.find(
"NO_META") >= 0:
1337 if bufFormat.find(
"NO_COMMENTS") >= 0:
1339 if bufFormat.find(
"ENTITIES_ENCODED") >= 0:
1346 if isinstance(data, list):
1349 if ch >=
'0' and ch <=
'9':
1358 if isinstance(ret, basestring):
1359 ret = ret.replace(
'\n',
'')
1360 ret = ret.replace(
'\t',
'')
1374 resource_set[
"resId"] = self.
input_data.urlId
1375 resource_set[
"siteId"] = self.
input_data.siteId
1376 resource_set[
"raw_html"] = self.
input_data.raw_content
1380 blockedByXpathTags = []
1384 self.
logger.debug(
"Got best matching extractor: " + str(self.
extractor))
1386 self.
logger.debug(
"No more extractors, exiting loop")
1391 if CONSTS.TAG_MEDIA
in collectResult.tags.keys()
and \
1392 not self.
extractor.isTagNotFilled(collectResult, CONSTS.TAG_MEDIA):
1393 self.
logger.debug(
"!!! Check collectResult. Tag 'media' already selected. Copy.")
1394 result.tags[CONSTS.TAG_MEDIA] = collectResult.tags[CONSTS.TAG_MEDIA]
1396 result.blockedByXpathTags = blockedByXpathTags
1398 result = self.
extractor.extractTags(resource, result)
1400 self.
logger.debug(
">>> TAG END")
1401 empty_tags = result.getEmptyTags()
1402 self.
logger.debug(
"get list of empty tags from result: " + str(empty_tags))
1403 filled_tags = result.getFilledTags()
1404 self.
logger.debug(
"get list of filled_tags from result: " + str(filled_tags))
1407 for tag
in result.tags:
1409 for rule
in template[tag]:
1411 if tag
not in collectResult.tags
or not collectResult.isTagFilled(tag):
1412 collectResult.tags[tag] = copy.deepcopy(result.tags[tag])
1413 blockedByXpathTags = result.blockedByXpathTags
1414 result.finish = time.time()
1417 collectResult.blockedByXpathTags = blockedByXpathTags
1418 ret = [collectResult] + ret
1424 empty_tags = result.getEmptyTags()
1425 for localKey
in EXTENDED_NEWS_TAGS:
1426 if localKey
in empty_tags
or (localKey
in result.tags
and result.isTagFilled(localKey)
is False):
1428 for tagName
in LINKS_NEWS_TAGS:
1429 if tagName
in result.tags:
1430 if isinstance(result.tags[tagName], dict)
and (result.tags[tagName][
"xpath"] ==
"" or \
1431 result.tags[tagName][
"xpath"].find(
"/@src") != -1
or result.tags[tagName][
"xpath"].find(
"/@href") != -1):
1432 result.tags[tagName][
"data"] = \
1441 replaceValue = localValue.replace(replaceFrom, replaceTo)
1442 while len(replaceValue) != len(buf):
1443 localValue = replaceValue
1444 replaceValue = localValue.replace(replaceFrom, replaceTo)
1449 if tagName
in result.tags:
1450 if isinstance(result.tags[tagName], dict):
1452 if isinstance(result.tags[tagName][
"data"], list)
and len(result.tags[tagName][
"data"]) > 0:
1453 localValue = result.tags[tagName][
"data"][0]
1454 elif isinstance(result.tags[tagName][
"data"], basestring):
1455 localValue = result.tags[tagName][
"data"]
1456 if localValue
is not None:
1458 if CONSTS.TAG_REDUCE_PROP_NAME
in self.
properties:
1460 replaceList = json.loads(self.
properties[CONSTS.TAG_REDUCE_PROP_NAME])
1462 self.
logger.debug(
">>> Bad processor_property json format, [" + CONSTS.TAG_REDUCE_PROP_NAME +
"]")
1463 if replaceList
is None:
1464 replaceList = CONTENT_REPLACEMENT_LIST
1466 if CONSTS.TAG_REDUCE_MASK_PROP_NAME
in self.
properties:
1470 self.
logger.
error(
"Bad processor property '%s' value: '%s'", CONSTS.TAG_REDUCE_MASK_PROP_NAME,
1471 str(self.
properties[CONSTS.TAG_REDUCE_MASK_PROP_NAME]))
1476 replaceList = [replaceList[i]
for i
in xrange(len(replaceList))
if 1 << i & self.
tagReduceMask]
1481 for elem
in replaceList:
1483 localValue = Utils.replaceLoopValue(localValue, (elem * 2), elem)
1485 localValue = localValue.replace(
"\r",
" ")
1487 if isinstance(result.tags[tagName][
"data"], list)
and len(result.tags[tagName][
"data"]) > 0:
1488 result.tags[tagName][
"data"][0] = localValue
1489 elif isinstance(result.tags[tagName][
"data"], basestring):
1490 result.tags[tagName][
"data"] = localValue
1494 self.
logger.debug(
">>> Start addition news extracting")
1496 if extractor
is not None:
1498 for tagsXpath
in tagsXpaths:
1499 if tagsXpath
is not None and tagsXpath !=
"":
1500 localXpath = sel.xpath(tagsXpath)
1501 localValue = Utils.innerText(localXpath,
' ',
' ', self.
properties[CONSTS.TAG_MARKUP_PROP_NAME] \
1502 if CONSTS.TAG_MARKUP_PROP_NAME
in self.
properties else None,
None,
1504 if localValue !=
"":
1505 extractor.addTag(localResult, key, localValue, tagsXpath)
1508 self.
logger.debug(
">>> Cant extract tag=%s for xpath=%s" % (key, tagsXpath))
1514 extractor = next(self.
itr)
1515 except StopIteration:
1541 if "pubdate" in result[0].tags
and "data" in result[0].tags[
"pubdate"]
and \
1542 len(result[0].tags[
"pubdate"][
"data"]) > 0:
1543 self.
pubdate = result[0].tags[
"pubdate"][
"data"][0]
1544 self.
logger.debug(
'>>>> Set self.pubdate = ' + str(self.
pubdate))
1556 self.
logger.debug(
">>> No moduler_key or algorithm_name in self.properties")
1560 self.
logger.debug(
"Modules: %s" % modules)
1563 for module
in modules:
1566 if exrtactor
is not None:
1570 self.
logger.debug(
"*******************")
1571 self.
logger.debug(
"Loaded extractors:")
1573 self.
logger.debug(exrtactor.name)
1574 self.
logger.debug(
"*******************")
1576 except Exception
as err:
1577 ExceptionLog.handler(self.
logger, err, MSG_ERROR_LOAD_EXTRACTORS)
1588 if self.
usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1590 input_pickled_object = sys.stdin.read()
1593 if self.
usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1594 scraper_in_data = pickle.loads(input_pickled_object)
1595 except Exception
as err:
1596 ExceptionLog.handler(self.
logger, err,
'pickle.loads() error:')
1597 self.
logger.debug(
"input_pickled_object:\n" + str(input_pickled_object))
1599 raise Exception(err)
1602 if self.
usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1604 if self.
input_data.batch_item.urlObj
is not None:
1605 urlString = self.
input_data.batch_item.urlObj.url
1608 logMsg =
"BatchItem.siteId=" + str(self.
input_data.batch_item.siteId) + \
1609 ", BatchItem.urlId=" + str(self.
input_data.batch_item.urlId) + \
1610 ", BatchItem.urlObj.url=" + urlString
1611 app.Profiler.messagesList.append(logMsg)
1612 self.
logger.info(
"Incoming data: %s", logMsg)
1620 len(self.
input_data.batch_item.properties[
"template"][
"templates"]) > 0
and \
1621 "output_format" in self.
input_data.batch_item.properties[
"template"][
"templates"][0]
and \
1622 "name" in self.
input_data.batch_item.properties[
"template"][
"templates"][0][
"output_format"]:
1623 self.
outputFormat = self.
input_data.batch_item.properties[
"template"][
"templates"][0][
"output_format"][
"name"]
1625 if "TAGS_MAPPING" in self.
input_data.batch_item.properties
and \
1626 self.
input_data.batch_item.properties[
"TAGS_MAPPING"]
is not None:
1630 except Exception
as exp:
1631 self.
logger.debug(
">>> Bad TAGS_MAPPING properties value, err=" + str(exp))
1635 processor_properties = self.
input_data.processor_properties
1638 if not isinstance(processor_properties, dict):
1639 processor_properties = json.loads(self.
input_data.processor_properties)
1640 self.
logger.debug(
"Processor's properties was taken from input data: %s" % processor_properties)
1642 except Exception
as err:
1643 ExceptionLog.handler(self.
logger, err,
'Error load properties from input data:')
1647 if self.
usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1648 Utils.storePickleOnDisk(input_pickled_object, ENV_SCRAPER_STORE_PATH,
"scraper.in." + \
1653 self.
logger.debug(
">>> Metrics loads = " + str(self.
metrics))
1654 except Exception
as excp:
1655 self.
logger.debug(
">>> Metrcis dumps exception = " + str(excp))
1658 sys.stdout = open(
"/dev/null",
"wb")
1681 if self.
usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1682 output_pickled_object = pickle.dumps(scraperResponse)
1683 Utils.storePickleOnDisk(output_pickled_object, ENV_SCRAPER_STORE_PATH,
1685 print output_pickled_object
1690 except Exception
as err:
1691 ExceptionLog.handler(self.
logger, err,
'Scraper process batch error:')
1693 raise Exception(
'Scraper process batch error:' + str(err))
1702 self.
config.optionxform = str
1703 if self.
usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1704 if self.pargs.config:
1705 self.
config.read(self.pargs.config)
1707 self.
config.read(APP_NAME)
1711 print MSG_ERROR_LOAD_CONFIG
1720 if self.
usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1721 log_conf_file = self.
config.get(
"Application",
"log")
1722 logging.config.fileConfig(log_conf_file)
1725 except Exception, err:
1726 raise Exception(CONSTS.MSG_ERROR_LOAD_CONFIG +
" : " + str(err))
1734 class_name = self.__class__.__name__
1737 dbTaskIniConfigFileName = self.
config.get(self.__class__.__name__,
"db-task_ini")
1738 config = ConfigParser.ConfigParser()
1739 config.optionxform = str
1740 readOk = config.read(dbTaskIniConfigFileName)
1741 if len(readOk) == 0:
1746 urlSourcesList = self.
config.get(self.__class__.__name__, OPTION_SECTION_URL_SOURCES_RULES)
1747 if isinstance(urlSourcesList, basestring):
1748 self.
urlSourcesRules = [urlSourcesRule.strip()
for urlSourcesRule
in urlSourcesList.split(
',')]
1755 self.
tagsTypes = self.
config.get(class_name, OPTION_SECTION_TAGS_TYPE)
1757 if self.
config.has_section(OPTION_SECTION_DATETIME_NEWS_NAMES):
1759 for item
in self.
config.
items(OPTION_SECTION_DATETIME_NEWS_NAMES):
1762 self.
logger.debug(
"Config file hasn't section: " + str(OPTION_SECTION_DATETIME_NEWS_NAMES))
1765 if self.
config.has_section(OPTION_SECTION_DATETIME_TEMPLATE_TYPES):
1767 for item
in self.
config.
items(OPTION_SECTION_DATETIME_TEMPLATE_TYPES):
1770 self.
logger.debug(
"Config file hasn't section: " + str(OPTION_SECTION_DATETIME_TEMPLATE_TYPES))
1773 print MSG_ERROR_LOAD_OPTIONS
1783 scraperProperies = json.loads(fd.read())
1784 self.
properties = scraperProperies[self.__class__.__name__][CONSTS.PROPERTIES_KEY]
1785 except Exception
as excp:
1786 self.
logger.debug(
">>> Some error with scraper property loads = " + str(excp))
1798 appInst = (module_name, eval(module_name)(self.
config,
1802 self.
logger.debug(
"%s has been created!" % module_name)
1803 except Exception
as err:
1804 ExceptionLog.handler(self.
logger, err,
"Can't create module %s. Error is:" % (module_name))
1816 if extractor.__class__.__name__ == extractorName:
1841 if CONSTS.PUBLISHED
in self.
article.tags:
1846 self.
logger.debug(
"Resource %s hasn't publish date" % str(self.
article.tags[CONSTS.TAG_LINK][
"data"]))
1848 self.
logger.debug(
"Resource hasn't raw content. Exit.")
1855 resid = self.
entry[
"urlMd5"]
1858 for tag
in self.
entry[
"entry"]:
1859 data = {
"extractor":
"feedParser extractor",
"data":
"",
"name":
""}
1860 data[
"data"] = self.
entry[
"entry"][tag]
1864 date_tags = [
"published",
"updated",
"updated_parsed"]
1865 if len(set(self.
entry[
"entry"].keys()).intersection(date_tags)) == 0:
1866 self.
logger.debug(
"PUBDATE_ERROR: list of tags from rss feed: %s" % str(self.
entry[
"entry"].keys()))
1868 if "pubdate" in self.
entry and self.
article.tags[
"pubdate"] ==
"":
1869 data = {
"extractor":
"feedParser extractor",
"data":
"",
"name":
""}
1870 data[
"data"] = self.
entry[
"pubdate"]
1871 data[
"name"] =
"pubdate" 1872 self.
article.tags[
"pubdate"] = data
1875 data = {
"extractor":
"feedParser extractor",
"data":
"",
"name":
""}
1876 data[
"data"] = self.
entry[
"parent_rss_feed"]
1877 data[
"name"] =
"parent_rss_feed" 1879 data[
"extractor"] = self.__class__.__name__
1880 self.
article.tags[
"parent_rss_feed"] = data
1883 data = {
"extractor":
"feedParser extractor",
"data":
"",
"name":
""}
1884 data[
"data"] = self.
entry[
"parent_rss_feed_urlMd5"]
1885 data[
"name"] =
"parent_rss_feed_urlMd5" 1887 data[
"extractor"] = self.__class__.__name__
1888 self.
article.tags[
"parent_rss_feed_urlMd5"] = data
1902 self.putArticleToDB({
"default":self.
article})
1903 except ValueError, err:
1904 ExceptionLog.handler(self.
logger, err,
'Bad raw content:', (self.
input_data.raw_content), \
1905 {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
1921 self.
logger.debug(
"!!! siteId: %s, url: %s", str(siteId), str(url))
1926 if rawPubdate
is not None:
1928 dt = DateTimeType.parse(rawPubdate,
True, self.
logger,
False)
1930 dt, timezone = DateTimeType.split(dt)
1931 pubdate = dt.strftime(
"%Y-%m-%d %H:%M:%S")
1935 except Exception, err:
1936 self.
logger.debug(
"Unsupported date format: '%s', error: %s", str(rawPubdate), str(err))
1938 return pubdate, timezone
1950 self.
logger.debug(
"!!! siteId: %s, url: %s", str(siteId), str(url))
1952 if headerContent
is not None:
1955 self.
logger.debug(
'!!! ret: ' + str(ret))
1969 self.
logger.debug(
"!!! siteId: %s, url: %s", str(siteId), str(url))
1971 if headerContent
is not None:
1974 self.
logger.debug(
'!!! ret: ' + str(ret))
1986 headerContent =
None 1987 urlContentObj = dc_event.URLContentRequest(siteId, url, \
1988 dc_event.URLContentRequest.CONTENT_TYPE_RAW_LAST + \
1989 dc_event.URLContentRequest. CONTENT_TYPE_RAW + \
1990 dc_event.URLContentRequest.CONTENT_TYPE_HEADERS)
1992 rawContentData = self.
dbWrapper.urlContent([urlContentObj])
1994 if rawContentData
is not None and len(rawContentData) > 0:
1995 if rawContentData[0].headers
is not None and len(rawContentData[0].headers) > 0
and \
1996 rawContentData[0].headers[0]
is not None:
1997 headerContent = rawContentData[0].headers[0].buffer
1999 return headerContent
2013 if isinstance(headerContent, basestring):
2015 header = base64.b64decode(headerContent)
2017 header = headerContent
2019 headerList = header.split(
'\r\n')
2022 for elem
in headerList:
2023 pos = elem.find(name +
':')
2026 ret = elem.replace(name +
':',
'').strip()
2027 self.
logger.debug(
"Found '" + name +
"' has value: " + str(ret))
2041 pubdate = rawPubdate
2043 self.
logger.debug(
'pubdateMonthOrder() enter... rawPubdate: ' + str(rawPubdate))
2044 if CONSTS.PDATE_DAY_MONTH_ORDER_NAME
in properties
and isinstance(rawPubdate, basestring):
2047 self.
logger.debug(
'inputted ' + CONSTS.PDATE_DAY_MONTH_ORDER_NAME +
':' + \
2048 str(properties[CONSTS.PDATE_DAY_MONTH_ORDER_NAME]))
2049 propertyObj = json.loads(properties[CONSTS.PDATE_DAY_MONTH_ORDER_NAME])
2050 except Exception, err:
2051 self.
logger.
error(
"Fail loads '%s', error: %s", str(CONSTS.PDATE_DAY_MONTH_ORDER_NAME), str(err))
2053 for propertyElem
in propertyObj:
2055 if "pattern" not in propertyElem:
2056 raise Exception(
'Property "pattern" not found')
2058 if "order" not in propertyElem:
2059 raise Exception(
'Property "order" not found')
2061 pattern = str(propertyElem[
"pattern"])
2062 order = int(propertyElem[
"order"])
2064 if re.search(pattern, urlString, re.UNICODE)
is not None:
2065 self.
logger.debug(
"Pattern '%' found in url: %s", str(pattern), str(urlString))
2069 dt = datetime.datetime.strptime(rawPubdate,
"%Y-%d-%m %H:%M:%S")
2071 dt = datetime.datetime.strptime(rawPubdate,
"%Y-%m-%d %H:%M:%S")
2073 raise Exception(
"Unsupported value of 'order' == " + str(order))
2076 pubdate = dt.strftime(
"%Y-%d-%m %H:%M:%S")
2078 except Exception, err:
2079 self.
logger.
error(
"Fail execution '%s', error: %s", str(CONSTS.PDATE_DAY_MONTH_ORDER_NAME), str(err))
2081 self.
logger.debug(
'pubdateMonthOrder() leave... pubdate: ' + str(pubdate))
2097 for media
in mediaUrls:
2099 if re.search(MediaLimitsHandler.BINARY_IMAGE_SEARCH_STR, media, re.UNICODE)
is not None:
2100 self.
logger.debug(
"Tag 'media' has binary picture...")
2103 allowedUrls.append(media)
2106 allowedUrls.append(media)
2108 self.
logger.debug(
"Binary media tag has not allowed limits. Skipped...")
2112 self.
logger.debug(
"Tag 'media' has valid url: %s", str(media))
2114 allowedUrls.append(media)
2117 allowedUrls.append(media)
2119 self.
logger.debug(
"Media tag has not allowed limits. Skipped. Url: %s", str(media))
2123 self.
logger.debug(
"Invalid url in tag 'media'... Url: %s", str(media))
2135 PROTOCOL_STR =
'http' 2137 DELIMITER_NEW =
'|||||' 2138 urlStringMedia = urlStringMedia.replace(DELIMITER_OLD + PROTOCOL_STR, DELIMITER_NEW + PROTOCOL_STR)
2140 REPLACE_STR =
'base64|' 2141 if urlStringMedia.find(MediaLimitsHandler.BINARY_IMAGE_SEARCH_STR) > -1:
2142 urlStringMedia = urlStringMedia.replace(MediaLimitsHandler.BINARY_IMAGE_SEARCH_STR, REPLACE_STR)
2143 urls = urlStringMedia.split(DELIMITER_NEW)
2145 urls = [url.replace(REPLACE_STR, MediaLimitsHandler.BINARY_IMAGE_SEARCH_STR)
for url
in urls]
2148 urls = urlStringMedia.split(DELIMITER_NEW)
2161 if CONSTS.HTTP_REDIRECT_LINK_NAME
in properties:
2162 self.
logger.debug(
"Found property '%s'", str(CONSTS.HTTP_REDIRECT_LINK_NAME))
2163 propertyValue = int(properties[CONSTS.HTTP_REDIRECT_LINK_NAME])
2165 self.
logger.debug(
"siteId: %s, url: %s, propertyValue: %s", str(siteId), str(url), str(propertyValue))
2170 self.
logger.debug(
"%s value: %s", str(CONSTS.LOCATION_NAME), str(urlValue))
2172 if propertyValue == CONSTS.HTTP_REDIRECT_LINK_VALUE_URL:
2173 self.
logger.debug(
"!!! propertyValue & %s", str(CONSTS.HTTP_REDIRECT_LINK_VALUE_URL))
2175 if CONSTS.HTTP_REDIRECT_LINK_LINK_TAG_NAME
in response.tags
and \
2176 "data" in response.tags[CONSTS.HTTP_REDIRECT_LINK_LINK_TAG_NAME]
and \
2177 len(response.tags[CONSTS.HTTP_REDIRECT_LINK_LINK_TAG_NAME][
"data"]) > 0:
2178 response.tags[CONSTS.HTTP_REDIRECT_LINK_LINK_TAG_NAME][
"data"][0] = url
2180 if urlValue
is not None and propertyValue == CONSTS.HTTP_REDIRECT_LINK_VALUE_LOCATION:
2181 self.
logger.debug(
"!!! propertyValue & %s", str(CONSTS.HTTP_REDIRECT_LINK_VALUE_LOCATION))
2183 if CONSTS.HTTP_REDIRECT_LINK_LINK_TAG_NAME
in response.tags
and \
2184 "data" in response.tags[CONSTS.HTTP_REDIRECT_LINK_LINK_TAG_NAME]
and \
2185 len(response.tags[CONSTS.HTTP_REDIRECT_LINK_LINK_TAG_NAME][
"data"]) > 0:
2186 response.tags[CONSTS.HTTP_REDIRECT_LINK_LINK_TAG_NAME][
"data"][0] = str(urlValue)
2188 if urlValue
is not None and propertyValue == CONSTS.HTTP_REDIRECT_LINK_VALUE_REDIRECT_URL:
2189 self.
logger.debug(
"!!! propertyValue & %s", str(CONSTS.HTTP_REDIRECT_LINK_VALUE_REDIRECT_URL))
2190 self.
addCustomTag(result=response, tag_name=CONSTS.REDIRECT_URL_NAME, tag_value=[str(urlValue)])
2192 if propertyValue == CONSTS.HTTP_REDIRECT_LINK_VALUE_SOURCE_URL:
2193 self.
logger.debug(
"!!! propertyValue & %s", str(CONSTS.HTTP_REDIRECT_LINK_VALUE_SOURCE_URL))
2195 if urlValue
is not None:
2196 self.
addCustomTag(result=response, tag_name=CONSTS.REDIRECT_URL_NAME, tag_value=[str(urlValue)])
2198 self.
addCustomTag(result=response, tag_name=CONSTS.REDIRECT_URL_NAME, tag_value=[url])
2208 self.
logger.debug(
"Incoming value urlSourcesRules: %s",
varDump(urlSourcesRules))
2212 for urlSourcesRule
in urlSourcesRules:
2213 if urlSourcesRule == URL_SOURCES_RULE_DATA_URL:
2218 self.
logger.debug(
"domain: %s", str(domain))
2220 if domain
is not None:
2221 domains.append(domain)
2223 if urlSourcesRule == URL_SOURCES_RULE_REDIRECT_URL:
2226 self.
logger.debug(
"redirectUrl: %s", str(redirectUrl))
2228 if isinstance(redirectUrl, basestring):
2230 self.
logger.debug(
"domain: %s", str(domain))
2232 if domain
is not None:
2233 domains.append(domain)
2235 if urlSourcesRule == URL_SOURCES_RULE_FEED_URL:
2237 self.
logger.debug(
"feedUrl: %s", str(feedUrl))
2239 if isinstance(feedUrl, basestring):
2241 self.
logger.debug(
"domain: %s", str(domain))
2243 if domain
is not None:
2244 domains.append(domain)
2246 if len(domains) == 0:
def pubdateMonthOrder(self, rawPubdate, properties, urlString)
def formatTag(self, result, path, key, pathDict, isExtract)
def process(self, config)
def getNextBestExtractor(self)
def normalizeAuthor(self, confProp, procProp, response)
def applyHTTPRedirectLink(self, siteId, url, properties, response)
def __init__(self, usageModel=APP_CONSTS.APP_USAGE_MODEL_PROCESS, configFile=None, logger=None, inputData=None)
def addCustomTag(self, result, tag_name, tag_value)
def compileResults(self, result, resultsList, key, xPathPreparing=None)
def applyPubdate(self, response, pubdate)
def calcUrlDomainCrc(self, url)
def extractPubDate(self, response, dataTagName)
def extractAdditionTagsByScrapy(self, localResult, key, tagsXpaths)
def extractPubdateRssFeed(self, siteId, url)
def getVariableFromHeaderContent(self, headerContent, name, makeDecode=True)
def getTemplate(self, explicit=True)
def dataUrlsCanonizator(self, data, baseUrl=None, useAdditionEncoding=False)
def prepareResults(self, resultsList)
def normalizeDatetime(self, response, algorithmName)
def checkDOMElement(self, elem)
def getProcessedContent(self, result)
def pubdateTransform(self, rawPubdate, rawTimezone, properties, urlString)
def preparseResponse(self, response)
def refineBadDateTags(self, response)
def createModule(self, module_name)
def adjustTitle(self, response)
def adjustPartialReferences(self, response)
def formatOutpuElement(self, elem, localOutputFormat)
def applyPostProcessing(self, result, key, postProcessingRE)
def loadLogConfigFile(self)
def loadScraperProperties(self)
def getHeaderContent(self, siteId, url)
def getDomainsForUrlSourcesRules(self, urlSourcesRules)
def replaceLoopValue(self, buf, replaceFrom, replaceTo)
def postprocessing(self, result, rule, tag)
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
def splitMediaTagString(self, urlStringMedia)
def urlNormalization(base, url, supportProtocols=None, log=None)
def elemUrlsCanoizator(self, data, baseUrl=None, firstDelim=' ', secondDelim=', useAdditionEncoding=False)
def checkMediaTag(self, urlStringMedia)
def extractFeedUrlRssFeed(self, siteId, url)
def extractBaseUrlRssFeed(self, siteId, url)
def getBestDatatimeData(self, data)
def getExtractorByName(self, extractorName)
pubdate
response.tagsLangDetecting(self.properties[CONSTS.LANG_PROP_NAME])
def feedParserProcess(self)
def formatOutputData(self, response, localOutputFormat)
def templateExtraction(self, config, urlHost)
def adjustLinkURL(self, response)
def commonResultOperations(self, result)
def refineCommonText(self, tagName, result)
string MSG_ERROR_WRONG_CONFIG_FILE_NAME
def processingHTMLData(self, htmlBuf, bufFormat)