601 self.logger.info(
"input_data url: %s, urlId: %s, siteId: %s", str(self.input_data.url), str(self.input_data.urlId),
602 str(self.input_data.siteId))
604 self.baseUrl = self.extractBaseUrlRssFeed(self.input_data.siteId, self.input_data.url)
605 if self.baseUrl
is None:
606 self.baseUrl = self.input_data.url
608 if self.input_data.template
and self.algorithm_name == CONSTS.PROCESS_ALGORITHM_REGULAR:
610 responses = self.templateExtraction(config, self.urlHost)
613 self.itr = iter(sorted(self.extractors, key=
lambda extractor: 0, reverse=
True))
614 self.logger.debug(
"Extractors: %s" %
varDump(self.itr))
615 responses = self.newsExtraction()
617 if CONSTS.MEDIA_LIMITS_NAME
in self.input_data.batch_item.properties:
618 self.logger.debug(
"Found property '%s'", str(CONSTS.MEDIA_LIMITS_NAME))
619 self.mediaLimitsHandler = MediaLimitsHandler(self.input_data.batch_item.properties[CONSTS.MEDIA_LIMITS_NAME])
621 for response
in responses:
622 response.metricsPrecalculate()
623 response.stripResult()
625 self.addCustomTag(result=response, tag_name=CONSTS.TAG_SOURCE_URL, \
626 tag_value=str(self.input_data.url))
629 if CONSTS.LANG_PROP_NAME
in self.properties:
630 self.logger.debug(
"!!! Enter '%s' !!!", str(CONSTS.LANG_PROP_NAME))
632 langDetector = ScraperLangDetector(self.properties[CONSTS.LANG_PROP_NAME])
633 langDetector.process(response, self.logger)
634 langTagsDict = langDetector.getLangTags()
635 self.logger.debug(
"langTagsDict: %s",
varDump(langTagsDict))
659 for tagName, langValue
in langTagsDict.items():
660 self.addCustomTag(result=response, tag_name=tagName, tag_value=langValue)
662 summaryLang = langDetector.getSummaryLang(response, self.logger)
663 self.addCustomTag(result=response, tag_name=CONSTS.TAG_SUMMARY_LANG, tag_value=summaryLang)
664 self.logger.debug(
"!!! Leave '%s' !!!", str(CONSTS.LANG_PROP_NAME))
668 if self.algorithm_name != CONSTS.PROCESS_ALGORITHM_REGULAR:
669 self.adjustTitle(response)
670 self.adjustLinkURL(response)
671 self.adjustPartialReferences(response)
675 self.preparseResponse(response)
679 if CONSTS.TAGS_TYPES_NAME
in self.input_data.batch_item.properties:
680 tagsTypes = self.input_data.batch_item.properties[CONSTS.TAGS_TYPES_NAME]
682 self.logger.info(
'=' * 50)
683 self.logger.info(
'self.properties: ' +
varDump(self.properties))
685 self.normalizeAuthor(self.tagsTypes, tagsTypes, response)
689 pdateSourceMask = APP_CONSTS.PDATE_SOURCES_MASK_BIT_DEFAULT
690 pdateSourceMaskOverwrite = APP_CONSTS.PDATE_SOURCES_MASK_OVERWRITE_DEFAULT
693 if APP_CONSTS.PDATE_SOURCES_MASK_PROP_NAME
in self.input_data.batch_item.properties:
694 pdateSourceMask = int(self.input_data.batch_item.properties[APP_CONSTS.PDATE_SOURCES_MASK_PROP_NAME])
697 if APP_CONSTS.PDATE_SOURCES_MASK_OVERWRITE_PROP_NAME
in self.input_data.batch_item.properties:
698 pdateSourceMaskOverwrite = \
699 int(self.input_data.batch_item.properties[APP_CONSTS.PDATE_SOURCES_MASK_OVERWRITE_PROP_NAME])
701 self.logger.debug(
'pdateSourceMask = %s, pdateSourceMaskOverwrite = %s',
702 str(pdateSourceMask), str(pdateSourceMaskOverwrite))
704 self.logger.debug(
"!!! self.input_data.batch_item.urlObj.pDate = " + str(self.input_data.batch_item.urlObj.pDate))
708 if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_RSS_FEED:
709 if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_RSS_FEED)
or \
710 not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_RSS_FEED:
711 self.pubdate, timezone = self.extractPubdateRssFeed(self.input_data.siteId, self.input_data.url)
714 if CONSTS.TAG_DC_DATE
in response.tags
and pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_DC_DATE:
715 if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_DC_DATE
and self.pubdate
is None)
or \
716 not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_DC_DATE:
717 if CONSTS.TAG_PUB_DATE
not in response.tags
or \
718 (isinstance(response.tags[CONSTS.TAG_PUB_DATE][
"data"], basestring)
and \
719 response.tags[CONSTS.TAG_PUB_DATE][
"data"].strip() ==
""):
720 response.tags[CONSTS.TAG_PUB_DATE] = copy.deepcopy(response.tags[CONSTS.TAG_DC_DATE])
721 response.tags[CONSTS.TAG_PUB_DATE][
"name"] = CONSTS.TAG_PUB_DATE
722 if len(response.tags[CONSTS.TAG_PUB_DATE][
"data"]) > 0
and response.tags[CONSTS.TAG_PUB_DATE][
"data"][0]:
723 self.pubdate = response.tags[CONSTS.TAG_PUB_DATE][
"data"][0]
724 self.logger.debug(
"Pubdate from 'dc_date': " + str(self.pubdate))
726 d = DateTimeType.parse(self.pubdate, bool(self.useCurrentYear), self.logger,
False)
727 self.logger.debug(
'Check format pubdate: ' + str(d))
729 d, timezone = DateTimeType.split(d)
730 self.pubdate = d.isoformat(DateTimeType.ISO_SEP)
731 self.logger.debug(
"Result pubdate from 'dc_date': %s, timezone: %s", str(self.pubdate), str(timezone))
736 if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_PUBDATE:
737 if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_PUBDATE
and self.pubdate
is None)
or \
738 not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_PUBDATE:
739 pubdate, tzone = self.normalizeDatetime(response, self.algorithm_name)
740 if pubdate
is not None:
741 self.pubdate = pubdate
743 self.logger.debug(
"Pubdate from 'pubdate': " + str(self.pubdate) +
" timezone: " + str(timezone))
746 if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_NOW:
747 if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_NOW
and self.pubdate
is None)
or \
748 not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_NOW:
749 self.pubdate = SQLExpression(
"NOW()")
750 self.logger.debug(
"Pubdate from 'SQL NOW()': " + str(self.pubdate))
753 if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_SQL_EXPRESSION
and \
754 APP_CONSTS.PDATE_SOURCES_EXPRESSION_PROP_NAME
in self.properties:
755 if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_SQL_EXPRESSION
and self.pubdate
is None)
or \
756 not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_SQL_EXPRESSION:
757 self.pubdate = SQLExpression(str(self.properties[APP_CONSTS.PDATE_SOURCES_EXPRESSION_PROP_NAME]))
758 self.logger.debug(
"Pubdate from 'sql expression': " + str(self.pubdate))
761 self.pubdate = self.pubdateMonthOrder(self.pubdate, self.input_data.batch_item.properties, self.input_data.url)
765 self.pubdate = FieldsSQLExpressionEvaluator.evaluatePDateTime(self.input_data.batch_item.properties,
767 self.input_data.batch_item.urlObj,
772 self.pubdate, timezone = self.pubdateTransform(self.pubdate,
774 self.input_data.batch_item.properties,
778 self.addCustomTag(result=response, tag_name=CONSTS.TAG_PUBDATE_TZ, tag_value=[timezone])
780 self.logger.debug(
"!!! self.pubdate: %s", str(self.pubdate))
784 self.applyPubdate(response, self.pubdate)
787 feedUrl = self.extractFeedUrlRssFeed(self.input_data.siteId, self.input_data.url)
788 if feedUrl
is not None:
789 self.addCustomTag(result=response, tag_name=CONSTS.TAG_FEED_URL, tag_value=[feedUrl])
793 if self.outputFormat
is None:
794 self.logger.debug(
">>> Warning, can't extract output format")
796 self.formatOutputData(response, self.outputFormat)
798 response.recalcTagMaskCount(
None, self.altTagsMask)
799 self.tagsCount = response.tagsCount
800 self.tagsMask = response.tagsMask
802 self.logger.debug(
"self.tagsCount: " + str(self.tagsCount) +
" self.tagsMasks: " + str(self.tagsMask))
804 response.finish = time.time()
805 response.data[
"time"] =
"%s" % (response.finish - response.start)
807 response = self.applyHTTPRedirectLink(self.input_data.batch_item.siteId, self.input_data.batch_item.urlObj.url,
808 self.input_data.batch_item.properties, response)
810 self.getProcessedContent(responses)
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)