HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_processor.ScraperCustomJson Namespace Reference

Classes

class  Meta
 
class  ScraperCustomJson
 

Functions

def __init__ (self, usageModel=APP_CONSTS.APP_USAGE_MODEL_PROCESS, configFile=None, logger=None, inputData=None)
 
def setup (self)
 
def run (self)
 
def loadConfig (self)
 
def loadLogConfigFile (self)
 
def loadOptions (self)
 
def loadScraperProperties (self)
 
def processBatch (self)
 
def loadExtractors (self)
 
def createModule (self, module_name)
 
def getNextBestExtractor (self)
 
def resourceExtraction (self, jsonElem)
 
def formatOutpuElement (self, elem, localOutputFormat)
 
def formatOutputData (self, response, localOutputFormat)
 
def jsonParserExtractor (self, jsonElem)
 
def getProcessedContent (self, result)
 
def fillScraperResponse (self, jsonElem)
 
def generateEmptyResponse (self)
 
def jsonParserProcess (self)
 
def getExitCode (self)
 

Variables

int ERROR_OK = 0
 
int EXIT_SUCCESS = 0
 
int EXIT_FAILURE = 1
 
string MSG_ERROR_LOAD_EXTRACTORS = "Error load extractors "
 
string ENV_SCRAPER_STORE_PATH = "ENV_SCRAPER_STORE_PATH"
 
list TAGS_DATETIME_NEWS_NAMES = [CONSTS.TAG_PUB_DATE, CONSTS.TAG_DC_DATE]
 
string MSG_ERROR_WRONG_CONFIG_FILE_NAME = "Config file name is wrong"
 
list TAGS_DATETIME_TEMPLATE_TYPES = [CONSTS.TAG_TYPE_DATETIME]
 
string OPTION_SECTION_DATETIME_TEMPLATE_TYPES = 'tags_datetime_template_types'
 
 exitCode
 
 usageModel
 
 configFile
 
 logger
 
 input_data
 
 properties
 
 extractor
 
 extractors
 
 itr
 
 pubdate
 
 timezone
 
 errorMask
 
 scraperPropFileName
 
 algorithm_name
 
 scraperResponses
 
 tagsCount
 
 tagsMask
 
 processedContent
 
 outputFormat
 
 metrics
 
 altTagsMask
 
 urlHost
 
 output_data
 
 dbWrapper
 
 datetimeTemplateTypes
 
 useCurrentYear
 
 config
 

Function Documentation

◆ __init__()

def dc_processor.ScraperCustomJson.__init__ (   self,
  usageModel = APP_CONSTS.APP_USAGE_MODEL_PROCESS,
  configFile = None,
  logger = None,
  inputData = None 
)

Definition at line 85 of file ScraperCustomJson.py.

85  def __init__(self, usageModel=APP_CONSTS.APP_USAGE_MODEL_PROCESS, configFile=None, logger=None, inputData=None):
86  if usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
87  # call base class __init__ method
88  # #foundation.CementApp.__init__(self)
89  Scraper.__init__(self)
90 
91  self.exitCode = APP_CONSTS.EXIT_SUCCESS
92  self.usageModel = usageModel
93  self.configFile = configFile
94  self.logger = logger
95  self.input_data = inputData
96  self.properties = {}
97  self.extractor = None
98  self.extractors = []
99  self.itr = None
100  self.pubdate = None
101  self.timezone = None
102  self.errorMask = APP_CONSTS.ERROR_OK
103  self.scraperPropFileName = None
104  self.algorithm_name = None
105  self.scraperResponses = []
106  self.tagsCount = 0
107  self.tagsMask = 0
108  self.pubdate = None
109  self.processedContent = None
110  self.outputFormat = None
111  self.metrics = None
112  self.altTagsMask = None
113  self.errorMask = APP_CONSTS.ERROR_OK
114  self.urlHost = None
115  self.output_data = None
116  self.dbWrapper = None
117  self.datetimeTemplateTypes = []
118  self.useCurrentYear = 0
119 
120 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

◆ createModule()

def dc_processor.ScraperCustomJson.createModule (   self,
  module_name 
)

Definition at line 374 of file ScraperCustomJson.py.

374  def createModule(self, module_name):
375  appInst = None
376  try:
377  appInst = (module_name, eval(module_name)(self.config, None, self.urlHost, self.properties))[1] # pylint: disable=W0123
378  self.logger.debug("%s has been created!" % module_name)
379  except Exception as err:
380  ExceptionLog.handler(self.logger, err, "Can't create module %s. Error is:" % (module_name))
381 
382  return appInst
383 
384 
def createModule(self, module_name)

◆ fillScraperResponse()

def dc_processor.ScraperCustomJson.fillScraperResponse (   self,
  jsonElem 
)

Definition at line 566 of file ScraperCustomJson.py.

566  def fillScraperResponse(self, jsonElem):
567  self.tagsCount = 0
568  self.tagsMask = 0
569  self.pubdate = None
570  self.processedContent = None
571  self.errorMask = APP_CONSTS.ERROR_OK
572  self.jsonParserExtractor(jsonElem)
573  return ScraperResponse(self.tagsCount, self.tagsMask, self.pubdate, self.processedContent, self.errorMask)
574 
575 
def fillScraperResponse(self, jsonElem)

◆ formatOutpuElement()

def dc_processor.ScraperCustomJson.formatOutpuElement (   self,
  elem,
  localOutputFormat 
)

Definition at line 433 of file ScraperCustomJson.py.

433  def formatOutpuElement(self, elem, localOutputFormat):
434  ret = elem
435  if localOutputFormat == "json":
436  # self.logger.debug(">>> JSON HTML = " + elem)
437  localStr = json.dumps(elem, ensure_ascii=False)
438  if localStr[0] == '\"' or localStr[0] == '\'':
439  localStr = localStr[1:]
440  if localStr[-1] == '\"' or localStr[-1] == '\'':
441  localStr = localStr[0:-1]
442  ret = localStr
443  # self.logger.debug(">>> JSON HTML = " + ret)
444  elif localOutputFormat == "html" or localOutputFormat == "xml":
445  ret = xml.sax.saxutils.escape(elem, {"'": "'", "\"" : """})
446  elif localOutputFormat == "sql":
447  # ret = mdb.escape_string(elem) # pylint: disable=E1101
448  ret = Utils.escape(elem)
449  return ret
450 
451 
def formatOutpuElement(self, elem, localOutputFormat)

◆ formatOutputData()

def dc_processor.ScraperCustomJson.formatOutputData (   self,
  response,
  localOutputFormat 
)

Definition at line 454 of file ScraperCustomJson.py.

454  def formatOutputData(self, response, localOutputFormat):
455  # result.tags[key]["data"]
456  for key in response.tags:
457  if "data" in response.tags[key]:
458  if isinstance(response.tags[key]["data"], types.ListType):
459  for i, elem in enumerate(response.tags[key]["data"]):
460  response.tags[key]["data"][i] = self.formatOutpuElement(elem, localOutputFormat)
461  elif isinstance(response.tags[key]["data"], types.StringTypes):
462  response.tags[key]["data"] = self.formatOutpuElement(response.tags[key]["data"], localOutputFormat)
463 
464 
def formatOutputData(self, response, localOutputFormat)

◆ generateEmptyResponse()

def dc_processor.ScraperCustomJson.generateEmptyResponse (   self)

Definition at line 578 of file ScraperCustomJson.py.

578  def generateEmptyResponse(self):
579  localResult = Result(self.config, self.input_data.urlId, self.metrics)
580  # Add tag 'source_url'
581  self.addCustomTag(result=localResult, tag_name=CONSTS.TAG_SOURCE_URL, tag_value=[str(self.input_data.url)])
582  self.getProcessedContent([localResult])
583  return ScraperResponse(0, 0, self.pubdate, self.processedContent, APP_CONSTS.ERROR_MASK_SCRAPER_ERROR)
584 
585 

◆ getExitCode()

def dc_processor.ScraperCustomJson.getExitCode (   self)

Definition at line 618 of file ScraperCustomJson.py.

618  def getExitCode(self):
619  return self.exitCode
620 
621 
622 # # # Add custom tag
623 # #
624 # # @param result - Scrper result instance
625 # # @param tag_name - value name of tag
626 # # @param tag_value - value value of tag
627 # # @return - None
628 # def addCustomTag(self, result, tag_name, tag_value):
629 # data = {"extractor": "Base extractor", "data": "", "name": ""}
630 # data["data"] = tag_value
631 # data["name"] = tag_name
632 # data["xpath"] = None
633 # data["type"] = None
634 # data["extractor"] = self.__class__.__name__
635 # if tag_name not in result.tags:
636 # result.tags[tag_name] = data
637 
638 
639 # # # Normalize datetime tags procedure
640 # #
641 # # @param response - scraper response instance
642 # # @param algorithmName - algorithm name
643 # # @return - 'pubdate tag value'
644 # def normalizeDatetime(self, response, algorithmName):
645 # ret = None
646 # timezone = ''
647 # try:
648 # if response is not None and response.tags is not None:
649 # self.logger.debug("normalizeDatetime scraper response: " + varDump(response))
650 # tagNames = []
651 # if self.input_data.template and algorithmName == CONSTS.PROCESS_ALGORITHM_REGULAR:
652 # # temlate
653 # for responseType in self.datetimeTemplateTypes:
654 # for responseTagName in response.tags:
655 # self.logger.debug("normalizeDatetime responseTagName: '" + str(responseTagName) + "'")
656 # if (response.tags.get(responseTagName) is not None and \
657 # 'type' in response.tags[responseTagName] and \
658 # response.tags[responseTagName]['type'] == responseType) or \
659 # (responseTagName == CONSTS.TAG_PUB_DATE and response.tags.get(responseTagName) is not None):
660 # tagNames.append(responseTagName)
661 # else:
662 # tagNames = TAGS_DATETIME_NEWS_NAMES
663 #
664 # self.logger.debug('normalizeDatetime tagNames: ' + varDump(tagNames))
665 # retDict = {}
666 # for tagName in tagNames:
667 # pubdate, tzone = self.extractPubDate(response, tagName) # , properties, urlString)
668 # if self.extractor and tagName in response.tags:
669 # self.extractor.addTag(result=response, tag_name=tagName + '_normalized', tag_value=pubdate, \
670 # xpath=response.tags[tagName]['xpath'])
671 #
672 # self.logger.debug('tagName: ' + str(tagName) + ' pubdate: ' + str(pubdate))
673 # retDict[tagName] = pubdate
674 #
675 # if tagName == CONSTS.TAG_PUB_DATE:
676 # ret = pubdate
677 # timezone = tzone
678 # else:
679 # pass
680 #
681 # if ret is None:
682 # for key, value in retDict.items():
683 # if value is not None:
684 # ret = value
685 # self.logger.debug('set return value from ' + str(key) + ' : ' + str(value))
686 # break
687 #
688 # except Exception, err:
689 # ExceptionLog.handler(self.logger, err, 'normalizeDatetime error:', (), \
690 # {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
691 #
692 # return ret, timezone
693 
694 
695 # # # Extract pubdate
696 # #
697 # # @param response - response instance
698 # # @param dataTagName - tag name for extracting
699 # # @param properties - properties from PROCESSOR_PROPERTIES
700 # # @param urlString - url string value
701 # # @return pubdate if success or None
702 # def extractPubDate(self, response, dataTagName): # , properties, urlString):
703 # # variable for result
704 # ret = None
705 # timezone = ''
706 # try:
707 # if response is not None and dataTagName in response.tags and response.tags[dataTagName] is not None:
708 #
709 # # self.logger.debug("extractPubDate response: " + varDump(response))
710 #
711 # inputData = response.tags[dataTagName]["data"]
712 # self.logger.debug("extractPubDate response has '" + str(dataTagName) + "' is: " + str(inputData))
713 # self.logger.debug("extractPubDate type of '" + str(dataTagName) + "' is: " + str(type(inputData)))
714 #
715 # inputList = []
716 # if isinstance(inputData, basestring):
717 # inputList = [inputData]
718 # elif isinstance(inputData, list):
719 # inputList = inputData
720 # else:
721 # pass
722 #
723 # pubdate = []
724 # timezones = []
725 # for inputElem in inputList:
726 # d = DateTimeType.parse(inputElem, bool(self.useCurrentYear), self.logger, False)
727 # self.logger.debug('pubdate: ' + str(d))
728 #
729 # if d is not None:
730 # d, tzone = DateTimeType.split(d)
731 # pubdate.append(d.isoformat(DateTimeType.ISO_SEP))
732 # timezones.append(tzone)
733 #
734 # self.logger.debug("extractPubDate result pubdate: " + str(pubdate))
735 # response.tags[dataTagName]["data"] = pubdate
736 # if len(pubdate) > 0:
737 # ret = pubdate[0]
738 #
739 # if len(timezones) > 0:
740 # timezone = timezones[0]
741 #
742 # except Exception, err:
743 # ExceptionLog.handler(self.logger, err, 'extractPubDate error:', (), \
744 # {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
745 #
746 # return ret, timezone
747 
748 
749 # # # pubdate transformation use timezone value
750 # #
751 # # @param rawPubdate - raw pubdate string
752 # # @param rawTimezone - raw timezone string
753 # # @param properties - properties from PROCESSOR_PROPERTIES
754 # # @param urlString - url string value
755 # # @return pubdate and timezone if success or None and empty string
756 # def pubdateTransform(self, rawPubdate, rawTimezone, properties, urlString):
757 # # variables for result
758 # pubdate = rawPubdate
759 # timezone = rawTimezone
760 #
761 # self.logger.debug('properties: ' + varDump(properties))
762 # if CONSTS.PDATE_TIMEZONES_NAME in properties:
763 # propertyString = properties[CONSTS.PDATE_TIMEZONES_NAME]
764 # self.logger.debug('inputted ' + CONSTS.PDATE_TIMEZONES_NAME + ':' + str(propertyString))
765 #
766 # dt = DateTimeType.parse(rawPubdate, bool(self.useCurrentYear), self.logger, False)
767 # self.logger.debug('pubdate: ' + str(dt))
768 # if dt is not None:
769 # # get utc offset if necessary
770 # utcOffset = DateTimeType.extractUtcOffset(rawTimezone, self.logger)
771 # self.logger.debug('utcOffset: ' + str(utcOffset))
772 # # transformation accord to PDATE_TIMEZONES properties
773 # d = PDateTimezonesHandler.transform(dt, utcOffset, propertyString, urlString, self.logger)
774 # if d is not None:
775 # dt = d
776 #
777 # if dt is not None:
778 # d, tzone = DateTimeType.split(dt)
779 # pubdate = d.isoformat(DateTimeType.ISO_SEP)
780 # timezone = tzone
781 #
782 # return pubdate, timezone
783 
784 
785 # # # change month orden in pubdate if neccessary
786 # #
787 # # @param rawPubdate - raw pubdate string in iso format. sample: '2016-02-07 16:28:00'
788 # # @param properties - properties from PROCESSOR_PROPERTIES
789 # # @param urlString - url string value
790 # # @return pubdate and timezone if success or None and empty string
791 # def pubdateMonthOrder(self, rawPubdate, properties, urlString):
792 # # variables for result
793 # pubdate = rawPubdate
794 #
795 # self.logger.debug('pubdateMonthOrder() enter... rawPubdate: ' + str(rawPubdate))
796 # if CONSTS.PDATE_DAY_MONTH_ORDER_NAME in properties and isinstance(rawPubdate, basestring):
797 # propertyObj = []
798 # try:
799 # self.logger.debug('inputted ' + CONSTS.PDATE_DAY_MONTH_ORDER_NAME + ':' + \
800 # str(properties[CONSTS.PDATE_DAY_MONTH_ORDER_NAME]))
801 # propertyObj = json.loads(properties[CONSTS.PDATE_DAY_MONTH_ORDER_NAME])
802 # except Exception, err:
803 # self.logger.error("Fail loads '%s', error: %s", str(CONSTS.PDATE_DAY_MONTH_ORDER_NAME), str(err))
804 #
805 # for propertyElem in propertyObj:
806 # try:
807 # if "pattern" not in propertyElem:
808 # raise Exception('Property "pattern" not found')
809 #
810 # if "order" not in propertyElem:
811 # raise Exception('Property "order" not found')
812 #
813 # pattern = str(propertyElem["pattern"])
814 # order = int(propertyElem["order"])
815 #
816 # if re.search(pattern, urlString, re.UNICODE) is not None:
817 # self.logger.debug("Pattern '%' found in url: %s", str(pattern), str(urlString))
818 #
819 # dt = None
820 # if order == 0: # means day follows month
821 # dt = datetime.datetime.strptime(rawPubdate, "%Y-%d-%m %H:%M:%S")
822 # elif order == 1: # means month follows day
823 # dt = datetime.datetime.strptime(rawPubdate, "%Y-%m-%d %H:%M:%S")
824 # else:
825 # raise Exception("Unsupported value of 'order' == " + str(order))
826 #
827 # if dt is not None:
828 # pubdate = dt.strftime("%Y-%d-%m %H:%M:%S")
829 #
830 # except Exception, err:
831 # self.logger.error("Fail execution '%s', error: %s", str(CONSTS.PDATE_DAY_MONTH_ORDER_NAME), str(err))
832 #
833 # self.logger.debug('pubdateMonthOrder() leave... pubdate: ' + str(pubdate))
834 #
835 # return pubdate
836 
837 
838 # # # Get header content
839 # #
840 # # @param siteId - Site/Project ID
841 # # @param url - url string
842 # # @return extracted header content
843 # def getHeaderContent(self, siteId, url):
844 # # variable for result
845 # headerContent = None
846 # urlContentObj = dc_event.URLContentRequest(siteId, url, \
847 # dc_event.URLContentRequest.CONTENT_TYPE_RAW_LAST + \
848 # dc_event.URLContentRequest. CONTENT_TYPE_RAW + \
849 # dc_event.URLContentRequest.CONTENT_TYPE_HEADERS)
850 #
851 # rawContentData = self.dbWrapper.urlContent([urlContentObj])
852 #
853 # if rawContentData is not None and len(rawContentData) > 0:
854 # if rawContentData[0].headers is not None and len(rawContentData[0].headers) > 0 and \
855 # rawContentData[0].headers[0] is not None:
856 # headerContent = rawContentData[0].headers[0].buffer
857 #
858 # return headerContent
859 #
860 #
861 # # #Get variable from header content
862 # #
863 # # @param headerContent - header content
864 # # @param name - variable name
865 # # @param makeDecode - boolean flag necessary decode
866 # # @return extracted value of 'Location'
867 # def getVariableFromHeaderContent(self, headerContent, name, makeDecode=True):
868 # # variable for result
869 # ret = None
870 #
871 # header = ''
872 # if makeDecode and headerContent is not None:
873 # header = base64.b64decode(headerContent)
874 #
875 # headerList = header.split('\r\n')
876 # self.logger.debug("headerList: " + varDump(headerList))
877 #
878 # for elem in headerList:
879 # pos = elem.find(name + ':')
880 # if pos > -1:
881 # ret = elem.replace(name + ':', '').strip()
882 # self.logger.debug("Found '" + name + "' has value: " + str(ret))
883 # break
884 #
885 # return ret

◆ getNextBestExtractor()

def dc_processor.ScraperCustomJson.getNextBestExtractor (   self)

Definition at line 385 of file ScraperCustomJson.py.

385  def getNextBestExtractor(self):
386  # return extractor with highest rank
387  try:
388  extractor = next(self.itr)
389  except StopIteration:
390  extractor = None
391  return extractor
392 
393 

◆ getProcessedContent()

def dc_processor.ScraperCustomJson.getProcessedContent (   self,
  result 
)

Definition at line 548 of file ScraperCustomJson.py.

548  def getProcessedContent(self, result):
549  for elem in result:
550  elem.get()
551  self.processedContent = {}
552  self.processedContent["default"] = result[0]
553  self.processedContent["internal"] = result
554  self.processedContent["custom"] = []
555  self.tagsCount = result[0].tagsCount
556  self.tagsMask = result[0].tagsMask
557 
558  if "pubdate" in result[0].tags and "data" in result[0].tags["pubdate"] and \
559  len(result[0].tags["pubdate"]["data"]) > 0:
560  self.pubdate = result[0].tags["pubdate"]["data"][0]
561  self.logger.debug('>>>> Set self.pubdate = ' + str(self.pubdate))
562 
563 

◆ jsonParserExtractor()

def dc_processor.ScraperCustomJson.jsonParserExtractor (   self,
  jsonElem 
)

Definition at line 467 of file ScraperCustomJson.py.

467  def jsonParserExtractor(self, jsonElem):
468  if self.extractors is not None:
469  self.itr = iter(sorted(self.extractors, key=lambda extractor: 0, reverse=True)) # pylint: disable=W0612,W0613
470  self.logger.debug("Extractors: %s" % varDump(self.itr))
471 
472  responses = self.resourceExtraction(jsonElem)
473  for response in responses:
474  response.metricsPrecalculate()
475  response.stripResult()
476  # Add tag 'source_url'
477  self.addCustomTag(result=response, tag_name=CONSTS.TAG_SOURCE_URL,
478  tag_value=[str(self.input_data.url)])
479 
480  if CONSTS.LANG_PROP_NAME in self.properties:
481  # response.tagsLangDetecting(self.properties[CONSTS.LANG_PROP_NAME])
482  langDetector = ScraperLangDetector(self.properties[CONSTS.LANG_PROP_NAME])
483  langDetector.process(response, self.logger)
484  langTagsDict = langDetector.getLangTags()
485  self.logger.debug("langTagsDict: %s", varDump(langTagsDict))
486 
487  # add lang tags to processed content
488  for tagName, langValue in langTagsDict.items():
489  self.addCustomTag(result=response, tag_name=tagName, tag_value=langValue)
490 
491  summaryLang = langDetector.getSummaryLang(response, self.logger)
492  self.addCustomTag(result=response, tag_name=CONSTS.TAG_SUMMARY_LANG, tag_value=summaryLang)
493 
494  pubdate, timezone = self.normalizeDatetime(response, self.algorithm_name)
495  if pubdate is not None:
496  self.pubdate = pubdate
497  self.logger.debug("Pubdate from 'pubdate': " + str(self.pubdate))
498 
499  # Apply property 'PDATE_DAY_MONTH_ORDER'
500  self.pubdate = self.pubdateMonthOrder(self.pubdate, self.input_data.batch_item.properties, self.input_data.url)
501 
502  # Apply property 'PDATE_TIME'
503  self.input_data.batch_item.urlObj.pDate = self.pubdate
504  self.pubdate = FieldsSQLExpressionEvaluator.evaluatePDateTime(self.input_data.batch_item.properties,
505  self.dbWrapper,
506  self.input_data.batch_item.urlObj,
507  self.logger,
508  self.pubdate)
509 
510  # Apply property 'PDATE_TIMEZONES'
511  self.pubdate, timezone = self.pubdateTransform(self.pubdate,
512  timezone,
513  self.input_data.batch_item.properties,
514  self.input_data.url)
515 
516  # Add tag 'pubdate_tz'
517  self.addCustomTag(result=response, tag_name=CONSTS.TAG_PUBDATE_TZ, tag_value=[timezone])
518 
519  if "pubdate" in response.tags and "data" in response.tags["pubdate"] and \
520  len(response.tags["pubdate"]["data"]) > 0:
521  response.tags["pubdate"]["data"][0] = self.pubdate
522 
523  if self.outputFormat is not None:
524  self.formatOutputData(response, self.outputFormat)
525  else:
526  self.logger.debug(">>> Warning, can't extracr output format")
527  response.recalcTagMaskCount(None, self.altTagsMask)
528  self.tagsCount = response.tagsCount
529  self.tagsMask = response.tagsMask
530  # self.putArticleToDB({"default":response})
531  self.logger.debug("self.tagsCount: %s", self.tagsCount)
532  self.logger.debug("self.tagsMasks: %s", self.tagsMask)
533 
534  self.logger.debug(">>> Resp: %s\n", varDump(response))
535 
536  # TODO: Seems need to be done more system way
537  response.finish = time.time()
538  response.data["time"] = "%s" % (response.finish - response.start)
539 
540  response = self.applyHTTPRedirectLink(self.input_data.batch_item.siteId, self.input_data.batch_item.urlObj.url,
541  self.input_data.batch_item.properties, response)
542 
543  self.getProcessedContent(responses)
544 
545 
def jsonParserExtractor(self, jsonElem)
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:

◆ jsonParserProcess()

def dc_processor.ScraperCustomJson.jsonParserProcess (   self)

Definition at line 588 of file ScraperCustomJson.py.

588  def jsonParserProcess(self):
589  rawDataJson = None
590  ret = []
591  try:
592  rawDataJson = json.loads(self.input_data.raw_content)
593  except Exception as excp:
594  self.logger.debug(">>> jsonParserProcess wrong rawData json: " + str(excp))
595 
596  self.logger.debug("!!! type(rawDataJson) = %s", str(type(rawDataJson)))
597  if not isinstance(rawDataJson, list):
598  self.logger.debug("!!! rawDataJson: %s", varDump(rawDataJson))
599 
600 
601  if rawDataJson is not None and isinstance(rawDataJson, list):
602  for elem in rawDataJson:
603  if isinstance(elem, list):
604  for internalElem in elem:
605  ret.append(self.fillScraperResponse(internalElem))
606  else:
607  ret.append(self.fillScraperResponse(elem))
608  else:
609  self.logger.debug(">>> rawDataJson structure not List type")
610 
611  if len(ret) == 0:
612  ret.append(self.generateEmptyResponse())
613  return ret
614 
615 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:

◆ loadConfig()

def dc_processor.ScraperCustomJson.loadConfig (   self)

Definition at line 157 of file ScraperCustomJson.py.

157  def loadConfig(self):
158  try:
159  self.config = ConfigParser.ConfigParser()
160  self.config.optionxform = str
161  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
162  if self.pargs.config:
163  self.config.read(self.pargs.config)
164  else:
165  self.config.read(CONSTS.SCRAPER_CUSTOM_JSON_APP_CLASS_NAME)
166  else:
167  self.config.read(self.configFile)
168  except:
169  raise
170 
171 

◆ loadExtractors()

def dc_processor.ScraperCustomJson.loadExtractors (   self)

Definition at line 338 of file ScraperCustomJson.py.

338  def loadExtractors(self):
339  try:
340  # modules
341  if CONSTS.MODULES_KEY in self.properties and self.algorithm_name in self.properties[CONSTS.MODULES_KEY]:
342  modules = self.properties[CONSTS.MODULES_KEY][self.algorithm_name]
343  else:
344  self.logger.debug(">>> No moduler_key or algorithm_name in self.properties")
345  modules = []
346 
347  self.logger.debug("Algorithm name: <%s>" % (self.algorithm_name))
348  self.logger.debug("Modules: %s" % modules)
349 
350  self.extractors = []
351  for module in modules:
352  exrtactor = self.createModule(module)
353  # Check if module was created successfully and then insert it to extractors
354  if exrtactor is not None:
355  self.extractors.append(exrtactor)
356 
357  # Info show extractors loaded
358  self.logger.debug("*******************")
359  self.logger.debug("Loaded extractors:")
360  for exrtactor in self.extractors:
361  self.logger.debug(exrtactor.name)
362  self.logger.debug("*******************")
363 
364  except Exception as err:
365  ExceptionLog.handler(self.logger, err, MSG_ERROR_LOAD_EXTRACTORS)
366  raise
367 
368 

◆ loadLogConfigFile()

def dc_processor.ScraperCustomJson.loadLogConfigFile (   self)

Definition at line 175 of file ScraperCustomJson.py.

175  def loadLogConfigFile(self):
176  try:
177  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
178  log_conf_file = self.config.get("Application", "log")
179  logging.config.fileConfig(log_conf_file)
180  # Logger initialization
181  self.logger = Utils.MPLogger().getLogger()
182  except Exception, err:
183  raise Exception(CONSTS.MSG_ERROR_LOAD_CONFIG + " : " + str(err))
184 
185 
Here is the call graph for this function:

◆ loadOptions()

def dc_processor.ScraperCustomJson.loadOptions (   self)

Definition at line 189 of file ScraperCustomJson.py.

189  def loadOptions(self):
190  try:
191  # class_name = self.__class__.__name__
192  self.scraperPropFileName = self.config.get("Application", "property_file_name")
193  # self.config_db_dir = self.config.get(class_name, "config_db_dir")
194  # self.sqliteTimeout = self.config.getint("sqlite", "timeout")
195 
196  self.useCurrentYear = self.config.getint("DateTimeType", "useCurrentYear")
197 
198  if self.config.has_section(self.OPTION_SECTION_DATETIME_TEMPLATE_TYPES):
199  self.datetimeTemplateTypes = []
200  for key, value in self.config.items(self.OPTION_SECTION_DATETIME_TEMPLATE_TYPES):
201  self.datetimeTemplateTypes.append(key)
202  if self.logger is not None:
203  self.logger.debug('load form config: ' + str(key) + ' = ' + str(value))
204  else:
205  self.datetimeTemplateTypes = self.TAGS_DATETIME_TEMPLATE_TYPES
206  if self.logger is not None:
207  self.logger.debug("Config file hasn't section: " + str(self.OPTION_SECTION_DATETIME_TEMPLATE_TYPES))
208 
209  # DBWrapper initialization
210  dbTaskIniConfigFileName = self.config.get(self.__class__.__name__, "db-task_ini")
211  config = ConfigParser.ConfigParser()
212  config.optionxform = str
213  readOk = config.read(dbTaskIniConfigFileName)
214  if len(readOk) == 0:
215  raise Exception(self.MSG_ERROR_WRONG_CONFIG_FILE_NAME + ": " + dbTaskIniConfigFileName)
216  self.dbWrapper = DBTasksWrapper(config)
217  except:
218  raise
219 
220 

◆ loadScraperProperties()

def dc_processor.ScraperCustomJson.loadScraperProperties (   self)

Definition at line 223 of file ScraperCustomJson.py.

223  def loadScraperProperties(self):
224  if self.scraperPropFileName is not None:
225  try:
226  with open(self.scraperPropFileName, "rb") as fd:
227  scraperProperies = json.loads(fd.read())
228  self.properties = scraperProperies[self.__class__.__name__][CONSTS.PROPERTIES_KEY]
229  except Exception as excp:
230  self.logger.debug(">>> Some error with scraper property loads = " + str(excp))
231 
232 

◆ processBatch()

def dc_processor.ScraperCustomJson.processBatch (   self)

Definition at line 235 of file ScraperCustomJson.py.

235  def processBatch(self):
236  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
237  # read pickled batch object from stdin
238  input_pickled_object = sys.stdin.read()
239  try:
240  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
241  scraper_in_data = pickle.loads(input_pickled_object)
242  except Exception as err:
243  ExceptionLog.handler(self.logger, err, 'pickle.loads() error:')
244  self.logger.debug("input_pickled_object:\n" + str(input_pickled_object))
245  self.exitCode = EXIT_FAILURE
246  raise Exception(err)
247 
248  try:
249  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
250  self.input_data = scraper_in_data
251  if self.input_data.batch_item.urlObj is not None:
252  urlString = self.input_data.batch_item.urlObj.url
253  else:
254  urlString = ""
255  logMsg = "BatchItem.siteId=" + str(self.input_data.batch_item.siteId) + \
256  ", BatchItem.urlId=" + str(self.input_data.batch_item.urlId) + \
257  ", BatchItem.urlObj.url=" + urlString
258  app.Profiler.messagesList.append(logMsg)
259  self.logger.info("Incoming data: %s", logMsg)
260 
261  self.urlHost = app.Utils.UrlParser.getDomain(self.input_data.url)
262 
263 
264  if self.input_data.output_format is not None and "name" in self.input_data.output_format:
265  self.outputFormat = self.input_data.output_format["name"]
266 
267  if self.outputFormat is None and "templates" in self.input_data.batch_item.properties["template"] and \
268  len(self.input_data.batch_item.properties["template"]["templates"]) > 0 and \
269  "output_format" in self.input_data.batch_item.properties["template"]["templates"][0] and \
270  "name" in self.input_data.batch_item.properties["template"]["templates"][0]["output_format"]:
271  self.outputFormat = self.input_data.batch_item.properties["template"]["templates"][0]["output_format"]["name"]
272 
273  if "TAGS_MAPPING" in self.input_data.batch_item.properties and \
274  self.input_data.batch_item.properties["TAGS_MAPPING"] is not None:
275  try:
276  self.altTagsMask = json.loads(self.input_data.batch_item.properties["TAGS_MAPPING"])
277  self.logger.debug(">>> AltTags = " + str(self.altTagsMask))
278  except Exception as exp:
279  self.logger.debug(">>> Bad TAGS_MAPPING properties value, err=" + str(exp))
280 
281  try:
282  if (self.input_data is not None) and (self.input_data.processor_properties is not None):
283  processor_properties = self.input_data.processor_properties
284  self.logger.debug("Processor's properties was taken from input data: %s" % processor_properties)
285  self.logger.debug("Processor's properties type: %s" % str(type(processor_properties)))
286  if not isinstance(processor_properties, types.DictType):
287  processor_properties = json.loads(self.input_data.processor_properties)
288  self.logger.debug("Processor's properties was taken from input data: %s" % processor_properties)
289  self.properties.update(processor_properties)
290  except Exception as err:
291  ExceptionLog.handler(self.logger, err, 'Error load properties from input data:')
292 
293  self.algorithm_name = self.properties[CONSTS.ALGORITHM_KEY][CONSTS.ALGORITHM_NAME_KEY]
294  self.logger.debug("Algorithm : %s" % self.algorithm_name)
295  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
296  Utils.storePickleOnDisk(input_pickled_object, ENV_SCRAPER_STORE_PATH, "scraper.in." + \
297  str(self.input_data.urlId))
298  if "metrics" in self.properties:
299  try:
300  self.metrics = json.loads(self.properties["metrics"])
301  self.logger.debug(">>> Metrics loads = " + str(self.metrics))
302  except Exception as excp:
303  self.logger.debug(">>> Metrcis dumps exception = " + str(excp))
304  # TODO main processing over every url from list of urls in the batch object
305  tmp = sys.stdout
306  sys.stdout = open("/dev/null", "wb")
307 
308  # initialization of scraper
309  # load scraper's modules
310  self.loadExtractors()
311 
312  # # Initialization pubdate
313  # self.logger.debug("Initialization pubdate from urlObj.pDate use value: %s",
314  # str(self.input_data.batch_item.urlObj.pDate))
315  # self.pubdate = self.input_data.batch_item.urlObj.pDate
316 
317  scraperResponses = self.jsonParserProcess()
318 
319  sys.stdout = tmp
320 
321  self.logger.debug("scraperResponse:\n%s", varDump(scraperResponses))
322  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
323  output_pickled_object = pickle.dumps(scraperResponses)
324  Utils.storePickleOnDisk(output_pickled_object, ENV_SCRAPER_STORE_PATH,
325  "scraper.out." + str(self.input_data.urlId))
326  print output_pickled_object
327  sys.stdout.flush()
328  else:
329  self.output_data = scraperResponses
330  except Exception as err:
331  ExceptionLog.handler(self.logger, err, 'ScraperCustomJson process batch error:')
332  self.exitCode = EXIT_FAILURE
333  raise Exception('ScraperCustomJson process batch error:' + str(err))
334 
335 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
def getDomain(url)
Definition: Utils.py:548
Here is the call graph for this function:

◆ resourceExtraction()

def dc_processor.ScraperCustomJson.resourceExtraction (   self,
  jsonElem 
)

Definition at line 394 of file ScraperCustomJson.py.

394  def resourceExtraction(self, jsonElem):
395  ret = []
396  # get resource as dictionary
397  resource_set = {}
398  resource_set["url"] = self.input_data.url
399  resource_set["resId"] = self.input_data.urlId
400  resource_set["siteId"] = self.input_data.siteId
401  resource_set["raw_html"] = jsonElem
402  resource = Resource(resource_set)
403 
404  # get best matching extractor
405  self.extractor = self.getNextBestExtractor()
406  self.logger.debug("get best matching extractor: " + str(self.extractor))
407 
408  # search engine parsing ???
409  collectResult = Result(self.config, self.input_data.urlId, self.metrics)
410  # main loooop
411  while self.extractor:
412  result = Result(self.config, self.input_data.urlId, self.metrics)
413  self.logger.debug(">>> TAG BEGIN extractor = " + str(self.extractor))
414  result = self.extractor.extractTags(resource, result)
415 
416  self.logger.debug(">>> TAG END")
417  empty_tags = result.getEmptyTags()
418  self.logger.debug("get list of empty tags from result: " + str(empty_tags))
419  filled_tags = result.getFilledTags()
420  self.logger.debug("get list of filled_tags from result: " + str(filled_tags))
421  self.extractor = self.getNextBestExtractor()
422  self.logger.debug("get best matching extractor: " + str(self.extractor))
423 
424  for key in result.tags:
425  if key not in collectResult.tags or not collectResult.isTagFilled(key):
426  collectResult.tags[key] = copy.deepcopy(result.tags[key])
427  ret.append(result)
428  self.logger.debug(">>> EXIT LOOP")
429  ret = [collectResult] + ret
430  return ret
431 
432 
def resourceExtraction(self, jsonElem)

◆ run()

def dc_processor.ScraperCustomJson.run (   self)

Definition at line 130 of file ScraperCustomJson.py.

130  def run(self):
131  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
132  # call base class run method
133  foundation.CementApp.run(self)
134 
135  # config section
136  self.loadConfig()
137 
138  # load logger config file
139  self.loadLogConfigFile()
140 
141  # options
142  self.loadOptions()
143 
144  # scraper properties
145  self.loadScraperProperties()
146 
147  # Do applied algorithm's job
148  self.processBatch()
149 
150  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
151  # Finish logging
152  self.logger.info(APP_CONSTS.LOGGER_DELIMITER_LINE)
153 
154 
Here is the caller graph for this function:

◆ setup()

def dc_processor.ScraperCustomJson.setup (   self)

Definition at line 122 of file ScraperCustomJson.py.

122  def setup(self):
123  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
124  # call base class setup method
125  foundation.CementApp.setup(self)
126 
127 

Variable Documentation

◆ algorithm_name

dc_processor.ScraperCustomJson.algorithm_name

Definition at line 104 of file ScraperCustomJson.py.

◆ altTagsMask

dc_processor.ScraperCustomJson.altTagsMask

Definition at line 112 of file ScraperCustomJson.py.

◆ config

dc_processor.ScraperCustomJson.config

Definition at line 159 of file ScraperCustomJson.py.

◆ configFile

dc_processor.ScraperCustomJson.configFile

Definition at line 93 of file ScraperCustomJson.py.

◆ datetimeTemplateTypes

dc_processor.ScraperCustomJson.datetimeTemplateTypes

Definition at line 117 of file ScraperCustomJson.py.

◆ dbWrapper

dc_processor.ScraperCustomJson.dbWrapper

Definition at line 116 of file ScraperCustomJson.py.

◆ ENV_SCRAPER_STORE_PATH

string dc_processor.ScraperCustomJson.ENV_SCRAPER_STORE_PATH = "ENV_SCRAPER_STORE_PATH"

Definition at line 65 of file ScraperCustomJson.py.

◆ ERROR_OK

int dc_processor.ScraperCustomJson.ERROR_OK = 0

Definition at line 57 of file ScraperCustomJson.py.

◆ errorMask

dc_processor.ScraperCustomJson.errorMask

Definition at line 102 of file ScraperCustomJson.py.

◆ EXIT_FAILURE

int dc_processor.ScraperCustomJson.EXIT_FAILURE = 1

Definition at line 61 of file ScraperCustomJson.py.

◆ EXIT_SUCCESS

int dc_processor.ScraperCustomJson.EXIT_SUCCESS = 0

Definition at line 60 of file ScraperCustomJson.py.

◆ exitCode

dc_processor.ScraperCustomJson.exitCode

Definition at line 91 of file ScraperCustomJson.py.

◆ extractor

dc_processor.ScraperCustomJson.extractor

Definition at line 97 of file ScraperCustomJson.py.

◆ extractors

dc_processor.ScraperCustomJson.extractors

Definition at line 98 of file ScraperCustomJson.py.

◆ input_data

dc_processor.ScraperCustomJson.input_data

Definition at line 95 of file ScraperCustomJson.py.

◆ itr

dc_processor.ScraperCustomJson.itr

Definition at line 99 of file ScraperCustomJson.py.

◆ logger

dc_processor.ScraperCustomJson.logger

Definition at line 94 of file ScraperCustomJson.py.

◆ metrics

dc_processor.ScraperCustomJson.metrics

Definition at line 111 of file ScraperCustomJson.py.

◆ MSG_ERROR_LOAD_EXTRACTORS

string dc_processor.ScraperCustomJson.MSG_ERROR_LOAD_EXTRACTORS = "Error load extractors "

Definition at line 63 of file ScraperCustomJson.py.

◆ MSG_ERROR_WRONG_CONFIG_FILE_NAME

string dc_processor.ScraperCustomJson.MSG_ERROR_WRONG_CONFIG_FILE_NAME = "Config file name is wrong"

Definition at line 72 of file ScraperCustomJson.py.

◆ OPTION_SECTION_DATETIME_TEMPLATE_TYPES

string dc_processor.ScraperCustomJson.OPTION_SECTION_DATETIME_TEMPLATE_TYPES = 'tags_datetime_template_types'

Definition at line 75 of file ScraperCustomJson.py.

◆ output_data

dc_processor.ScraperCustomJson.output_data

Definition at line 115 of file ScraperCustomJson.py.

◆ outputFormat

dc_processor.ScraperCustomJson.outputFormat

Definition at line 110 of file ScraperCustomJson.py.

◆ processedContent

dc_processor.ScraperCustomJson.processedContent

Definition at line 109 of file ScraperCustomJson.py.

◆ properties

dc_processor.ScraperCustomJson.properties

Definition at line 96 of file ScraperCustomJson.py.

◆ pubdate

dc_processor.ScraperCustomJson.pubdate

Definition at line 100 of file ScraperCustomJson.py.

◆ scraperPropFileName

dc_processor.ScraperCustomJson.scraperPropFileName

Definition at line 103 of file ScraperCustomJson.py.

◆ scraperResponses

dc_processor.ScraperCustomJson.scraperResponses

Definition at line 105 of file ScraperCustomJson.py.

◆ TAGS_DATETIME_NEWS_NAMES

list dc_processor.ScraperCustomJson.TAGS_DATETIME_NEWS_NAMES = [CONSTS.TAG_PUB_DATE, CONSTS.TAG_DC_DATE]

Definition at line 67 of file ScraperCustomJson.py.

◆ TAGS_DATETIME_TEMPLATE_TYPES

list dc_processor.ScraperCustomJson.TAGS_DATETIME_TEMPLATE_TYPES = [CONSTS.TAG_TYPE_DATETIME]

Definition at line 74 of file ScraperCustomJson.py.

◆ tagsCount

dc_processor.ScraperCustomJson.tagsCount

Definition at line 106 of file ScraperCustomJson.py.

◆ tagsMask

dc_processor.ScraperCustomJson.tagsMask

Definition at line 107 of file ScraperCustomJson.py.

◆ timezone

dc_processor.ScraperCustomJson.timezone

Definition at line 101 of file ScraperCustomJson.py.

◆ urlHost

dc_processor.ScraperCustomJson.urlHost

Definition at line 114 of file ScraperCustomJson.py.

◆ usageModel

dc_processor.ScraperCustomJson.usageModel

Definition at line 92 of file ScraperCustomJson.py.

◆ useCurrentYear

dc_processor.ScraperCustomJson.useCurrentYear

Definition at line 118 of file ScraperCustomJson.py.