HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_processor.ScraperMultiItemsTask Namespace Reference

Classes

class  Meta
 
class  ScraperMultiItemsTask
 
class  ScraperResultDocuments
 

Functions

def __init__ (self, usageModel=APP_CONSTS.APP_USAGE_MODEL_PROCESS, configFile=None, logger=None, inputData=None)
 
def setup (self)
 
def run (self)
 
def __initApp (self, configName=None)
 
def __loadScraperProperties (self, scraperPropertyFileName)
 
def __loadAppConfig (self, configName)
 
def __loadLogConfig (self, configName)
 
def getExtractorByName (self, extractorName)
 
def getExitCode (self)
 
def __getInputData (self)
 
def __checkInputData (self, inputData)
 
def __fillProfilerMessageList (self, inputData)
 
def __getOutputFormat (self, inputData)
 
def __getAltTagsMask (self, inputData)
 
def __getPropertiesFromInputData (self, inputData)
 
def __loadExtractors (self, algorithmName, config, urlHost)
 
def __createModule (self, moduleName, config, urlHost)
 
def refineBadDateTags (self, response)
 
def preparseResponse (self, response)
 
def formatOutpuElement (self, elem, localOutputFormat)
 
def formatOutputData (self, response, localOutputFormat)
 
def templateExtraction (self, config, urlHost)
 
def applyPostProcessing (self, result, key, postProcessingRE)
 
def getProcessedContent (self, result)
 
def process (self, config)
 
def get_path (self, etreeElement, path=None)
 

Variables

string MSG_ERROR_PARSE_CMD_PARAMS = "Error parse command line parameters."
 
string MSG_ERROR_EMPTY_CONFIG_FILE_NAME = "Config file name is empty."
 
string MSG_ERROR_WRONG_CONFIG_FILE_NAME = "Config file name is wrong"
 
string MSG_ERROR_LOAD_PROPERTIES_FROM_FILE = "Error load Scraper multi items properties from file"
 
string MSG_ERROR_LOAD_APP_CONFIG = "Error loading application config file."
 
string MSG_ERROR_READ_LOG_CONFIG = "Error read log config file."
 
string MSG_ERROR_READ_INPUT_DATA = "Error read input data from stdin."
 
string MSG_ERROR_INPUT_DATA_NONE = "Input data is none"
 
string MSG_ERROR_INPUT_DATA_WITHOUT_BATCH = "Input data without batch item."
 
string MSG_ERROR_INPUT_DATA_WITHOUT_PROPERTIES = "Input data has batch item without 'properties'."
 
string MSG_ERROR_GET_PROPERTIES = "Error getting properties from input data"
 
string MSG_ERROR_LOAD_EXTRACTORS = "Error load extractors "
 
string MSG_ERROR_ADJUST_PR = "Error adjust partial references. "
 
string MSG_ERROR_ADJUST_PUBDATE = "Error adjust PUBDATE. "
 
string MSG_ERROR_ADJUST_TITLE = "Error adjust title. "
 
string MSG_ERROR_ADJUST_LINK_URL = "Error adjust link URL. "
 
string SCRAPER_MULTI_ITEMS_OPTION_LOG = "log"
 
string SCRAPER_MULTI_ITEMS_OPTION_PROPERTY_JSON_FILE = "property_file_name"
 
string ENV_SCRAPER_STORE_PATH = "self.ENV_SCRAPER_STORE_PATH"
 
dictionary EXTENDED_NEWS_TAGS = {"description": ["//meta[@name='description']//@content"]}
 
list DATA_NEWS_TAGS = [CONSTS.TAG_DC_DATE]
 
list TAGS_DATETIME_TEMPLATE_TYPES = [CONSTS.TAG_TYPE_DATETIME]
 
string OPTION_SECTION_DATETIME_TEMPLATE_TYPES = 'tags_datetime_template_types'
 
 exitCode
 
 usageModel
 
 configFile
 
 logger
 
 input_data
 
 properties
 
 outputFormat
 
 output_data
 
 extractor
 
 extractors
 
 itr
 
 pubdate
 
 errorMask
 
 xpathSplitString
 
 useCurrentYear
 
 datetimeTemplateTypes
 
 dbWrapper
 
 mediaLimitsHandler
 

Detailed Description

HCE project,  Python bindings, Distributed Tasks Manager application.
ScraperMultiItemsTask Class content main functional scrapering for multi items.

@package: dc_processor
@file ScraperMultiItemsTask.py
@author Alexander Vybornyh <alexander.hce.cluster@gmail.com>
@link: http://hierarchical-cluster-engine.com/
@copyright: Copyright &copy; 2013-2015 IOIX Ukraine
@license: http://hierarchical-cluster-engine.com/license/
@since: 0.1

Function Documentation

◆ __checkInputData()

def dc_processor.ScraperMultiItemsTask.__checkInputData (   self,
  inputData 
)
private

Definition at line 640 of file ScraperMultiItemsTask.py.

640  def __checkInputData(self, inputData):
641 
642  if inputData is None:
643  raise Exception(self.MSG_ERROR_INPUT_DATA_NONE)
644 
645  if inputData.batch_item is None:
646  raise Exception(self.MSG_ERROR_INPUT_DATA_WITHOUT_BATCH)
647 
648  if inputData.batch_item.properties is None:
649  raise Exception(self.MSG_ERROR_INPUT_DATA_WITHOUT_PROPERTIES)
650 
651 
652 

◆ __createModule()

def dc_processor.ScraperMultiItemsTask.__createModule (   self,
  moduleName,
  config,
  urlHost 
)
private

Definition at line 784 of file ScraperMultiItemsTask.py.

784  def __createModule(self, moduleName, config, urlHost):
785  # varable for result
786  appInst = None
787  try:
788  appInst = (moduleName, eval(moduleName)(config, None, urlHost))[1] # pylint: disable=W0123
789  self.logger.debug("%s has been created!" % moduleName)
790  except Exception, err:
791  ExceptionLog.handler(self.logger, err, "Can't create module %s. Error is:" % (moduleName))
792 
793  return appInst
794 
795 
796 
797 # # #adjust partial references
798 # # adjust partial references
799 # #
800 # def checkDOMElement(self, elem):
801 # ret = False
802 # if re.search('<', elem):
803 # self.logger.debug("Media tag contain DOM element: %s", elem)
804 # ret = True
805 # return ret
806 
807 
808 # # #adjust partial references
809 # # adjust partial references
810 # #
811 # def adjustPartialReferences(self, response):
812 # if "links" in response.tags and isinstance(response.tags["link"], dict) and \
813 # "media" in response.tags and isinstance(response.tags["media"], dict):
814 # try:
815 # url = None
816 # if self.input_data.template and "link" in self.input_data.template:
817 # self.logger.debug("url type: %s", str(type(response.tags["link"]["data"])))
818 # if isinstance(response.tags["link"]["data"], str) or isinstance(response.tags["link"]["data"], unicode):
819 # url = response.tags["link"]["data"]
820 # else:
821 # url = response.tags["link"]["data"][0]
822 # else:
823 # url = self.input_data.url
824 # if self.input_data.template and "media" in self.input_data.template:
825 # self.logger.debug("resource has template with media tag. Try to adjust media.")
826 # # if type(response.tags["media"]) == str and response.tags["media"] == "": return
827 # self.logger.debug("response.tags['media']: " + str(response.tags["media"]))
828 # self.logger.debug("media tag in response: <<%s>>" % str(response.tags["media"]["data"]))
829 # self.logger.debug("link tag in response: <<%s>>" % str(url))
830 # res = []
831 #
832 # filter_patterns, filter_types = [], []
833 # if self.input_data.filters:
834 # # filter_types = [filter_item["Type"] for filter_item in self.input_data.filters]
835 # # filter_patterns = [re.compile(filter_item["Pattern"]) for filter_item in self.input_data.filters]
836 # filter_types = [filter_item.type for filter_item in self.input_data.filters]
837 # filter_patterns = [re.compile(filter_item.pattern) for filter_item in self.input_data.filters]
838 # self.logger.debug("filter: %s" % (str(self.input_data.filters)))
839 # for media in response.tags["media"]["data"]:
840 # self.logger.debug("Media link: <<%s>>", media)
841 # # instead pure url
842 # if self.checkDOMElement(media):
843 # res.append(media)
844 # break
845 # media = urlparse.urljoin(url, media)
846 # for filter_type, filter_pattern in zip(filter_types, filter_patterns):
847 # match = filter_pattern.match(media)
848 # if filter_type == SiteFilter.TYPE_EXCLUDE and match:
849 # break
850 # if filter_type == SiteFilter.TYPE_INCLUDE and match:
851 # res = self.checkMediaTag(media, res)
852 # break
853 # else:
854 # self.logger.debug("media: %s", media)
855 # self.logger.debug("url: %s", url)
856 # res = self.checkMediaTag(media, res)
857 #
858 # # If media tag after adjusting is empty remove it from response
859 # if not len(res):
860 # self.logger.debug("media tag is empty. Remove media tag from response.")
861 # del response.tags["media"]
862 # else:
863 # self.logger.debug("media tag is adjusted. Copy media tag to response.")
864 # response.tags["media"]["data"] = res
865 # # End code block removing empty media tag
866 # else:
867 # self.logger.debug("resource hasn't template with media tag. adjustPartialReferences doesn't execute")
868 # except Exception as err:
869 # ExceptionLog.handler(self.logger, err, self.MSG_ERROR_ADJUST_PR, (err), \
870 # {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
871 # else:
872 # self.logger.debug(">>> Response has not have link or media tag, Don't need adjust media")
873 
874 
875 # # adjustTitle
876 # #
877 # def adjustTitle(self, response):
878 # try:
879 # if self.input_data.template and "title" in self.input_data.template:
880 # self.logger.debug("resource has template with title tag. Try to adjust title.")
881 # self.logger.debug("response.tags['title']: " + str(response.tags["title"]))
882 # if not self.extractor:
883 # if len(self.extractors) > 2:
884 # self.extractor = self.extractors[2]
885 # else:
886 # raise Exception(" >>> Wrong! self.extractors list doesn't have 3'th element (index 2)")
887 # if isinstance(response.tags["title"], str):
888 # self.logger.debug("response has not have title tag")
889 # sel = Selector(text=self.input_data.raw_content)
890 # title = sel.xpath("//title/text()").extract()
891 # self.extractor.addTag(result=response, tag_name="title", tag_value=title, xpath="", \
892 # isDefaultTag=False, callAdjustment=True, tagType=None, allowNotFilled=True)
893 # self.logger.debug("TYPE response.tags['title']['data']" + str(type(response.tags["title"]["data"])))
894 # else:
895 # self.logger.debug("resource hasn't template with title tag. Don't need adjust title.")
896 # except Exception as err:
897 # ExceptionLog.handler(self.logger, err, self.MSG_ERROR_ADJUST_TITLE, (err), \
898 # {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
899 
900 
901 # # adjustLinkURL
902 # #
903 # def adjustLinkURL(self, response):
904 # flag = False
905 # try:
906 # if response.tags and "link" in response.tags:
907 # self.logger.debug("resource has template with link tag. Try to adjust link.")
908 # self.logger.debug("response.tags['link']: " + str(response.tags["link"]))
909 # self.logger.debug("self.extractor: %s", str(self.extractor))
910 # flag = True
911 # if self.extractor:
912 # self.logger.debug("Extractor exists")
913 # if isinstance(response.tags["link"], str):
914 # self.logger.debug("response has not have link tag")
915 # self.extractor.addTag(result=response, tag_name="link", tag_value=[self.input_data.url], xpath="", \
916 # isDefaultTag=False, callAdjustment=True, tagType=None, allowNotFilled=True)
917 # # bypass
918 # else:
919 # response.tags["link"]["data"] = self.input_data.url
920 # else:
921 # if len(self.extractors) > 2:
922 # self.extractors[2].addTag(result=response, tag_name="link", tag_value=[self.input_data.url], xpath="", \
923 # isDefaultTag=False, callAdjustment=True, tagType=None, allowNotFilled=True)
924 # else:
925 # self.logger.debug(">>> Wrong! self.extractors list doesn't have 3'th element (index 2)")
926 # self.logger.debug("TYPE response.tags['link']['data']" + str(type(response.tags["link"]["data"])))
927 # else:
928 # self.logger.debug("resource hasn't template with link tag. Don't need adjust link.")
929 # except Exception as err:
930 # ExceptionLog.handler(self.logger, err, self.MSG_ERROR_ADJUST_LINK_URL, (err), \
931 # {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
932 #
933 # return flag
934 
935 
936 # # # Normalize datetime tags procedure
937 # #
938 # # @param response - scraper response instance
939 # # @param algorithmName - algorithm name
940 # # @return - 'pubdate tag value'
941 # def normalizeDatetime(self, response, algorithmName):
942 # ret = None
943 # timezone = ''
944 # try:
945 # if response is not None and response.tags is not None:
946 # self.logger.debug("normalizeDatetime scraper response: " + varDump(response))
947 # tagNames = []
948 # if self.input_data.template and algorithmName == CONSTS.PROCESS_ALGORITHM_REGULAR:
949 # # temlate
950 # for responseType in self.datetimeTemplateTypes:
951 # for responseTagName in response.tags:
952 # self.logger.debug("normalizeDatetime responseTagName: '" + str(responseTagName) + "'")
953 # if (responseTagName in response.tags and \
954 # response.tags[responseTagName] is not None and \
955 # response.tags[responseTagName].has_key('type') and \
956 # response.tags[responseTagName]['type'] == responseType) or \
957 # (responseTagName in response.tags and response.tags[responseTagName] is not None and \
958 # responseTagName == CONSTS.TAG_PUB_DATE):
959 # tagNames.append(responseTagName)
960 # else:
961 # pass
962 # else:
963 # pass
964 #
965 # self.logger.debug('normalizeDatetime tagNames: ' + varDump(tagNames))
966 # retDict = {}
967 # for tagName in tagNames:
968 # pubdate, tzone = self.extractPubDate(response, tagName) # , properties, urlString)
969 # if self.extractor and tagName in response.tags:
970 # self.extractor.addTag(result=response, tag_name=tagName + '_normalized', tag_value=pubdate, \
971 # xpath=response.tags[tagName]['xpath'], isDefaultTag=False, \
972 # callAdjustment=True, tagType=None, allowNotFilled=True)
973 #
974 # self.logger.debug('tagName: ' + str(tagName) + ' pubdate: ' + str(pubdate))
975 # retDict[tagName] = pubdate
976 #
977 # if tagName == CONSTS.TAG_PUB_DATE:
978 # ret = pubdate
979 # timezone = tzone
980 # else:
981 # pass
982 #
983 # if ret is None:
984 # for key, value in retDict.items():
985 # if value is not None:
986 # ret = value
987 # self.logger.debug('set return value from ' + str(key) + ' : ' + str(value))
988 # break
989 #
990 # except Exception, err:
991 # ExceptionLog.handler(self.logger, err, 'normalizeDatetime error:', (), \
992 # {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
993 #
994 # return ret, timezone
995 
996 
997 # # # Extract pubdate
998 # #
999 # # @param response - response instance
1000 # # @param dataTagName - tag name for extracting
1001 # # @param properties - properties from PROCESSOR_PROPERTIES
1002 # # @param urlString - url string value
1003 # # @return pubdate if success or None
1004 # def extractPubDate(self, response, dataTagName): # , properties, urlString):
1005 # # variable for result
1006 # ret = None
1007 # timezone = ''
1008 # try:
1009 # if response is not None and dataTagName in response.tags and response.tags[dataTagName] != "":
1010 #
1011 # self.logger.debug("extractPubDate response: " + varDump(response))
1012 #
1013 # if dataTagName in response.tags and response.tags[dataTagName] is not None:
1014 # inputData = response.tags[dataTagName]["data"]
1015 # self.logger.debug("extractPubDate response has '" + str(dataTagName) + "' is: " + str(inputData))
1016 # self.logger.debug("extractPubDate type of '" + str(dataTagName) + "' is: " + str(type(inputData)))
1017 #
1018 # inputList = []
1019 # if isinstance(inputData, str) or isinstance(inputData, unicode):
1020 # inputList = [inputData]
1021 # elif isinstance(inputData, list):
1022 # inputList = inputData
1023 # else:
1024 # pass
1025 #
1026 # pubdate = []
1027 # timezones = []
1028 # for inputElem in inputList:
1029 # d = DateTimeType.parse(inputElem, bool(self.useCurrentYear), self.logger, False)
1030 # self.logger.debug('pubdate: ' + str(d))
1031 #
1032 # if d is not None:
1033 # d, tzone = DateTimeType.split(d)
1034 # pubdate.append(d.isoformat(DateTimeType.ISO_SEP))
1035 # timezones.append(tzone)
1036 #
1037 # self.logger.debug("extractPubDate result pubdate: " + str(pubdate))
1038 # response.tags[dataTagName]["data"] = pubdate
1039 # if len(pubdate) > 0:
1040 # ret = pubdate[0]
1041 #
1042 # if len(timezones) > 0:
1043 # timezone = timezones[0]
1044 #
1045 # except Exception, err:
1046 # ExceptionLog.handler(self.logger, err, 'extractPubDate error:', (), \
1047 # {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
1048 #
1049 # return ret, timezone
1050 
1051 
1052 # # # pubdate transformation use timezone value
1053 # #
1054 # # @param rawPubdate - raw pubdate string
1055 # # @param rawTimezone - raw timezone string
1056 # # @param properties - properties from PROCESSOR_PROPERTIES
1057 # # @param urlString - url string value
1058 # # @return pubdate and timezone if success or None and empty string
1059 # def pubdateTransform(self, rawPubdate, rawTimezone, properties, urlString):
1060 # # variables for result
1061 # pubdate = rawPubdate
1062 # timezone = rawTimezone
1063 #
1064 # self.logger.debug('properties: ' + varDump(properties))
1065 # if CONSTS.PDATE_TIMEZONES_NAME in properties:
1066 # propertyString = properties[CONSTS.PDATE_TIMEZONES_NAME]
1067 # self.logger.debug('inputted ' + CONSTS.PDATE_TIMEZONES_NAME + ':' + str(propertyString))
1068 #
1069 # dt = DateTimeType.parse(rawPubdate, bool(self.useCurrentYear), self.logger, False)
1070 # self.logger.debug('pubdate: ' + str(dt))
1071 # if dt is not None:
1072 # # get utc offset if necessary
1073 # utcOffset = DateTimeType.extractUtcOffset(rawTimezone, self.logger)
1074 # self.logger.debug('utcOffset: ' + str(utcOffset))
1075 # # transformation accord to PDATE_TIMEZONES properties
1076 # d = PDateTimezonesHandler.transform(dt, utcOffset, propertyString, urlString, self.logger)
1077 # if d is not None:
1078 # dt = d
1079 #
1080 # if dt is not None:
1081 # d, tzone = DateTimeType.split(dt)
1082 # pubdate = d.isoformat(DateTimeType.ISO_SEP)
1083 # timezone = tzone
1084 #
1085 # return pubdate, timezone
1086 
1087 
def __createModule(self, moduleName, config, urlHost)

◆ __fillProfilerMessageList()

def dc_processor.ScraperMultiItemsTask.__fillProfilerMessageList (   self,
  inputData 
)
private

Definition at line 657 of file ScraperMultiItemsTask.py.

657  def __fillProfilerMessageList(self, inputData):
658 
659  if inputData.batch_item.urlObj is not None:
660  urlString = inputData.batch_item.urlObj.url
661  else:
662  urlString = ""
663  logMsg = "BatchItem.siteId=" + str(inputData.batch_item.siteId) + \
664  ", BatchItem.urlId=" + str(inputData.batch_item.urlId) + \
665  ", BatchItem.urlObj.url=" + urlString
666  app.Profiler.messagesList.append(logMsg)
667  self.logger.info("Incoming data: %s", logMsg)
668 
669 

◆ __getAltTagsMask()

def dc_processor.ScraperMultiItemsTask.__getAltTagsMask (   self,
  inputData 
)
private

Definition at line 696 of file ScraperMultiItemsTask.py.

696  def __getAltTagsMask(self, inputData):
697  # variable for result
698  altTagsMask = None
699  if "TAGS_MAPPING" in inputData.batch_item.properties and \
700  inputData.batch_item.properties["TAGS_MAPPING"] is not None:
701  try:
702  altTagsMask = json.loads(inputData.batch_item.properties["TAGS_MAPPING"])
703  self.logger.debug(">>> AltTagsMask = " + str(altTagsMask))
704  except Exception, err:
705  ExceptionLog.handler(self.logger, err, 'Bad TAGS_MAPPING properties value:', \
706  (inputData.batch_item.properties["TAGS_MAPPING"]), \
707  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
708 
709  return altTagsMask
710 
711 
712 

◆ __getInputData()

def dc_processor.ScraperMultiItemsTask.__getInputData (   self)
private

Definition at line 620 of file ScraperMultiItemsTask.py.

620  def __getInputData(self):
621  # variable for result
622  scraperInputData = None
623  try:
624  # read pickled object from stdin and extract it
625  scraperInputData = pickle.loads(sys.stdin.read())
626  except Exception, err:
627  if self.logger is not None:
628  ExceptionLog.handler(self.logger, err, self.MSG_ERROR_READ_INPUT_DATA)
629  else:
630  pass
631  raise Exception(self.MSG_ERROR_READ_INPUT_DATA + ' ' + str(err))
632 
633  return scraperInputData
634 
635 

◆ __getOutputFormat()

def dc_processor.ScraperMultiItemsTask.__getOutputFormat (   self,
  inputData 
)
private

Definition at line 674 of file ScraperMultiItemsTask.py.

674  def __getOutputFormat(self, inputData):
675  # variable for result
676  outputFormat = None
677 
678  if inputData.output_format is not None and "name" in inputData.output_format:
679  outputFormat = inputData.output_format["name"]
680 
681  if outputFormat is None and "templates" in inputData.batch_item.properties["template"] and \
682  len(inputData.batch_item.properties["template"]["templates"]) > 0 and \
683  "output_format" in inputData.batch_item.properties["template"]["templates"][0] and \
684  "name" in inputData.batch_item.properties["template"]["templates"][0]["output_format"]:
685  outputFormat = inputData.batch_item.properties["template"]["templates"][0]["output_format"]["name"]
686  else:
687  self.logger.debug(">>> 'output_format' hasn't in template of input batch.")
688 
689  return outputFormat
690 
691 

◆ __getPropertiesFromInputData()

def dc_processor.ScraperMultiItemsTask.__getPropertiesFromInputData (   self,
  inputData 
)
private

Definition at line 717 of file ScraperMultiItemsTask.py.

717  def __getPropertiesFromInputData(self, inputData):
718  # variable for result
719  properties = {}
720  try:
721  if (self.input_data is not None) and \
722  inputData.processor_properties is not None:
723  processor_properties = inputData.processor_properties
724  self.logger.debug("Processor's properties was taken from input data: %s" % processor_properties)
725  self.logger.debug("Processor's properties type: %s" % str(type(processor_properties)))
726  if not isinstance(processor_properties, dict):
727  processor_properties = json.loads(inputData.processor_properties)
728  self.logger.debug("Processor's properties was taken from input data: %s" % processor_properties)
729  properties = processor_properties
730 
731  self.logger.debug('>>> inputData.batch_item.properties: ' + varDump(inputData.batch_item.properties) + \
732  ' type: ' + str(type(inputData.batch_item.properties)))
733  if isinstance(inputData.batch_item.properties, dict):
734  properties.update(inputData.batch_item.properties)
735 
736  except Exception, err:
737  ExceptionLog.handler(self.logger, err, self.MSG_ERROR_GET_PROPERTIES, (inputData.processor_properties))
738 
739  return properties
740 
741 
def __getPropertiesFromInputData(self, inputData)
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:

◆ __init__()

def dc_processor.ScraperMultiItemsTask.__init__ (   self,
  usageModel = APP_CONSTS.APP_USAGE_MODEL_PROCESS,
  configFile = None,
  logger = None,
  inputData = None 
)

Definition at line 432 of file ScraperMultiItemsTask.py.

432  def __init__(self, usageModel=APP_CONSTS.APP_USAGE_MODEL_PROCESS, configFile=None, logger=None, inputData=None):
433  if usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
434  # call base class __init__ method
435  # #foundation.CementApp.__init__(self)
436  Scraper.__init__(self)
437 
438  self.exitCode = APP_CONSTS.EXIT_SUCCESS
439  self.usageModel = usageModel
440  self.configFile = configFile
441  self.logger = logger
442  self.input_data = inputData
443  self.properties = {}
444  self.outputFormat = None
445  self.output_data = None
446  self.extractor = None
447  self.extractors = []
448  self.itr = None
449  self.pubdate = None
450  self.errorMask = APP_CONSTS.ERROR_OK
451  self.xpathSplitString = ' '
452  self.useCurrentYear = 0
453  self.datetimeTemplateTypes = []
454  self.dbWrapper = None
455  self.mediaLimitsHandler = None
456 
457 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

◆ __initApp()

def dc_processor.ScraperMultiItemsTask.__initApp (   self,
  configName = None 
)
private

Definition at line 486 of file ScraperMultiItemsTask.py.

486  def __initApp(self, configName=None):
487 
488  if configName is None:
489  configName = self.pargs.config
490  else:
491  pass
492 
493  config, confLogFileName, scraperPropertyFileName = self.__loadAppConfig(configName)
494 
495  self.properties = self.__loadScraperProperties(scraperPropertyFileName)
496 
497  if self.logger is None:
498  self.__loadLogConfig(confLogFileName)
499  else:
500  pass
501 
502  self.logger.info('self.properties: ' + varDump(self.properties))
503 
504  return config
505 
506 
def __initApp(self, configName=None)
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:

◆ __loadAppConfig()

def dc_processor.ScraperMultiItemsTask.__loadAppConfig (   self,
  configName 
)
private

Definition at line 533 of file ScraperMultiItemsTask.py.

533  def __loadAppConfig(self, configName):
534  # variables for result
535  confLogFileName = ''
536  scraperPropertyFileName = ''
537  try:
538  if configName is None or configName == "":
539  raise Exception(self.MSG_ERROR_EMPTY_CONFIG_FILE_NAME)
540 
541  config = ConfigParser.ConfigParser()
542  config.optionxform = str
543 
544  readOk = config.read(configName)
545 
546  if len(readOk) == 0:
547  raise Exception(self.MSG_ERROR_WRONG_CONFIG_FILE_NAME + ": " + configName)
548 
549  if config.has_section(APP_CONSTS.CONFIG_APPLICATION_SECTION_NAME):
550  confLogFileName = Utils.getConfigParameter(config, APP_CONSTS.CONFIG_APPLICATION_SECTION_NAME, \
551  self.SCRAPER_MULTI_ITEMS_OPTION_LOG, '')
552 
553  scraperPropertyFileName = Utils.getConfigParameter(config, APP_CONSTS.CONFIG_APPLICATION_SECTION_NAME, \
554  self.SCRAPER_MULTI_ITEMS_OPTION_PROPERTY_JSON_FILE, '')
555 
556  self.useCurrentYear = config.getint("DateTimeType", "useCurrentYear")
557 
558  if config.has_section(self.OPTION_SECTION_DATETIME_TEMPLATE_TYPES):
559  self.datetimeTemplateTypes = []
560  for key, value in config.items(self.OPTION_SECTION_DATETIME_TEMPLATE_TYPES):
561  self.datetimeTemplateTypes.append(key)
562  if self.logger is not None:
563  self.logger.debug('load form config: ' + str(key) + ' = ' + str(value))
564  else:
565  self.datetimeTemplateTypes = self.TAGS_DATETIME_TEMPLATE_TYPES
566  if self.logger is not None:
567  self.logger.debug("Config file hasn't section: " + str(self.OPTION_SECTION_DATETIME_TEMPLATE_TYPES))
568 
569  # DBWrapper initialization
570  dbTaskIniConfigFileName = config.get(self.__class__.__name__, "db-task_ini")
571  readOk = config.read(dbTaskIniConfigFileName)
572  if len(readOk) == 0:
573  raise Exception(self.MSG_ERROR_WRONG_CONFIG_FILE_NAME + ": " + dbTaskIniConfigFileName)
574  self.dbWrapper = DBTasksWrapper(config)
575  except Exception, err:
576  raise Exception(self.MSG_ERROR_LOAD_APP_CONFIG + ' ' + str(err))
577 
578  return config, confLogFileName, scraperPropertyFileName
579 
580 

◆ __loadExtractors()

def dc_processor.ScraperMultiItemsTask.__loadExtractors (   self,
  algorithmName,
  config,
  urlHost 
)
private

Definition at line 748 of file ScraperMultiItemsTask.py.

748  def __loadExtractors(self, algorithmName, config, urlHost):
749  # varable for result
750  extractors = []
751  try:
752  # modules
753  modules = self.properties[CONSTS.MODULES_KEY][algorithmName]
754 
755  self.logger.debug("Algorithm name: <%s>" % (algorithmName))
756  self.logger.debug("Modules: %s" % modules)
757 
758  for module in modules:
759  exrtactor = self.__createModule(module, config, urlHost)
760  # Check if module was created successfully and then insert it to extractors
761  if exrtactor is not None:
762  extractors.append(exrtactor)
763 
764  # Info show extractors loaded
765  self.logger.debug("*******************")
766  self.logger.debug("Loaded extractors:")
767  for extractor in extractors:
768  self.logger.debug(extractor.name)
769  self.logger.debug("*******************")
770 
771  except Exception, err:
772  ExceptionLog.handler(self.logger, err, self.MSG_ERROR_LOAD_EXTRACTORS)
773  raise Exception(self.MSG_ERROR_LOAD_EXTRACTORS + ' ' + str(err))
774 
775  return extractors
776 
777 
def __loadExtractors(self, algorithmName, config, urlHost)

◆ __loadLogConfig()

def dc_processor.ScraperMultiItemsTask.__loadLogConfig (   self,
  configName 
)
private

Definition at line 585 of file ScraperMultiItemsTask.py.

585  def __loadLogConfig(self, configName):
586  try:
587  if isinstance(configName, str) and len(configName) == 0:
588  raise Exception(self.MSG_ERROR_EMPTY_CONFIG_FILE_NAME)
589 
590  logging.config.fileConfig(configName)
591 
592  # call rotation log files and initialization logger
593  self.logger = Utils.MPLogger().getLogger()
594 
595  except Exception, err:
596  raise Exception(self.MSG_ERROR_READ_LOG_CONFIG + ' ' + str(err))
597 
598 
Here is the call graph for this function:

◆ __loadScraperProperties()

def dc_processor.ScraperMultiItemsTask.__loadScraperProperties (   self,
  scraperPropertyFileName 
)
private

Definition at line 511 of file ScraperMultiItemsTask.py.

511  def __loadScraperProperties(self, scraperPropertyFileName):
512  # variable for result
513  properties = None
514  if scraperPropertyFileName is not None:
515  try:
516  with open(scraperPropertyFileName, "rb") as fd:
517  scraperProperies = json.loads(fd.read())
518  properties = scraperProperies[self.__class__.__name__][CONSTS.PROPERTIES_KEY]
519  except Exception, err:
520  if self.logger is not None:
521  self.logger.error(self.MSG_ERROR_LOAD_PROPERTIES_FROM_FILE + " '" + \
522  str(scraperPropertyFileName) + "': " + str(err))
523 
524  return properties
525 
526 
def __loadScraperProperties(self, scraperPropertyFileName)
-mask-info

◆ applyPostProcessing()

def dc_processor.ScraperMultiItemsTask.applyPostProcessing (   self,
  result,
  key,
  postProcessingRE 
)

Definition at line 1392 of file ScraperMultiItemsTask.py.

1392  def applyPostProcessing(self, result, key, postProcessingRE):
1393  if key in result.tags and "data" in result.tags[key] and result.tags[key]["data"] is not None and \
1394  len(result.tags[key]["data"]) > 0:
1395  try:
1396  matchingVal = re.compile(postProcessingRE)
1397  except re.error as err:
1398  self.logger.debug(">>> RE error = " + str(err))
1399  self.errorMask = self.errorMask | APP_CONSTS.ERROR_RE_ERROR
1400  else:
1401  tmpStr = ""
1402  matchingResult = matchingVal.findall(result.tags[key]["data"][0])
1403  if matchingResult is not None:
1404  for elem in matchingResult:
1405  if isinstance(elem, str) or isinstance(elem, unicode):
1406  tmpStr += str(elem)
1407  tmpStr += ' '
1408  else:
1409  for innerElem in elem:
1410  if innerElem is not None and innerElem != '':
1411  tmpStr += str(innerElem)
1412  tmpStr += ' '
1413  tmpStr = tmpStr.strip()
1414  if tmpStr != "":
1415  self.logger.debug(">>> Replace value, prev. value is = " + result.tags[key]["data"][0])
1416  result.tags[key]["data"][0] = tmpStr
1417  else:
1418  # Set not detected value if no match, changed default behavior by bgv
1419  result.tags[key]["data"][0] = None
1420 
1421 
def applyPostProcessing(self, result, key, postProcessingRE)

◆ formatOutpuElement()

def dc_processor.ScraperMultiItemsTask.formatOutpuElement (   self,
  elem,
  localOutputFormat 
)

Definition at line 1138 of file ScraperMultiItemsTask.py.

1138  def formatOutpuElement(self, elem, localOutputFormat):
1139  ret = elem
1140  if localOutputFormat == "json":
1141  # self.logger.debug(">>> JSON HTML = " + elem)
1142  localStr = json.dumps(elem, ensure_ascii=False)
1143  if localStr[0] == '\"' or localStr[0] == '\'':
1144  localStr = localStr[1:]
1145  if localStr[-1] == '\"' or localStr[-1] == '\'':
1146  localStr = localStr[0:-1]
1147  ret = localStr
1148  # self.logger.debug(">>> JSON HTML = " + ret)
1149  elif localOutputFormat == "html":
1150  ret = xml.sax.saxutils.escape(elem, {"'": "&apos;", "\"" : "&quot;"})
1151  elif localOutputFormat == "sql":
1152  # ret = mdb.escape_string(elem) # pylint: disable=E1101
1153  ret = Utils.escape(elem)
1154  return ret
1155 
1156 
def formatOutpuElement(self, elem, localOutputFormat)

◆ formatOutputData()

def dc_processor.ScraperMultiItemsTask.formatOutputData (   self,
  response,
  localOutputFormat 
)

Definition at line 1157 of file ScraperMultiItemsTask.py.

1157  def formatOutputData(self, response, localOutputFormat):
1158  # result.tags[key]["data"]
1159  for key in response.tags:
1160  if response.tags[key] is not None:
1161  if "data" in response.tags[key]:
1162  if isinstance(response.tags[key]["data"], list):
1163  for i, elem in enumerate(response.tags[key]["data"]):
1164  response.tags[key]["data"][i] = self.formatOutpuElement(elem, localOutputFormat)
1165  elif isinstance(response.tags[key]["data"], str) or isinstance(response.tags[key]["data"], unicode):
1166  response.tags[key]["data"] = self.formatOutpuElement(response.tags[key]["data"], localOutputFormat)
1167 
1168 
def formatOutputData(self, response, localOutputFormat)

◆ get_path()

def dc_processor.ScraperMultiItemsTask.get_path (   self,
  etreeElement,
  path = None 
)

Definition at line 1690 of file ScraperMultiItemsTask.py.

1690  def get_path(self, etreeElement, path=None):
1691  if path is None:
1692  rpath = []
1693  else:
1694  rpath = path
1695 
1696  p = etreeElement.getparent()
1697  if p is not None:
1698  index = p.index(etreeElement) + 1
1699  rpath.insert(0, (etreeElement.tag, str(index)))
1700  return self.get_path(p, rpath)
1701  else:
1702  rpath.insert(0, etreeElement.tag)
1703  return rpath
1704 
1705 
1706 # # # Extract pubdate rss feed from header
1707 # #
1708 # # @param siteId - Site/Project ID
1709 # # @param url - url string
1710 # # @return pubdate from rss feed
1711 # def extractPubdateRssFeed(self, siteId, url):
1712 # # variable for result
1713 # pubdate = None
1714 # timezone = ''
1715 #
1716 # self.logger.debug('!!! extractPubdateRssFeed siteId: ' + str(siteId))
1717 # self.logger.debug('!!! extractPubdateRssFeed url: ' + str(url))
1718 # headerContent = self.getHeaderContent(siteId, url)
1719 # rawPubdate = self.getVariableFromHeaderContent(headerContent, CRAWLER_CONSTS.pubdateRssFeedHeaderName)
1720 #
1721 # self.logger.debug('!!! getVariableFromHeaderContent: ' + str(rawPubdate))
1722 # if rawPubdate is not None:
1723 # try:
1724 # dt = DateTimeType.parse(rawPubdate, True, self.logger, False)
1725 # if dt is not None:
1726 # dt, timezone = DateTimeType.split(dt)
1727 # pubdate = dt.strftime("%Y-%m-%d %H:%M:%S")
1728 #
1729 # if timezone is '':
1730 # timezone = '+0000'
1731 # except Exception, err:
1732 # self.logger.debug("Unsupported date format: <%s>, error: %s", str(rawPubdate), str(err))
1733 #
1734 # return pubdate, timezone
1735 
1736 
1737 # # # Get header content
1738 # #
1739 # # @param siteId - Site/Project ID
1740 # # @param url - url string
1741 # # @return extracted header content
1742 # def getHeaderContent(self, siteId, url):
1743 # # variable for result
1744 # headerContent = None
1745 # urlContentObj = dc_event.URLContentRequest(siteId, url, \
1746 # dc_event.URLContentRequest.CONTENT_TYPE_RAW_LAST + \
1747 # dc_event.URLContentRequest. CONTENT_TYPE_RAW + \
1748 # dc_event.URLContentRequest.CONTENT_TYPE_HEADERS)
1749 #
1750 # rawContentData = self.dbWrapper.urlContent([urlContentObj])
1751 #
1752 # if rawContentData is not None and len(rawContentData) > 0:
1753 # if rawContentData[0].headers is not None and len(rawContentData[0].headers) > 0 and \
1754 # rawContentData[0].headers[0] is not None:
1755 # headerContent = rawContentData[0].headers[0].buffer
1756 #
1757 # return headerContent
1758 #
1759 #
1760 # # #Get variable from header content
1761 # #
1762 # # @param headerContent - header content
1763 # # @param name - variable name
1764 # # @param makeDecode - boolean flag necessary decode
1765 # # @return extracted value of 'Location'
1766 # def getVariableFromHeaderContent(self, headerContent, name, makeDecode=True):
1767 # # variable for result
1768 # ret = None
1769 #
1770 # header = ''
1771 # if makeDecode and headerContent is not None:
1772 # header = base64.b64decode(headerContent)
1773 #
1774 # headerList = header.split('\r\n')
1775 # self.logger.debug("headerList: " + varDump(headerList))
1776 #
1777 # for elem in headerList:
1778 # pos = elem.find(name + ':')
1779 # if pos > -1:
1780 # ret = elem.replace(name + ':', '').strip()
1781 # self.logger.debug("Found '" + name + "' has value: " + str(ret))
1782 # break
1783 #
1784 # return ret
1785 
1786 
1787 # # # change month orden in pubdate if neccessary
1788 # #
1789 # # @param rawPubdate - raw pubdate string in iso format. sample: '2016-02-07 16:28:00'
1790 # # @param properties - properties from PROCESSOR_PROPERTIES
1791 # # @param urlString - url string value
1792 # # @return pubdate and timezone if success or None and empty string
1793 # def pubdateMonthOrder(self, rawPubdate, properties, urlString):
1794 # # variables for result
1795 # pubdate = rawPubdate
1796 #
1797 # self.logger.debug('pubdateMonthOrder() enter... rawPubdate: ' + str(rawPubdate))
1798 # if CONSTS.PDATE_DAY_MONTH_ORDER_NAME in properties and isinstance(rawPubdate, basestring):
1799 # propertyObj = []
1800 # try:
1801 # self.logger.debug('inputted ' + CONSTS.PDATE_DAY_MONTH_ORDER_NAME + ':' + \
1802 # str(properties[CONSTS.PDATE_DAY_MONTH_ORDER_NAME]))
1803 # propertyObj = json.loads(properties[CONSTS.PDATE_DAY_MONTH_ORDER_NAME])
1804 # except Exception, err:
1805 # self.logger.error("Fail loads '%s', error: %s", str(CONSTS.PDATE_DAY_MONTH_ORDER_NAME), str(err))
1806 #
1807 # for propertyElem in propertyObj:
1808 # try:
1809 # if "pattern" not in propertyElem:
1810 # raise Exception('Property "pattern" not found')
1811 #
1812 # if "order" not in propertyElem:
1813 # raise Exception('Property "order" not found')
1814 #
1815 # pattern = str(propertyElem["pattern"])
1816 # order = int(propertyElem["order"])
1817 #
1818 # if re.search(pattern, urlString, re.UNICODE) is not None:
1819 # self.logger.debug("Pattern '%' found in url: %s", str(pattern), str(urlString))
1820 #
1821 # dt = None
1822 # if order == 0: # means day follows month
1823 # dt = datetime.datetime.strptime(rawPubdate, "%Y-%d-%m %H:%M:%S")
1824 # elif order == 1: # means month follows day
1825 # dt = datetime.datetime.strptime(rawPubdate, "%Y-%m-%d %H:%M:%S")
1826 # else:
1827 # raise Exception("Unsupported value of 'order' == " + str(order))
1828 #
1829 # if dt is not None:
1830 # pubdate = dt.strftime("%Y-%d-%m %H:%M:%S")
1831 #
1832 # except Exception, err:
1833 # self.logger.error("Fail execution '%s', error: %s", str(CONSTS.PDATE_DAY_MONTH_ORDER_NAME), str(err))
1834 #
1835 # self.logger.debug('pubdateMonthOrder() leave... pubdate: ' + str(pubdate))
1836 #
1837 # return pubdate
1838 
1839 
1840 # # # Check media tag and append to list
1841 # #
1842 # # @param urlStringMedia - url string of media tag
1843 # # @param allowedUrls - list for accumulate allowed url strings (by validator and limits)
1844 # # @return allowedUrls list already accumulated allowed url strings
1845 # def checkMediaTag(self, urlStringMedia, allowedUrls):
1846 #
1847 # mediaUrls = self.splitMediaTagString(urlStringMedia)
1848 # for media in mediaUrls:
1849 # # Check if media is binary picture
1850 # if re.search(MediaLimitsHandler.BINARY_IMAGE_SEARCH_STR, media, re.UNICODE) is not None:
1851 # self.logger.debug("Tag 'media' has binary picture...")
1852 #
1853 # if self.mediaLimitsHandler is None:
1854 # allowedUrls.append(media)
1855 # else:
1856 # if self.mediaLimitsHandler.isAllowedLimits(urlString=media, binaryType=True):
1857 # allowedUrls.append(media)
1858 # else:
1859 # self.logger.debug("Binary media tag has not allowed limits. Skipped...")
1860 #
1861 # # Check is media content valid url
1862 # elif isValidURL(media):
1863 # self.logger.debug("Tag 'media' has valid url of picture...")
1864 # if self.mediaLimitsHandler is None:
1865 # allowedUrls.append(media)
1866 # else:
1867 # if self.mediaLimitsHandler.isAllowedLimits(media):
1868 # allowedUrls.append(media)
1869 # else:
1870 # self.logger.debug("Media tag has not allowed limits. Skipped. Url: %s", str(media))
1871 #
1872 # # Invalid url of 'media' tag
1873 # else:
1874 # self.logger.debug("Invalid url in tag 'media'... Url: %s", str(media))
1875 #
1876 # return allowedUrls
1877 #
1878 #
1879 # # # Split media tag string
1880 # #
1881 # # @param urlStringMedia - url string of media tag
1882 # # @return list urls extracted from string of media tag
1883 # def splitMediaTagString(self, urlStringMedia):
1884 # # variable for result
1885 # urls = []
1886 # # temporary string for replace in url string
1887 # REPLACE_STR = 'base64|'
1888 # if urlStringMedia.find(MediaLimitsHandler.BINARY_IMAGE_SEARCH_STR) > -1:
1889 # urlStringMedia = urlStringMedia.replace(MediaLimitsHandler.BINARY_IMAGE_SEARCH_STR, REPLACE_STR)
1890 # urls = urlStringMedia.split(',')
1891 # self.logger.debug("!!! urls before: " + varDump(urls))
1892 # urls = [url.replace(REPLACE_STR, MediaLimitsHandler.BINARY_IMAGE_SEARCH_STR) for url in urls]
1893 # self.logger.debug("!!! urls after: " + varDump(urls))
1894 # else:
1895 # urls = urlStringMedia.split(',')
1896 #
1897 # return urls
1898 
def get_path(self, etreeElement, path=None)

◆ getExitCode()

def dc_processor.ScraperMultiItemsTask.getExitCode (   self)

Definition at line 612 of file ScraperMultiItemsTask.py.

612  def getExitCode(self):
613  return self.exitCode
614 
615 

◆ getExtractorByName()

def dc_processor.ScraperMultiItemsTask.getExtractorByName (   self,
  extractorName 
)

Definition at line 603 of file ScraperMultiItemsTask.py.

603  def getExtractorByName(self, extractorName):
604  for extractor in self.extractors:
605  if extractor.__class__.__name__ == extractorName:
606  return extractor
607  # in case if not found
608  return None
609 
610 
def getExtractorByName(self, extractorName)

◆ getProcessedContent()

def dc_processor.ScraperMultiItemsTask.getProcessedContent (   self,
  result 
)

Definition at line 1422 of file ScraperMultiItemsTask.py.

1422  def getProcessedContent(self, result):
1423  result.get()
1424  processedContent = {}
1425  processedContent["default"] = result
1426  processedContent["internal"] = [result]
1427  processedContent["custom"] = []
1428 
1429  if "pubdate" in result.tags and "data" in result.tags["pubdate"] and \
1430  len(result.tags["pubdate"]["data"]) > 0:
1431  self.pubdate = result.tags["pubdate"]["data"]
1432  self.logger.debug('>>>> Set self.pubdate = ' + str(self.pubdate))
1433 
1434  return processedContent
1435 
1436 
1437 # # #Internal method of url's domain crc calculating
1438 # #
1439 # # @param url - incoming url
1440 # def calcUrlDomainCrc(self, url):
1441 # urlHost = None
1442 # auth = urlparse.urlsplit(url)[1]
1443 # if auth is not None:
1444 # urlHost = (re.search('([^@]*@)?([^:]*):?(.*)', auth).groups())[1]
1445 # if urlHost is not None and urlHost.find(self.WWW_PREFIX) == 0:
1446 # urlHost = urlHost[len(self.WWW_PREFIX): len(urlHost)]
1447 #
1448 # return urlHost
1449 
1450 

◆ preparseResponse()

def dc_processor.ScraperMultiItemsTask.preparseResponse (   self,
  response 
)

Definition at line 1118 of file ScraperMultiItemsTask.py.

1118  def preparseResponse(self, response):
1119  self.logger.debug('>>> preparseResponse enter <<<')
1120 
1121  for key in response.tags:
1122  if response.tags[key] is not None:
1123  if "data" in response.tags[key]:
1124  if isinstance(response.tags[key]["data"], str) or isinstance(response.tags[key]["data"], unicode):
1125  localStr = response.tags[key]["data"]
1126 
1127  self.logger.debug('-----------------------------------------')
1128  self.logger.debug('key: ' + str(key) + ' => ' + str(localStr))
1129  self.logger.debug('-----------------------------------------')
1130 
1131  response.tags[key]["data"] = []
1132  response.tags[key]["data"].append(localStr)
1133 
1134  self.logger.debug('response.tags[key]["data"]: ' + str(response.tags[key]["data"]))
1135  self.logger.debug('-----------------------------------------')
1136 
1137 

◆ process()

def dc_processor.ScraperMultiItemsTask.process (   self,
  config 
)

Definition at line 1455 of file ScraperMultiItemsTask.py.

1455  def process(self, config):
1456 
1457  # check recieved input data accord to protocol
1458  self.__checkInputData(self.input_data)
1459 
1460  self.logger.info('Start processing on BatchItem from Batch: ' + str(self.input_data.batchId))
1461 
1462  # fill profiler message list
1463  self.__fillProfilerMessageList(self.input_data)
1464  self.logger.debug("self.inputData:\n%s", varDump(self.input_data))
1465 
1466  # get output data format
1467  self.outputFormat = self.__getOutputFormat(self.input_data)
1468 
1469  # get alt tags mask as property from input data
1470  altTagsMask = self.__getAltTagsMask(self.input_data)
1471 
1472  # get property from input data and use in valid case
1473  properties = self.__getPropertiesFromInputData(self.input_data)
1474  if properties is not None:
1475  self.properties = properties
1476 
1477  algorithmName = self.properties[CONSTS.ALGORITHM_KEY][CONSTS.ALGORITHM_NAME_KEY]
1478 
1479  self.logger.debug("Algorithm : %s" % algorithmName)
1480  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1481  Utils.storePickleOnDisk(self.input_data, self.ENV_SCRAPER_STORE_PATH, "scraper.in." + \
1482  str(self.input_data.urlId))
1483 
1484  tmp = sys.stdout
1485  sys.stdout = open("/dev/null", "wb")
1486 
1487  # initialization of scraper
1488  # load scraper's modules
1489 
1490 
1491  urlHost = self.calcUrlDomainCrc(self.input_data.url)
1492  self.logger.info('urlHost: ' + str(urlHost))
1493 
1494  self.extractors = self.__loadExtractors(algorithmName, config, urlHost)
1495 
1496 
1497  # log info input data
1498  self.logger.info("input_data url: %s, urlId: %s, siteId: %s", str(self.input_data.url), str(self.input_data.urlId),
1499  str(self.input_data.siteId))
1500  # self.logger.debug("input_data:\n" + varDump(self.input_data))
1501 
1502  # self.logger.debug("Initialization pubdate from urlObj.pDate use value: %s",
1503  # str(self.input_data.batch_item.urlObj.pDate))
1504  # self.pubdate = self.input_data.batch_item.urlObj.pDate
1505 
1506  # get iterator to ranked list of extractors
1507  self.itr = iter(sorted(self.extractors, key=lambda extractor: extractor.rank, reverse=True))
1508  self.logger.debug("Extractors: %s" % varDump(self.itr))
1509 
1510  # Reconfigure processor's properties to involve only template scraper
1511  responses = self.templateExtraction(config, urlHost)
1512 
1513  if CONSTS.MEDIA_LIMITS_NAME in self.input_data.batch_item.properties:
1514  self.logger.debug("Found property '%s'", str(CONSTS.MEDIA_LIMITS_NAME))
1515  self.mediaLimitsHandler = MediaLimitsHandler(self.input_data.batch_item.properties[CONSTS.MEDIA_LIMITS_NAME])
1516 
1517  # variable for result
1518  scraperResponseList = []
1519  for response in responses:
1520  if response is not None:
1521  response.stripResult()
1522 
1523  # put extracted article to the db
1524  if algorithmName != CONSTS.PROCESS_ALGORITHM_REGULAR:
1525  self.adjustTitle(response)
1526  self.adjustLinkURL(response)
1527  self.adjustPartialReferences(response)
1528  self.logger.debug("PDate: %s" % str(self.input_data.batch_item.urlObj.pDate))
1529  self.logger.debug("PDate type: %s" % str(type(self.input_data.batch_item.urlObj.pDate)))
1530 
1531 
1532  self.preparseResponse(response)
1533 
1534  self.logger.debug('>>>>> self.properties = ' + varDump(self.properties))
1535 
1536  # Setting pubdate in depend of different sources masks
1537  # default values
1538  pdateSourceMask = APP_CONSTS.PDATE_SOURCES_MASK_BIT_DEFAULT
1539  pdateSourceMaskOverwrite = APP_CONSTS.PDATE_SOURCES_MASK_OVERWRITE_DEFAULT
1540 
1541  # get value 'PDATE_SOURCES_MASK' from site properties
1542  if APP_CONSTS.PDATE_SOURCES_MASK_PROP_NAME in self.input_data.batch_item.properties:
1543  pdateSourceMask = int(self.input_data.batch_item.properties[APP_CONSTS.PDATE_SOURCES_MASK_PROP_NAME])
1544 
1545  # get value 'PDATE_SOURCES_MASK_OVERWRITE' from site properties
1546  if APP_CONSTS.PDATE_SOURCES_MASK_OVERWRITE_PROP_NAME in self.input_data.batch_item.properties:
1547  pdateSourceMaskOverwrite = \
1548  int(self.input_data.batch_item.properties[APP_CONSTS.PDATE_SOURCES_MASK_OVERWRITE_PROP_NAME])
1549 
1550  self.logger.debug('pdateSourceMask = %s, pdateSourceMaskOverwrite = %s',
1551  str(pdateSourceMask), str(pdateSourceMaskOverwrite))
1552 
1553  self.logger.debug("!!! self.input_data.batch_item.urlObj.pDate = " + str(self.input_data.batch_item.urlObj.pDate))
1554 
1555  timezone = ''
1556  # URL object the "pdate" field (supposed was got from the RSS feed)
1557  if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_RSS_FEED:
1558  if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_RSS_FEED) or \
1559  not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_RSS_FEED:
1560  self.pubdate, timezone = self.extractPubdateRssFeed(self.input_data.siteId, self.input_data.url)
1561 
1562  # Normalization procedure after the scraping, supposes the tag dc_date for the NEWS or TEMPLATE scraping.
1563  if CONSTS.TAG_DC_DATE in response.tags and pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_DC_DATE:
1564  if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_DC_DATE and self.pubdate is None) or \
1565  not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_DC_DATE:
1566  if CONSTS.TAG_PUB_DATE not in response.tags or \
1567  (isinstance(response.tags[CONSTS.TAG_PUB_DATE]["data"], basestring) and \
1568  response.tags[CONSTS.TAG_PUB_DATE]["data"].strip() == ""):
1569  response.tags[CONSTS.TAG_PUB_DATE] = copy.deepcopy(response.tags[CONSTS.TAG_DC_DATE])
1570  response.tags[CONSTS.TAG_PUB_DATE]["name"] = CONSTS.TAG_PUB_DATE
1571  if len(response.tags[CONSTS.TAG_PUB_DATE]) > 0 and response.tags[CONSTS.TAG_PUB_DATE][0]:
1572  self.pubdate = response.tags[CONSTS.TAG_PUB_DATE][0]
1573  self.logger.debug("Pubdate from 'dc_date': " + str(self.pubdate))
1574 
1575  # Normalization procedure after the scraping, supposes the "pubdate" tag for the NEWS or TEMPLATE scraping.
1576  timezone = ''
1577  if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_PUBDATE:
1578  if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_PUBDATE and self.pubdate is None) or \
1579  not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_PUBDATE:
1580  pubdate, timezone = self.normalizeDatetime(response, algorithmName)
1581  if pubdate is not None:
1582  self.pubdate = pubdate
1583  self.logger.debug("Pubdate from 'pubdate': " + str(self.pubdate))
1584 
1585  # Current date (SQL NOW())
1586  if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_NOW:
1587  if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_NOW and self.pubdate is None) or \
1588  not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_NOW:
1589  self.pubdate = SQLExpression("NOW()")
1590  self.logger.debug("Pubdate from 'SQL NOW()': " + str(self.pubdate))
1591 
1592  # Custom SQL expression defined in the property PDATE_SOURCES_EXPRESSION
1593  if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_SQL_EXPRESSION and \
1594  APP_CONSTS.PDATE_SOURCES_EXPRESSION_PROP_NAME in self.properties:
1595  if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_SQL_EXPRESSION and self.pubdate is None) or \
1596  not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_SQL_EXPRESSION:
1597  self.pubdate = SQLExpression(str(self.properties[APP_CONSTS.PDATE_SOURCES_EXPRESSION_PROP_NAME]))
1598  self.logger.debug("Pubdate from 'sql expression': " + str(self.pubdate))
1599 
1600  # Apply property 'PDATE_DAY_MONTH_ORDER'
1601  self.pubdate = self.pubdateMonthOrder(self.pubdate, self.input_data.batch_item.properties, self.input_data.url)
1602 
1603  # Apply property 'PDATE_TIME'
1604  self.input_data.batch_item.urlObj.pDate = self.pubdate
1605  self.pubdate = FieldsSQLExpressionEvaluator.evaluatePDateTime(self.input_data.batch_item.properties,
1606  self.dbWrapper,
1607  self.input_data.batch_item.urlObj,
1608  self.logger,
1609  self.pubdate)
1610 
1611  # Apply property 'PDATE_TIMEZONES'
1612  self.pubdate, timezone = self.pubdateTransform(self.pubdate,
1613  timezone,
1614  self.input_data.batch_item.properties,
1615  self.input_data.url)
1616 
1617  # Add tag 'pubdate_tz'
1618  self.addCustomTag(result=response, tag_name=CONSTS.TAG_PUBDATE_TZ, tag_value=[timezone])
1619 
1620  if "pubdate" in response.tags and "data" in response.tags["pubdate"] and \
1621  len(response.tags["pubdate"]["data"]) > 0:
1622  response.tags["pubdate"]["data"][0] = self.pubdate
1623 
1624  if self.outputFormat is None:
1625  self.logger.debug(">>> Warning, can't extract output format")
1626  else:
1627  self.formatOutputData(response, self.outputFormat)
1628 
1629  response.recalcTagMaskCount(None, altTagsMask)
1630 
1631  self.logger.debug("response.tagsCount: " + str(response.tagsCount) + \
1632  " response.tagsMasks: " + str(response.tagsMask) + \
1633  "\n>>> Resp: " + varDump(response))
1634 
1635  # Get start and finish times
1636  startTime = 0
1637  if len(responses) > 0:
1638  startTime = responses[0].start
1639 
1640  finishTime = time.time()
1641  # recalculate spend time
1642  for response in responses:
1643  response.start = startTime
1644  response.finish = finishTime
1645  response.data["time"] = "%s" % str(finishTime - startTime)
1646 
1647  response = self.applyHTTPRedirectLink(self.input_data.batch_item.siteId, self.input_data.batch_item.urlObj.url,
1648  self.input_data.batch_item.properties, response)
1649 
1650  # get processed content and append to list of scraper responses
1651  processedContent = self.getProcessedContent(response)
1652  scraperResponseList.append(ScraperResponse(response.tagsCount, response.tagsMask, self.pubdate, \
1653  processedContent, self.errorMask))
1654 
1655  self.logger.debug('len(scraperResponseList): ' + varDump(len(scraperResponseList)))
1656  self.logger.debug('maxURLsFromPage: ' + str(self.input_data.batch_item.urlObj.maxURLsFromPage))
1657 
1658  # check allowed limits
1659  if self.input_data.batch_item.urlObj.maxURLsFromPage is not None and \
1660  int(self.input_data.batch_item.urlObj.maxURLsFromPage) > 0 and \
1661  int(self.input_data.batch_item.urlObj.maxURLsFromPage) < len(scraperResponseList):
1662  self.logger.debug('>>> scraperResponseList 1')
1663  scraperResponseList = scraperResponseList[0: int(self.input_data.batch_item.urlObj.maxURLsFromPage)]
1664  self.logger.debug('>>> scraperResponseList 2')
1665  scraperResponseList[-1].errorMask |= APP_CONSTS.ERROR_MAX_URLS_FROM_PAGE
1666  self.logger.debug("Truncated scraper responces list because over limit 'maxURLsFromPage' = " + \
1667  str(self.input_data.batch_item.urlObj.maxURLsFromPage) + " set errorMask = " + \
1668  str(APP_CONSTS.ERROR_MAX_URLS_FROM_PAGE))
1669 
1670  # send response to the stdout
1671  sys.stdout = tmp
1672 
1673  # output result of scraping
1674  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1675  output_pickled_object = pickle.dumps(scraperResponseList)
1676  Utils.storePickleOnDisk(output_pickled_object, self.ENV_SCRAPER_STORE_PATH,
1677  "scraper.out." + str(self.input_data.urlId))
1678  print output_pickled_object
1679  sys.stdout.flush()
1680  else:
1681  self.output_data = scraperResponseList
1682  self.logger.debug('self.output_data: ' + str(varDump(self.output_data)))
1683 
1684 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:

◆ refineBadDateTags()

def dc_processor.ScraperMultiItemsTask.refineBadDateTags (   self,
  response 
)

Definition at line 1090 of file ScraperMultiItemsTask.py.

1090  def refineBadDateTags(self, response):
1091  removeKeys = []
1092  for key in response.tags:
1093  if key in self.DATA_NEWS_TAGS:
1094  tagsValue = None
1095 
1096  if isinstance(response.tags[key], str) or isinstance(response.tags[key], unicode):
1097  tagsValue = response.tags[key]
1098  elif isinstance(response.tags[key], dict) and "data" in response.tags[key]:
1099  if isinstance(response.tags[key]["data"], str) or isinstance(response.tags[key]["data"], unicode):
1100  tagsValue = response.tags[key]["data"]
1101  elif isinstance(response.tags[key]["data"], list) and len(response.tags[key]["data"]) > 0 and \
1102  isinstance(response.tags[key]["data"][0], str) or isinstance(response.tags[key]["data"][0], unicode):
1103  tagsValue = response.tags[key]["data"][0]
1104 
1105  if tagsValue is not None:
1106  try:
1107  dt = parser.parse(tagsValue)
1108  int(time.mktime(dt.timetuple()))
1109  except Exception:
1110  removeKeys.append(key)
1111 
1112  for key in removeKeys:
1113  if key in response.tags:
1114  logging.debug(">>> Remove " + key + " element besause it empty")
1115  del response.tags[key]
1116 
1117 

◆ run()

def dc_processor.ScraperMultiItemsTask.run (   self)

Definition at line 465 of file ScraperMultiItemsTask.py.

465  def run(self):
466  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
467  # call base class run method
468  foundation.CementApp.run(self)
469  # get input data from stdin
470  self.input_data = self.__getInputData()
471 
472  # call initialization application
473  config = self.__initApp(self.configFile)
474 
475  self.process(config)
476 
477  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
478  # Finish logging
479  self.logger.info(APP_CONSTS.LOGGER_DELIMITER_LINE)
480 
481 

◆ setup()

def dc_processor.ScraperMultiItemsTask.setup (   self)

Definition at line 459 of file ScraperMultiItemsTask.py.

459  def setup(self):
460  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
461  # call base class setup method
462  foundation.CementApp.setup(self)
463 

◆ templateExtraction()

def dc_processor.ScraperMultiItemsTask.templateExtraction (   self,
  config,
  urlHost 
)

Definition at line 1174 of file ScraperMultiItemsTask.py.

1174  def templateExtraction(self, config, urlHost):
1175  self.extractor = ScrapyExtractor(config, self.input_data.template, urlHost)
1176  sel = Selector(text=self.input_data.raw_content)
1177  if isinstance(self.input_data.template, dict):
1178  template = self.input_data.template
1179  else:
1180  # template = ast.literal_eval(self.input_data.template)
1181  # TODO:strange potential backdoor for malicious code, cancelled by bgv
1182  pass
1183 
1184  # Calculate mandatory properties for exist tags
1185  mandatoryTags = {}
1186  for key, value in template.items():
1187  isMandatory = True
1188  self.logger.debug(">>> Calculate mandatory for '" + str(key) + "'")
1189  for elem in value:
1190  self.logger.debug(">>> mandatory = " + str(elem["mandatory"]) + " type: " + str(type(elem["mandatory"])))
1191  if bool(elem["mandatory"]) is False:
1192  isMandatory = False
1193  continue
1194 
1195  mandatoryTags[key] = isMandatory
1196 
1197  self.logger.debug(">>> Calculated mandatoryTags: " + varDump(mandatoryTags))
1198 
1199  scraperDocs = ScraperResultDocuments(template.keys(), self.input_data.urlId)
1200 
1201  # Add End
1202  for key in template:
1203  self.logger.debug(">>> Template key: " + key)
1204  if "state" in template[key] and not bool(int(template[key]["state"])):
1205  self.logger.debug(">>> Template disable: template name = " + str(key))
1206  continue
1207  for path in template[key]:
1208  if not isinstance(path, dict):
1209  self.logger.debug(">>> WARNING path not DICT type ")
1210  continue
1211 
1212  isExtract = True
1213  localResult = Result(None, self.input_data.urlId)
1214  # Added new template format conversion
1215  xpath = None
1216  xpathValue = None
1217 
1218  # Logging xPath trees
1219  self.logger.debug(">>> Logging xPath trees for key: '" + str(key) + "'")
1220  etrees = sel.xpath(path['target'])
1221  for etree in etrees:
1222 
1223  self.logger.debug(">>> etree: " + varDump(etree))
1224  if isinstance(etree._root, basestring): # pylint: disable=W0212
1225  continue
1226 
1227  etreeValue = self.get_path(etree._root) # pylint: disable=W0212
1228  self.logger.debug('>>> etreeValue: ' + varDump(etreeValue))
1229  scraperDocs.addEtree(key, copy.deepcopy(etreeValue))
1230 
1231  # Added new template type specification
1232  xPathPreparing = TemplateExtractorXPathPreparing(self.properties[CONSTS.TAG_MARKUP_PROP_NAME] \
1233  if CONSTS.TAG_MARKUP_PROP_NAME in self.properties else None)
1234 
1235  self.logger.debug(">>> xPathPreparing: " + varDump(xPathPreparing))
1236  self.logger.debug(">>> path: " + varDump(path))
1237  self.logger.debug(">>> sel: " + varDump(sel))
1238 
1239  self.logger.debug(">>> self.properties: " + varDump(self.properties))
1240  # Added new template type specification
1241  self.xpathSplitString = xPathPreparing.resolveDelimiter(path, self.properties, self.xpathSplitString)
1242  innerDelimiter = xPathPreparing.resolveInnerDelimiter(path, self.properties)
1243  self.logger.debug(">>> xpathSplitString: '" + str(self.xpathSplitString) + "'")
1244  self.logger.debug(">>> innerDelimiter: '" + str(innerDelimiter) + "'")
1245  try:
1246  xpath, xpathValue = xPathPreparing.process(path, sel, self.xpathSplitString, innerDelimiter,
1247  Utils.innerTextToList)
1248  except Exception, err:
1249  ExceptionLog.handler(self.logger, err, "some rule/xpath exception:", (), \
1250  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
1251  continue
1252 
1253  self.logger.debug(">>> xpathValue " + str(type(xpathValue)) + " " + str(xpathValue))
1254  self.logger.debug(">>> xpath: %s" % str(xpath))
1255  if (isinstance(xpathValue, list) and len(xpathValue) == 0) or\
1256  (isinstance(xpathValue, basestring) and xpathValue == ''):
1257  self.logger.debug(">>> set default xpathValue")
1258  xpathValue = []
1259  xpathValue.append(path["default"])
1260  isExtract = False
1261 
1262  if not isinstance(xpathValue, list):
1263  xpathValue = [xpathValue]
1264 
1265  for xpathElem in xpathValue:
1266  elemResult = copy.deepcopy(localResult)
1267  self.logger.debug("result before:\n%s", varDump(elemResult))
1268  self.extractor.addTag(result=elemResult, tag_name=key, tag_value=xpathElem, xpath=xpath,
1269  isDefaultTag=(not isExtract), callAdjustment=False, tagType=path["type"],
1270  allowNotFilled=True)
1271 
1272  self.logger.debug("result after:\n%s", varDump(elemResult))
1273 
1274  self.logger.debug(">>> tag type = " + str(type(elemResult.tags)))
1275  self.logger.debug(">>> tags data type = " + str(type(elemResult.tags[key]["data"])))
1276 
1277  if key in elemResult.tags and isinstance(elemResult.tags[key]["data"], basestring):
1278  self.logger.debug(">>> Convert result = " + str(key))
1279  localString = elemResult.tags[key]["data"]
1280  elemResult.tags[key]["data"] = []
1281  elemResult.tags[key]["data"].append(localString)
1282 
1283  if isExtract and "postProcessing" in path and path["postProcessing"] is not None and \
1284  path["postProcessing"] != "":
1285  self.applyPostProcessing(elemResult, key, path["postProcessing"])
1286 
1287 
1288  self.logger.debug("scraperDocs.addDoc key: " + str(key) + ' mandatory = ' + varDump(mandatoryTags[key]))
1289 
1290  scraperDocs.addDoc(key, elemResult, path["join"], isExtract,
1291  (bool(path["mandatory"]) if "mandatory" in path else False))
1292 
1293  # for response
1294  resultsList = []
1295  resultDocs = scraperDocs.getAllDocs(mandatoryTags, self.logger)
1296 
1297  for elem in resultDocs:
1298  result = Result(None, self.input_data.urlId)
1299  # Add tag 'order_number'
1300  self.addCustomTag(result=result, tag_name=CONSTS.TAG_ORDER_NUMBER, \
1301  tag_value=str(elem[CONSTS.TAG_ORDER_NUMBER]))
1302  # Add tag 'source_url'
1303  self.addCustomTag(result=result, tag_name=CONSTS.TAG_SOURCE_URL, \
1304  tag_value=[self.input_data.url])
1305 
1306  # Prepare result
1307  prepareResultsList = self.prepareResults([elem])
1308  self.compileResults(result, prepareResultsList, key, xPathPreparing)
1309  result.finish = time.time()
1310  resultsList.append(copy.deepcopy(result))
1311 
1312  return resultsList
1313 
1314 
1315 # # # Add custom tag
1316 # #
1317 # # @param result - Scrper result instance
1318 # # @param tag_name - value name of tag
1319 # # @param tag_value - value value of tag
1320 # # @return - None
1321 # def addCustomTag(self, result, tag_name, tag_value):
1322 # data = {"extractor": "Base extractor", "data": "", "name": ""}
1323 # data["data"] = tag_value
1324 # data["name"] = tag_name
1325 # data["xpath"] = None
1326 # data["type"] = None
1327 # data["extractor"] = self.__class__.__name__
1328 # result.tags[tag_name] = data
1329 
1330 
1331 # def compileResults(self, result, resultsList, key, xPathPreparing=None):
1332 # for elem in resultsList:
1333 # if key in result.tags:
1334 # if result.tags[key] is not None:
1335 # if result.tags[key]["xpath"] is None:
1336 # result.tags[key]["xpath"] = elem["obj"].tags[key]["xpath"]
1337 # else:
1338 # result.tags[key]["xpath"] += ' '
1339 # result.tags[key]["xpath"] += elem["obj"].tags[key]["xpath"]
1340 # if result.tags[key]["data"] is None or len(result.tags[key]["data"]) == 0:
1341 # result.tags[key]["data"] = elem["obj"].tags[key]["data"]
1342 # else:
1343 # if xPathPreparing is not None:
1344 # self.xpathSplitString = xPathPreparing.resolveDelimiter(elem, self.properties, self.xpathSplitString)
1345 # result.tags[key]["data"][0] += self.xpathSplitString
1346 # else:
1347 # result.tags[key]["data"][0] += ' '
1348 # result.tags[key]["data"][0] += elem["obj"].tags[key]["data"][0]
1349 # else:
1350 # result.tags.update(elem["obj"].tags)
1351 
1352 
1353 # def prepareResults(self, resultsList):
1354 # ret = []
1355 # if len(resultsList) > 0:
1356 # localElemWeight = 0
1357 # firstElemWeight = 0
1358 # firstElem = None
1359 # tempList = []
1360 # for elem in resultsList:
1361 # localElemWeight = 0
1362 # if elem["join"] == "concat":
1363 # tempList.append(elem)
1364 # else:
1365 # if elem["mandatory"]:
1366 # #>>> Mandatory breaking block -------------
1367 # if not elem["isExtract"]:
1368 # return []
1369 # #-------------
1370 # localElemWeight = localElemWeight | CONSTS.TAGS_RULES_MASK_MANDATORY_FIELD
1371 # if elem["join"] == "best":
1372 # localElemWeight = localElemWeight | CONSTS.TAGS_RULES_MASK_RULE_PRIORITY
1373 # if elem["isExtract"]:
1374 # localElemWeight = localElemWeight | CONSTS.TAGS_RULES_MASK_DEFAULT_VALUE
1375 #
1376 # self.logger.debug(">>> Rule weight = " + str(localElemWeight))
1377 # self.logger.debug(">>> Rule join = " + elem["join"])
1378 # if localElemWeight > firstElemWeight:
1379 # firstElemWeight = localElemWeight
1380 # firstElem = elem
1381 #
1382 # if firstElem is not None:
1383 # tempList = [firstElem] + tempList
1384 # isExtractResults = any([elem["isExtract"] for elem in tempList])
1385 # if isExtractResults:
1386 # ret = [elem for elem in tempList if elem["isExtract"]]
1387 # else:
1388 # ret.append(tempList[0])
1389 # return ret
1390 
1391 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
def templateExtraction(self, config, urlHost)
Here is the call graph for this function:

Variable Documentation

◆ configFile

dc_processor.ScraperMultiItemsTask.configFile

Definition at line 440 of file ScraperMultiItemsTask.py.

◆ DATA_NEWS_TAGS

list dc_processor.ScraperMultiItemsTask.DATA_NEWS_TAGS = [CONSTS.TAG_DC_DATE]

Definition at line 418 of file ScraperMultiItemsTask.py.

◆ datetimeTemplateTypes

dc_processor.ScraperMultiItemsTask.datetimeTemplateTypes

Definition at line 453 of file ScraperMultiItemsTask.py.

◆ dbWrapper

dc_processor.ScraperMultiItemsTask.dbWrapper

Definition at line 454 of file ScraperMultiItemsTask.py.

◆ ENV_SCRAPER_STORE_PATH

string dc_processor.ScraperMultiItemsTask.ENV_SCRAPER_STORE_PATH = "self.ENV_SCRAPER_STORE_PATH"

Definition at line 416 of file ScraperMultiItemsTask.py.

◆ errorMask

dc_processor.ScraperMultiItemsTask.errorMask

Definition at line 450 of file ScraperMultiItemsTask.py.

◆ exitCode

dc_processor.ScraperMultiItemsTask.exitCode

Definition at line 438 of file ScraperMultiItemsTask.py.

◆ EXTENDED_NEWS_TAGS

dictionary dc_processor.ScraperMultiItemsTask.EXTENDED_NEWS_TAGS = {"description": ["//meta[@name='description']//@content"]}

Definition at line 417 of file ScraperMultiItemsTask.py.

◆ extractor

dc_processor.ScraperMultiItemsTask.extractor

Definition at line 446 of file ScraperMultiItemsTask.py.

◆ extractors

dc_processor.ScraperMultiItemsTask.extractors

Definition at line 447 of file ScraperMultiItemsTask.py.

◆ input_data

dc_processor.ScraperMultiItemsTask.input_data

Definition at line 442 of file ScraperMultiItemsTask.py.

◆ itr

dc_processor.ScraperMultiItemsTask.itr

Definition at line 448 of file ScraperMultiItemsTask.py.

◆ logger

dc_processor.ScraperMultiItemsTask.logger

Definition at line 441 of file ScraperMultiItemsTask.py.

◆ mediaLimitsHandler

dc_processor.ScraperMultiItemsTask.mediaLimitsHandler

Definition at line 455 of file ScraperMultiItemsTask.py.

◆ MSG_ERROR_ADJUST_LINK_URL

string dc_processor.ScraperMultiItemsTask.MSG_ERROR_ADJUST_LINK_URL = "Error adjust link URL. "

Definition at line 408 of file ScraperMultiItemsTask.py.

◆ MSG_ERROR_ADJUST_PR

string dc_processor.ScraperMultiItemsTask.MSG_ERROR_ADJUST_PR = "Error adjust partial references. "

Definition at line 405 of file ScraperMultiItemsTask.py.

◆ MSG_ERROR_ADJUST_PUBDATE

string dc_processor.ScraperMultiItemsTask.MSG_ERROR_ADJUST_PUBDATE = "Error adjust PUBDATE. "

Definition at line 406 of file ScraperMultiItemsTask.py.

◆ MSG_ERROR_ADJUST_TITLE

string dc_processor.ScraperMultiItemsTask.MSG_ERROR_ADJUST_TITLE = "Error adjust title. "

Definition at line 407 of file ScraperMultiItemsTask.py.

◆ MSG_ERROR_EMPTY_CONFIG_FILE_NAME

string dc_processor.ScraperMultiItemsTask.MSG_ERROR_EMPTY_CONFIG_FILE_NAME = "Config file name is empty."

Definition at line 391 of file ScraperMultiItemsTask.py.

◆ MSG_ERROR_GET_PROPERTIES

string dc_processor.ScraperMultiItemsTask.MSG_ERROR_GET_PROPERTIES = "Error getting properties from input data"

Definition at line 402 of file ScraperMultiItemsTask.py.

◆ MSG_ERROR_INPUT_DATA_NONE

string dc_processor.ScraperMultiItemsTask.MSG_ERROR_INPUT_DATA_NONE = "Input data is none"

Definition at line 399 of file ScraperMultiItemsTask.py.

◆ MSG_ERROR_INPUT_DATA_WITHOUT_BATCH

string dc_processor.ScraperMultiItemsTask.MSG_ERROR_INPUT_DATA_WITHOUT_BATCH = "Input data without batch item."

Definition at line 400 of file ScraperMultiItemsTask.py.

◆ MSG_ERROR_INPUT_DATA_WITHOUT_PROPERTIES

string dc_processor.ScraperMultiItemsTask.MSG_ERROR_INPUT_DATA_WITHOUT_PROPERTIES = "Input data has batch item without 'properties'."

Definition at line 401 of file ScraperMultiItemsTask.py.

◆ MSG_ERROR_LOAD_APP_CONFIG

string dc_processor.ScraperMultiItemsTask.MSG_ERROR_LOAD_APP_CONFIG = "Error loading application config file."

Definition at line 395 of file ScraperMultiItemsTask.py.

◆ MSG_ERROR_LOAD_EXTRACTORS

string dc_processor.ScraperMultiItemsTask.MSG_ERROR_LOAD_EXTRACTORS = "Error load extractors "

Definition at line 404 of file ScraperMultiItemsTask.py.

◆ MSG_ERROR_LOAD_PROPERTIES_FROM_FILE

string dc_processor.ScraperMultiItemsTask.MSG_ERROR_LOAD_PROPERTIES_FROM_FILE = "Error load Scraper multi items properties from file"

Definition at line 394 of file ScraperMultiItemsTask.py.

◆ MSG_ERROR_PARSE_CMD_PARAMS

string dc_processor.ScraperMultiItemsTask.MSG_ERROR_PARSE_CMD_PARAMS = "Error parse command line parameters."

Definition at line 390 of file ScraperMultiItemsTask.py.

◆ MSG_ERROR_READ_INPUT_DATA

string dc_processor.ScraperMultiItemsTask.MSG_ERROR_READ_INPUT_DATA = "Error read input data from stdin."

Definition at line 398 of file ScraperMultiItemsTask.py.

◆ MSG_ERROR_READ_LOG_CONFIG

string dc_processor.ScraperMultiItemsTask.MSG_ERROR_READ_LOG_CONFIG = "Error read log config file."

Definition at line 396 of file ScraperMultiItemsTask.py.

◆ MSG_ERROR_WRONG_CONFIG_FILE_NAME

string dc_processor.ScraperMultiItemsTask.MSG_ERROR_WRONG_CONFIG_FILE_NAME = "Config file name is wrong"

Definition at line 392 of file ScraperMultiItemsTask.py.

◆ OPTION_SECTION_DATETIME_TEMPLATE_TYPES

string dc_processor.ScraperMultiItemsTask.OPTION_SECTION_DATETIME_TEMPLATE_TYPES = 'tags_datetime_template_types'

Definition at line 422 of file ScraperMultiItemsTask.py.

◆ output_data

dc_processor.ScraperMultiItemsTask.output_data

Definition at line 445 of file ScraperMultiItemsTask.py.

◆ outputFormat

dc_processor.ScraperMultiItemsTask.outputFormat

Definition at line 444 of file ScraperMultiItemsTask.py.

◆ properties

dc_processor.ScraperMultiItemsTask.properties

Definition at line 443 of file ScraperMultiItemsTask.py.

◆ pubdate

dc_processor.ScraperMultiItemsTask.pubdate

Definition at line 449 of file ScraperMultiItemsTask.py.

◆ SCRAPER_MULTI_ITEMS_OPTION_LOG

string dc_processor.ScraperMultiItemsTask.SCRAPER_MULTI_ITEMS_OPTION_LOG = "log"

Definition at line 412 of file ScraperMultiItemsTask.py.

◆ SCRAPER_MULTI_ITEMS_OPTION_PROPERTY_JSON_FILE

string dc_processor.ScraperMultiItemsTask.SCRAPER_MULTI_ITEMS_OPTION_PROPERTY_JSON_FILE = "property_file_name"

Definition at line 413 of file ScraperMultiItemsTask.py.

◆ TAGS_DATETIME_TEMPLATE_TYPES

list dc_processor.ScraperMultiItemsTask.TAGS_DATETIME_TEMPLATE_TYPES = [CONSTS.TAG_TYPE_DATETIME]

Definition at line 421 of file ScraperMultiItemsTask.py.

◆ usageModel

dc_processor.ScraperMultiItemsTask.usageModel

Definition at line 439 of file ScraperMultiItemsTask.py.

◆ useCurrentYear

dc_processor.ScraperMultiItemsTask.useCurrentYear

Definition at line 452 of file ScraperMultiItemsTask.py.

◆ xpathSplitString

dc_processor.ScraperMultiItemsTask.xpathSplitString

Definition at line 451 of file ScraperMultiItemsTask.py.