Classes
class	Meta

class	ScraperCustomJson

Functions
def	__init__ (self, usageModel=APP_CONSTS.APP_USAGE_MODEL_PROCESS, configFile=None, logger=None, inputData=None)

def	setup (self)

def	run (self)

def	loadConfig (self)

def	loadLogConfigFile (self)

def	loadOptions (self)

def	loadScraperProperties (self)

def	processBatch (self)

def	loadExtractors (self)

def	createModule (self, module_name)

def	getNextBestExtractor (self)

def	resourceExtraction (self, jsonElem)

def	formatOutpuElement (self, elem, localOutputFormat)

def	formatOutputData (self, response, localOutputFormat)

def	jsonParserExtractor (self, jsonElem)

def	getProcessedContent (self, result)

def	fillScraperResponse (self, jsonElem)

def	generateEmptyResponse (self)

def	jsonParserProcess (self)

def	getExitCode (self)

Variables
int	ERROR_OK = 0

int	EXIT_SUCCESS = 0

int	EXIT_FAILURE = 1

string	MSG_ERROR_LOAD_EXTRACTORS = "Error load extractors "

string	ENV_SCRAPER_STORE_PATH = "ENV_SCRAPER_STORE_PATH"

list	TAGS_DATETIME_NEWS_NAMES = [CONSTS.TAG_PUB_DATE, CONSTS.TAG_DC_DATE]

string	MSG_ERROR_WRONG_CONFIG_FILE_NAME = "Config file name is wrong"

list	TAGS_DATETIME_TEMPLATE_TYPES = [CONSTS.TAG_TYPE_DATETIME]

string	OPTION_SECTION_DATETIME_TEMPLATE_TYPES = 'tags_datetime_template_types'

	exitCode

	usageModel

	configFile

	logger

	input_data

	properties

	extractor

	extractors

	itr

	pubdate

	timezone

	errorMask

	scraperPropFileName

	algorithm_name

	scraperResponses

	tagsCount

	tagsMask

	processedContent

	outputFormat

	metrics

	altTagsMask

	urlHost

	output_data

	dbWrapper

	datetimeTemplateTypes

	useCurrentYear

	config

Function Documentation

◆ init()

def dc_processor.ScraperCustomJson.__init__	(	self,
		usageModel = `APP_CONSTS.APP_USAGE_MODEL_PROCESS`,
		configFile = `None`,
		logger = `None`,
		inputData = `None`
	)

Definition at line 85 of file ScraperCustomJson.py.

   def __init__(self, usageModel=APP_CONSTS.APP_USAGE_MODEL_PROCESS, configFile=None, logger=None, inputData=None):
     if usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
       # call base class __init__ method
       # #foundation.CementApp.__init__(self)
       Scraper.__init__(self)
 
     self.exitCode = APP_CONSTS.EXIT_SUCCESS
     self.usageModel = usageModel
     self.configFile = configFile
     self.logger = logger
     self.input_data = inputData
     self.properties = {}
     self.extractor = None
     self.extractors = []
     self.itr = None
     self.pubdate = None
     self.timezone = None
     self.errorMask = APP_CONSTS.ERROR_OK
     self.scraperPropFileName = None
     self.algorithm_name = None
     self.scraperResponses = []
     self.tagsCount = 0
     self.tagsMask = 0
     self.pubdate = None
     self.processedContent = None
     self.outputFormat = None
     self.metrics = None
     self.altTagsMask = None
     self.errorMask = APP_CONSTS.ERROR_OK
     self.urlHost = None
     self.output_data = None
     self.dbWrapper = None
     self.datetimeTemplateTypes = []
     self.useCurrentYear = 0
 
 

◆ createModule()

def dc_processor.ScraperCustomJson.createModule	(	self,
		module_name
	)

Definition at line 374 of file ScraperCustomJson.py.

   def createModule(self, module_name):
     appInst = None
     try:
       appInst = (module_name, eval(module_name)(self.config, None, self.urlHost, self.properties))[1]  # pylint: disable=W0123
       self.logger.debug("%s has been created!" % module_name)
     except Exception as err:
       ExceptionLog.handler(self.logger, err, "Can't create module %s. Error is:" % (module_name))
 
     return appInst
 
 

◆ fillScraperResponse()

def dc_processor.ScraperCustomJson.fillScraperResponse	(	self,
		jsonElem
	)

Definition at line 566 of file ScraperCustomJson.py.

   def fillScraperResponse(self, jsonElem):
     self.tagsCount = 0
     self.tagsMask = 0
     self.pubdate = None
     self.processedContent = None
     self.errorMask = APP_CONSTS.ERROR_OK
     self.jsonParserExtractor(jsonElem)
     return ScraperResponse(self.tagsCount, self.tagsMask, self.pubdate, self.processedContent, self.errorMask)
 
 

◆ formatOutpuElement()

def dc_processor.ScraperCustomJson.formatOutpuElement	(	self,
		elem,
		localOutputFormat
	)

Definition at line 433 of file ScraperCustomJson.py.

   def formatOutpuElement(self, elem, localOutputFormat):
     ret = elem
     if localOutputFormat == "json":
       # self.logger.debug(">>> JSON HTML = " + elem)
       localStr = json.dumps(elem, ensure_ascii=False)
       if localStr[0] == '\"' or localStr[0] == '\'':
         localStr = localStr[1:]
       if localStr[-1] == '\"' or localStr[-1] == '\'':
         localStr = localStr[0:-1]
       ret = localStr
       # self.logger.debug(">>> JSON HTML = " + ret)
     elif localOutputFormat == "html" or localOutputFormat == "xml":
       ret = xml.sax.saxutils.escape(elem, {"'": "&apos;", "\"" : "&quot;"})
     elif localOutputFormat == "sql":
       # ret = mdb.escape_string(elem)  # pylint: disable=E1101
       ret = Utils.escape(elem)
     return ret
 
 

◆ formatOutputData()

def dc_processor.ScraperCustomJson.formatOutputData	(	self,
		response,
		localOutputFormat
	)

Definition at line 454 of file ScraperCustomJson.py.

   def formatOutputData(self, response, localOutputFormat):
     # result.tags[key]["data"]
     for key in response.tags:
       if "data" in response.tags[key]:
         if isinstance(response.tags[key]["data"], types.ListType):
           for i, elem in enumerate(response.tags[key]["data"]):
             response.tags[key]["data"][i] = self.formatOutpuElement(elem, localOutputFormat)
         elif isinstance(response.tags[key]["data"], types.StringTypes):
           response.tags[key]["data"] = self.formatOutpuElement(response.tags[key]["data"], localOutputFormat)
 
 

◆ generateEmptyResponse()

def dc_processor.ScraperCustomJson.generateEmptyResponse ( self )

Definition at line 578 of file ScraperCustomJson.py.

   def generateEmptyResponse(self):
     localResult = Result(self.config, self.input_data.urlId, self.metrics)
     # Add tag 'source_url'
     self.addCustomTag(result=localResult, tag_name=CONSTS.TAG_SOURCE_URL, tag_value=[str(self.input_data.url)])
     self.getProcessedContent([localResult])
     return ScraperResponse(0, 0, self.pubdate, self.processedContent, APP_CONSTS.ERROR_MASK_SCRAPER_ERROR)
 
 

◆ getExitCode()

def dc_processor.ScraperCustomJson.getExitCode ( self )

Definition at line 618 of file ScraperCustomJson.py.

   def getExitCode(self):
     return self.exitCode
 
 
 #   # # Add custom tag
 #   #
 #   # @param result - Scrper result instance
 #   # @param tag_name - value name of tag
 #   # @param tag_value - value value of tag
 #   # @return - None
 #   def addCustomTag(self, result, tag_name, tag_value):
 #     data = {"extractor": "Base extractor", "data": "", "name": ""}
 #     data["data"] = tag_value
 #     data["name"] = tag_name
 #     data["xpath"] = None
 #     data["type"] = None
 #     data["extractor"] = self.__class__.__name__
 #     if tag_name not in result.tags:
 #       result.tags[tag_name] = data
 
 
 #   # # Normalize datetime tags procedure
 #   #
 #   # @param response - scraper response instance
 #   # @param algorithmName - algorithm name
 #   # @return - 'pubdate tag value'
 #   def normalizeDatetime(self, response, algorithmName):
 #     ret = None
 #     timezone = ''
 #     try:
 #       if response is not None and response.tags is not None:
 #         self.logger.debug("normalizeDatetime scraper response: " + varDump(response))
 #         tagNames = []
 #         if self.input_data.template and algorithmName == CONSTS.PROCESS_ALGORITHM_REGULAR:
 #           # temlate
 #           for responseType in self.datetimeTemplateTypes:
 #             for responseTagName in response.tags:
 #               self.logger.debug("normalizeDatetime responseTagName: '" + str(responseTagName) + "'")
 #               if (response.tags.get(responseTagName) is not None and \
 #               'type' in response.tags[responseTagName] and \
 #               response.tags[responseTagName]['type'] == responseType) or \
 #               (responseTagName == CONSTS.TAG_PUB_DATE and response.tags.get(responseTagName) is not None):
 #                 tagNames.append(responseTagName)
 #         else:
 #           tagNames = TAGS_DATETIME_NEWS_NAMES
 #
 #         self.logger.debug('normalizeDatetime  tagNames: ' + varDump(tagNames))
 #         retDict = {}
 #         for tagName in tagNames:
 #           pubdate, tzone = self.extractPubDate(response, tagName)  # , properties, urlString)
 #           if self.extractor and tagName in response.tags:
 #             self.extractor.addTag(result=response, tag_name=tagName + '_normalized', tag_value=pubdate, \
 #                                   xpath=response.tags[tagName]['xpath'])
 #
 #           self.logger.debug('tagName: ' + str(tagName) + ' pubdate: ' + str(pubdate))
 #           retDict[tagName] = pubdate
 #
 #           if tagName == CONSTS.TAG_PUB_DATE:
 #             ret = pubdate
 #             timezone = tzone
 #           else:
 #             pass
 #
 #         if ret is None:
 #           for key, value in retDict.items():
 #             if value is not None:
 #               ret = value
 #               self.logger.debug('set return value from ' + str(key) + ' : ' + str(value))
 #               break
 #
 #     except Exception, err:
 #       ExceptionLog.handler(self.logger, err, 'normalizeDatetime error:', (), \
 #                            {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
 #
 #     return ret, timezone
 
 
 #   # # Extract pubdate
 #   #
 #   # @param response - response instance
 #   # @param dataTagName - tag name for extracting
 #   # @param properties - properties from PROCESSOR_PROPERTIES
 #   # @param urlString - url string value
 #   # @return pubdate if success or None
 #   def extractPubDate(self, response, dataTagName):  # , properties, urlString):
 #     # variable for result
 #     ret = None
 #     timezone = ''
 #     try:
 #       if response is not None and dataTagName in response.tags and response.tags[dataTagName] is not None:
 #
 #         # self.logger.debug("extractPubDate response: " + varDump(response))
 #
 #         inputData = response.tags[dataTagName]["data"]
 #         self.logger.debug("extractPubDate response has '" + str(dataTagName) + "' is: " + str(inputData))
 #         self.logger.debug("extractPubDate type of '" + str(dataTagName) + "' is: " + str(type(inputData)))
 #
 #         inputList = []
 #         if isinstance(inputData, basestring):
 #           inputList = [inputData]
 #         elif isinstance(inputData, list):
 #           inputList = inputData
 #         else:
 #           pass
 #
 #         pubdate = []
 #         timezones = []
 #         for inputElem in inputList:
 #           d = DateTimeType.parse(inputElem, bool(self.useCurrentYear), self.logger, False)
 #           self.logger.debug('pubdate: ' + str(d))
 #
 #           if d is not None:
 #             d, tzone = DateTimeType.split(d)
 #             pubdate.append(d.isoformat(DateTimeType.ISO_SEP))
 #             timezones.append(tzone)
 #
 #         self.logger.debug("extractPubDate result pubdate: " + str(pubdate))
 #         response.tags[dataTagName]["data"] = pubdate
 #         if len(pubdate) > 0:
 #           ret = pubdate[0]
 #
 #         if len(timezones) > 0:
 #           timezone = timezones[0]
 #
 #     except Exception, err:
 #       ExceptionLog.handler(self.logger, err, 'extractPubDate error:', (), \
 #                            {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
 #
 #     return ret, timezone
 
 
 #   # # pubdate transformation use timezone value
 #   #
 #   # @param rawPubdate - raw pubdate string
 #   # @param rawTimezone - raw timezone string
 #   # @param properties - properties from PROCESSOR_PROPERTIES
 #   # @param urlString - url string value
 #   # @return pubdate and timezone if success or None and empty string
 #   def pubdateTransform(self, rawPubdate, rawTimezone, properties, urlString):
 #     # variables for result
 #     pubdate = rawPubdate
 #     timezone = rawTimezone
 #
 #     self.logger.debug('properties: ' + varDump(properties))
 #     if CONSTS.PDATE_TIMEZONES_NAME in properties:
 #       propertyString = properties[CONSTS.PDATE_TIMEZONES_NAME]
 #       self.logger.debug('inputted ' + CONSTS.PDATE_TIMEZONES_NAME + ':' + str(propertyString))
 #
 #       dt = DateTimeType.parse(rawPubdate, bool(self.useCurrentYear), self.logger, False)
 #       self.logger.debug('pubdate: ' + str(dt))
 #       if dt is not None:
 #         # get utc offset if necessary
 #         utcOffset = DateTimeType.extractUtcOffset(rawTimezone, self.logger)
 #         self.logger.debug('utcOffset: ' + str(utcOffset))
 #         # transformation accord to PDATE_TIMEZONES properties
 #         d = PDateTimezonesHandler.transform(dt, utcOffset, propertyString, urlString, self.logger)
 #         if d is not None:
 #           dt = d
 #
 #       if dt is not None:
 #         d, tzone = DateTimeType.split(dt)
 #         pubdate = d.isoformat(DateTimeType.ISO_SEP)
 #         timezone = tzone
 #
 #     return pubdate, timezone
 
 
 #   # # change month orden in pubdate if neccessary
 #   #
 #   # @param rawPubdate - raw pubdate string in iso format. sample: '2016-02-07 16:28:00'
 #   # @param properties - properties from PROCESSOR_PROPERTIES
 #   # @param urlString - url string value
 #   # @return pubdate and timezone if success or None and empty string
 #   def pubdateMonthOrder(self, rawPubdate, properties, urlString):
 #     # variables for result
 #     pubdate = rawPubdate
 #
 #     self.logger.debug('pubdateMonthOrder() enter... rawPubdate: ' + str(rawPubdate))
 #     if CONSTS.PDATE_DAY_MONTH_ORDER_NAME in properties and isinstance(rawPubdate, basestring):
 #       propertyObj = []
 #       try:
 #         self.logger.debug('inputted ' + CONSTS.PDATE_DAY_MONTH_ORDER_NAME + ':' + \
 #                           str(properties[CONSTS.PDATE_DAY_MONTH_ORDER_NAME]))
 #         propertyObj = json.loads(properties[CONSTS.PDATE_DAY_MONTH_ORDER_NAME])
 #       except Exception, err:
 #         self.logger.error("Fail loads '%s', error: %s", str(CONSTS.PDATE_DAY_MONTH_ORDER_NAME), str(err))
 #
 #       for propertyElem in propertyObj:
 #         try:
 #           if "pattern" not in propertyElem:
 #             raise Exception('Property "pattern" not found')
 #
 #           if "order" not in propertyElem:
 #             raise Exception('Property "order" not found')
 #
 #           pattern = str(propertyElem["pattern"])
 #           order = int(propertyElem["order"])
 #
 #           if re.search(pattern, urlString, re.UNICODE) is not None:
 #             self.logger.debug("Pattern '%' found in url: %s", str(pattern), str(urlString))
 #
 #             dt = None
 #             if order == 0:  # means day follows month
 #               dt = datetime.datetime.strptime(rawPubdate, "%Y-%d-%m %H:%M:%S")
 #             elif order == 1:  # means month follows day
 #               dt = datetime.datetime.strptime(rawPubdate, "%Y-%m-%d %H:%M:%S")
 #             else:
 #               raise Exception("Unsupported value of 'order' == " + str(order))
 #
 #             if dt is not None:
 #               pubdate = dt.strftime("%Y-%d-%m %H:%M:%S")
 #
 #         except Exception, err:
 #           self.logger.error("Fail execution '%s', error: %s", str(CONSTS.PDATE_DAY_MONTH_ORDER_NAME), str(err))
 #
 #     self.logger.debug('pubdateMonthOrder() leave... pubdate: ' + str(pubdate))
 #
 #     return pubdate
 
 
 #   # # Get header content
 #   #
 #   # @param siteId - Site/Project ID
 #   # @param url - url string
 #   # @return extracted header content
 #   def getHeaderContent(self, siteId, url):
 #     # variable for result
 #     headerContent = None
 #     urlContentObj = dc_event.URLContentRequest(siteId, url, \
 #                                       dc_event.URLContentRequest.CONTENT_TYPE_RAW_LAST + \
 #                                       dc_event.URLContentRequest. CONTENT_TYPE_RAW + \
 #                                       dc_event.URLContentRequest.CONTENT_TYPE_HEADERS)
 #
 #     rawContentData = self.dbWrapper.urlContent([urlContentObj])
 #
 #     if rawContentData is not None and len(rawContentData) > 0:
 #       if rawContentData[0].headers is not None and len(rawContentData[0].headers) > 0 and \
 #         rawContentData[0].headers[0] is not None:
 #         headerContent = rawContentData[0].headers[0].buffer
 #
 #     return headerContent
 #
 #
 #   # #Get variable from header content
 #   #
 #   # @param headerContent - header content
 #   # @param name - variable name
 #   # @param makeDecode - boolean flag necessary decode
 #   # @return extracted value of 'Location'
 #   def getVariableFromHeaderContent(self, headerContent, name, makeDecode=True):
 #     # variable for result
 #     ret = None
 #
 #     header = ''
 #     if makeDecode and headerContent is not None:
 #       header = base64.b64decode(headerContent)
 #
 #     headerList = header.split('\r\n')
 #     self.logger.debug("headerList: " + varDump(headerList))
 #
 #     for elem in headerList:
 #       pos = elem.find(name + ':')
 #       if pos > -1:
 #         ret = elem.replace(name + ':', '').strip()
 #         self.logger.debug("Found  '" + name + "' has value: " + str(ret))
 #         break
 #
 #     return ret

◆ getNextBestExtractor()

def dc_processor.ScraperCustomJson.getNextBestExtractor ( self )

Definition at line 385 of file ScraperCustomJson.py.

   def getNextBestExtractor(self):
     # return extractor with highest rank
     try:
       extractor = next(self.itr)
     except StopIteration:
       extractor = None
     return extractor
 
 

◆ getProcessedContent()

def dc_processor.ScraperCustomJson.getProcessedContent	(	self,
		result
	)

Definition at line 548 of file ScraperCustomJson.py.

   def getProcessedContent(self, result):
     for elem in result:
       elem.get()
     self.processedContent = {}
     self.processedContent["default"] = result[0]
     self.processedContent["internal"] = result
     self.processedContent["custom"] = []
     self.tagsCount = result[0].tagsCount
     self.tagsMask = result[0].tagsMask
 
     if "pubdate" in result[0].tags and "data" in result[0].tags["pubdate"] and \
     len(result[0].tags["pubdate"]["data"]) > 0:
       self.pubdate = result[0].tags["pubdate"]["data"][0]
       self.logger.debug('>>>> Set self.pubdate =  ' + str(self.pubdate))
 
 

◆ jsonParserExtractor()

def dc_processor.ScraperCustomJson.jsonParserExtractor	(	self,
		jsonElem
	)

Definition at line 467 of file ScraperCustomJson.py.

   def jsonParserExtractor(self, jsonElem):
     if self.extractors is not None:
       self.itr = iter(sorted(self.extractors, key=lambda extractor: 0, reverse=True))  # pylint: disable=W0612,W0613
       self.logger.debug("Extractors: %s" % varDump(self.itr))
 
     responses = self.resourceExtraction(jsonElem)
     for response in responses:
       response.metricsPrecalculate()
       response.stripResult()
       # Add tag 'source_url'
       self.addCustomTag(result=response, tag_name=CONSTS.TAG_SOURCE_URL,
                         tag_value=[str(self.input_data.url)])
 
       if CONSTS.LANG_PROP_NAME in self.properties:
         # response.tagsLangDetecting(self.properties[CONSTS.LANG_PROP_NAME])
         langDetector = ScraperLangDetector(self.properties[CONSTS.LANG_PROP_NAME])
         langDetector.process(response, self.logger)
         langTagsDict = langDetector.getLangTags()
         self.logger.debug("langTagsDict: %s", varDump(langTagsDict))
 
         # add lang tags to processed content
         for tagName, langValue in langTagsDict.items():
           self.addCustomTag(result=response, tag_name=tagName, tag_value=langValue)
 
         summaryLang = langDetector.getSummaryLang(response, self.logger)
         self.addCustomTag(result=response, tag_name=CONSTS.TAG_SUMMARY_LANG, tag_value=summaryLang)
 
       pubdate, timezone = self.normalizeDatetime(response, self.algorithm_name)
       if pubdate is not None:
         self.pubdate = pubdate
         self.logger.debug("Pubdate from 'pubdate': " + str(self.pubdate))
 
       # Apply property 'PDATE_DAY_MONTH_ORDER'
       self.pubdate = self.pubdateMonthOrder(self.pubdate, self.input_data.batch_item.properties, self.input_data.url)
 
       # Apply property 'PDATE_TIME'
       self.input_data.batch_item.urlObj.pDate = self.pubdate
       self.pubdate = FieldsSQLExpressionEvaluator.evaluatePDateTime(self.input_data.batch_item.properties,
                                                                     self.dbWrapper,
                                                                     self.input_data.batch_item.urlObj,
                                                                     self.logger,
                                                                     self.pubdate)
 
       # Apply property 'PDATE_TIMEZONES'
       self.pubdate, timezone = self.pubdateTransform(self.pubdate,
                                                      timezone,
                                                      self.input_data.batch_item.properties,
                                                      self.input_data.url)
 
       # Add tag 'pubdate_tz'
       self.addCustomTag(result=response, tag_name=CONSTS.TAG_PUBDATE_TZ, tag_value=[timezone])
 
       if "pubdate" in response.tags and "data" in response.tags["pubdate"] and \
       len(response.tags["pubdate"]["data"]) > 0:
         response.tags["pubdate"]["data"][0] = self.pubdate
 
       if self.outputFormat is not None:
         self.formatOutputData(response, self.outputFormat)
       else:
         self.logger.debug(">>> Warning, can't extracr output format")
       response.recalcTagMaskCount(None, self.altTagsMask)
       self.tagsCount = response.tagsCount
       self.tagsMask = response.tagsMask
       # self.putArticleToDB({"default":response})
       self.logger.debug("self.tagsCount: %s", self.tagsCount)
       self.logger.debug("self.tagsMasks: %s", self.tagsMask)
 
       self.logger.debug(">>> Resp: %s\n", varDump(response))
 
       # TODO: Seems need to be done more system way
       response.finish = time.time()
       response.data["time"] = "%s" % (response.finish - response.start)
 
       response = self.applyHTTPRedirectLink(self.input_data.batch_item.siteId, self.input_data.batch_item.urlObj.url,
                                             self.input_data.batch_item.properties, response)
 
     self.getProcessedContent(responses)
 
 

Here is the call graph for this function:

◆ jsonParserProcess()

def dc_processor.ScraperCustomJson.jsonParserProcess ( self )

Definition at line 588 of file ScraperCustomJson.py.

   def jsonParserProcess(self):
     rawDataJson = None
     ret = []
     try:
       rawDataJson = json.loads(self.input_data.raw_content)
     except Exception as excp:
       self.logger.debug(">>> jsonParserProcess wrong rawData json: " + str(excp))
 
     self.logger.debug("!!! type(rawDataJson) = %s", str(type(rawDataJson)))
     if not isinstance(rawDataJson, list):
       self.logger.debug("!!! rawDataJson: %s", varDump(rawDataJson))
 
 
     if rawDataJson is not None and isinstance(rawDataJson, list):
       for elem in rawDataJson:
         if isinstance(elem, list):
           for internalElem in elem:
             ret.append(self.fillScraperResponse(internalElem))
         else:
           ret.append(self.fillScraperResponse(elem))
     else:
       self.logger.debug(">>> rawDataJson structure not List type")
 
     if len(ret) == 0:
       ret.append(self.generateEmptyResponse())
     return ret
 
 

Here is the call graph for this function:

◆ loadConfig()

def dc_processor.ScraperCustomJson.loadConfig ( self )

Definition at line 157 of file ScraperCustomJson.py.

   def loadConfig(self):
     try:
       self.config = ConfigParser.ConfigParser()
       self.config.optionxform = str
       if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
         if self.pargs.config:
           self.config.read(self.pargs.config)
         else:
           self.config.read(CONSTS.SCRAPER_CUSTOM_JSON_APP_CLASS_NAME)
       else:
         self.config.read(self.configFile)
     except:
       raise
 
 

◆ loadExtractors()

def dc_processor.ScraperCustomJson.loadExtractors ( self )

Definition at line 338 of file ScraperCustomJson.py.

   def loadExtractors(self):
     try:
       # modules
       if CONSTS.MODULES_KEY in self.properties and self.algorithm_name in self.properties[CONSTS.MODULES_KEY]:
         modules = self.properties[CONSTS.MODULES_KEY][self.algorithm_name]
       else:
         self.logger.debug(">>> No moduler_key or algorithm_name in self.properties")
         modules = []
 
       self.logger.debug("Algorithm name: <%s>" % (self.algorithm_name))
       self.logger.debug("Modules: %s" % modules)
 
       self.extractors = []
       for module in modules:
         exrtactor = self.createModule(module)
         # Check if module was created successfully and then insert it to extractors
         if exrtactor is not None:
           self.extractors.append(exrtactor)
 
       # Info show extractors loaded
       self.logger.debug("*******************")
       self.logger.debug("Loaded extractors:")
       for exrtactor in self.extractors:
         self.logger.debug(exrtactor.name)
       self.logger.debug("*******************")
 
     except Exception as err:
       ExceptionLog.handler(self.logger, err, MSG_ERROR_LOAD_EXTRACTORS)
       raise
 
 

◆ loadLogConfigFile()

def dc_processor.ScraperCustomJson.loadLogConfigFile ( self )

Definition at line 175 of file ScraperCustomJson.py.

   def loadLogConfigFile(self):
     try:
       if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
         log_conf_file = self.config.get("Application", "log")
         logging.config.fileConfig(log_conf_file)
         # Logger initialization
         self.logger = Utils.MPLogger().getLogger()
     except Exception, err:
       raise Exception(CONSTS.MSG_ERROR_LOAD_CONFIG + " : " + str(err))
 
 

Here is the call graph for this function:

◆ loadOptions()

def dc_processor.ScraperCustomJson.loadOptions ( self )

Definition at line 189 of file ScraperCustomJson.py.

   def loadOptions(self):
     try:
       # class_name = self.__class__.__name__
       self.scraperPropFileName = self.config.get("Application", "property_file_name")
       # self.config_db_dir = self.config.get(class_name, "config_db_dir")
       # self.sqliteTimeout = self.config.getint("sqlite", "timeout")
 
       self.useCurrentYear = self.config.getint("DateTimeType", "useCurrentYear")
 
       if self.config.has_section(self.OPTION_SECTION_DATETIME_TEMPLATE_TYPES):
         self.datetimeTemplateTypes = []
         for key, value in self.config.items(self.OPTION_SECTION_DATETIME_TEMPLATE_TYPES):
           self.datetimeTemplateTypes.append(key)
           if self.logger is not None:
             self.logger.debug('load form config: ' + str(key) + ' = ' + str(value))
       else:
         self.datetimeTemplateTypes = self.TAGS_DATETIME_TEMPLATE_TYPES
         if self.logger is not None:
           self.logger.debug("Config file hasn't section: " + str(self.OPTION_SECTION_DATETIME_TEMPLATE_TYPES))
 
       # DBWrapper initialization
       dbTaskIniConfigFileName = self.config.get(self.__class__.__name__, "db-task_ini")
       config = ConfigParser.ConfigParser()
       config.optionxform = str
       readOk = config.read(dbTaskIniConfigFileName)
       if len(readOk) == 0:
         raise Exception(self.MSG_ERROR_WRONG_CONFIG_FILE_NAME + ": " + dbTaskIniConfigFileName)
       self.dbWrapper = DBTasksWrapper(config)
     except:
       raise
 
 

◆ loadScraperProperties()

def dc_processor.ScraperCustomJson.loadScraperProperties ( self )

Definition at line 223 of file ScraperCustomJson.py.

   def loadScraperProperties(self):
     if self.scraperPropFileName is not None:
       try:
         with open(self.scraperPropFileName, "rb") as fd:
           scraperProperies = json.loads(fd.read())
           self.properties = scraperProperies[self.__class__.__name__][CONSTS.PROPERTIES_KEY]
       except Exception as excp:
         self.logger.debug(">>> Some error with scraper property loads = " + str(excp))
 
 

◆ processBatch()

def dc_processor.ScraperCustomJson.processBatch ( self )

Definition at line 235 of file ScraperCustomJson.py.

   def processBatch(self):
     if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
       # read pickled batch object from stdin
       input_pickled_object = sys.stdin.read()
     try:
       if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
         scraper_in_data = pickle.loads(input_pickled_object)
     except Exception as err:
       ExceptionLog.handler(self.logger, err, 'pickle.loads() error:')
       self.logger.debug("input_pickled_object:\n" + str(input_pickled_object))
       self.exitCode = EXIT_FAILURE
       raise Exception(err)
 
     try:
       if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
         self.input_data = scraper_in_data
       if self.input_data.batch_item.urlObj is not None:
         urlString = self.input_data.batch_item.urlObj.url
       else:
         urlString = ""
       logMsg = "BatchItem.siteId=" + str(self.input_data.batch_item.siteId) + \
                ", BatchItem.urlId=" + str(self.input_data.batch_item.urlId) + \
                ", BatchItem.urlObj.url=" + urlString
       app.Profiler.messagesList.append(logMsg)
       self.logger.info("Incoming data: %s", logMsg)
 
       self.urlHost = app.Utils.UrlParser.getDomain(self.input_data.url)
 
 
       if self.input_data.output_format is not None and "name" in self.input_data.output_format:
         self.outputFormat = self.input_data.output_format["name"]
 
       if self.outputFormat is None and "templates" in self.input_data.batch_item.properties["template"] and \
       len(self.input_data.batch_item.properties["template"]["templates"]) > 0 and \
       "output_format" in self.input_data.batch_item.properties["template"]["templates"][0] and \
       "name" in self.input_data.batch_item.properties["template"]["templates"][0]["output_format"]:
         self.outputFormat = self.input_data.batch_item.properties["template"]["templates"][0]["output_format"]["name"]
 
       if "TAGS_MAPPING" in self.input_data.batch_item.properties and \
       self.input_data.batch_item.properties["TAGS_MAPPING"] is not None:
         try:
           self.altTagsMask = json.loads(self.input_data.batch_item.properties["TAGS_MAPPING"])
           self.logger.debug(">>> AltTags = " + str(self.altTagsMask))
         except Exception as exp:
           self.logger.debug(">>> Bad TAGS_MAPPING properties value, err=" + str(exp))
 
       try:
         if (self.input_data is not None) and (self.input_data.processor_properties is not None):
           processor_properties = self.input_data.processor_properties
           self.logger.debug("Processor's properties was taken from input data: %s" % processor_properties)
           self.logger.debug("Processor's properties type: %s" % str(type(processor_properties)))
           if not isinstance(processor_properties, types.DictType):
             processor_properties = json.loads(self.input_data.processor_properties)
           self.logger.debug("Processor's properties was taken from input data: %s" % processor_properties)
           self.properties.update(processor_properties)
       except Exception as err:
         ExceptionLog.handler(self.logger, err, 'Error load properties from input data:')
 
       self.algorithm_name = self.properties[CONSTS.ALGORITHM_KEY][CONSTS.ALGORITHM_NAME_KEY]
       self.logger.debug("Algorithm : %s" % self.algorithm_name)
       if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
         Utils.storePickleOnDisk(input_pickled_object, ENV_SCRAPER_STORE_PATH, "scraper.in." + \
                                 str(self.input_data.urlId))
       if "metrics" in self.properties:
         try:
           self.metrics = json.loads(self.properties["metrics"])
           self.logger.debug(">>> Metrics loads = " + str(self.metrics))
         except Exception as excp:
           self.logger.debug(">>> Metrcis dumps exception = " + str(excp))
       # TODO main processing over every url from list of urls in the batch object
       tmp = sys.stdout
       sys.stdout = open("/dev/null", "wb")
 
       # initialization of scraper
       # load scraper's modules
       self.loadExtractors()
 
       # # Initialization pubdate
       # self.logger.debug("Initialization pubdate from urlObj.pDate use value: %s",
       #                  str(self.input_data.batch_item.urlObj.pDate))
       # self.pubdate = self.input_data.batch_item.urlObj.pDate
 
       scraperResponses = self.jsonParserProcess()
 
       sys.stdout = tmp
 
       self.logger.debug("scraperResponse:\n%s", varDump(scraperResponses))
       if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
         output_pickled_object = pickle.dumps(scraperResponses)
         Utils.storePickleOnDisk(output_pickled_object, ENV_SCRAPER_STORE_PATH,
                                 "scraper.out." + str(self.input_data.urlId))
         print output_pickled_object
         sys.stdout.flush()
       else:
         self.output_data = scraperResponses
     except Exception as err:
       ExceptionLog.handler(self.logger, err, 'ScraperCustomJson process batch error:')
       self.exitCode = EXIT_FAILURE
       raise Exception('ScraperCustomJson process batch error:' + str(err))
 
 

Here is the call graph for this function:

◆ resourceExtraction()

def dc_processor.ScraperCustomJson.resourceExtraction	(	self,
		jsonElem
	)

Definition at line 394 of file ScraperCustomJson.py.

   def resourceExtraction(self, jsonElem):
     ret = []
     # get resource as dictionary
     resource_set = {}
     resource_set["url"] = self.input_data.url
     resource_set["resId"] = self.input_data.urlId
     resource_set["siteId"] = self.input_data.siteId
     resource_set["raw_html"] = jsonElem
     resource = Resource(resource_set)
 
     # get best matching extractor
     self.extractor = self.getNextBestExtractor()
     self.logger.debug("get best matching extractor: " + str(self.extractor))
 
     # search engine parsing ???
     collectResult = Result(self.config, self.input_data.urlId, self.metrics)
     # main loooop
     while self.extractor:
       result = Result(self.config, self.input_data.urlId, self.metrics)
       self.logger.debug(">>> TAG BEGIN extractor = " + str(self.extractor))
       result = self.extractor.extractTags(resource, result)
 
       self.logger.debug(">>> TAG END")
       empty_tags = result.getEmptyTags()
       self.logger.debug("get list of empty tags from result: " + str(empty_tags))
       filled_tags = result.getFilledTags()
       self.logger.debug("get list of filled_tags from result: " + str(filled_tags))
       self.extractor = self.getNextBestExtractor()
       self.logger.debug("get best matching extractor: " + str(self.extractor))
 
       for key in result.tags:
         if key not in collectResult.tags or not collectResult.isTagFilled(key):
           collectResult.tags[key] = copy.deepcopy(result.tags[key])
       ret.append(result)
     self.logger.debug(">>> EXIT LOOP")
     ret = [collectResult] + ret
     return ret
 
 

◆ run()

def dc_processor.ScraperCustomJson.run ( self )

Definition at line 130 of file ScraperCustomJson.py.

   def run(self):
     if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
       # call base class run method
       foundation.CementApp.run(self)
 
     # config section
     self.loadConfig()
 
     # load logger config file
     self.loadLogConfigFile()
 
     # options
     self.loadOptions()
 
     # scraper properties
     self.loadScraperProperties()
 
     # Do applied algorithm's job
     self.processBatch()
 
     if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
       # Finish logging
       self.logger.info(APP_CONSTS.LOGGER_DELIMITER_LINE)
 
 

Here is the caller graph for this function:

◆ setup()

def dc_processor.ScraperCustomJson.setup ( self )

Definition at line 122 of file ScraperCustomJson.py.

   def setup(self):
     if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
       # call base class setup method
       foundation.CementApp.setup(self)
 
 

Variable Documentation

◆ algorithm_name

dc_processor.ScraperCustomJson.algorithm_name

Definition at line 104 of file ScraperCustomJson.py.

◆ altTagsMask

dc_processor.ScraperCustomJson.altTagsMask

Definition at line 112 of file ScraperCustomJson.py.

◆ config

dc_processor.ScraperCustomJson.config

Definition at line 159 of file ScraperCustomJson.py.

◆ configFile

dc_processor.ScraperCustomJson.configFile

Definition at line 93 of file ScraperCustomJson.py.

◆ datetimeTemplateTypes

dc_processor.ScraperCustomJson.datetimeTemplateTypes

Definition at line 117 of file ScraperCustomJson.py.

◆ dbWrapper

dc_processor.ScraperCustomJson.dbWrapper

Definition at line 116 of file ScraperCustomJson.py.

◆ ENV_SCRAPER_STORE_PATH

string dc_processor.ScraperCustomJson.ENV_SCRAPER_STORE_PATH = "ENV_SCRAPER_STORE_PATH"

Definition at line 65 of file ScraperCustomJson.py.

◆ ERROR_OK

int dc_processor.ScraperCustomJson.ERROR_OK = 0

Definition at line 57 of file ScraperCustomJson.py.

◆ errorMask

dc_processor.ScraperCustomJson.errorMask

Definition at line 102 of file ScraperCustomJson.py.

◆ EXIT_FAILURE

int dc_processor.ScraperCustomJson.EXIT_FAILURE = 1

Definition at line 61 of file ScraperCustomJson.py.

◆ EXIT_SUCCESS

int dc_processor.ScraperCustomJson.EXIT_SUCCESS = 0

Definition at line 60 of file ScraperCustomJson.py.

◆ exitCode

dc_processor.ScraperCustomJson.exitCode

Definition at line 91 of file ScraperCustomJson.py.

◆ extractor

dc_processor.ScraperCustomJson.extractor

Definition at line 97 of file ScraperCustomJson.py.

◆ extractors

dc_processor.ScraperCustomJson.extractors

Definition at line 98 of file ScraperCustomJson.py.

◆ input_data

dc_processor.ScraperCustomJson.input_data

Definition at line 95 of file ScraperCustomJson.py.

◆ itr

dc_processor.ScraperCustomJson.itr

Definition at line 99 of file ScraperCustomJson.py.

◆ logger

dc_processor.ScraperCustomJson.logger

Definition at line 94 of file ScraperCustomJson.py.

◆ metrics

dc_processor.ScraperCustomJson.metrics

Definition at line 111 of file ScraperCustomJson.py.

◆ MSG_ERROR_LOAD_EXTRACTORS

string dc_processor.ScraperCustomJson.MSG_ERROR_LOAD_EXTRACTORS = "Error load extractors "

Definition at line 63 of file ScraperCustomJson.py.

◆ MSG_ERROR_WRONG_CONFIG_FILE_NAME

string dc_processor.ScraperCustomJson.MSG_ERROR_WRONG_CONFIG_FILE_NAME = "Config file name is wrong"

Definition at line 72 of file ScraperCustomJson.py.

◆ OPTION_SECTION_DATETIME_TEMPLATE_TYPES

string dc_processor.ScraperCustomJson.OPTION_SECTION_DATETIME_TEMPLATE_TYPES = 'tags_datetime_template_types'

Definition at line 75 of file ScraperCustomJson.py.

◆ output_data

dc_processor.ScraperCustomJson.output_data

Definition at line 115 of file ScraperCustomJson.py.

◆ outputFormat

dc_processor.ScraperCustomJson.outputFormat

Definition at line 110 of file ScraperCustomJson.py.

◆ processedContent

dc_processor.ScraperCustomJson.processedContent

Definition at line 109 of file ScraperCustomJson.py.

◆ properties

dc_processor.ScraperCustomJson.properties

Definition at line 96 of file ScraperCustomJson.py.

◆ pubdate

dc_processor.ScraperCustomJson.pubdate

Definition at line 100 of file ScraperCustomJson.py.

◆ scraperPropFileName

dc_processor.ScraperCustomJson.scraperPropFileName

Definition at line 103 of file ScraperCustomJson.py.

◆ scraperResponses

dc_processor.ScraperCustomJson.scraperResponses

Definition at line 105 of file ScraperCustomJson.py.

◆ TAGS_DATETIME_NEWS_NAMES

list dc_processor.ScraperCustomJson.TAGS_DATETIME_NEWS_NAMES = [CONSTS.TAG_PUB_DATE, CONSTS.TAG_DC_DATE]

Definition at line 67 of file ScraperCustomJson.py.

◆ TAGS_DATETIME_TEMPLATE_TYPES

list dc_processor.ScraperCustomJson.TAGS_DATETIME_TEMPLATE_TYPES = [CONSTS.TAG_TYPE_DATETIME]

Definition at line 74 of file ScraperCustomJson.py.

◆ tagsCount

dc_processor.ScraperCustomJson.tagsCount

Definition at line 106 of file ScraperCustomJson.py.

◆ tagsMask

dc_processor.ScraperCustomJson.tagsMask

Definition at line 107 of file ScraperCustomJson.py.

◆ timezone

dc_processor.ScraperCustomJson.timezone

Definition at line 101 of file ScraperCustomJson.py.

◆ urlHost

dc_processor.ScraperCustomJson.urlHost

Definition at line 114 of file ScraperCustomJson.py.

◆ usageModel

dc_processor.ScraperCustomJson.usageModel

Definition at line 92 of file ScraperCustomJson.py.

◆ useCurrentYear

dc_processor.ScraperCustomJson.useCurrentYear

Definition at line 118 of file ScraperCustomJson.py.

Classes

Functions

Variables

Function Documentation

◆ __init__()

◆ createModule()

◆ fillScraperResponse()

◆ formatOutpuElement()

◆ formatOutputData()

◆ generateEmptyResponse()

◆ getExitCode()

◆ getNextBestExtractor()

◆ getProcessedContent()

◆ jsonParserExtractor()

◆ jsonParserProcess()

◆ loadConfig()

◆ loadExtractors()

◆ loadLogConfigFile()

◆ loadOptions()

◆ loadScraperProperties()

◆ processBatch()

◆ resourceExtraction()

◆ run()

◆ setup()

Variable Documentation

◆ algorithm_name

◆ altTagsMask

◆ config

◆ configFile

◆ datetimeTemplateTypes

◆ dbWrapper

◆ ENV_SCRAPER_STORE_PATH

◆ ERROR_OK

◆ errorMask

◆ EXIT_FAILURE

◆ EXIT_SUCCESS

◆ exitCode

◆ extractor

◆ extractors

◆ input_data

◆ itr

◆ logger

◆ metrics

◆ MSG_ERROR_LOAD_EXTRACTORS

◆ MSG_ERROR_WRONG_CONFIG_FILE_NAME

◆ OPTION_SECTION_DATETIME_TEMPLATE_TYPES

◆ output_data

◆ outputFormat

◆ processedContent

◆ properties

◆ pubdate

◆ scraperPropFileName

◆ scraperResponses

◆ TAGS_DATETIME_NEWS_NAMES

◆ TAGS_DATETIME_TEMPLATE_TYPES

◆ tagsCount

◆ tagsMask

◆ timezone

◆ urlHost

◆ usageModel

◆ useCurrentYear

◆ init()