HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
ScraperMultiItemsTask.py File Reference

Go to the source code of this file.

Classes

class  dc_processor.ScraperMultiItemsTask.ScraperResultDocuments
 
class  dc_processor.ScraperMultiItemsTask.ScraperMultiItemsTask
 
class  dc_processor.ScraperMultiItemsTask.Meta
 

Namespaces

 dc_processor.ScraperMultiItemsTask
 

Functions

def dc_processor.ScraperMultiItemsTask.__init__ (self, usageModel=APP_CONSTS.APP_USAGE_MODEL_PROCESS, configFile=None, logger=None, inputData=None)
 
def dc_processor.ScraperMultiItemsTask.setup (self)
 
def dc_processor.ScraperMultiItemsTask.run (self)
 
def dc_processor.ScraperMultiItemsTask.__initApp (self, configName=None)
 
def dc_processor.ScraperMultiItemsTask.__loadScraperProperties (self, scraperPropertyFileName)
 
def dc_processor.ScraperMultiItemsTask.__loadAppConfig (self, configName)
 
def dc_processor.ScraperMultiItemsTask.__loadLogConfig (self, configName)
 
def dc_processor.ScraperMultiItemsTask.getExtractorByName (self, extractorName)
 
def dc_processor.ScraperMultiItemsTask.getExitCode (self)
 
def dc_processor.ScraperMultiItemsTask.__getInputData (self)
 
def dc_processor.ScraperMultiItemsTask.__checkInputData (self, inputData)
 
def dc_processor.ScraperMultiItemsTask.__fillProfilerMessageList (self, inputData)
 
def dc_processor.ScraperMultiItemsTask.__getOutputFormat (self, inputData)
 
def dc_processor.ScraperMultiItemsTask.__getAltTagsMask (self, inputData)
 
def dc_processor.ScraperMultiItemsTask.__getPropertiesFromInputData (self, inputData)
 
def dc_processor.ScraperMultiItemsTask.__loadExtractors (self, algorithmName, config, urlHost)
 
def dc_processor.ScraperMultiItemsTask.__createModule (self, moduleName, config, urlHost)
 
def dc_processor.ScraperMultiItemsTask.refineBadDateTags (self, response)
 
def dc_processor.ScraperMultiItemsTask.preparseResponse (self, response)
 
def dc_processor.ScraperMultiItemsTask.formatOutpuElement (self, elem, localOutputFormat)
 
def dc_processor.ScraperMultiItemsTask.formatOutputData (self, response, localOutputFormat)
 
def dc_processor.ScraperMultiItemsTask.templateExtraction (self, config, urlHost)
 
def dc_processor.ScraperMultiItemsTask.applyPostProcessing (self, result, key, postProcessingRE)
 
def dc_processor.ScraperMultiItemsTask.getProcessedContent (self, result)
 
def dc_processor.ScraperMultiItemsTask.process (self, config)
 
def dc_processor.ScraperMultiItemsTask.get_path (self, etreeElement, path=None)
 

Variables

string dc_processor.ScraperMultiItemsTask.MSG_ERROR_PARSE_CMD_PARAMS = "Error parse command line parameters."
 
string dc_processor.ScraperMultiItemsTask.MSG_ERROR_EMPTY_CONFIG_FILE_NAME = "Config file name is empty."
 
string dc_processor.ScraperMultiItemsTask.MSG_ERROR_WRONG_CONFIG_FILE_NAME = "Config file name is wrong"
 
string dc_processor.ScraperMultiItemsTask.MSG_ERROR_LOAD_PROPERTIES_FROM_FILE = "Error load Scraper multi items properties from file"
 
string dc_processor.ScraperMultiItemsTask.MSG_ERROR_LOAD_APP_CONFIG = "Error loading application config file."
 
string dc_processor.ScraperMultiItemsTask.MSG_ERROR_READ_LOG_CONFIG = "Error read log config file."
 
string dc_processor.ScraperMultiItemsTask.MSG_ERROR_READ_INPUT_DATA = "Error read input data from stdin."
 
string dc_processor.ScraperMultiItemsTask.MSG_ERROR_INPUT_DATA_NONE = "Input data is none"
 
string dc_processor.ScraperMultiItemsTask.MSG_ERROR_INPUT_DATA_WITHOUT_BATCH = "Input data without batch item."
 
string dc_processor.ScraperMultiItemsTask.MSG_ERROR_INPUT_DATA_WITHOUT_PROPERTIES = "Input data has batch item without 'properties'."
 
string dc_processor.ScraperMultiItemsTask.MSG_ERROR_GET_PROPERTIES = "Error getting properties from input data"
 
string dc_processor.ScraperMultiItemsTask.MSG_ERROR_LOAD_EXTRACTORS = "Error load extractors "
 
string dc_processor.ScraperMultiItemsTask.MSG_ERROR_ADJUST_PR = "Error adjust partial references. "
 
string dc_processor.ScraperMultiItemsTask.MSG_ERROR_ADJUST_PUBDATE = "Error adjust PUBDATE. "
 
string dc_processor.ScraperMultiItemsTask.MSG_ERROR_ADJUST_TITLE = "Error adjust title. "
 
string dc_processor.ScraperMultiItemsTask.MSG_ERROR_ADJUST_LINK_URL = "Error adjust link URL. "
 
string dc_processor.ScraperMultiItemsTask.SCRAPER_MULTI_ITEMS_OPTION_LOG = "log"
 
string dc_processor.ScraperMultiItemsTask.SCRAPER_MULTI_ITEMS_OPTION_PROPERTY_JSON_FILE = "property_file_name"
 
string dc_processor.ScraperMultiItemsTask.ENV_SCRAPER_STORE_PATH = "self.ENV_SCRAPER_STORE_PATH"
 
dictionary dc_processor.ScraperMultiItemsTask.EXTENDED_NEWS_TAGS = {"description": ["//meta[@name='description']//@content"]}
 
list dc_processor.ScraperMultiItemsTask.DATA_NEWS_TAGS = [CONSTS.TAG_DC_DATE]
 
list dc_processor.ScraperMultiItemsTask.TAGS_DATETIME_TEMPLATE_TYPES = [CONSTS.TAG_TYPE_DATETIME]
 
string dc_processor.ScraperMultiItemsTask.OPTION_SECTION_DATETIME_TEMPLATE_TYPES = 'tags_datetime_template_types'
 
 dc_processor.ScraperMultiItemsTask.exitCode
 
 dc_processor.ScraperMultiItemsTask.usageModel
 
 dc_processor.ScraperMultiItemsTask.configFile
 
 dc_processor.ScraperMultiItemsTask.logger
 
 dc_processor.ScraperMultiItemsTask.input_data
 
 dc_processor.ScraperMultiItemsTask.properties
 
 dc_processor.ScraperMultiItemsTask.outputFormat
 
 dc_processor.ScraperMultiItemsTask.output_data
 
 dc_processor.ScraperMultiItemsTask.extractor
 
 dc_processor.ScraperMultiItemsTask.extractors
 
 dc_processor.ScraperMultiItemsTask.itr
 
 dc_processor.ScraperMultiItemsTask.pubdate
 
 dc_processor.ScraperMultiItemsTask.errorMask
 
 dc_processor.ScraperMultiItemsTask.xpathSplitString
 
 dc_processor.ScraperMultiItemsTask.useCurrentYear
 
 dc_processor.ScraperMultiItemsTask.datetimeTemplateTypes
 
 dc_processor.ScraperMultiItemsTask.dbWrapper
 
 dc_processor.ScraperMultiItemsTask.mediaLimitsHandler