HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
Scraper.py File Reference

Go to the source code of this file.

Classes

class  dc_processor.Scraper.Scraper
 
class  dc_processor.Scraper.Scraper.Meta
 

Namespaces

 dc_processor.Scraper
 

Variables

string dc_processor.Scraper.APP_NAME = "scraper"
 
string dc_processor.Scraper.MSG_ERROR_LOAD_CONFIG = "Error loading config file. Exciting."
 
string dc_processor.Scraper.MSG_ERROR_LOAD_LOG_CONFIG_FILE = "Error loading logging config file. Exiting."
 
string dc_processor.Scraper.MSG_ERROR_LOAD_EXTRACTORS = "Error load extractors "
 
string dc_processor.Scraper.MSG_ERROR_TEMPLATE_EXTRACTION = "Error template extraction "
 
string dc_processor.Scraper.MSG_ERROR_DYNAMIC_EXTRACTION = "Error dynamic extraction "
 
string dc_processor.Scraper.MSG_ERROR_LOAD_DB_BACKEND = "Error load db backend"
 
string dc_processor.Scraper.MSG_ERROR_LOAD_OPTIONS = "Error load options"
 
string dc_processor.Scraper.MSG_INFO_PREPARE_CONTENT = "Prepare content: "
 
string dc_processor.Scraper.MSG_ERROR_ADJUST_PR = "Error adjust partial references. "
 
string dc_processor.Scraper.MSG_ERROR_ADJUST_PUBDATE = "PUBDATE_ERROR "
 
string dc_processor.Scraper.MSG_ERROR_ADJUST_TITLE = "Can't adjust title. "
 
int dc_processor.Scraper.EXIT_SUCCESS = 0
 
int dc_processor.Scraper.EXIT_FAILURE = 1
 
int dc_processor.Scraper.SQLITE_TIMEOUT = 30
 
string dc_processor.Scraper.ENV_SCRAPER_STORE_PATH = "ENV_SCRAPER_STORE_PATH"
 
list dc_processor.Scraper.CONTENT_REPLACEMENT_LIST = ['\n', '\r\n', '\t', ' ', '<br>', '<p>', '</p>']
 
int dc_processor.Scraper.DEFAULT_TAG_REDUCE_MASK = 65535
 
dictionary dc_processor.Scraper.EXTENDED_NEWS_TAGS = {"description": ["//meta[@name='description']//@content"]}
 
list dc_processor.Scraper.LINKS_NEWS_TAGS = [CONSTS.TAG_MEDIA, CONSTS.TAG_LINK, CONSTS.TAG_MEDIA_CONTENT, "links", "href"]
 
list dc_processor.Scraper.DATA_NEWS_TAGS = []
 
list dc_processor.Scraper.TAGS_DATETIME_NEWS_NAMES = [CONSTS.TAG_PUB_DATE, CONSTS.TAG_DC_DATE]
 
list dc_processor.Scraper.TAGS_DATETIME_TEMPLATE_TYPES = [CONSTS.TAG_TYPE_DATETIME]
 
string dc_processor.Scraper.OPTION_SECTION_DATETIME_NEWS_NAMES = 'tags_datetime_news_names'
 
string dc_processor.Scraper.OPTION_SECTION_DATETIME_TEMPLATE_TYPES = 'tags_datetime_template_types'
 
string dc_processor.Scraper.OPTION_SECTION_TAGS_TYPE = 'tagsTypes'
 
string dc_processor.Scraper.OPTION_SECTION_URL_SOURCES_RULES = 'urlSourcesRules'
 
string dc_processor.Scraper.URL_SOURCES_RULE_DATA_URL = 'd_url'
 
string dc_processor.Scraper.URL_SOURCES_RULE_REDIRECT_URL = 'r_url '
 
string dc_processor.Scraper.URL_SOURCES_RULE_FEED_URL = 'f_url'