Classes
class	Scraper

Variables
string	APP_NAME = "scraper"

string	MSG_ERROR_LOAD_CONFIG = "Error loading config file. Exciting."

string	MSG_ERROR_LOAD_LOG_CONFIG_FILE = "Error loading logging config file. Exiting."

string	MSG_ERROR_LOAD_EXTRACTORS = "Error load extractors "

string	MSG_ERROR_TEMPLATE_EXTRACTION = "Error template extraction "

string	MSG_ERROR_DYNAMIC_EXTRACTION = "Error dynamic extraction "

string	MSG_ERROR_LOAD_DB_BACKEND = "Error load db backend"

string	MSG_ERROR_LOAD_OPTIONS = "Error load options"

string	MSG_INFO_PREPARE_CONTENT = "Prepare content: "

string	MSG_ERROR_ADJUST_PR = "Error adjust partial references. "

string	MSG_ERROR_ADJUST_PUBDATE = "PUBDATE_ERROR "

string	MSG_ERROR_ADJUST_TITLE = "Can't adjust title. "

int	EXIT_SUCCESS = 0

int	EXIT_FAILURE = 1

int	SQLITE_TIMEOUT = 30

string	ENV_SCRAPER_STORE_PATH = "ENV_SCRAPER_STORE_PATH"

list	CONTENT_REPLACEMENT_LIST = ['\n', '\r\n', '\t', ' ', '<br>', '<p>', '</p>']

int	DEFAULT_TAG_REDUCE_MASK = 65535

dictionary	EXTENDED_NEWS_TAGS = {"description": ["//meta[@name='description']//@content"]}

list	LINKS_NEWS_TAGS = [CONSTS.TAG_MEDIA, CONSTS.TAG_LINK, CONSTS.TAG_MEDIA_CONTENT, "links", "href"]

list	DATA_NEWS_TAGS = []

list	TAGS_DATETIME_NEWS_NAMES = [CONSTS.TAG_PUB_DATE, CONSTS.TAG_DC_DATE]

list	TAGS_DATETIME_TEMPLATE_TYPES = [CONSTS.TAG_TYPE_DATETIME]

string	OPTION_SECTION_DATETIME_NEWS_NAMES = 'tags_datetime_news_names'

string	OPTION_SECTION_DATETIME_TEMPLATE_TYPES = 'tags_datetime_template_types'

string	OPTION_SECTION_TAGS_TYPE = 'tagsTypes'

string	OPTION_SECTION_URL_SOURCES_RULES = 'urlSourcesRules'

string	URL_SOURCES_RULE_DATA_URL = 'd_url'

string	URL_SOURCES_RULE_REDIRECT_URL = 'r_url '

string	URL_SOURCES_RULE_FEED_URL = 'f_url'

Detailed Description

@package docstring
 @file Scraper.py
 @author Alexey <developers.hce@gmail.com>, scorp, bgv
 @link http://hierarchical-cluster-engine.com/
 @copyright Copyright &copy; 2013-2016 IOIX Ukraine
 @license http://hierarchical-cluster-engine.com/license/
 @package HCE project node API
 @since 0.1

Variable Documentation

◆ APP_NAME

string dc_processor.Scraper.APP_NAME = "scraper"

Definition at line 61 of file Scraper.py.

◆ CONTENT_REPLACEMENT_LIST

list dc_processor.Scraper.CONTENT_REPLACEMENT_LIST = ['\n', '\r\n', '\t', ' ', '<br>', '<p>', '</p>']

Definition at line 82 of file Scraper.py.

◆ DATA_NEWS_TAGS

list dc_processor.Scraper.DATA_NEWS_TAGS = []

Definition at line 88 of file Scraper.py.

◆ DEFAULT_TAG_REDUCE_MASK

int dc_processor.Scraper.DEFAULT_TAG_REDUCE_MASK = 65535

Definition at line 83 of file Scraper.py.

◆ ENV_SCRAPER_STORE_PATH

string dc_processor.Scraper.ENV_SCRAPER_STORE_PATH = "ENV_SCRAPER_STORE_PATH"

Definition at line 80 of file Scraper.py.

◆ EXIT_FAILURE

int dc_processor.Scraper.EXIT_FAILURE = 1

Definition at line 76 of file Scraper.py.

◆ EXIT_SUCCESS

int dc_processor.Scraper.EXIT_SUCCESS = 0

Definition at line 75 of file Scraper.py.

◆ EXTENDED_NEWS_TAGS

dictionary dc_processor.Scraper.EXTENDED_NEWS_TAGS = {"description": ["//meta[@name='description']//@content"]}

Definition at line 85 of file Scraper.py.

◆ LINKS_NEWS_TAGS

list dc_processor.Scraper.LINKS_NEWS_TAGS = [CONSTS.TAG_MEDIA, CONSTS.TAG_LINK, CONSTS.TAG_MEDIA_CONTENT, "links", "href"]

Definition at line 86 of file Scraper.py.

◆ MSG_ERROR_ADJUST_PR

string dc_processor.Scraper.MSG_ERROR_ADJUST_PR = "Error adjust partial references. "

Definition at line 71 of file Scraper.py.

◆ MSG_ERROR_ADJUST_PUBDATE

string dc_processor.Scraper.MSG_ERROR_ADJUST_PUBDATE = "PUBDATE_ERROR "

Definition at line 72 of file Scraper.py.

◆ MSG_ERROR_ADJUST_TITLE

string dc_processor.Scraper.MSG_ERROR_ADJUST_TITLE = "Can't adjust title. "

Definition at line 73 of file Scraper.py.

◆ MSG_ERROR_DYNAMIC_EXTRACTION

string dc_processor.Scraper.MSG_ERROR_DYNAMIC_EXTRACTION = "Error dynamic extraction "

Definition at line 67 of file Scraper.py.

◆ MSG_ERROR_LOAD_CONFIG

string dc_processor.Scraper.MSG_ERROR_LOAD_CONFIG = "Error loading config file. Exciting."

Definition at line 63 of file Scraper.py.

◆ MSG_ERROR_LOAD_DB_BACKEND

string dc_processor.Scraper.MSG_ERROR_LOAD_DB_BACKEND = "Error load db backend"

Definition at line 68 of file Scraper.py.

◆ MSG_ERROR_LOAD_EXTRACTORS

string dc_processor.Scraper.MSG_ERROR_LOAD_EXTRACTORS = "Error load extractors "

Definition at line 65 of file Scraper.py.

◆ MSG_ERROR_LOAD_LOG_CONFIG_FILE

string dc_processor.Scraper.MSG_ERROR_LOAD_LOG_CONFIG_FILE = "Error loading logging config file. Exiting."

Definition at line 64 of file Scraper.py.

◆ MSG_ERROR_LOAD_OPTIONS

string dc_processor.Scraper.MSG_ERROR_LOAD_OPTIONS = "Error load options"

Definition at line 69 of file Scraper.py.

◆ MSG_ERROR_TEMPLATE_EXTRACTION

string dc_processor.Scraper.MSG_ERROR_TEMPLATE_EXTRACTION = "Error template extraction "

Definition at line 66 of file Scraper.py.

◆ MSG_INFO_PREPARE_CONTENT

string dc_processor.Scraper.MSG_INFO_PREPARE_CONTENT = "Prepare content: "

Definition at line 70 of file Scraper.py.

◆ OPTION_SECTION_DATETIME_NEWS_NAMES

string dc_processor.Scraper.OPTION_SECTION_DATETIME_NEWS_NAMES = 'tags_datetime_news_names'

Definition at line 93 of file Scraper.py.

◆ OPTION_SECTION_DATETIME_TEMPLATE_TYPES

string dc_processor.Scraper.OPTION_SECTION_DATETIME_TEMPLATE_TYPES = 'tags_datetime_template_types'

Definition at line 94 of file Scraper.py.

◆ OPTION_SECTION_TAGS_TYPE

string dc_processor.Scraper.OPTION_SECTION_TAGS_TYPE = 'tagsTypes'

Definition at line 96 of file Scraper.py.

◆ OPTION_SECTION_URL_SOURCES_RULES

string dc_processor.Scraper.OPTION_SECTION_URL_SOURCES_RULES = 'urlSourcesRules'

Definition at line 98 of file Scraper.py.

◆ SQLITE_TIMEOUT

int dc_processor.Scraper.SQLITE_TIMEOUT = 30

Definition at line 78 of file Scraper.py.

◆ TAGS_DATETIME_NEWS_NAMES

list dc_processor.Scraper.TAGS_DATETIME_NEWS_NAMES = [CONSTS.TAG_PUB_DATE, CONSTS.TAG_DC_DATE]

Definition at line 90 of file Scraper.py.

◆ TAGS_DATETIME_TEMPLATE_TYPES

list dc_processor.Scraper.TAGS_DATETIME_TEMPLATE_TYPES = [CONSTS.TAG_TYPE_DATETIME]

Definition at line 91 of file Scraper.py.

◆ URL_SOURCES_RULE_DATA_URL

string dc_processor.Scraper.URL_SOURCES_RULE_DATA_URL = 'd_url'

Definition at line 99 of file Scraper.py.

◆ URL_SOURCES_RULE_FEED_URL

string dc_processor.Scraper.URL_SOURCES_RULE_FEED_URL = 'f_url'

Definition at line 101 of file Scraper.py.

◆ URL_SOURCES_RULE_REDIRECT_URL

string dc_processor.Scraper.URL_SOURCES_RULE_REDIRECT_URL = 'r_url '

Definition at line 100 of file Scraper.py.

Classes

Variables