HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.
2.0.0-chaika
Hierarchical Cluster Engine Python language binding
|
Classes | |
class | Scraper |
Variables | |
string | APP_NAME = "scraper" |
string | MSG_ERROR_LOAD_CONFIG = "Error loading config file. Exciting." |
string | MSG_ERROR_LOAD_LOG_CONFIG_FILE = "Error loading logging config file. Exiting." |
string | MSG_ERROR_LOAD_EXTRACTORS = "Error load extractors " |
string | MSG_ERROR_TEMPLATE_EXTRACTION = "Error template extraction " |
string | MSG_ERROR_DYNAMIC_EXTRACTION = "Error dynamic extraction " |
string | MSG_ERROR_LOAD_DB_BACKEND = "Error load db backend" |
string | MSG_ERROR_LOAD_OPTIONS = "Error load options" |
string | MSG_INFO_PREPARE_CONTENT = "Prepare content: " |
string | MSG_ERROR_ADJUST_PR = "Error adjust partial references. " |
string | MSG_ERROR_ADJUST_PUBDATE = "PUBDATE_ERROR " |
string | MSG_ERROR_ADJUST_TITLE = "Can't adjust title. " |
int | EXIT_SUCCESS = 0 |
int | EXIT_FAILURE = 1 |
int | SQLITE_TIMEOUT = 30 |
string | ENV_SCRAPER_STORE_PATH = "ENV_SCRAPER_STORE_PATH" |
list | CONTENT_REPLACEMENT_LIST = ['\n', '\r\n', '\t', ' ', '<br>', '<p>', '</p>'] |
int | DEFAULT_TAG_REDUCE_MASK = 65535 |
dictionary | EXTENDED_NEWS_TAGS = {"description": ["//meta[@name='description']//@content"]} |
list | LINKS_NEWS_TAGS = [CONSTS.TAG_MEDIA, CONSTS.TAG_LINK, CONSTS.TAG_MEDIA_CONTENT, "links", "href"] |
list | DATA_NEWS_TAGS = [] |
list | TAGS_DATETIME_NEWS_NAMES = [CONSTS.TAG_PUB_DATE, CONSTS.TAG_DC_DATE] |
list | TAGS_DATETIME_TEMPLATE_TYPES = [CONSTS.TAG_TYPE_DATETIME] |
string | OPTION_SECTION_DATETIME_NEWS_NAMES = 'tags_datetime_news_names' |
string | OPTION_SECTION_DATETIME_TEMPLATE_TYPES = 'tags_datetime_template_types' |
string | OPTION_SECTION_TAGS_TYPE = 'tagsTypes' |
string | OPTION_SECTION_URL_SOURCES_RULES = 'urlSourcesRules' |
string | URL_SOURCES_RULE_DATA_URL = 'd_url' |
string | URL_SOURCES_RULE_REDIRECT_URL = 'r_url ' |
string | URL_SOURCES_RULE_FEED_URL = 'f_url' |
@package docstring @file Scraper.py @author Alexey <developers.hce@gmail.com>, scorp, bgv @link http://hierarchical-cluster-engine.com/ @copyright Copyright © 2013-2016 IOIX Ukraine @license http://hierarchical-cluster-engine.com/license/ @package HCE project node API @since 0.1
string dc_processor.Scraper.APP_NAME = "scraper" |
Definition at line 61 of file Scraper.py.
list dc_processor.Scraper.CONTENT_REPLACEMENT_LIST = ['\n', '\r\n', '\t', ' ', '<br>', '<p>', '</p>'] |
Definition at line 82 of file Scraper.py.
list dc_processor.Scraper.DATA_NEWS_TAGS = [] |
Definition at line 88 of file Scraper.py.
int dc_processor.Scraper.DEFAULT_TAG_REDUCE_MASK = 65535 |
Definition at line 83 of file Scraper.py.
string dc_processor.Scraper.ENV_SCRAPER_STORE_PATH = "ENV_SCRAPER_STORE_PATH" |
Definition at line 80 of file Scraper.py.
int dc_processor.Scraper.EXIT_FAILURE = 1 |
Definition at line 76 of file Scraper.py.
int dc_processor.Scraper.EXIT_SUCCESS = 0 |
Definition at line 75 of file Scraper.py.
dictionary dc_processor.Scraper.EXTENDED_NEWS_TAGS = {"description": ["//meta[@name='description']//@content"]} |
Definition at line 85 of file Scraper.py.
list dc_processor.Scraper.LINKS_NEWS_TAGS = [CONSTS.TAG_MEDIA, CONSTS.TAG_LINK, CONSTS.TAG_MEDIA_CONTENT, "links", "href"] |
Definition at line 86 of file Scraper.py.
string dc_processor.Scraper.MSG_ERROR_ADJUST_PR = "Error adjust partial references. " |
Definition at line 71 of file Scraper.py.
string dc_processor.Scraper.MSG_ERROR_ADJUST_PUBDATE = "PUBDATE_ERROR " |
Definition at line 72 of file Scraper.py.
string dc_processor.Scraper.MSG_ERROR_ADJUST_TITLE = "Can't adjust title. " |
Definition at line 73 of file Scraper.py.
string dc_processor.Scraper.MSG_ERROR_DYNAMIC_EXTRACTION = "Error dynamic extraction " |
Definition at line 67 of file Scraper.py.
string dc_processor.Scraper.MSG_ERROR_LOAD_CONFIG = "Error loading config file. Exciting." |
Definition at line 63 of file Scraper.py.
string dc_processor.Scraper.MSG_ERROR_LOAD_DB_BACKEND = "Error load db backend" |
Definition at line 68 of file Scraper.py.
string dc_processor.Scraper.MSG_ERROR_LOAD_EXTRACTORS = "Error load extractors " |
Definition at line 65 of file Scraper.py.
string dc_processor.Scraper.MSG_ERROR_LOAD_LOG_CONFIG_FILE = "Error loading logging config file. Exiting." |
Definition at line 64 of file Scraper.py.
string dc_processor.Scraper.MSG_ERROR_LOAD_OPTIONS = "Error load options" |
Definition at line 69 of file Scraper.py.
string dc_processor.Scraper.MSG_ERROR_TEMPLATE_EXTRACTION = "Error template extraction " |
Definition at line 66 of file Scraper.py.
string dc_processor.Scraper.MSG_INFO_PREPARE_CONTENT = "Prepare content: " |
Definition at line 70 of file Scraper.py.
string dc_processor.Scraper.OPTION_SECTION_DATETIME_NEWS_NAMES = 'tags_datetime_news_names' |
Definition at line 93 of file Scraper.py.
string dc_processor.Scraper.OPTION_SECTION_DATETIME_TEMPLATE_TYPES = 'tags_datetime_template_types' |
Definition at line 94 of file Scraper.py.
string dc_processor.Scraper.OPTION_SECTION_TAGS_TYPE = 'tagsTypes' |
Definition at line 96 of file Scraper.py.
string dc_processor.Scraper.OPTION_SECTION_URL_SOURCES_RULES = 'urlSourcesRules' |
Definition at line 98 of file Scraper.py.
int dc_processor.Scraper.SQLITE_TIMEOUT = 30 |
Definition at line 78 of file Scraper.py.
list dc_processor.Scraper.TAGS_DATETIME_NEWS_NAMES = [CONSTS.TAG_PUB_DATE, CONSTS.TAG_DC_DATE] |
Definition at line 90 of file Scraper.py.
list dc_processor.Scraper.TAGS_DATETIME_TEMPLATE_TYPES = [CONSTS.TAG_TYPE_DATETIME] |
Definition at line 91 of file Scraper.py.
string dc_processor.Scraper.URL_SOURCES_RULE_DATA_URL = 'd_url' |
Definition at line 99 of file Scraper.py.
string dc_processor.Scraper.URL_SOURCES_RULE_FEED_URL = 'f_url' |
Definition at line 101 of file Scraper.py.
string dc_processor.Scraper.URL_SOURCES_RULE_REDIRECT_URL = 'r_url ' |
Definition at line 100 of file Scraper.py.