HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_processor.Scraper Namespace Reference

Classes

class  Scraper
 

Variables

string APP_NAME = "scraper"
 
string MSG_ERROR_LOAD_CONFIG = "Error loading config file. Exciting."
 
string MSG_ERROR_LOAD_LOG_CONFIG_FILE = "Error loading logging config file. Exiting."
 
string MSG_ERROR_LOAD_EXTRACTORS = "Error load extractors "
 
string MSG_ERROR_TEMPLATE_EXTRACTION = "Error template extraction "
 
string MSG_ERROR_DYNAMIC_EXTRACTION = "Error dynamic extraction "
 
string MSG_ERROR_LOAD_DB_BACKEND = "Error load db backend"
 
string MSG_ERROR_LOAD_OPTIONS = "Error load options"
 
string MSG_INFO_PREPARE_CONTENT = "Prepare content: "
 
string MSG_ERROR_ADJUST_PR = "Error adjust partial references. "
 
string MSG_ERROR_ADJUST_PUBDATE = "PUBDATE_ERROR "
 
string MSG_ERROR_ADJUST_TITLE = "Can't adjust title. "
 
int EXIT_SUCCESS = 0
 
int EXIT_FAILURE = 1
 
int SQLITE_TIMEOUT = 30
 
string ENV_SCRAPER_STORE_PATH = "ENV_SCRAPER_STORE_PATH"
 
list CONTENT_REPLACEMENT_LIST = ['\n', '\r\n', '\t', ' ', '<br>', '<p>', '</p>']
 
int DEFAULT_TAG_REDUCE_MASK = 65535
 
dictionary EXTENDED_NEWS_TAGS = {"description": ["//meta[@name='description']//@content"]}
 
list LINKS_NEWS_TAGS = [CONSTS.TAG_MEDIA, CONSTS.TAG_LINK, CONSTS.TAG_MEDIA_CONTENT, "links", "href"]
 
list DATA_NEWS_TAGS = []
 
list TAGS_DATETIME_NEWS_NAMES = [CONSTS.TAG_PUB_DATE, CONSTS.TAG_DC_DATE]
 
list TAGS_DATETIME_TEMPLATE_TYPES = [CONSTS.TAG_TYPE_DATETIME]
 
string OPTION_SECTION_DATETIME_NEWS_NAMES = 'tags_datetime_news_names'
 
string OPTION_SECTION_DATETIME_TEMPLATE_TYPES = 'tags_datetime_template_types'
 
string OPTION_SECTION_TAGS_TYPE = 'tagsTypes'
 
string OPTION_SECTION_URL_SOURCES_RULES = 'urlSourcesRules'
 
string URL_SOURCES_RULE_DATA_URL = 'd_url'
 
string URL_SOURCES_RULE_REDIRECT_URL = 'r_url '
 
string URL_SOURCES_RULE_FEED_URL = 'f_url'
 

Detailed Description

@package docstring
 @file Scraper.py
 @author Alexey <developers.hce@gmail.com>, scorp, bgv
 @link http://hierarchical-cluster-engine.com/
 @copyright Copyright &copy; 2013-2016 IOIX Ukraine
 @license http://hierarchical-cluster-engine.com/license/
 @package HCE project node API
 @since 0.1

Variable Documentation

◆ APP_NAME

string dc_processor.Scraper.APP_NAME = "scraper"

Definition at line 61 of file Scraper.py.

◆ CONTENT_REPLACEMENT_LIST

list dc_processor.Scraper.CONTENT_REPLACEMENT_LIST = ['\n', '\r\n', '\t', ' ', '<br>', '<p>', '</p>']

Definition at line 82 of file Scraper.py.

◆ DATA_NEWS_TAGS

list dc_processor.Scraper.DATA_NEWS_TAGS = []

Definition at line 88 of file Scraper.py.

◆ DEFAULT_TAG_REDUCE_MASK

int dc_processor.Scraper.DEFAULT_TAG_REDUCE_MASK = 65535

Definition at line 83 of file Scraper.py.

◆ ENV_SCRAPER_STORE_PATH

string dc_processor.Scraper.ENV_SCRAPER_STORE_PATH = "ENV_SCRAPER_STORE_PATH"

Definition at line 80 of file Scraper.py.

◆ EXIT_FAILURE

int dc_processor.Scraper.EXIT_FAILURE = 1

Definition at line 76 of file Scraper.py.

◆ EXIT_SUCCESS

int dc_processor.Scraper.EXIT_SUCCESS = 0

Definition at line 75 of file Scraper.py.

◆ EXTENDED_NEWS_TAGS

dictionary dc_processor.Scraper.EXTENDED_NEWS_TAGS = {"description": ["//meta[@name='description']//@content"]}

Definition at line 85 of file Scraper.py.

◆ LINKS_NEWS_TAGS

list dc_processor.Scraper.LINKS_NEWS_TAGS = [CONSTS.TAG_MEDIA, CONSTS.TAG_LINK, CONSTS.TAG_MEDIA_CONTENT, "links", "href"]

Definition at line 86 of file Scraper.py.

◆ MSG_ERROR_ADJUST_PR

string dc_processor.Scraper.MSG_ERROR_ADJUST_PR = "Error adjust partial references. "

Definition at line 71 of file Scraper.py.

◆ MSG_ERROR_ADJUST_PUBDATE

string dc_processor.Scraper.MSG_ERROR_ADJUST_PUBDATE = "PUBDATE_ERROR "

Definition at line 72 of file Scraper.py.

◆ MSG_ERROR_ADJUST_TITLE

string dc_processor.Scraper.MSG_ERROR_ADJUST_TITLE = "Can't adjust title. "

Definition at line 73 of file Scraper.py.

◆ MSG_ERROR_DYNAMIC_EXTRACTION

string dc_processor.Scraper.MSG_ERROR_DYNAMIC_EXTRACTION = "Error dynamic extraction "

Definition at line 67 of file Scraper.py.

◆ MSG_ERROR_LOAD_CONFIG

string dc_processor.Scraper.MSG_ERROR_LOAD_CONFIG = "Error loading config file. Exciting."

Definition at line 63 of file Scraper.py.

◆ MSG_ERROR_LOAD_DB_BACKEND

string dc_processor.Scraper.MSG_ERROR_LOAD_DB_BACKEND = "Error load db backend"

Definition at line 68 of file Scraper.py.

◆ MSG_ERROR_LOAD_EXTRACTORS

string dc_processor.Scraper.MSG_ERROR_LOAD_EXTRACTORS = "Error load extractors "

Definition at line 65 of file Scraper.py.

◆ MSG_ERROR_LOAD_LOG_CONFIG_FILE

string dc_processor.Scraper.MSG_ERROR_LOAD_LOG_CONFIG_FILE = "Error loading logging config file. Exiting."

Definition at line 64 of file Scraper.py.

◆ MSG_ERROR_LOAD_OPTIONS

string dc_processor.Scraper.MSG_ERROR_LOAD_OPTIONS = "Error load options"

Definition at line 69 of file Scraper.py.

◆ MSG_ERROR_TEMPLATE_EXTRACTION

string dc_processor.Scraper.MSG_ERROR_TEMPLATE_EXTRACTION = "Error template extraction "

Definition at line 66 of file Scraper.py.

◆ MSG_INFO_PREPARE_CONTENT

string dc_processor.Scraper.MSG_INFO_PREPARE_CONTENT = "Prepare content: "

Definition at line 70 of file Scraper.py.

◆ OPTION_SECTION_DATETIME_NEWS_NAMES

string dc_processor.Scraper.OPTION_SECTION_DATETIME_NEWS_NAMES = 'tags_datetime_news_names'

Definition at line 93 of file Scraper.py.

◆ OPTION_SECTION_DATETIME_TEMPLATE_TYPES

string dc_processor.Scraper.OPTION_SECTION_DATETIME_TEMPLATE_TYPES = 'tags_datetime_template_types'

Definition at line 94 of file Scraper.py.

◆ OPTION_SECTION_TAGS_TYPE

string dc_processor.Scraper.OPTION_SECTION_TAGS_TYPE = 'tagsTypes'

Definition at line 96 of file Scraper.py.

◆ OPTION_SECTION_URL_SOURCES_RULES

string dc_processor.Scraper.OPTION_SECTION_URL_SOURCES_RULES = 'urlSourcesRules'

Definition at line 98 of file Scraper.py.

◆ SQLITE_TIMEOUT

int dc_processor.Scraper.SQLITE_TIMEOUT = 30

Definition at line 78 of file Scraper.py.

◆ TAGS_DATETIME_NEWS_NAMES

list dc_processor.Scraper.TAGS_DATETIME_NEWS_NAMES = [CONSTS.TAG_PUB_DATE, CONSTS.TAG_DC_DATE]

Definition at line 90 of file Scraper.py.

◆ TAGS_DATETIME_TEMPLATE_TYPES

list dc_processor.Scraper.TAGS_DATETIME_TEMPLATE_TYPES = [CONSTS.TAG_TYPE_DATETIME]

Definition at line 91 of file Scraper.py.

◆ URL_SOURCES_RULE_DATA_URL

string dc_processor.Scraper.URL_SOURCES_RULE_DATA_URL = 'd_url'

Definition at line 99 of file Scraper.py.

◆ URL_SOURCES_RULE_FEED_URL

string dc_processor.Scraper.URL_SOURCES_RULE_FEED_URL = 'f_url'

Definition at line 101 of file Scraper.py.

◆ URL_SOURCES_RULE_REDIRECT_URL

string dc_processor.Scraper.URL_SOURCES_RULE_REDIRECT_URL = 'r_url '

Definition at line 100 of file Scraper.py.