HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.
2.0.0-chaika
Hierarchical Cluster Engine Python language binding
|
Classes | |
class | CrawlerTask |
Variables | |
string | DB_SITES = "dc_sites" |
string | DB_URLS = "dc_urls" |
string | MSG_ERROR_LOAD_CONFIG = "Error loading config file. Exciting. " |
string | MSG_ERROR_LOAD_OPTIONS = "Error loading options. Exciting. " |
string | MSG_ERROR_LOAD_LOG_CONFIG_FILE = "Can't load logging config file. Exiting. " |
string | MSG_ERROR_LOAD_SITE_DATA = "Can't load site data: " |
string | MSG_ERROR_UPDATE_SITE_DATA = "Can't update site data: " |
string | MSG_ERROR_LOAD_URL_DATA = "Can't load url data: " |
string | MSG_ERROR_PROCESS_BATCH_ITEM = "Can't process batch item " |
string | MSG_ERROR_WRITE_CRAWLED_DATA = "Can't write crawled data " |
string | MSG_ERROR_COLLECT_URLS = "Can't collect urls " |
string | MSG_ERROR_ADD_URL_TO_BATCH_ITEM = "Can't add url to batch item " |
string | MSG_ERROR_LOAD_SITE_PROPERTIES = "Can't load site properties " |
string | MSG_ERROR_CRAWL_SITE = "Can't crawl site " |
string | MSG_ERROR_CHECK_SITE = "Site don't passed check site " |
string | MSG_ERROR_GET_DIR = "Can't get dir " |
string | MSG_ERROR_READ_SITE_FROM_DB = "Can't read site data from db" |
string | MSG_ERROR_EMPTY_RESPONSE_SIZE = "Empty response" |
string | MSG_ERROR_NOT_EXIST_ANY_VALID_PROXY = "Not exist any valid proxy" |
string | MSG_ERROR_EMPTY_CONFIG_FILE_NAME = "Config file name is empty." |
string | MSG_ERROR_WRONG_CONFIG_FILE_NAME = "Config file name is wrong: %s" |
string | MSG_ERROR_LOAD_APP_CONFIG = "Error loading application config file. %s" |
string | MSG_ERROR_EXTRACT_BASE_URL = "Extract base url failed. Error: %s" |
string | MSG_INFO_PROCESS_BATCH = "ProcessBatch " |
string | MSG_INFO_STORE_COOKIES_FILE = "Store cookies file on disk." |
string | MSG_DEBUG_NON_PROCESSING = "ProcessorName is NONE. Exclude batch item from further processing." |
string | SITE_MD5_EMPTY = "d41d8cd98f00b204e9800998ecf8427e" |
int | DEFAULT_MAX_SIZE = 1000000 |
string | EMPTY_RESPONSE_SIZE = "0" |
string | APP_NAME = "crawler-task" |
string | HTTP_COOKIE = "HTTP_COOKIE" |
string | DEFAULT_HTTP_COOKIE = "" |
string | HTTP_HEADERS = "HTTP_HEADERS" |
string | DEFAULT_HTTP_HEADER = "" |
string | DC_URLS_DB_NAME = "dc_urls" |
string | DC_URLS_TABLE_PREFIX = "urls_" |
string | DC_SITES_DB_NAME = "dc_sites" |
string | DC_SITES_PROPERTIES_TABLE_NAME = "sites_properties" |
string | DC_SITES_TABLE_NAME = "sites" |
string | DC_URLS_TABLE_NAME = "urls" |
string | COOKIES_FILE_POSTFIX = ".cookies.txt" |
string | NON_PROCESSING = "NONE" |
string | HTTP_REDIRECT = "<Response [301]>" |
string | HTML_REDIRECT = "" |
int | MAX_HTTP_REDIRECTS_UNLIMITED = 0 |
int | MAX_HTML_REDIRECTS_UNLIMITED = 0 |
string | META_XPATH = "//meta[contains(@content, 'url')]/@content" |
Results = namedtuple("Results", "exit_code, output, err") | |
ROBOTS_PATTERN = re.compile(r'(https?://[^/]+).*', re.I) | |
TEXT_CONTENT_TYPE_PATTERN = re.compile('text', re.I) | |
string | ENV_CRAWLER_STORE_PATH = "ENV_CRAWLER_STORE_PATH" |
string | DETECT_MIME_MAIN_CONTENT = "1" |
string | RECOVER_IF_FAILED = "2" |
int | EXIT_SUCCESS = 0 |
int | EXIT_FAILURE = 1 |
string dc_crawler.CrawlerTask.APP_NAME = "crawler-task" |
Definition at line 129 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.COOKIES_FILE_POSTFIX = ".cookies.txt" |
Definition at line 142 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.DB_SITES = "dc_sites" |
Definition at line 93 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.DB_URLS = "dc_urls" |
Definition at line 94 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.DC_SITES_DB_NAME = "dc_sites" |
Definition at line 138 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.DC_SITES_PROPERTIES_TABLE_NAME = "sites_properties" |
Definition at line 139 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.DC_SITES_TABLE_NAME = "sites" |
Definition at line 140 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.DC_URLS_DB_NAME = "dc_urls" |
Definition at line 136 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.DC_URLS_TABLE_NAME = "urls" |
Definition at line 141 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.DC_URLS_TABLE_PREFIX = "urls_" |
Definition at line 137 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.DEFAULT_HTTP_COOKIE = "" |
Definition at line 132 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.DEFAULT_HTTP_HEADER = "" |
Definition at line 134 of file CrawlerTask.py.
int dc_crawler.CrawlerTask.DEFAULT_MAX_SIZE = 1000000 |
Definition at line 126 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.DETECT_MIME_MAIN_CONTENT = "1" |
Definition at line 164 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.EMPTY_RESPONSE_SIZE = "0" |
Definition at line 127 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.ENV_CRAWLER_STORE_PATH = "ENV_CRAWLER_STORE_PATH" |
Definition at line 158 of file CrawlerTask.py.
int dc_crawler.CrawlerTask.EXIT_FAILURE = 1 |
Definition at line 168 of file CrawlerTask.py.
int dc_crawler.CrawlerTask.EXIT_SUCCESS = 0 |
Definition at line 167 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.HTML_REDIRECT = "" |
Definition at line 147 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.HTTP_COOKIE = "HTTP_COOKIE" |
Definition at line 131 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.HTTP_HEADERS = "HTTP_HEADERS" |
Definition at line 133 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.HTTP_REDIRECT = "<Response [301]>" |
Definition at line 146 of file CrawlerTask.py.
int dc_crawler.CrawlerTask.MAX_HTML_REDIRECTS_UNLIMITED = 0 |
Definition at line 149 of file CrawlerTask.py.
int dc_crawler.CrawlerTask.MAX_HTTP_REDIRECTS_UNLIMITED = 0 |
Definition at line 148 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.META_XPATH = "//meta[contains(@content, 'url')]/@content" |
Definition at line 150 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.MSG_DEBUG_NON_PROCESSING = "ProcessorName is NONE. Exclude batch item from further processing." |
Definition at line 122 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.MSG_ERROR_ADD_URL_TO_BATCH_ITEM = "Can't add url to batch item " |
Definition at line 105 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.MSG_ERROR_CHECK_SITE = "Site don't passed check site " |
Definition at line 108 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.MSG_ERROR_COLLECT_URLS = "Can't collect urls " |
Definition at line 104 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.MSG_ERROR_CRAWL_SITE = "Can't crawl site " |
Definition at line 107 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.MSG_ERROR_EMPTY_CONFIG_FILE_NAME = "Config file name is empty." |
Definition at line 113 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.MSG_ERROR_EMPTY_RESPONSE_SIZE = "Empty response" |
Definition at line 111 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.MSG_ERROR_EXTRACT_BASE_URL = "Extract base url failed. Error: %s" |
Definition at line 116 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.MSG_ERROR_GET_DIR = "Can't get dir " |
Definition at line 109 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_APP_CONFIG = "Error loading application config file. %s" |
Definition at line 115 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_CONFIG = "Error loading config file. Exciting. " |
Definition at line 96 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_LOG_CONFIG_FILE = "Can't load logging config file. Exiting. " |
Definition at line 98 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_OPTIONS = "Error loading options. Exciting. " |
Definition at line 97 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_SITE_DATA = "Can't load site data: " |
Definition at line 99 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_SITE_PROPERTIES = "Can't load site properties " |
Definition at line 106 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_URL_DATA = "Can't load url data: " |
Definition at line 101 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.MSG_ERROR_NOT_EXIST_ANY_VALID_PROXY = "Not exist any valid proxy" |
Definition at line 112 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.MSG_ERROR_PROCESS_BATCH_ITEM = "Can't process batch item " |
Definition at line 102 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.MSG_ERROR_READ_SITE_FROM_DB = "Can't read site data from db" |
Definition at line 110 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.MSG_ERROR_UPDATE_SITE_DATA = "Can't update site data: " |
Definition at line 100 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.MSG_ERROR_WRITE_CRAWLED_DATA = "Can't write crawled data " |
Definition at line 103 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.MSG_ERROR_WRONG_CONFIG_FILE_NAME = "Config file name is wrong: %s" |
Definition at line 114 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.MSG_INFO_PROCESS_BATCH = "ProcessBatch " |
Definition at line 119 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.MSG_INFO_STORE_COOKIES_FILE = "Store cookies file on disk." |
Definition at line 120 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.NON_PROCESSING = "NONE" |
Definition at line 144 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.RECOVER_IF_FAILED = "2" |
Definition at line 165 of file CrawlerTask.py.
dc_crawler.CrawlerTask.Results = namedtuple("Results", "exit_code, output, err") |
Definition at line 152 of file CrawlerTask.py.
dc_crawler.CrawlerTask.ROBOTS_PATTERN = re.compile(r'(https?://[^/]+).*', re.I) |
Definition at line 154 of file CrawlerTask.py.
string dc_crawler.CrawlerTask.SITE_MD5_EMPTY = "d41d8cd98f00b204e9800998ecf8427e" |
Definition at line 124 of file CrawlerTask.py.
dc_crawler.CrawlerTask.TEXT_CONTENT_TYPE_PATTERN = re.compile('text', re.I) |
Definition at line 156 of file CrawlerTask.py.