HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_crawler.CrawlerTask Namespace Reference

Classes

class  CrawlerTask
 

Variables

string DB_SITES = "dc_sites"
 
string DB_URLS = "dc_urls"
 
string MSG_ERROR_LOAD_CONFIG = "Error loading config file. Exciting. "
 
string MSG_ERROR_LOAD_OPTIONS = "Error loading options. Exciting. "
 
string MSG_ERROR_LOAD_LOG_CONFIG_FILE = "Can't load logging config file. Exiting. "
 
string MSG_ERROR_LOAD_SITE_DATA = "Can't load site data: "
 
string MSG_ERROR_UPDATE_SITE_DATA = "Can't update site data: "
 
string MSG_ERROR_LOAD_URL_DATA = "Can't load url data: "
 
string MSG_ERROR_PROCESS_BATCH_ITEM = "Can't process batch item "
 
string MSG_ERROR_WRITE_CRAWLED_DATA = "Can't write crawled data "
 
string MSG_ERROR_COLLECT_URLS = "Can't collect urls "
 
string MSG_ERROR_ADD_URL_TO_BATCH_ITEM = "Can't add url to batch item "
 
string MSG_ERROR_LOAD_SITE_PROPERTIES = "Can't load site properties "
 
string MSG_ERROR_CRAWL_SITE = "Can't crawl site "
 
string MSG_ERROR_CHECK_SITE = "Site don't passed check site "
 
string MSG_ERROR_GET_DIR = "Can't get dir "
 
string MSG_ERROR_READ_SITE_FROM_DB = "Can't read site data from db"
 
string MSG_ERROR_EMPTY_RESPONSE_SIZE = "Empty response"
 
string MSG_ERROR_NOT_EXIST_ANY_VALID_PROXY = "Not exist any valid proxy"
 
string MSG_ERROR_EMPTY_CONFIG_FILE_NAME = "Config file name is empty."
 
string MSG_ERROR_WRONG_CONFIG_FILE_NAME = "Config file name is wrong: %s"
 
string MSG_ERROR_LOAD_APP_CONFIG = "Error loading application config file. %s"
 
string MSG_ERROR_EXTRACT_BASE_URL = "Extract base url failed. Error: %s"
 
string MSG_INFO_PROCESS_BATCH = "ProcessBatch "
 
string MSG_INFO_STORE_COOKIES_FILE = "Store cookies file on disk."
 
string MSG_DEBUG_NON_PROCESSING = "ProcessorName is NONE. Exclude batch item from further processing."
 
string SITE_MD5_EMPTY = "d41d8cd98f00b204e9800998ecf8427e"
 
int DEFAULT_MAX_SIZE = 1000000
 
string EMPTY_RESPONSE_SIZE = "0"
 
string APP_NAME = "crawler-task"
 
string HTTP_COOKIE = "HTTP_COOKIE"
 
string DEFAULT_HTTP_COOKIE = ""
 
string HTTP_HEADERS = "HTTP_HEADERS"
 
string DEFAULT_HTTP_HEADER = ""
 
string DC_URLS_DB_NAME = "dc_urls"
 
string DC_URLS_TABLE_PREFIX = "urls_"
 
string DC_SITES_DB_NAME = "dc_sites"
 
string DC_SITES_PROPERTIES_TABLE_NAME = "sites_properties"
 
string DC_SITES_TABLE_NAME = "sites"
 
string DC_URLS_TABLE_NAME = "urls"
 
string COOKIES_FILE_POSTFIX = ".cookies.txt"
 
string NON_PROCESSING = "NONE"
 
string HTTP_REDIRECT = "<Response [301]>"
 
string HTML_REDIRECT = ""
 
int MAX_HTTP_REDIRECTS_UNLIMITED = 0
 
int MAX_HTML_REDIRECTS_UNLIMITED = 0
 
string META_XPATH = "//meta[contains(@content, 'url')]/@content"
 
 Results = namedtuple("Results", "exit_code, output, err")
 
 ROBOTS_PATTERN = re.compile(r'(https?://[^/]+).*', re.I)
 
 TEXT_CONTENT_TYPE_PATTERN = re.compile('text', re.I)
 
string ENV_CRAWLER_STORE_PATH = "ENV_CRAWLER_STORE_PATH"
 
string DETECT_MIME_MAIN_CONTENT = "1"
 
string RECOVER_IF_FAILED = "2"
 
int EXIT_SUCCESS = 0
 
int EXIT_FAILURE = 1
 

Variable Documentation

◆ APP_NAME

string dc_crawler.CrawlerTask.APP_NAME = "crawler-task"

Definition at line 129 of file CrawlerTask.py.

◆ COOKIES_FILE_POSTFIX

string dc_crawler.CrawlerTask.COOKIES_FILE_POSTFIX = ".cookies.txt"

Definition at line 142 of file CrawlerTask.py.

◆ DB_SITES

string dc_crawler.CrawlerTask.DB_SITES = "dc_sites"

Definition at line 93 of file CrawlerTask.py.

◆ DB_URLS

string dc_crawler.CrawlerTask.DB_URLS = "dc_urls"

Definition at line 94 of file CrawlerTask.py.

◆ DC_SITES_DB_NAME

string dc_crawler.CrawlerTask.DC_SITES_DB_NAME = "dc_sites"

Definition at line 138 of file CrawlerTask.py.

◆ DC_SITES_PROPERTIES_TABLE_NAME

string dc_crawler.CrawlerTask.DC_SITES_PROPERTIES_TABLE_NAME = "sites_properties"

Definition at line 139 of file CrawlerTask.py.

◆ DC_SITES_TABLE_NAME

string dc_crawler.CrawlerTask.DC_SITES_TABLE_NAME = "sites"

Definition at line 140 of file CrawlerTask.py.

◆ DC_URLS_DB_NAME

string dc_crawler.CrawlerTask.DC_URLS_DB_NAME = "dc_urls"

Definition at line 136 of file CrawlerTask.py.

◆ DC_URLS_TABLE_NAME

string dc_crawler.CrawlerTask.DC_URLS_TABLE_NAME = "urls"

Definition at line 141 of file CrawlerTask.py.

◆ DC_URLS_TABLE_PREFIX

string dc_crawler.CrawlerTask.DC_URLS_TABLE_PREFIX = "urls_"

Definition at line 137 of file CrawlerTask.py.

◆ DEFAULT_HTTP_COOKIE

string dc_crawler.CrawlerTask.DEFAULT_HTTP_COOKIE = ""

Definition at line 132 of file CrawlerTask.py.

◆ DEFAULT_HTTP_HEADER

string dc_crawler.CrawlerTask.DEFAULT_HTTP_HEADER = ""

Definition at line 134 of file CrawlerTask.py.

◆ DEFAULT_MAX_SIZE

int dc_crawler.CrawlerTask.DEFAULT_MAX_SIZE = 1000000

Definition at line 126 of file CrawlerTask.py.

◆ DETECT_MIME_MAIN_CONTENT

string dc_crawler.CrawlerTask.DETECT_MIME_MAIN_CONTENT = "1"

Definition at line 164 of file CrawlerTask.py.

◆ EMPTY_RESPONSE_SIZE

string dc_crawler.CrawlerTask.EMPTY_RESPONSE_SIZE = "0"

Definition at line 127 of file CrawlerTask.py.

◆ ENV_CRAWLER_STORE_PATH

string dc_crawler.CrawlerTask.ENV_CRAWLER_STORE_PATH = "ENV_CRAWLER_STORE_PATH"

Definition at line 158 of file CrawlerTask.py.

◆ EXIT_FAILURE

int dc_crawler.CrawlerTask.EXIT_FAILURE = 1

Definition at line 168 of file CrawlerTask.py.

◆ EXIT_SUCCESS

int dc_crawler.CrawlerTask.EXIT_SUCCESS = 0

Definition at line 167 of file CrawlerTask.py.

◆ HTML_REDIRECT

string dc_crawler.CrawlerTask.HTML_REDIRECT = ""

Definition at line 147 of file CrawlerTask.py.

◆ HTTP_COOKIE

string dc_crawler.CrawlerTask.HTTP_COOKIE = "HTTP_COOKIE"

Definition at line 131 of file CrawlerTask.py.

◆ HTTP_HEADERS

string dc_crawler.CrawlerTask.HTTP_HEADERS = "HTTP_HEADERS"

Definition at line 133 of file CrawlerTask.py.

◆ HTTP_REDIRECT

string dc_crawler.CrawlerTask.HTTP_REDIRECT = "<Response [301]>"

Definition at line 146 of file CrawlerTask.py.

◆ MAX_HTML_REDIRECTS_UNLIMITED

int dc_crawler.CrawlerTask.MAX_HTML_REDIRECTS_UNLIMITED = 0

Definition at line 149 of file CrawlerTask.py.

◆ MAX_HTTP_REDIRECTS_UNLIMITED

int dc_crawler.CrawlerTask.MAX_HTTP_REDIRECTS_UNLIMITED = 0

Definition at line 148 of file CrawlerTask.py.

◆ META_XPATH

string dc_crawler.CrawlerTask.META_XPATH = "//meta[contains(@content, 'url')]/@content"

Definition at line 150 of file CrawlerTask.py.

◆ MSG_DEBUG_NON_PROCESSING

string dc_crawler.CrawlerTask.MSG_DEBUG_NON_PROCESSING = "ProcessorName is NONE. Exclude batch item from further processing."

Definition at line 122 of file CrawlerTask.py.

◆ MSG_ERROR_ADD_URL_TO_BATCH_ITEM

string dc_crawler.CrawlerTask.MSG_ERROR_ADD_URL_TO_BATCH_ITEM = "Can't add url to batch item "

Definition at line 105 of file CrawlerTask.py.

◆ MSG_ERROR_CHECK_SITE

string dc_crawler.CrawlerTask.MSG_ERROR_CHECK_SITE = "Site don't passed check site "

Definition at line 108 of file CrawlerTask.py.

◆ MSG_ERROR_COLLECT_URLS

string dc_crawler.CrawlerTask.MSG_ERROR_COLLECT_URLS = "Can't collect urls "

Definition at line 104 of file CrawlerTask.py.

◆ MSG_ERROR_CRAWL_SITE

string dc_crawler.CrawlerTask.MSG_ERROR_CRAWL_SITE = "Can't crawl site "

Definition at line 107 of file CrawlerTask.py.

◆ MSG_ERROR_EMPTY_CONFIG_FILE_NAME

string dc_crawler.CrawlerTask.MSG_ERROR_EMPTY_CONFIG_FILE_NAME = "Config file name is empty."

Definition at line 113 of file CrawlerTask.py.

◆ MSG_ERROR_EMPTY_RESPONSE_SIZE

string dc_crawler.CrawlerTask.MSG_ERROR_EMPTY_RESPONSE_SIZE = "Empty response"

Definition at line 111 of file CrawlerTask.py.

◆ MSG_ERROR_EXTRACT_BASE_URL

string dc_crawler.CrawlerTask.MSG_ERROR_EXTRACT_BASE_URL = "Extract base url failed. Error: %s"

Definition at line 116 of file CrawlerTask.py.

◆ MSG_ERROR_GET_DIR

string dc_crawler.CrawlerTask.MSG_ERROR_GET_DIR = "Can't get dir "

Definition at line 109 of file CrawlerTask.py.

◆ MSG_ERROR_LOAD_APP_CONFIG

string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_APP_CONFIG = "Error loading application config file. %s"

Definition at line 115 of file CrawlerTask.py.

◆ MSG_ERROR_LOAD_CONFIG

string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_CONFIG = "Error loading config file. Exciting. "

Definition at line 96 of file CrawlerTask.py.

◆ MSG_ERROR_LOAD_LOG_CONFIG_FILE

string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_LOG_CONFIG_FILE = "Can't load logging config file. Exiting. "

Definition at line 98 of file CrawlerTask.py.

◆ MSG_ERROR_LOAD_OPTIONS

string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_OPTIONS = "Error loading options. Exciting. "

Definition at line 97 of file CrawlerTask.py.

◆ MSG_ERROR_LOAD_SITE_DATA

string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_SITE_DATA = "Can't load site data: "

Definition at line 99 of file CrawlerTask.py.

◆ MSG_ERROR_LOAD_SITE_PROPERTIES

string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_SITE_PROPERTIES = "Can't load site properties "

Definition at line 106 of file CrawlerTask.py.

◆ MSG_ERROR_LOAD_URL_DATA

string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_URL_DATA = "Can't load url data: "

Definition at line 101 of file CrawlerTask.py.

◆ MSG_ERROR_NOT_EXIST_ANY_VALID_PROXY

string dc_crawler.CrawlerTask.MSG_ERROR_NOT_EXIST_ANY_VALID_PROXY = "Not exist any valid proxy"

Definition at line 112 of file CrawlerTask.py.

◆ MSG_ERROR_PROCESS_BATCH_ITEM

string dc_crawler.CrawlerTask.MSG_ERROR_PROCESS_BATCH_ITEM = "Can't process batch item "

Definition at line 102 of file CrawlerTask.py.

◆ MSG_ERROR_READ_SITE_FROM_DB

string dc_crawler.CrawlerTask.MSG_ERROR_READ_SITE_FROM_DB = "Can't read site data from db"

Definition at line 110 of file CrawlerTask.py.

◆ MSG_ERROR_UPDATE_SITE_DATA

string dc_crawler.CrawlerTask.MSG_ERROR_UPDATE_SITE_DATA = "Can't update site data: "

Definition at line 100 of file CrawlerTask.py.

◆ MSG_ERROR_WRITE_CRAWLED_DATA

string dc_crawler.CrawlerTask.MSG_ERROR_WRITE_CRAWLED_DATA = "Can't write crawled data "

Definition at line 103 of file CrawlerTask.py.

◆ MSG_ERROR_WRONG_CONFIG_FILE_NAME

string dc_crawler.CrawlerTask.MSG_ERROR_WRONG_CONFIG_FILE_NAME = "Config file name is wrong: %s"

Definition at line 114 of file CrawlerTask.py.

◆ MSG_INFO_PROCESS_BATCH

string dc_crawler.CrawlerTask.MSG_INFO_PROCESS_BATCH = "ProcessBatch "

Definition at line 119 of file CrawlerTask.py.

◆ MSG_INFO_STORE_COOKIES_FILE

string dc_crawler.CrawlerTask.MSG_INFO_STORE_COOKIES_FILE = "Store cookies file on disk."

Definition at line 120 of file CrawlerTask.py.

◆ NON_PROCESSING

string dc_crawler.CrawlerTask.NON_PROCESSING = "NONE"

Definition at line 144 of file CrawlerTask.py.

◆ RECOVER_IF_FAILED

string dc_crawler.CrawlerTask.RECOVER_IF_FAILED = "2"

Definition at line 165 of file CrawlerTask.py.

◆ Results

dc_crawler.CrawlerTask.Results = namedtuple("Results", "exit_code, output, err")

Definition at line 152 of file CrawlerTask.py.

◆ ROBOTS_PATTERN

dc_crawler.CrawlerTask.ROBOTS_PATTERN = re.compile(r'(https?://[^/]+).*', re.I)

Definition at line 154 of file CrawlerTask.py.

◆ SITE_MD5_EMPTY

string dc_crawler.CrawlerTask.SITE_MD5_EMPTY = "d41d8cd98f00b204e9800998ecf8427e"

Definition at line 124 of file CrawlerTask.py.

◆ TEXT_CONTENT_TYPE_PATTERN

dc_crawler.CrawlerTask.TEXT_CONTENT_TYPE_PATTERN = re.compile('text', re.I)

Definition at line 156 of file CrawlerTask.py.