Classes
class	CrawlerTask

Variables
string	DB_SITES = "dc_sites"

string	DB_URLS = "dc_urls"

string	MSG_ERROR_LOAD_CONFIG = "Error loading config file. Exciting. "

string	MSG_ERROR_LOAD_OPTIONS = "Error loading options. Exciting. "

string	MSG_ERROR_LOAD_LOG_CONFIG_FILE = "Can't load logging config file. Exiting. "

string	MSG_ERROR_LOAD_SITE_DATA = "Can't load site data: "

string	MSG_ERROR_UPDATE_SITE_DATA = "Can't update site data: "

string	MSG_ERROR_LOAD_URL_DATA = "Can't load url data: "

string	MSG_ERROR_PROCESS_BATCH_ITEM = "Can't process batch item "

string	MSG_ERROR_WRITE_CRAWLED_DATA = "Can't write crawled data "

string	MSG_ERROR_COLLECT_URLS = "Can't collect urls "

string	MSG_ERROR_ADD_URL_TO_BATCH_ITEM = "Can't add url to batch item "

string	MSG_ERROR_LOAD_SITE_PROPERTIES = "Can't load site properties "

string	MSG_ERROR_CRAWL_SITE = "Can't crawl site "

string	MSG_ERROR_CHECK_SITE = "Site don't passed check site "

string	MSG_ERROR_GET_DIR = "Can't get dir "

string	MSG_ERROR_READ_SITE_FROM_DB = "Can't read site data from db"

string	MSG_ERROR_EMPTY_RESPONSE_SIZE = "Empty response"

string	MSG_ERROR_NOT_EXIST_ANY_VALID_PROXY = "Not exist any valid proxy"

string	MSG_ERROR_EMPTY_CONFIG_FILE_NAME = "Config file name is empty."

string	MSG_ERROR_WRONG_CONFIG_FILE_NAME = "Config file name is wrong: %s"

string	MSG_ERROR_LOAD_APP_CONFIG = "Error loading application config file. %s"

string	MSG_ERROR_EXTRACT_BASE_URL = "Extract base url failed. Error: %s"

string	MSG_INFO_PROCESS_BATCH = "ProcessBatch "

string	MSG_INFO_STORE_COOKIES_FILE = "Store cookies file on disk."

string	MSG_DEBUG_NON_PROCESSING = "ProcessorName is NONE. Exclude batch item from further processing."

string	SITE_MD5_EMPTY = "d41d8cd98f00b204e9800998ecf8427e"

int	DEFAULT_MAX_SIZE = 1000000

string	EMPTY_RESPONSE_SIZE = "0"

string	APP_NAME = "crawler-task"

string	HTTP_COOKIE = "HTTP_COOKIE"

string	DEFAULT_HTTP_COOKIE = ""

string	HTTP_HEADERS = "HTTP_HEADERS"

string	DEFAULT_HTTP_HEADER = ""

string	DC_URLS_DB_NAME = "dc_urls"

string	DC_URLS_TABLE_PREFIX = "urls_"

string	DC_SITES_DB_NAME = "dc_sites"

string	DC_SITES_PROPERTIES_TABLE_NAME = "sites_properties"

string	DC_SITES_TABLE_NAME = "sites"

string	DC_URLS_TABLE_NAME = "urls"

string	COOKIES_FILE_POSTFIX = ".cookies.txt"

string	NON_PROCESSING = "NONE"

string	HTTP_REDIRECT = "<Response [301]>"

string	HTML_REDIRECT = ""

int	MAX_HTTP_REDIRECTS_UNLIMITED = 0

int	MAX_HTML_REDIRECTS_UNLIMITED = 0

string	META_XPATH = "//meta[contains(@content, 'url')]/@content"

	Results = namedtuple("Results", "exit_code, output, err")

	ROBOTS_PATTERN = re.compile(r'(https?://[^/]+).*', re.I)

	TEXT_CONTENT_TYPE_PATTERN = re.compile('text', re.I)

string	ENV_CRAWLER_STORE_PATH = "ENV_CRAWLER_STORE_PATH"

string	DETECT_MIME_MAIN_CONTENT = "1"

string	RECOVER_IF_FAILED = "2"

int	EXIT_SUCCESS = 0

int	EXIT_FAILURE = 1

Variable Documentation

◆ APP_NAME

string dc_crawler.CrawlerTask.APP_NAME = "crawler-task"

Definition at line 129 of file CrawlerTask.py.

◆ COOKIES_FILE_POSTFIX

string dc_crawler.CrawlerTask.COOKIES_FILE_POSTFIX = ".cookies.txt"

Definition at line 142 of file CrawlerTask.py.

◆ DB_SITES

string dc_crawler.CrawlerTask.DB_SITES = "dc_sites"

Definition at line 93 of file CrawlerTask.py.

◆ DB_URLS

string dc_crawler.CrawlerTask.DB_URLS = "dc_urls"

Definition at line 94 of file CrawlerTask.py.

◆ DC_SITES_DB_NAME

string dc_crawler.CrawlerTask.DC_SITES_DB_NAME = "dc_sites"

Definition at line 138 of file CrawlerTask.py.

◆ DC_SITES_PROPERTIES_TABLE_NAME

string dc_crawler.CrawlerTask.DC_SITES_PROPERTIES_TABLE_NAME = "sites_properties"

Definition at line 139 of file CrawlerTask.py.

◆ DC_SITES_TABLE_NAME

string dc_crawler.CrawlerTask.DC_SITES_TABLE_NAME = "sites"

Definition at line 140 of file CrawlerTask.py.

◆ DC_URLS_DB_NAME

string dc_crawler.CrawlerTask.DC_URLS_DB_NAME = "dc_urls"

Definition at line 136 of file CrawlerTask.py.

◆ DC_URLS_TABLE_NAME

string dc_crawler.CrawlerTask.DC_URLS_TABLE_NAME = "urls"

Definition at line 141 of file CrawlerTask.py.

◆ DC_URLS_TABLE_PREFIX

string dc_crawler.CrawlerTask.DC_URLS_TABLE_PREFIX = "urls_"

Definition at line 137 of file CrawlerTask.py.

◆ DEFAULT_HTTP_COOKIE

string dc_crawler.CrawlerTask.DEFAULT_HTTP_COOKIE = ""

Definition at line 132 of file CrawlerTask.py.

◆ DEFAULT_HTTP_HEADER

string dc_crawler.CrawlerTask.DEFAULT_HTTP_HEADER = ""

Definition at line 134 of file CrawlerTask.py.

◆ DEFAULT_MAX_SIZE

int dc_crawler.CrawlerTask.DEFAULT_MAX_SIZE = 1000000

Definition at line 126 of file CrawlerTask.py.

◆ DETECT_MIME_MAIN_CONTENT

string dc_crawler.CrawlerTask.DETECT_MIME_MAIN_CONTENT = "1"

Definition at line 164 of file CrawlerTask.py.

◆ EMPTY_RESPONSE_SIZE

string dc_crawler.CrawlerTask.EMPTY_RESPONSE_SIZE = "0"

Definition at line 127 of file CrawlerTask.py.

◆ ENV_CRAWLER_STORE_PATH

string dc_crawler.CrawlerTask.ENV_CRAWLER_STORE_PATH = "ENV_CRAWLER_STORE_PATH"

Definition at line 158 of file CrawlerTask.py.

◆ EXIT_FAILURE

int dc_crawler.CrawlerTask.EXIT_FAILURE = 1

Definition at line 168 of file CrawlerTask.py.

◆ EXIT_SUCCESS

int dc_crawler.CrawlerTask.EXIT_SUCCESS = 0

Definition at line 167 of file CrawlerTask.py.

◆ HTML_REDIRECT

string dc_crawler.CrawlerTask.HTML_REDIRECT = ""

Definition at line 147 of file CrawlerTask.py.

◆ HTTP_COOKIE

string dc_crawler.CrawlerTask.HTTP_COOKIE = "HTTP_COOKIE"

Definition at line 131 of file CrawlerTask.py.

◆ HTTP_HEADERS

string dc_crawler.CrawlerTask.HTTP_HEADERS = "HTTP_HEADERS"

Definition at line 133 of file CrawlerTask.py.

◆ HTTP_REDIRECT

string dc_crawler.CrawlerTask.HTTP_REDIRECT = "<Response [301]>"

Definition at line 146 of file CrawlerTask.py.

◆ MAX_HTML_REDIRECTS_UNLIMITED

int dc_crawler.CrawlerTask.MAX_HTML_REDIRECTS_UNLIMITED = 0

Definition at line 149 of file CrawlerTask.py.

◆ MAX_HTTP_REDIRECTS_UNLIMITED

int dc_crawler.CrawlerTask.MAX_HTTP_REDIRECTS_UNLIMITED = 0

Definition at line 148 of file CrawlerTask.py.

◆ META_XPATH

string dc_crawler.CrawlerTask.META_XPATH = "//meta[contains(@content, 'url')]/@content"

Definition at line 150 of file CrawlerTask.py.

◆ MSG_DEBUG_NON_PROCESSING

string dc_crawler.CrawlerTask.MSG_DEBUG_NON_PROCESSING = "ProcessorName is NONE. Exclude batch item from further processing."

Definition at line 122 of file CrawlerTask.py.

◆ MSG_ERROR_ADD_URL_TO_BATCH_ITEM

string dc_crawler.CrawlerTask.MSG_ERROR_ADD_URL_TO_BATCH_ITEM = "Can't add url to batch item "

Definition at line 105 of file CrawlerTask.py.

◆ MSG_ERROR_CHECK_SITE

string dc_crawler.CrawlerTask.MSG_ERROR_CHECK_SITE = "Site don't passed check site "

Definition at line 108 of file CrawlerTask.py.

◆ MSG_ERROR_COLLECT_URLS

string dc_crawler.CrawlerTask.MSG_ERROR_COLLECT_URLS = "Can't collect urls "

Definition at line 104 of file CrawlerTask.py.

◆ MSG_ERROR_CRAWL_SITE

string dc_crawler.CrawlerTask.MSG_ERROR_CRAWL_SITE = "Can't crawl site "

Definition at line 107 of file CrawlerTask.py.

◆ MSG_ERROR_EMPTY_CONFIG_FILE_NAME

string dc_crawler.CrawlerTask.MSG_ERROR_EMPTY_CONFIG_FILE_NAME = "Config file name is empty."

Definition at line 113 of file CrawlerTask.py.

◆ MSG_ERROR_EMPTY_RESPONSE_SIZE

string dc_crawler.CrawlerTask.MSG_ERROR_EMPTY_RESPONSE_SIZE = "Empty response"

Definition at line 111 of file CrawlerTask.py.

◆ MSG_ERROR_EXTRACT_BASE_URL

string dc_crawler.CrawlerTask.MSG_ERROR_EXTRACT_BASE_URL = "Extract base url failed. Error: %s"

Definition at line 116 of file CrawlerTask.py.

◆ MSG_ERROR_GET_DIR

string dc_crawler.CrawlerTask.MSG_ERROR_GET_DIR = "Can't get dir "

Definition at line 109 of file CrawlerTask.py.

◆ MSG_ERROR_LOAD_APP_CONFIG

string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_APP_CONFIG = "Error loading application config file. %s"

Definition at line 115 of file CrawlerTask.py.

◆ MSG_ERROR_LOAD_CONFIG

string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_CONFIG = "Error loading config file. Exciting. "

Definition at line 96 of file CrawlerTask.py.

◆ MSG_ERROR_LOAD_LOG_CONFIG_FILE

string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_LOG_CONFIG_FILE = "Can't load logging config file. Exiting. "

Definition at line 98 of file CrawlerTask.py.

◆ MSG_ERROR_LOAD_OPTIONS

string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_OPTIONS = "Error loading options. Exciting. "

Definition at line 97 of file CrawlerTask.py.

◆ MSG_ERROR_LOAD_SITE_DATA

string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_SITE_DATA = "Can't load site data: "

Definition at line 99 of file CrawlerTask.py.

◆ MSG_ERROR_LOAD_SITE_PROPERTIES

string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_SITE_PROPERTIES = "Can't load site properties "

Definition at line 106 of file CrawlerTask.py.

◆ MSG_ERROR_LOAD_URL_DATA

string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_URL_DATA = "Can't load url data: "

Definition at line 101 of file CrawlerTask.py.

◆ MSG_ERROR_NOT_EXIST_ANY_VALID_PROXY

string dc_crawler.CrawlerTask.MSG_ERROR_NOT_EXIST_ANY_VALID_PROXY = "Not exist any valid proxy"

Definition at line 112 of file CrawlerTask.py.

◆ MSG_ERROR_PROCESS_BATCH_ITEM

string dc_crawler.CrawlerTask.MSG_ERROR_PROCESS_BATCH_ITEM = "Can't process batch item "

Definition at line 102 of file CrawlerTask.py.

◆ MSG_ERROR_READ_SITE_FROM_DB

string dc_crawler.CrawlerTask.MSG_ERROR_READ_SITE_FROM_DB = "Can't read site data from db"

Definition at line 110 of file CrawlerTask.py.

◆ MSG_ERROR_UPDATE_SITE_DATA

string dc_crawler.CrawlerTask.MSG_ERROR_UPDATE_SITE_DATA = "Can't update site data: "

Definition at line 100 of file CrawlerTask.py.

◆ MSG_ERROR_WRITE_CRAWLED_DATA

string dc_crawler.CrawlerTask.MSG_ERROR_WRITE_CRAWLED_DATA = "Can't write crawled data "

Definition at line 103 of file CrawlerTask.py.

◆ MSG_ERROR_WRONG_CONFIG_FILE_NAME

string dc_crawler.CrawlerTask.MSG_ERROR_WRONG_CONFIG_FILE_NAME = "Config file name is wrong: %s"

Definition at line 114 of file CrawlerTask.py.

◆ MSG_INFO_PROCESS_BATCH

string dc_crawler.CrawlerTask.MSG_INFO_PROCESS_BATCH = "ProcessBatch "

Definition at line 119 of file CrawlerTask.py.

◆ MSG_INFO_STORE_COOKIES_FILE

string dc_crawler.CrawlerTask.MSG_INFO_STORE_COOKIES_FILE = "Store cookies file on disk."

Definition at line 120 of file CrawlerTask.py.

◆ NON_PROCESSING

string dc_crawler.CrawlerTask.NON_PROCESSING = "NONE"

Definition at line 144 of file CrawlerTask.py.

◆ RECOVER_IF_FAILED

string dc_crawler.CrawlerTask.RECOVER_IF_FAILED = "2"

Definition at line 165 of file CrawlerTask.py.

◆ Results

dc_crawler.CrawlerTask.Results = namedtuple("Results", "exit_code, output, err")

Definition at line 152 of file CrawlerTask.py.

◆ ROBOTS_PATTERN

dc_crawler.CrawlerTask.ROBOTS_PATTERN = re.compile(r'(https?://[^/]+).*', re.I)

Definition at line 154 of file CrawlerTask.py.

◆ SITE_MD5_EMPTY

string dc_crawler.CrawlerTask.SITE_MD5_EMPTY = "d41d8cd98f00b204e9800998ecf8427e"

Definition at line 124 of file CrawlerTask.py.

◆ TEXT_CONTENT_TYPE_PATTERN

dc_crawler.CrawlerTask.TEXT_CONTENT_TYPE_PATTERN = re.compile('text', re.I)

Definition at line 156 of file CrawlerTask.py.

Classes

Variables