HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
CrawlerTask.py File Reference

Go to the source code of this file.

Classes

class  dc_crawler.CrawlerTask.CrawlerTask
 
class  dc_crawler.CrawlerTask.CrawlerTask.Meta
 

Namespaces

 dc_crawler.CrawlerTask
 

Variables

string dc_crawler.CrawlerTask.DB_SITES = "dc_sites"
 
string dc_crawler.CrawlerTask.DB_URLS = "dc_urls"
 
string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_CONFIG = "Error loading config file. Exciting. "
 
string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_OPTIONS = "Error loading options. Exciting. "
 
string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_LOG_CONFIG_FILE = "Can't load logging config file. Exiting. "
 
string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_SITE_DATA = "Can't load site data: "
 
string dc_crawler.CrawlerTask.MSG_ERROR_UPDATE_SITE_DATA = "Can't update site data: "
 
string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_URL_DATA = "Can't load url data: "
 
string dc_crawler.CrawlerTask.MSG_ERROR_PROCESS_BATCH_ITEM = "Can't process batch item "
 
string dc_crawler.CrawlerTask.MSG_ERROR_WRITE_CRAWLED_DATA = "Can't write crawled data "
 
string dc_crawler.CrawlerTask.MSG_ERROR_COLLECT_URLS = "Can't collect urls "
 
string dc_crawler.CrawlerTask.MSG_ERROR_ADD_URL_TO_BATCH_ITEM = "Can't add url to batch item "
 
string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_SITE_PROPERTIES = "Can't load site properties "
 
string dc_crawler.CrawlerTask.MSG_ERROR_CRAWL_SITE = "Can't crawl site "
 
string dc_crawler.CrawlerTask.MSG_ERROR_CHECK_SITE = "Site don't passed check site "
 
string dc_crawler.CrawlerTask.MSG_ERROR_GET_DIR = "Can't get dir "
 
string dc_crawler.CrawlerTask.MSG_ERROR_READ_SITE_FROM_DB = "Can't read site data from db"
 
string dc_crawler.CrawlerTask.MSG_ERROR_EMPTY_RESPONSE_SIZE = "Empty response"
 
string dc_crawler.CrawlerTask.MSG_ERROR_NOT_EXIST_ANY_VALID_PROXY = "Not exist any valid proxy"
 
string dc_crawler.CrawlerTask.MSG_ERROR_EMPTY_CONFIG_FILE_NAME = "Config file name is empty."
 
string dc_crawler.CrawlerTask.MSG_ERROR_WRONG_CONFIG_FILE_NAME = "Config file name is wrong: %s"
 
string dc_crawler.CrawlerTask.MSG_ERROR_LOAD_APP_CONFIG = "Error loading application config file. %s"
 
string dc_crawler.CrawlerTask.MSG_ERROR_EXTRACT_BASE_URL = "Extract base url failed. Error: %s"
 
string dc_crawler.CrawlerTask.MSG_INFO_PROCESS_BATCH = "ProcessBatch "
 
string dc_crawler.CrawlerTask.MSG_INFO_STORE_COOKIES_FILE = "Store cookies file on disk."
 
string dc_crawler.CrawlerTask.MSG_DEBUG_NON_PROCESSING = "ProcessorName is NONE. Exclude batch item from further processing."
 
string dc_crawler.CrawlerTask.SITE_MD5_EMPTY = "d41d8cd98f00b204e9800998ecf8427e"
 
int dc_crawler.CrawlerTask.DEFAULT_MAX_SIZE = 1000000
 
string dc_crawler.CrawlerTask.EMPTY_RESPONSE_SIZE = "0"
 
string dc_crawler.CrawlerTask.APP_NAME = "crawler-task"
 
string dc_crawler.CrawlerTask.HTTP_COOKIE = "HTTP_COOKIE"
 
string dc_crawler.CrawlerTask.DEFAULT_HTTP_COOKIE = ""
 
string dc_crawler.CrawlerTask.HTTP_HEADERS = "HTTP_HEADERS"
 
string dc_crawler.CrawlerTask.DEFAULT_HTTP_HEADER = ""
 
string dc_crawler.CrawlerTask.DC_URLS_DB_NAME = "dc_urls"
 
string dc_crawler.CrawlerTask.DC_URLS_TABLE_PREFIX = "urls_"
 
string dc_crawler.CrawlerTask.DC_SITES_DB_NAME = "dc_sites"
 
string dc_crawler.CrawlerTask.DC_SITES_PROPERTIES_TABLE_NAME = "sites_properties"
 
string dc_crawler.CrawlerTask.DC_SITES_TABLE_NAME = "sites"
 
string dc_crawler.CrawlerTask.DC_URLS_TABLE_NAME = "urls"
 
string dc_crawler.CrawlerTask.COOKIES_FILE_POSTFIX = ".cookies.txt"
 
string dc_crawler.CrawlerTask.NON_PROCESSING = "NONE"
 
string dc_crawler.CrawlerTask.HTTP_REDIRECT = "<Response [301]>"
 
string dc_crawler.CrawlerTask.HTML_REDIRECT = ""
 
int dc_crawler.CrawlerTask.MAX_HTTP_REDIRECTS_UNLIMITED = 0
 
int dc_crawler.CrawlerTask.MAX_HTML_REDIRECTS_UNLIMITED = 0
 
string dc_crawler.CrawlerTask.META_XPATH = "//meta[contains(@content, 'url')]/@content"
 
 dc_crawler.CrawlerTask.Results = namedtuple("Results", "exit_code, output, err")
 
 dc_crawler.CrawlerTask.ROBOTS_PATTERN = re.compile(r'(https?://[^/]+).*', re.I)
 
 dc_crawler.CrawlerTask.TEXT_CONTENT_TYPE_PATTERN = re.compile('text', re.I)
 
string dc_crawler.CrawlerTask.ENV_CRAWLER_STORE_PATH = "ENV_CRAWLER_STORE_PATH"
 
string dc_crawler.CrawlerTask.DETECT_MIME_MAIN_CONTENT = "1"
 
string dc_crawler.CrawlerTask.RECOVER_IF_FAILED = "2"
 
int dc_crawler.CrawlerTask.EXIT_SUCCESS = 0
 
int dc_crawler.CrawlerTask.EXIT_FAILURE = 1