HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
Consts.py
Go to the documentation of this file.
1 '''
2 Created on Feb 26, 2014
3 
4 @author: igor, bgv
5 '''
6 # event handlers
7 NEW_TASK_HANDLER = 1
8 
9 # Global HCE package version string
10 VERSION_STRING = "2.0.0-chaika"
11 # Global HCE project logger name
12 LOGGER_NAME = "hce"
13 # Global HCE project profiler logger name
14 LOGGER_NAME_PROFILER = "profiler"
15 # Global HCE project traceback logger name
16 LOGGER_NAME_TRACEBACK = "traceback"
17 # Start date mark application stats field name
18 START_DATE_NAME = "START_DATE"
19 # Constants exit codes
20 EXIT_SUCCESS = 0
21 EXIT_FAILURE = 1
22 
23 # 1 - Crawler errors diapason
24 ERROR_OK = 0
25 ERROR_BAD_URL = 1
26 ERROR_REQUEST_TIMEOUT = 1 << 1
27 ERROR_HTTP_ERROR = 1 << 2
28 ERROR_EMPTY_RESPONSE = 1 << 3
29 ERROR_WRONG_MIME = 1 << 4
30 ERROR_CONNECTION_ERROR = 1 << 5
31 ERROR_PAGE_CONVERT_ERROR = 1 << 6
32 ERROR_MACRO = 1 << 7
33 ERROR_RESPONSE_SIZE_ERROR = 1 << 8
34 ERROR_AUTH_ERROR = 1 << 9
35 ERROR_WRITE_FILE_ERROR = 1 << 10
36 ERROR_ROBOTS_NOT_ALLOW = 1 << 11
37 ERROR_PARSE_ERROR = 1 << 12
38 ERROR_BAD_ENCODING = 1 << 13
39 ERROR_SITE_MAX_ERRORS = 1 << 14
40 ERROR_SYNCHRONIZE_URL_WITH_DB = 1 << 15
41 ERROR_CRAWLER_FILTERS_BREAK = 1 << 16
42 ERROR_MAX_ALLOW_HTTP_REDIRECTS = 1 << 17
43 ERROR_MAX_ALLOW_HTML_REDIRECTS = 1 << 18
44 ERROR_GENERAL_CRAWLER = 1 << 19
45 ERROR_DTD_INVALID = 1 << 20
46 ERROR_MACRO_DESERIALIZATION = 1 << 21
47 ERROR_FETCH_AMBIGUOUS_REQUEST = 1 << 22
48 ERROR_FETCH_CONNECTION_ERROR = 1 << 23
49 ERROR_FETCH_HTTP_ERROR = 1 << 24
50 ERROR_FETCH_INVALID_URL = 1 << 25
51 ERROR_FETCH_TOO_MANY_REDIRECTS = 1 << 26
52 ERROR_FETCH_CONNECTION_TIMEOUT = 1 << 27
53 ERROR_FETCH_READ_TIMEOUT = 1 << 28
54 ERROR_FETCH_TIMEOUT = 1 << 29
55 ERROR_FETCHER_INTERNAL = 1 << 30
56 
57 # Processor/Scraper errors diapason
58 ERROR_MASK_SITE_MAX_RESOURCES_NUMBER = 1 << 31
59 ERROR_DATABASE_ERROR = 1 << 32
60 ERROR_MASK_SITE_MAX_RESOURCES_SIZE = 1 << 33
61 ERROR_MASK_SITE_UNSUPPORTED_CONTENT_TYPE = 1 << 34
62 ERROR_MASK_URL_ENCODING_ERROR = 1 << 35
63 ERROR_MASK_SCRAPER_ERROR = 1 << 36
64 ERROR_MASK_MISSED_RAW_CONTENT_ON_DISK = 1 << 37
65 ERROR_RE_ERROR = 1 << 38
66 ERROR_MANDATORY_TEMPLATE = 1 << 39
67 ERROR_PROCESSOR_FILTERS_BREAK = 1 << 40
68 ERROR_MASK_SITE_STATE = 1 << 41
69 ERROR_MAX_ITEMS = 1 << 42
70 ERROR_MAX_URLS_FROM_PAGE = 1 << 43
71 ERROR_TEMPLATE_SOURCE = 1 << 44
72 
73 # 2 - Crawler errors diapason
74 ERROR_RSS_EMPTY = 1 << 45
75 ERROR_URLS_SCHEMA_EXTERNAL = 1 << 46
76 ERROR_NOT_EXIST_ANY_VALID_PROXY = 1 << 47
77 ERROR_FETCH_FORBIDDEN = 1 << 48
78 ERROR_NO_TIME_WINDOW = 1 << 49
79 ERROR_CRAWLER_FATAL_INITIALIZATION_PROJECT_ERROR = 1 << 50
80 ERROR_PROCESSOR_BATCH_ITEM_PROCESS = 1 << 51
81 ERROR_MAX_EXECUTION_TIME = 1 << 52
82 
83 CONFIG_APPLICATION_SECTION_NAME = "Application"
84 CONFIG_PROFILER_SECTION_NAME = "Profiler"
85 
86 LOGGER_DELIMITER_LINE = "============================================\n"
87 
88 # Application class based on the cement usage forms
89 APP_USAGE_MODEL_PROCESS = 0
90 APP_USAGE_MODEL_MODULE = 1
91 
92 # Application names used in modules
93 URLS_TO_BATCH_TASK_APP_NAME = "UrlsToBatchTask"
94 URLS_FETCH_JSON_TO_DBTASK_APP_NAME = "URLFetchToJsonDBTaskConvertor"
95 SOCIAL_TASK_APP_NAME = "SocialTask"
96 CONTENT_UPDATER_APP_NAME = "ContentUpdater"
97 POST_PROCESSOR_APP_NAME = "PostprocessorTask"
98 SOCIAL_PROFILES_VERIFIER_APP_NAME = "SocialProfilesVerifier"
99 SOCIAL_DATA_GET_API_APP_NAME = "SocialDataGetApi"
100 SOCIAL_DATA_UPDATER_APP_NAME = "SocialDataUpdater"
101 
102 # Sites properties for work with source mask
103 PDATE_SOURCES_MASK_PROP_NAME = "PDATE_SOURCES_MASK"
104 PDATE_SOURCES_MASK_OVERWRITE_PROP_NAME = "PDATE_SOURCES_MASK_OVERWRITE"
105 PDATE_SOURCES_EXPRESSION_PROP_NAME = "PDATE_SOURCES_EXPRESSION"
106 
107 # Sites properties for work with sql expression fields update
108 SQL_EXPRESSION_FIELDS_UPDATE_CRAWLER = 'SQL_EXPRESSION_FIELDS_UPDATE_CRAWLER'
109 SQL_EXPRESSION_FIELDS_UPDATE_PROCESSOR = 'SQL_EXPRESSION_FIELDS_UPDATE_PROCESSOR'
110 SQL_EXPRESSION_FIELDS_PDATE_TIME = 'PDATE_TIME'
111 # Site property for replacement content
112 REPLACEMENT_CONTENT_DATA = 'REPLACE'
113 # Site property for url normalization
114 URL_NORMALIZE = 'URL_NORMALIZE'
115 
116 # Bits constant values for source mask
117 # Used on crawler:
118 PDATE_SOURCES_MASK_URL_NAME = 1
119 PDATE_SOURCES_MASK_RSS_FEED = 1 << 1
120 PDATE_SOURCES_MASK_HTTP_DATE = 1 << 2
121 PDATE_SOURCES_MASK_HTTP_LAST_MODIFIED = 1 << 3
122 # Used on scraper:
123 PDATE_SOURCES_MASK_DC_DATE = 1 << 4
124 PDATE_SOURCES_MASK_PUBDATE = 1 << 5
125 PDATE_SOURCES_MASK_NOW = 1 << 6
126 PDATE_SOURCES_MASK_SQL_EXPRESSION = 1 << 7
127 # Default values for source mask
128 PDATE_SOURCES_MASK_BIT_DEFAULT = 255
129 PDATE_SOURCES_MASK_OVERWRITE_DEFAULT = 192