HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
Constants.py
Go to the documentation of this file.
1 """@package docstring
2  @file Constants.py
3  @author Alexey, bgv <developers.hce@gmail.com>, Alexander Vybornyh <alexander.hce.cluster@gmail.com>
4  @link http://hierarchical-cluster-engine.com/
5  @copyright Copyright &copy; 2013-2015 IOIX Ukraine
6  @license http://hierarchical-cluster-engine.com/license/
7  @package HCE project node API
8  @since 0.1
9 """
10 import app.Consts as APP_CONSTS
11 LOGGER_NAME = APP_CONSTS.LOGGER_NAME
12 
13 # constants for general purpose
14 TAG_MEDIA = "media"
15 TAG_TITLE = "title"
16 TAG_LINK = "link"
17 TAG_DESCRIPTION = "description"
18 TAG_PUB_DATE = "pubdate"
19 TAG_DC_DATE = "dc_date"
20 TAG_AUTHOR = "author"
21 TAG_GUID = "guid"
22 TAG_CONTENT_UTF8_ENCODED = "content_encoded"
23 TAG_KEYWORDS = "keywords"
24 TAG_MEDIA_THUMBNAIL = "media_thumbnail"
25 TAG_MEDIA_CONTENT = "media_content"
26 TAG_ENCLOSURE = "enclosure"
27 TAG_GOOGLE = "google_search"
28 TAG_GOOGLE_TOTAL = "google_search_total"
29 TAG_SUMMARY_LANG = "summary_lang"
30 HTML_LANG = "html_lang"
31 PARENT_RSS_FEED = "parent_rss_feed"
32 PARENT_RSS_FEED_URLMD5 = "parent_rss_feed_urlMd5"
33 SUMMARY = "summary"
34 SUMMARY_DETAIL = "summary_detail"
35 COMMENTNS = "comments"
36 TAGS = "tags"
37 PUBLISHED = "published"
38 CONTENT = "content"
39 UPDATED = "updated"
40 UPDATED_PARSED = "updated_parsed"
41 TAG_ORDER_NUMBER = "order_number"
42 TAG_SOURCE_URL = "source_url"
43 TAG_FEED_URL = "feed_url"
44 # TAG_LINKS = "links"
45 TAG_TYPE_DATETIME = 'datetime'
46 TAG_PUBDATE_TZ = 'pubdate_tz'
47 # content hash for duplicate detection
48 CONTENT_HASH_ALGORITHM_EMPTY = 0
49 CONTENT_HASH_ALGORITHM_MD5 = 1
50 CONTENT_HASH_ALGORITHM_CRC32 = 2
51 CONTENT_HASH_ALGORITHM_SOUNDEX = 3
52 CONTENT_HASH_ALGORITHM_SHA1 = 4
53 CONTENT_HASH_ALGORITHM_SDHASH = 5
54 CONTENT_HASH_ALGORITHM_BBHASH = 6
55 CONTENT_HASH_ALGORITHM_MRSH_V2 = 7
56 CONTENT_HASH_ALGORITHM_MVHASH_B = 8
57 CONTENT_HASH_ALGORITHM_MD5_WITHOUT_HTML = 9
58 CONTENT_HASH_ACTION_DELETE = 1
59 PARENT_URL_MD5 = ""
60 
61 TAGS_RULES_MASK_DEFAULT_VALUE = 4
62 TAGS_RULES_MASK_RULE_PRIORITY = 2
63 TAGS_RULES_MASK_MANDATORY_FIELD = 1
64 
65 # MODES
66 PROCESS_ALGORITHM_REGULAR = "regular"
67 PROCESS_ALGORITHM_TRAINING = "training"
68 PROCESS_ALGORITHM_PREDICTION = "prediction"
69 PROCESS_ALGORITHM_CONCURRENCY = "concurrency"
70 PROCESS_ALGORITHM_METRIC = "metric_based"
71 PROCESS_ALGORITHM_FEED_PARSER = "feed_parser"
72 PROCESS_ALGORITHM_ALCHEMY = "ALCHEMY"
73 PROCESS_ALGORITHM_BOILERPIPE = "BOILERPIPE"
74 PROCESS_ALGORITHM_NEWSPAPER = "NEWSPAPER"
75 PROCESS_ALGORITHM_GOOSE = "GOOSE"
76 PROCESS_ALGORITHM_SCRAPY = "SCRAPY"
77 PROCESS_ALGORITHM_ML = "ML"
78 
79 TRAINING_QUEUE = "TRAINING_QUEUE"
80 TRAINED_QUEUE = "TRAINED_QUEUE"
81 CONCURRENCY_QUEUE = "CONCURRENCY_QUEUE"
82 
83 DB_SECTION = "mysql"
84 DB_HOST = "db_host"
85 DB_PORT = "db_port"
86 DB_USER = "db_user"
87 DB_PWD = "db_pwd"
88 DB_SITES = "db_dc_sites"
89 DB_URLS = "db_dc_urls"
90 DB_SCRAPERS = "db_dc_scrapers"
91 DC_CONTENTS_DB_NAME = "db_dc_contents"
92 SQL_TMP_TABLE = "metrics"
93 
94 MYSQL_ENGINE = "mysql_engine"
95 
96 # log messages
97 MSG_ERROR_OK = ""
98 MSG_ERROR_LOAD_DB_BACKEND = "Error loading DB backend. "
99 MSG_ERROR_LOAD_CONFIG = "Error loading config file."
100 MSG_ERROR_LOAD_LOG_CONFIG_FILE = "Error loading logging config file."
101 MSG_ERROR_LOAD_EXTRACTORS = "Error load extractors "
102 MSG_ERROR_TEMPLATE_EXTRACTION = "Error template extraction "
103 MSG_ERROR_DYNAMIC_EXTRACTION = "Error dynamic extraction "
104 MSG_ERROR_LOAD_OPTIONS = "Error load options"
105 MSG_INFO_PREPARE_CONTENT = "Prepare content: "
106 MSG_ERROR_ADJUST_PR = "Error adjust partial references. "
107 MSG_ERROR_PROCESS = "Processor Storing Contents process batch error: "
108 MSG_ERROR_CALC_METRICS = "Smth goes wrong. See traceback: "
109 
110 
111 # staus code
112 ERROR_OK = 0
113 
114 # exit staus code
115 EXIT_SUCCESS = 0
116 EXIT_FAILURE = 1
117 
118 # sqlite operation timeout, sec.
119 SQLITE_TIMEOUT = 30
120 
121 # scrapping extract tags operation time limit, sec.
122 TIME_EXECUTION_LIMIT = 20
123 
124 PYTHON_BINARY = "/usr/bin/python"
125 
126 # DEFAULT PROCESSOR_NAME
127 PROCESSOR_EMPTY = ""
128 SCRAPER_BINARY = "./scraper.py"
129 SCRAPER_CFG = "--config=../ini/scraper.ini"
130 
131 # STORE PROCESSOR_NAME
132 PROCESSOR_STORE = "STORE"
133 STORE_PROCESSOR_BINARY = "./processor_store_content_kvdb.py"
134 STORE_PROCESSOR_CFG = "--config=../ini/processor-store-content-in-kvdb.ini"
135 
136 # FEED_PARSER PROCESSOR_NAME
137 PROCESSOR_FEED_PARSER = "FEED_PARSER"
138 PROCESSOR_RSS = "RSS"
139 
140 # REAL TIME CRAWLING
141 REPROCESS_KEY = "reprocess"
142 REPROCESS_VALUE_NO = 0
143 RECRAWL_KEY = "recrawl"
144 RECRAWL_VALUE_NO = 0
145 
146 PROCESSOR_FEED_PARSER_BINARY = "./processor_feed_parser.py"
147 PROCESSOR_FEED_PARSER_CFG = "--config=../ini/processor_feed_parser.ini"
148 
149 # SCRAPER MULTI ITEMS PROCESSOR_NAME
150 PROCESSOR_SCRAPER_MULTI_ITEMS = "SCRAPER_MULTI_ITEMS"
151 SCRAPER_MULTI_ITEMS_BINARY = "./scraper_multi_items_task.py"
152 SCRAPER_MULTI_ITEMS_CFG = "--config=../ini/scraper_multi_items_task.ini"
153 
154 # SCRAPER CUSTOM PROCESSOR_NAME
155 PROCESSOR_SCRAPER_CUSTOM = "SCRAPER_CUSTOM"
156 SCRAPER_CUSTOM_BINARY = "./scraper_custom_task.py"
157 SCRAPER_CUSTOM_CFG = "--config=../ini/scraper_custom_task.ini"
158 
159 # extractor's names
160 EXTRACTOR_NAME_ML = "ML extractor"
161 EXTRACTOR_NAME_ALCHEMY = "Alchemy extractor"
162 EXTRACTOR_NAME_BOILERPIPE = "Boilerpipe extractor"
163 
164 MODULES_KEY = "modules"
165 ALGORITHM_KEY = "algorithm"
166 ALGORITHM_NAME_KEY = "algorithm_name"
167 PROPERTIES_KEY = "properties"
168 TEMPLATE_KEY = "template"
169 RANK_KEY = "rank"
170 USE_HTML5_KEY = "html5"
171 
172 SCRAPER_RANK_INIT = 10
173 USE_HTML5_YES = 1
174 USE_HTML5_NO = 0
175 
176 TIMEZONE_LIST = ["JST"]
177 COMMON_DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
178 
179 # # METRIC SECTION
180 
181 # DEFAULT (base empty) METRIC
182 DEFAULT_TRESHOLD_VALUE = 0
183 DEFAULT_METRIC_VALUE = 0
184 DEFAULT_COMPARATOR = ""
185 
186 # WORDS METRIC
187 # Words count in corpus of article
188 WORDS_TRESHOLD_VALUE = 100
189 # Words comparator type
190 WORDS_COMPARATOR = "round"
191 
192 
193 # SENTENCES METRIC
194 SENTENCES_TRESHOLD_VALUE = 5
195 # Sentences comparator type
196 SENTENCES_COMPARATOR = "round"
197 
198 # AUTOMATED READABILITY INDEX METRIC
199 ARI_TRESHOLD_VALUE = 1
200 # ARI comparator type
201 ARI_COMPARATOR = "round"
202 
203 ARTICLE_CORPUS = "content_encoded"
204 
205 # # METRIC SECTION END
206 
207 GOOGLE_SEARCH_SITE_ID = "google_search"
208 CABINET_SEARCH_SITE_ID = "cabinet_search"
209 # obsolete. Will be removed in next release
210 OLD_GOOGLE_SEARCH_SITE_ID = "d57f144e7b26c9976769ea94f18b9064"
211 OLD_CABINET_SEARCH_SITE_ID = "1fe592caf03fd50c5f065c30f82b13bb"
212 
213 
214 # For the module import algorithms usage mode
215 SCRAPER_APP_CLASS_NAME = "Scraper"
216 SCRAPER_APP_CLASS_CFG = "../ini/scraper.ini"
217 STORE_APP_CLASS_NAME = "???"
218 STORE_APP_CLASS_CFG = "../ini/processor-store-content-in-kvdb.ini"
219 PROCESSOR_FEED_PARSER_CLASS_NAME = "ProcessorFeedParser"
220 PROCESSOR_FEED_PARSER_CLASS_CFG = "../ini/processor_feed_parser.ini"
221 
222 SCRAPER_MULTI_ITEMS_APP_CLASS_NAME = "ScraperMultiItemsTask"
223 SCRAPER_MULTI_ITEMS_APP_CLASS_CFG = "../ini/scraper_multi_items_task.ini"
224 
225 SCRAPER_CUSTOM_JSON_APP_CLASS_NAME = "ScraperCustomJson"
226 SCRAPER_CUSTOM_JSON_APP_CLASS_CFG = "../ini/scraper_custom_task.ini"
227 
228 TAG_REDUCE_MASK_PROP_NAME = "SCRAPER_TEXT_REDUCER_MASK"
229 TAG_REDUCE_PROP_NAME = "SCRAPER_TEXT_REDUCER"
230 TAG_MARKUP_PROP_NAME = "SCRAPER_TEXT_MARKUP"
231 TAG_KEEP_ATTRIBUTES_PROP_NAME = "SCRAPER_KEEP_ATTRIBUTES"
232 TAG_CLOSE_VOID_PROP_NAME = "CLOSE_VOID"
233 
234 TAGS_TYPES_NAME = "TAGS_TYPES"
235 
236 PDATE_TIMEZONES_NAME = "PDATE_TIMEZONES"
237 PDATE_DAY_MONTH_ORDER_NAME = "PDATE_DAY_MONTH_ORDER"
238 
239 LANG_PROP_NAME = "SCRAPER_LANG_DETECT"
240 
241 MEDIA_LIMITS_NAME = "MEDIA_LIMITS"
242 
243 # Constants for property 'HTTP_REDIRECT_LINK'
244 HTTP_REDIRECT_LINK_NAME = "HTTP_REDIRECT_LINK"
245 LOCATION_NAME = "Location"
246 HTTP_REDIRECT_LINK_VALUE_URL = 1
247 HTTP_REDIRECT_LINK_VALUE_LOCATION = 2
248 HTTP_REDIRECT_LINK_VALUE_REDIRECT_URL = 3
249 HTTP_REDIRECT_LINK_VALUE_SOURCE_URL = 4
250 HTTP_REDIRECT_LINK_LINK_TAG_NAME = 'link'
251 REDIRECT_URL_NAME = 'redirect_url'
252 
253 # ML section
254 TEMPLATE_CONDITION_TYPE_URL = 0
255 
256 # HTML5 SEMANTIC TASG
257 class HTML5_SEMANTIC_TAGS(object):
258 
259  HEADER = "header"
260  FOOTER = "footer"
261  ARTICLE = "article"
262  SECTION = "section"
263 
264 
265  def __init__(self):
266  pass