3 @author Alexey, bgv <developers.hce@gmail.com>, Alexander Vybornyh <alexander.hce.cluster@gmail.com> 4 @link http://hierarchical-cluster-engine.com/ 5 @copyright Copyright © 2013-2015 IOIX Ukraine 6 @license http://hierarchical-cluster-engine.com/license/ 7 @package HCE project node API 11 LOGGER_NAME = APP_CONSTS.LOGGER_NAME
17 TAG_DESCRIPTION =
"description" 18 TAG_PUB_DATE =
"pubdate" 19 TAG_DC_DATE =
"dc_date" 22 TAG_CONTENT_UTF8_ENCODED =
"content_encoded" 23 TAG_KEYWORDS =
"keywords" 24 TAG_MEDIA_THUMBNAIL =
"media_thumbnail" 25 TAG_MEDIA_CONTENT =
"media_content" 26 TAG_ENCLOSURE =
"enclosure" 27 TAG_GOOGLE =
"google_search" 28 TAG_GOOGLE_TOTAL =
"google_search_total" 29 TAG_SUMMARY_LANG =
"summary_lang" 30 HTML_LANG =
"html_lang" 31 PARENT_RSS_FEED =
"parent_rss_feed" 32 PARENT_RSS_FEED_URLMD5 =
"parent_rss_feed_urlMd5" 34 SUMMARY_DETAIL =
"summary_detail" 35 COMMENTNS =
"comments" 37 PUBLISHED =
"published" 40 UPDATED_PARSED =
"updated_parsed" 41 TAG_ORDER_NUMBER =
"order_number" 42 TAG_SOURCE_URL =
"source_url" 43 TAG_FEED_URL =
"feed_url" 45 TAG_TYPE_DATETIME =
'datetime' 46 TAG_PUBDATE_TZ =
'pubdate_tz' 48 CONTENT_HASH_ALGORITHM_EMPTY = 0
49 CONTENT_HASH_ALGORITHM_MD5 = 1
50 CONTENT_HASH_ALGORITHM_CRC32 = 2
51 CONTENT_HASH_ALGORITHM_SOUNDEX = 3
52 CONTENT_HASH_ALGORITHM_SHA1 = 4
53 CONTENT_HASH_ALGORITHM_SDHASH = 5
54 CONTENT_HASH_ALGORITHM_BBHASH = 6
55 CONTENT_HASH_ALGORITHM_MRSH_V2 = 7
56 CONTENT_HASH_ALGORITHM_MVHASH_B = 8
57 CONTENT_HASH_ALGORITHM_MD5_WITHOUT_HTML = 9
58 CONTENT_HASH_ACTION_DELETE = 1
61 TAGS_RULES_MASK_DEFAULT_VALUE = 4
62 TAGS_RULES_MASK_RULE_PRIORITY = 2
63 TAGS_RULES_MASK_MANDATORY_FIELD = 1
66 PROCESS_ALGORITHM_REGULAR =
"regular" 67 PROCESS_ALGORITHM_TRAINING =
"training" 68 PROCESS_ALGORITHM_PREDICTION =
"prediction" 69 PROCESS_ALGORITHM_CONCURRENCY =
"concurrency" 70 PROCESS_ALGORITHM_METRIC =
"metric_based" 71 PROCESS_ALGORITHM_FEED_PARSER =
"feed_parser" 72 PROCESS_ALGORITHM_ALCHEMY =
"ALCHEMY" 73 PROCESS_ALGORITHM_BOILERPIPE =
"BOILERPIPE" 74 PROCESS_ALGORITHM_NEWSPAPER =
"NEWSPAPER" 75 PROCESS_ALGORITHM_GOOSE =
"GOOSE" 76 PROCESS_ALGORITHM_SCRAPY =
"SCRAPY" 77 PROCESS_ALGORITHM_ML =
"ML" 79 TRAINING_QUEUE =
"TRAINING_QUEUE" 80 TRAINED_QUEUE =
"TRAINED_QUEUE" 81 CONCURRENCY_QUEUE =
"CONCURRENCY_QUEUE" 88 DB_SITES =
"db_dc_sites" 89 DB_URLS =
"db_dc_urls" 90 DB_SCRAPERS =
"db_dc_scrapers" 91 DC_CONTENTS_DB_NAME =
"db_dc_contents" 92 SQL_TMP_TABLE =
"metrics" 94 MYSQL_ENGINE =
"mysql_engine" 98 MSG_ERROR_LOAD_DB_BACKEND =
"Error loading DB backend. " 99 MSG_ERROR_LOAD_CONFIG =
"Error loading config file." 100 MSG_ERROR_LOAD_LOG_CONFIG_FILE =
"Error loading logging config file." 101 MSG_ERROR_LOAD_EXTRACTORS =
"Error load extractors " 102 MSG_ERROR_TEMPLATE_EXTRACTION =
"Error template extraction " 103 MSG_ERROR_DYNAMIC_EXTRACTION =
"Error dynamic extraction " 104 MSG_ERROR_LOAD_OPTIONS =
"Error load options" 105 MSG_INFO_PREPARE_CONTENT =
"Prepare content: " 106 MSG_ERROR_ADJUST_PR =
"Error adjust partial references. " 107 MSG_ERROR_PROCESS =
"Processor Storing Contents process batch error: " 108 MSG_ERROR_CALC_METRICS =
"Smth goes wrong. See traceback: " 122 TIME_EXECUTION_LIMIT = 20
124 PYTHON_BINARY =
"/usr/bin/python" 128 SCRAPER_BINARY =
"./scraper.py" 129 SCRAPER_CFG =
"--config=../ini/scraper.ini" 132 PROCESSOR_STORE =
"STORE" 133 STORE_PROCESSOR_BINARY =
"./processor_store_content_kvdb.py" 134 STORE_PROCESSOR_CFG =
"--config=../ini/processor-store-content-in-kvdb.ini" 137 PROCESSOR_FEED_PARSER =
"FEED_PARSER" 138 PROCESSOR_RSS =
"RSS" 141 REPROCESS_KEY =
"reprocess" 142 REPROCESS_VALUE_NO = 0
143 RECRAWL_KEY =
"recrawl" 146 PROCESSOR_FEED_PARSER_BINARY =
"./processor_feed_parser.py" 147 PROCESSOR_FEED_PARSER_CFG =
"--config=../ini/processor_feed_parser.ini" 150 PROCESSOR_SCRAPER_MULTI_ITEMS =
"SCRAPER_MULTI_ITEMS" 151 SCRAPER_MULTI_ITEMS_BINARY =
"./scraper_multi_items_task.py" 152 SCRAPER_MULTI_ITEMS_CFG =
"--config=../ini/scraper_multi_items_task.ini" 155 PROCESSOR_SCRAPER_CUSTOM =
"SCRAPER_CUSTOM" 156 SCRAPER_CUSTOM_BINARY =
"./scraper_custom_task.py" 157 SCRAPER_CUSTOM_CFG =
"--config=../ini/scraper_custom_task.ini" 160 EXTRACTOR_NAME_ML =
"ML extractor" 161 EXTRACTOR_NAME_ALCHEMY =
"Alchemy extractor" 162 EXTRACTOR_NAME_BOILERPIPE =
"Boilerpipe extractor" 164 MODULES_KEY =
"modules" 165 ALGORITHM_KEY =
"algorithm" 166 ALGORITHM_NAME_KEY =
"algorithm_name" 167 PROPERTIES_KEY =
"properties" 168 TEMPLATE_KEY =
"template" 170 USE_HTML5_KEY =
"html5" 172 SCRAPER_RANK_INIT = 10
176 TIMEZONE_LIST = [
"JST"]
177 COMMON_DATE_FORMAT =
'%Y-%m-%d %H:%M:%S' 182 DEFAULT_TRESHOLD_VALUE = 0
183 DEFAULT_METRIC_VALUE = 0
184 DEFAULT_COMPARATOR =
"" 188 WORDS_TRESHOLD_VALUE = 100
190 WORDS_COMPARATOR =
"round" 194 SENTENCES_TRESHOLD_VALUE = 5
196 SENTENCES_COMPARATOR =
"round" 199 ARI_TRESHOLD_VALUE = 1
201 ARI_COMPARATOR =
"round" 203 ARTICLE_CORPUS =
"content_encoded" 207 GOOGLE_SEARCH_SITE_ID =
"google_search" 208 CABINET_SEARCH_SITE_ID =
"cabinet_search" 210 OLD_GOOGLE_SEARCH_SITE_ID =
"d57f144e7b26c9976769ea94f18b9064" 211 OLD_CABINET_SEARCH_SITE_ID =
"1fe592caf03fd50c5f065c30f82b13bb" 215 SCRAPER_APP_CLASS_NAME =
"Scraper" 216 SCRAPER_APP_CLASS_CFG =
"../ini/scraper.ini" 217 STORE_APP_CLASS_NAME =
"???" 218 STORE_APP_CLASS_CFG =
"../ini/processor-store-content-in-kvdb.ini" 219 PROCESSOR_FEED_PARSER_CLASS_NAME =
"ProcessorFeedParser" 220 PROCESSOR_FEED_PARSER_CLASS_CFG =
"../ini/processor_feed_parser.ini" 222 SCRAPER_MULTI_ITEMS_APP_CLASS_NAME =
"ScraperMultiItemsTask" 223 SCRAPER_MULTI_ITEMS_APP_CLASS_CFG =
"../ini/scraper_multi_items_task.ini" 225 SCRAPER_CUSTOM_JSON_APP_CLASS_NAME =
"ScraperCustomJson" 226 SCRAPER_CUSTOM_JSON_APP_CLASS_CFG =
"../ini/scraper_custom_task.ini" 228 TAG_REDUCE_MASK_PROP_NAME =
"SCRAPER_TEXT_REDUCER_MASK" 229 TAG_REDUCE_PROP_NAME =
"SCRAPER_TEXT_REDUCER" 230 TAG_MARKUP_PROP_NAME =
"SCRAPER_TEXT_MARKUP" 231 TAG_KEEP_ATTRIBUTES_PROP_NAME =
"SCRAPER_KEEP_ATTRIBUTES" 232 TAG_CLOSE_VOID_PROP_NAME =
"CLOSE_VOID" 234 TAGS_TYPES_NAME =
"TAGS_TYPES" 236 PDATE_TIMEZONES_NAME =
"PDATE_TIMEZONES" 237 PDATE_DAY_MONTH_ORDER_NAME =
"PDATE_DAY_MONTH_ORDER" 239 LANG_PROP_NAME =
"SCRAPER_LANG_DETECT" 241 MEDIA_LIMITS_NAME =
"MEDIA_LIMITS" 244 HTTP_REDIRECT_LINK_NAME =
"HTTP_REDIRECT_LINK" 245 LOCATION_NAME =
"Location" 246 HTTP_REDIRECT_LINK_VALUE_URL = 1
247 HTTP_REDIRECT_LINK_VALUE_LOCATION = 2
248 HTTP_REDIRECT_LINK_VALUE_REDIRECT_URL = 3
249 HTTP_REDIRECT_LINK_VALUE_SOURCE_URL = 4
250 HTTP_REDIRECT_LINK_LINK_TAG_NAME =
'link' 251 REDIRECT_URL_NAME =
'redirect_url' 254 TEMPLATE_CONDITION_TYPE_URL = 0