HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
Constants.py
Go to the documentation of this file.
1 """
2  HCE project, Python bindings, Distributed Tasks Manager application.
3  Event objects definitions.
4 
5  @package: dc
6  @file Constants.py
7  @author Oleksii <developers.hce@gmail.com>
8  @author madk <developers.hce@gmail.com>
9  @link: http://hierarchical-cluster-engine.com/
10  @copyright: Copyright &copy; 2013-2014 IOIX Ukraine
11  @license: http://hierarchical-cluster-engine.com/license/
12  @since: 0.1
13  """
14 
15 # limit fetcher 100 sec
16 FETCHER_TIME_LIMIT_MAX = 100
17 CONNECTION_TIMEOUT = 1.0
18 
19 MAX_HTTP_REDIRECTS_LIMIT = 5
20 MAX_HTTP_SIZE_UNLIMIT = 0
21 
22 MAX_HTML_REDIRECTS_LIMIT = 1
23 
24 DB_SITES = "dc_sites"
25 DB_URLS = "dc_urls"
26 
27 RTC_FINALIZER_APP_NAME = "rtc-finalizer"
28 RTC_PREPROCESSOR_APP_NAME = "rtc-preprocessor"
29 
30 # List of allowed names for getting 'pubdate' from feed
31 pubdateFeedNames = ["pubdate", "published", "pubDate", "published_parsed", "updated_parsed"]
32 pubdateRssFeedHeaderName = "X-pubdateRssFeed"
33 rssFeedUrlHeaderName = "X-feed_url"
34 baseUrlHeaderName = "X-base_url"
35 
36 HTTP_CODE_200 = 200
37 HTTP_CODE_304 = 304
38 HTTP_CODE_400 = 400
39 HTTP_CODE_403 = 403
40 
41 REDIRECT_HTTP_CODES = [301, 302, 303, 304]
42 REDIRECT_HEADER_FIELDS_FOR_REMOVE = ['referer', 'content-type', 'Location', 'cookie']
43 
44 # Dict of charsets used in class SimpleCharsetDetector for cast from wrong encoding name to correct encoding name
45 charsetDetectorMap = {
46  'win-1251':'windows-1251',
47  'UTF-8':'utf8',
48  'utf-8':'utf8'
49 }
50 
51 # # dictionary of pair Codec as key and Aliases as value string
52 standardEncodings = {
53  'ascii':'646, us-ascii',
54  'big5':'big5-tw, csbig5',
55  'big5hkscs':'big5-hkscs, hkscs',
56  'cp037':'IBM037, IBM039',
57  'cp424':'EBCDIC-CP-HE, IBM424',
58  'cp437':'437, IBM437',
59  'cp500':'EBCDIC-CP-BE, EBCDIC-CP-CH, IBM500',
60  'cp720':'',
61  'cp737':'',
62  'cp775':'IBM775',
63  'cp850':'850, IBM850',
64  'cp852':'852, IBM852',
65  'cp855':'855, IBM855',
66  'cp856':'',
67  'cp857':'857, IBM857',
68  'cp858':'858, IBM858',
69  'cp860':'860, IBM860',
70  'cp861':'861, CP-IS, IBM861',
71  'cp862':'862, IBM862',
72  'cp863':'863, IBM863',
73  'cp864':'IBM864',
74  'cp865':'865, IBM865',
75  'cp866':'866, IBM866',
76  'cp869':'869, CP-GR, IBM869',
77  'cp874':'',
78  'cp875':'',
79  'cp932':'932, ms932, mskanji, ms-kanji',
80  'cp949':'949, ms949, uhc',
81  'cp950':'950, ms950',
82  'cp1006':'',
83  'cp1026':'ibm1026',
84  'cp1140':'ibm1140',
85  'cp1250':'windows-1250',
86  'cp1251':'windows-1251',
87  'cp1252':'windows-1252',
88  'cp1253':'windows-1253',
89  'cp1254':'windows-1254',
90  'cp1255':'windows-1255',
91  'cp1256':'windows-1256',
92  'cp1257':'windows-1257',
93  'cp1258':'windows-1258',
94  'euc_jp':'eucjp, ujis, u-jis',
95  'euc_jis_2004':'jisx0213, eucjis2004',
96  'euc_jisx0213':'eucjisx0213',
97  'euc_kr':'euckr, korean, ksc5601, ks_c-5601, ks_c-5601-1987, ksx1001, ks_x-1001',
98  'gb2312':'chinese, csiso58gb231280, euc- cn, euccn, eucgb2312-cn, gb2312-1980, gb2312-80, iso- ir-58',
99  'gbk':'936, cp936, ms936',
100  'gb18030':'gb18030-2000',
101  'hz':'hzgb, hz-gb, hz-gb-2312',
102  'iso2022_jp':'csiso2022jp, iso2022jp, iso-2022-jp',
103  'iso2022_jp_1':'iso2022jp-1, iso-2022-jp-1',
104  'iso2022_jp_2':'iso2022jp-2, iso-2022-jp-2',
105  'iso2022_jp_2004':'iso2022jp-2004, iso-2022-jp-2004',
106  'iso2022_jp_3':'iso2022jp-3, iso-2022-jp-3',
107  'iso2022_jp_ext':'iso2022jp-ext, iso-2022-jp-ext',
108  'iso2022_kr':'csiso2022kr, iso2022kr, iso-2022-kr',
109  'latin_1':'iso-8859-1, iso8859-1, 8859, cp819, latin, latin1, L1',
110  'iso8859_2':'iso-8859-2, latin2, L2',
111  'iso8859_3':'iso-8859-3, latin3, L3',
112  'iso8859_4':'iso-8859-4, latin4, L4',
113  'iso8859_5':'iso-8859-5, cyrillic',
114  'iso8859_6':'iso-8859-6, arabic',
115  'iso8859_7':'iso-8859-7, greek, greek8',
116  'iso8859_8':'iso-8859-8, hebrew',
117  'iso8859_9':'iso-8859-9, latin5, L5',
118  'iso8859_10':'iso-8859-10, latin6, L6',
119  'iso8859_11':'iso-8859-11, thai',
120  'iso8859_13':'iso-8859-13, latin7, L7',
121  'iso8859_14':'iso-8859-14, latin8, L8',
122  'iso8859_15':'iso-8859-15, latin9, L9',
123  'iso8859_16':'iso-8859-16, latin10, L10',
124  'johab':'cp1361, ms1361',
125  'koi8_r':'',
126  'koi8_u':'',
127  'mac_cyrillic':'maccyrillic',
128  'mac_greek':'macgreek',
129  'mac_iceland':'maciceland',
130  'mac_latin2':'maclatin2, maccentraleurope',
131  'mac_roman':'macroman',
132  'mac_turkish':'macturkish',
133  'ptcp154':'csptcp154, pt154, cp154, cyrillic-asian',
134  'shift_jis':'csshiftjis, shiftjis, sjis, s_jis',
135  'shift_jis_2004':'shiftjis2004, sjis_2004, sjis2004',
136  'shift_jisx0213':'shiftjisx0213, sjisx0213, s_jisx0213',
137  'utf_32':'U32, utf32',
138  'utf_32_be':'UTF-32BE',
139  'utf_32_le':'UTF-32LE',
140  'utf_16':'U16, utf16',
141  'utf_16_be':'UTF-16BE',
142  'utf_16_le':'UTF-16LE',
143  'utf_7':'U7, unicode-1-1-utf-7',
144  'utf_8':'U8, UTF, utf8',
145  'utf_8_sig':''}