HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
Constants.py
Go to the documentation of this file.
1 """@package docstring
2  @file Constants.py
3  @author Alexey <developers.hce@gmail.com>
4  @link http://hierarchical-cluster-engine.com/
5  @copyright Copyright &copy; 2013 IOIX Ukraine
6  @license http://hierarchical-cluster-engine.com/license/
7  @package HCE project node API
8  @since 0.1
9  """
10 
11 #exit staus code
12 EXIT_SUCCESS = 0
13 EXIT_FAILURE = 1
14 
15 SITE_ALL=0
16 
17 # parameters strings constants
18 DEFAULT_CFG_FILE="../ini/crawling-optimizer.ini"
19 APP_NAME = "crawling-optimizer"
20 
21 # Logging string constatns
22 # INFO level
23 MSG_INFO_LOAD_DEFAULT_CONFIG_FILE = "Loading default config file: "
24 MSG_INFO_LOAD_CONFIG_FILE = "Loading config file: "
25 MSG_INFO_LOAD_DEFAULT_SITE_ID = "Load default site id: "
26 MSG_INFO_LOAD_SITE_ID = "Load site id: "
27 
28 # ERROR level
29 MSG_ERROR_LOAD_CONFIG = "Can't load config file"
30 MSG_ERROR_LOAD_LOG_CONFIG_FILE = "Can't load logging config file"
31 MSG_ERROR_PROCESS_GENERAL = "Can't process query"
32 MSG_ERROR_COLLECT_SITE_DATA = "Can't collect site's data"
33 MSG_ERROR_STORE_SITE_DATA = "Can't store site's data"
34 # SQL query's templates
35 # tables
36 DB_URLS = "dc_urls"
37 DB_CO = "dc_co"
38 
39 # Count new urls for last recrawl period
40 SQL_QUERY_NEW_URLS = """SELECT count(*), max(`TcDate`), min(`LastModified`), max(`LastModified`) FROM dc_urls.`urls_%s`
41  WHERE
42  `CDate`
43  BETWEEN
44  (SELECT DATE_SUB(`RecrawlDate`, INTERVAL `RecrawlPeriod` minute) FROM dc_sites.`sites` WHERE `Id`='%s')
45  AND
46  (SELECT `RecrawlDate` FROM dc_sites.`sites` WHERE `Id`='%s')
47  AND
48  `ParentMd5`<>''
49  AND
50  `Crawled`<>0
51  AND
52  `Processed`<>0
53  AND
54  `TagsCount`<>0
55  AND
56  `Status`=7"""
57 
58 
59 # Start time of recrawl period
60 SQL_QUERY_RECRAWL_PERIOD_START = """SELECT
61  DATE_SUB(`RecrawlDate`, INTERVAL `RecrawlPeriod` minute)
62  FROM dc_sites.`sites`
63  WHERE `Id`='%s'"""
64 
65 
66 # Get end of crawling time
67 SQL_QUERY_RECRAWL_END = """SELECT max(`TcDate`) FROM dc_urls.`urls_%s`
68  WHERE
69  `CDate`
70  BETWEEN
71  (SELECT DATE_SUB(`RecrawlDate`, INTERVAL `RecrawlPeriod` minute) FROM dc_sites.`sites` WHERE `Id`='%s')
72  AND
73  (SELECT `RecrawlDate` FROM dc_sites.`sites` WHERE `Id`='%s')"""
74 
75 
76 # End time of recrawl period
77 SQL_QUERY_RECRAWL_PERIOD_END = """SELECT `RecrawlDate` FROM dc_sites.`sites` WHERE `Id`='%s'"""
78 
79 
80 # Create new site's data table
81 SQL_QUERY_NEW_SITE_TABLE = """CREATE TABLE IF NOT EXISTS `%s` (
82  `host` varchar(126) DEFAULT NULL,
83  `Contents` bigint(20) unsigned NOT NULL DEFAULT '0',
84  `RecrawlStart` datetime DEFAULT NULL COMMENT 'Start date of re-crawl',
85  `RecrawlEnd` datetime DEFAULT NULL COMMENT 'End date of re-crawl',
86  `minPDate` datetime DEFAULT NULL COMMENT 'When resource was appeared ',
87  `maxPDate` datetime DEFAULT NULL,
88  `LastAdded` datetime DEFAULT NULL COMMENT 'When content was inserted to the system',
89  `CDate` datetime NOT NULL COMMENT 'Date insert row',
90  UNIQUE KEY `RecrawlEnd` (`RecrawlEnd`)
91  ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
92 """
93 
94 
95 # Put site's data to the table
96 SQL_QUERY_INSERT_SITE_DATA = """INSERT INTO `%s` VALUES('%s', %s,'%s','%s','%s','%s','%s',NOW()) ON DUPLICATE KEY UPDATE `Contents`=%s, `LastAdded`='%s', `minPDate`='%s', `maxPDate`='%s', `CDate`=NOW()"""