HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
Constants.py
Go to the documentation of this file.
1 '''
2 @package: dc
3 @author igor
4 @link: http://hierarchical-cluster-engine.com/
5 @copyright: Copyright © 2013-2014 IOIX Ukraine
6 @license: http://hierarchical-cluster-engine.com/license/
7 @since: 0.1
8 '''
9 
10 import MySQLdb
11 import app.Consts as APP_CONSTS
12 
13 APP_NAME = "db-task"
14 DEFAULT_LOCK_TTL = 600
15 
16 DB_DATA_KVDB = 0
17 DB_DATA_MYSQL = 1
18 
19 TASK_DUPLICATE_ERR = 2020
20 TASK_DUPLICATE_ERR_MSG = "Duplicate site"
21 TASK_SQL_ERR = 2021
22 TASK_SQL_ERR_MSG = "Some SQL error, look log file for details"
23 
24 EXEC_INDEX = 0
25 EXEC_NAME = 1
26 
27 PRIMARY_DB_ID = "primaryDB"
28 SECONDARY_DB_ID = "secondaryDB"
29 THIRD_DB_ID = "thirdDB"
30 FOURTH_DB_ID = "fourthDB"
31 FIFTH_DB_ID = "fifthDB"
32 STAT_DB_ID = "statDB"
33 LOG_DB_ID = "logDB"
34 ATT_DB_ID = "attDB"
35 STAT_DOMAINS_DB_ID = "statDomainsDB"
36 
37 DB_STORAGE_TABLE_NAME = "articles"
38 
39 DB_LOCK_APPLICATION_ID = 0
40 FETCH_LOCK_NAME = "SELECT_LOCK"
41 
42 COMA_SEPARATOR = ","
43 FIELD_QUOTE_SEPARATOR = "`"
44 
45 EXIT_CODE_OK = 0
46 EXIT_CODE_CONFIG_ERROR = 1
47 EXIT_CODE_GLOBAL_ERROR = 2
48 EXIT_CODE_MYSQL_ERROR = 3
49 
50 # LOGGER_NAME = "dc_db"
51 LOGGER_NAME = APP_CONSTS.LOGGER_NAME
52 
53 # work db names
54 DC_SITES = "dc_sites"
55 DC_URLS = "dc_urls"
56 
57 # template for table names in dc_urls db
58 DC_URLS_TABLE_NAME_TEMPLATE = "urls_%s"
59 URL_URL_SQL_UPDATE = 'UPDATE ' + DC_URLS_TABLE_NAME_TEMPLATE + ' SET %s WHERE'
60 URL_URL_SQL_SELECT_COUNT = 'SELECT COUNT(*) FROM ' + DC_URLS_TABLE_NAME_TEMPLATE + ' WHERE '
61 DC_CONTENTS_TABLE_NAME_TEMPLATE = "contents_%s"
62 DC_FREQ_TABLE_NAME_TEMPLATE = "freq_%s"
63 DC_LOG_TABLE_NAME_TEMPLATE = "log_%s"
64 DC_ATT_TABLE_NAME_TEMPLATE = "att_%s"
65 
66 # SQL tempaltes
67 USE_SQL_TEMPLATE = "USE `%s`"
68 SELECT_DB_STORAGE = "SELECT * FROM `%s` WHERE Id = '%s'"
69 
70 INSERT_COMMON_TEMPLATE = "INSERT INTO `%s` SET %s"
71 SITE_SQL_TEMPLATE = "INSERT INTO `sites` %s VALUES %s"
72 #------------------------------- Site Filters SQL templates -------------------------
73 SITE_FILTER_SQL_TEMPLATE = "INSERT INTO `sites_filters` SET `Site_Id`='%s', `Pattern`='%s', `Subject`='%s', \
74 `OperationCode`=%s, `Stage`=%s, `Action`=%s, `UDate`=%s, `Type`='%s', `Mode`='%s', `State`='%s', `Group_Id`=%s"
75 SITE_FILTER_SQL_UPDATE = "UPDATE `sites_filters` SET `Pattern`='%s', `Subject`='%s', `OperationCode`=%s, `Stage`=%s, \
76 `Action`=%s, `UDate`=%s,`Group_Id`=%s WHERE `Site_Id`='%s' AND `Type`='%s' AND `Mode`='%s' AND `State`='%s'"
77 SITE_PROP_SQL_TEMPLATE = "INSERT INTO `sites_properties` SET `Site_Id`='%s', `Name`='%s', `Value`='%s'"
78 SITE_PROP_SQL_ADDITIONS = ", `URLMD5`='%s'"
79 SITE_PROP_SQL_SHOT = "INSERT INTO `sites_properties` SET %s"
80 SITE_PROP_SQL_UPDATE = "UPDATE `sites_properties` SET %s WHERE `Site_Id`='%s' AND `Name`='%s'"
81 SITE_URL_SQL_TEMPLATE = "INSERT INTO `sites_urls` SET %s"
82 SITE_URL_SQL_UPDATE = "UPDATE `sites_urls` SET %s WHERE `Site_Id`='%s'"
83 SITE_URL_SQL_SELECT_COUNT = 'SELECT COUNT(*) FROM `sites_urls` WHERE '
84 
85 DEL_BY_ID_QUERY_TEMPLATE = "DELETE FROM `%s` WHERE `Site_Id` = '%s'"
86 
87 SQL_CHECK_TABLE_EXIST_TEMPLATE = """ SELECT IF( EXISTS(SELECT * FROM information_schema.TABLES WHERE Table_Name="%s" \
88 and TABLE_SCHEMA="%s"), 1, 0) """
89 DC_SITE_URL_SQL_TEMPLATE = "INSERT INTO `%s` (`Site_Id`, `URL`) VALUES('%s', '%s')"
90 
91 SELECT_SQL_TEMPLATE = """ SELECT * FROM `%s` WHERE %s"""
92 SELECT_SQL_TEMPLATE_SIMPLE = """ SELECT %s FROM `%s`"""
93 SELECT_SITE_ID_BY_URL = "SELECT `Site_Id` FROM `sites_urls` \
94 WHERE SUBSTRING(\"%s\", 1, LENGTH(URL))=`URL` ORDER BY LENGTH(URL) DESC LIMIT 1"
95 CHECK_TABLE_SQL_ADDITION = "`User_Id` = %s"
96 SQL_CREATE_QUERY_TEMPLATE = "CREATE TABLE IF NOT EXISTS `%s` LIKE dc_urls.%s"
97 
98 # #sql query which checks existence of a table
99 CHECK_TABLE_SQL_TEMPLATE = " SELECT COUNT(*) FROM sites WHERE `Id` = '%s'"
100 CHECK_TABLE_SQL_ADDITION = " AND `User_Id` = %s"
101 
102 # template for key value file name
103 KEY_VALUE_FILE_NAME_TEMPLATE = "%s.db"
104 
105 class StatFreqConstants(object):
106 
107  FREQ_INSERT = "FIns"
108  FREQ_DELETE = "FDel"
109  FREQ_UPDATE = "FUpd"
110  FREQ_NEW_STATUS = "FNew"
111  FREQ_CRAWLED_STATUS = "FCrawled"
112  FREQ_PROCESSED_STATS = "FProcessed"
113  FREQ_AGED_STATE = "FAged"
114  FREQ_DELETED_STATE = "FDeleted"
115  FREQ_PURGED_STATE = "FPurged"
116 
117 logOperationsDict = dict({"LOG_INSERT": 20,
118  "LOG_DELETE": 21,
119  "LOG_UPDATE": 22,
120  "LOG_URL_CLEANUP": 23,
121  "LOG_URL_AGING": 24,
122  "LOG_URL_CONTENT": 25,
123  "LOG_NEW": 1,
124  "LOG_SELECTED_CRAWLING": 2,
125  "LOG_CRAWLING": 3,
126  "LOG_CRAWLED": 4,
127  "LOG_SELECTED_PROCESSING": 5,
128  "LOG_PROCESSING": 6,
129  "LOG_PROCESSED": 7})
130 
131 siteDict = dict({"id": "Id",
132  "uDate": "UDate",
133  "tcDate": "TcDate",
134  "tcDateProcess": "TcDateProcess",
135  "cDate": "CDate",
136  "resources": "Resources",
137  "contents": "Contents",
138  "collectedURLs": "CollectedURLs",
139  "newURLs": "NewURLs",
140  "deletedURLs": "DeletedURLs",
141  "iterations": "Iterations",
142  "state": "State",
143  "priority": "Priority",
144  "maxURLs": "MaxURLs",
145  "maxURLsFromPage": "MaxURLsFromPage",
146  "maxResources": "MaxResources",
147  "maxErrors": "MaxErrors",
148  "maxResourceSize": "MaxResourceSize",
149  "requestDelay": "RequestDelay",
150  "processingDelay": "ProcessingDelay",
151  "httpTimeout": "HTTPTimeout",
152  "errorMask": "ErrorMask",
153  "errors": "Errors",
154  "size": "Size",
155  "avgSpeed": "AVGSpeed",
156  "avgSpeedCounter": "AVGSpeedCounter",
157  "urlType": "URLType",
158  "userId": "User_Id",
159  "recrawlPeriod": "RecrawlPeriod",
160  "recrawlDate": "RecrawlDate",
161  "fetchType": "FetchType",
162  "description": "Description",
163  "categoryId": "Category_Id"}
164  )
165 
166 siteExcludeList = ["Id"]
167 
168 propDict = dict({"siteId": "Site_Id",
169  "urlMd5": "URLMd5",
170  "name": "Name",
171  "value": "Value",
172  "uDate": "UDate",
173  "cDate": "CDate"}
174  )
175 
176 
177 filterDict = dict({"siteId": "Site_Id",
178  "pattern": "Pattern",
179  "subject": "Subject",
180  "opCode": "OperationCode",
181  "stage": "Stage",
182  "action": "Action",
183  "type": "Type",
184  "mode": "Mode",
185  "state": "State",
186  "uDate": "UDate",
187  "cDate": "CDate",
188  "groupId": "Group_Id"}
189  )
190 
191 
192 URLTableDict = dict({"siteId": "Site_Id",
193  "url": "URL",
194  "type": "Type",
195  "state": "State",
196  "status": "Status",
197  "crawled": "Crawled",
198  "processed": "Processed",
199  "urlMd5": "URLMd5",
200  "contentType": "ContentType",
201  "requestDelay": "RequestDelay",
202  "processingDelay": "ProcessingDelay",
203  "httpTimeout": "HTTPTimeout",
204  "charset": "Charset",
205  "batchId": "Batch_Id",
206  "errorMask": "ErrorMask",
207  "crawlingTime": "CrawlingTime",
208  "processingTime": "ProcessingTime",
209  "totalTime": "TotalTime",
210  "httpCode": "HTTPCode",
211  "UDate": "UDate",
212  "CDate": "CDate",
213  "httpMethod": "HTTPMethod",
214  "size": "Size",
215  "linksI": "LinksI",
216  "linksE": "LinksE",
217  "freq": "Freq",
218  "depth": "Depth",
219  "rawContentMd5": "RawContentMd5",
220  "parentMd5": "ParentMd5",
221  "lastModified": "LastModified",
222  "eTag": "ETag",
223  "mRate": "MRate",
224  "mRateCounter": "MRateCounter",
225  "tcDate": "TcDate",
226  "maxURLsFromPage": "MaxURLsFromPage",
227  "tagsMask": "TagsMask",
228  "tagsCount": "TagsCount",
229  "pDate": "PDate",
230  "contentURLMd5": "ContentURLMd5",
231  "priority": "Priority",
232  "classifierMask": "ClassifierMask"}
233  )
234 
235 ProxyTableDict = dict({"id": "Id",
236  "siteId": "Site_Id",
237  "host": "Host",
238  "domains": "Domains",
239  "priority": "Priority",
240  "state": "State",
241  "countryCode":"CountryCode",
242  "countryName":"CountryName",
243  "regionCode":"RegionCode",
244  "regionName":"RegionName",
245  "cityName":"CityName",
246  "zipCode":"ZipCode",
247  "timeZone":"TimeZone",
248  "latitude":"Latitude",
249  "longitude":"Longitude",
250  "metroCode":"MetroCode",
251  "faults":"Faults",
252  "faultsMax":"FaultsMax",
253  "categoryId":"Category_Id",
254  "limits": "Limits",
255  "description": "Description",
256  "cDate":"CDate",
257  "uDate":"UDate"}
258  )
259 
260 AttrTableDict = dict({"name": "Name",
261  "urlMd5": "URLMD5",
262  "value": "Value"}
263  )
264 
265 
266 urlExcludeList = ["URL", "URLMd5"]
267 proxyExcludeList = ["Id", "Site_Id", "Host", "CDate"]
268 
269 SiteURLTableDitct = dict(URLTableDict.items() + {"userId": "User_Id"}.items())
270 
271 DbContentFields = {"KVDB": ["id", "data", "CDate"],
272  "MYSQL":["id", "data", "CDate"]}
273 
274 SITE_ID_NAME = "SITE_ID"
275 
276 
277 # #Function reads datatime field as str
278 #
279 # fName - field name
280 # row - db row
281 # returns converted value
282 def readDataTimeField(fName, row):
283  ret = None
284  if fName in row and row[fName] is not None:
285  ret = str(row[fName])
286  return ret
287 
288 
289 # #Function strips last symbol in incoming string
290 #
291 # incomeStr - incoming string
292 # symbol - symbol for comparing with last char in string
293 def stripSymbol(incomeStr, symbol=None):
294  ret = incomeStr
295  if len(incomeStr) > 0:
296  if symbol is None or incomeStr[-1] == symbol:
297  ret = incomeStr[:-1]
298  return ret
299 
300 
301 # #Function parse incoming object and dict and converts them to the 2 lists
302 #
303 # @param obj - incoming converting object
304 # @param inputDict - incoming converting dict
305 # @param excludeList - exclude list
306 # @return tuple with fields,values lists
307 def getFieldsValuesTuple(obj, inputDict, excludeList=None):
308  fields = []
309  values = []
310  for key in inputDict:
311  attr = None
312  if isinstance(obj, dict):
313  if key in obj.keys() and obj[key] is not None:
314  if excludeList is None or key not in excludeList:
315  fields.append(inputDict[key])
316  attr = obj[key]
317  else:
318  if hasattr(obj, key) and getattr(obj, key) is not None:
319  if excludeList is None or key not in excludeList:
320  fields.append(inputDict[key])
321  attr = getattr(obj, key)
322 
323  if attr is not None:
324  if isinstance(attr, basestring):
325  escapingStr = MySQLdb.escape_string(str(attr)) # pylint: disable=E1101
326  values.append(("'" + escapingStr + "'"))
327  else:
328  values.append(str(attr))
329  return (fields, values)
330 
331 
332 # #Function converts incoming fields and values lists to the string representation
333 #
334 # fields - fields list
335 # values - values list
336 # return tupe with string representation
337 def cleateFieldsValuesLists(fields, values):
338  retFields = ""
339  retValues = ""
340  if len(fields) == len(values):
341  for field in fields:
342  retFields += (FIELD_QUOTE_SEPARATOR + field + FIELD_QUOTE_SEPARATOR + COMA_SEPARATOR)
343  for value in values:
344  retValues += value + COMA_SEPARATOR
345  else:
346  pass
347  retFields = stripSymbol(retFields, COMA_SEPARATOR)
348  retValues = stripSymbol(retValues, COMA_SEPARATOR)
349  return (retFields, retValues)
350 
351 
352 # #Function string representation of incoming lists (fields and values)
353 #
354 # fields - fields list
355 # values - values list
356 # return string of fields=values pairs
357 def createFieldsValuesString(fields, values, excludeList=None):
358  ret = ""
359  if len(fields) == len(values):
360  for fieldIndex in xrange(0, len(fields)):
361  if excludeList is None or fields[fieldIndex] not in excludeList:
362  ret = ret + FIELD_QUOTE_SEPARATOR + fields[fieldIndex] + FIELD_QUOTE_SEPARATOR + "=" + \
363  values[fieldIndex] + COMA_SEPARATOR
364  ret = stripSymbol(ret, COMA_SEPARATOR)
365  return ret
def readDataTimeField(fName, row)
Definition: Constants.py:282
def createFieldsValuesString(fields, values, excludeList=None)
Definition: Constants.py:357
def cleateFieldsValuesLists(fields, values)
Definition: Constants.py:337
def getFieldsValuesTuple(obj, inputDict, excludeList=None)
Definition: Constants.py:307
def stripSymbol(incomeStr, symbol=None)
Definition: Constants.py:293