4 @link: http://hierarchical-cluster-engine.com/ 5 @copyright: Copyright © 2013-2014 IOIX Ukraine 6 @license: http://hierarchical-cluster-engine.com/license/ 14 DEFAULT_LOCK_TTL = 600
19 TASK_DUPLICATE_ERR = 2020
20 TASK_DUPLICATE_ERR_MSG =
"Duplicate site" 22 TASK_SQL_ERR_MSG =
"Some SQL error, look log file for details" 27 PRIMARY_DB_ID =
"primaryDB" 28 SECONDARY_DB_ID =
"secondaryDB" 29 THIRD_DB_ID =
"thirdDB" 30 FOURTH_DB_ID =
"fourthDB" 31 FIFTH_DB_ID =
"fifthDB" 35 STAT_DOMAINS_DB_ID =
"statDomainsDB" 37 DB_STORAGE_TABLE_NAME =
"articles" 39 DB_LOCK_APPLICATION_ID = 0
40 FETCH_LOCK_NAME =
"SELECT_LOCK" 43 FIELD_QUOTE_SEPARATOR =
"`" 46 EXIT_CODE_CONFIG_ERROR = 1
47 EXIT_CODE_GLOBAL_ERROR = 2
48 EXIT_CODE_MYSQL_ERROR = 3
51 LOGGER_NAME = APP_CONSTS.LOGGER_NAME
58 DC_URLS_TABLE_NAME_TEMPLATE =
"urls_%s" 59 URL_URL_SQL_UPDATE =
'UPDATE ' + DC_URLS_TABLE_NAME_TEMPLATE +
' SET %s WHERE' 60 URL_URL_SQL_SELECT_COUNT =
'SELECT COUNT(*) FROM ' + DC_URLS_TABLE_NAME_TEMPLATE +
' WHERE ' 61 DC_CONTENTS_TABLE_NAME_TEMPLATE =
"contents_%s" 62 DC_FREQ_TABLE_NAME_TEMPLATE =
"freq_%s" 63 DC_LOG_TABLE_NAME_TEMPLATE =
"log_%s" 64 DC_ATT_TABLE_NAME_TEMPLATE =
"att_%s" 67 USE_SQL_TEMPLATE =
"USE `%s`" 68 SELECT_DB_STORAGE =
"SELECT * FROM `%s` WHERE Id = '%s'" 70 INSERT_COMMON_TEMPLATE =
"INSERT INTO `%s` SET %s" 71 SITE_SQL_TEMPLATE =
"INSERT INTO `sites` %s VALUES %s" 73 SITE_FILTER_SQL_TEMPLATE =
"INSERT INTO `sites_filters` SET `Site_Id`='%s', `Pattern`='%s', `Subject`='%s', \ 74 `OperationCode`=%s, `Stage`=%s, `Action`=%s, `UDate`=%s, `Type`='%s', `Mode`='%s', `State`='%s', `Group_Id`=%s" 75 SITE_FILTER_SQL_UPDATE =
"UPDATE `sites_filters` SET `Pattern`='%s', `Subject`='%s', `OperationCode`=%s, `Stage`=%s, \ 76 `Action`=%s, `UDate`=%s,`Group_Id`=%s WHERE `Site_Id`='%s' AND `Type`='%s' AND `Mode`='%s' AND `State`='%s'" 77 SITE_PROP_SQL_TEMPLATE =
"INSERT INTO `sites_properties` SET `Site_Id`='%s', `Name`='%s', `Value`='%s'" 78 SITE_PROP_SQL_ADDITIONS =
", `URLMD5`='%s'" 79 SITE_PROP_SQL_SHOT =
"INSERT INTO `sites_properties` SET %s" 80 SITE_PROP_SQL_UPDATE =
"UPDATE `sites_properties` SET %s WHERE `Site_Id`='%s' AND `Name`='%s'" 81 SITE_URL_SQL_TEMPLATE =
"INSERT INTO `sites_urls` SET %s" 82 SITE_URL_SQL_UPDATE =
"UPDATE `sites_urls` SET %s WHERE `Site_Id`='%s'" 83 SITE_URL_SQL_SELECT_COUNT =
'SELECT COUNT(*) FROM `sites_urls` WHERE ' 85 DEL_BY_ID_QUERY_TEMPLATE =
"DELETE FROM `%s` WHERE `Site_Id` = '%s'" 87 SQL_CHECK_TABLE_EXIST_TEMPLATE =
""" SELECT IF( EXISTS(SELECT * FROM information_schema.TABLES WHERE Table_Name="%s" \ 88 and TABLE_SCHEMA="%s"), 1, 0) """ 89 DC_SITE_URL_SQL_TEMPLATE =
"INSERT INTO `%s` (`Site_Id`, `URL`) VALUES('%s', '%s')" 91 SELECT_SQL_TEMPLATE =
""" SELECT * FROM `%s` WHERE %s""" 92 SELECT_SQL_TEMPLATE_SIMPLE =
""" SELECT %s FROM `%s`""" 93 SELECT_SITE_ID_BY_URL =
"SELECT `Site_Id` FROM `sites_urls` \ 94 WHERE SUBSTRING(\"%s\", 1, LENGTH(URL))=`URL` ORDER BY LENGTH(URL) DESC LIMIT 1" 95 CHECK_TABLE_SQL_ADDITION =
"`User_Id` = %s" 96 SQL_CREATE_QUERY_TEMPLATE =
"CREATE TABLE IF NOT EXISTS `%s` LIKE dc_urls.%s" 99 CHECK_TABLE_SQL_TEMPLATE =
" SELECT COUNT(*) FROM sites WHERE `Id` = '%s'" 100 CHECK_TABLE_SQL_ADDITION =
" AND `User_Id` = %s" 103 KEY_VALUE_FILE_NAME_TEMPLATE =
"%s.db" 110 FREQ_NEW_STATUS =
"FNew" 111 FREQ_CRAWLED_STATUS =
"FCrawled" 112 FREQ_PROCESSED_STATS =
"FProcessed" 113 FREQ_AGED_STATE =
"FAged" 114 FREQ_DELETED_STATE =
"FDeleted" 115 FREQ_PURGED_STATE =
"FPurged" 117 logOperationsDict = dict({
"LOG_INSERT": 20,
120 "LOG_URL_CLEANUP": 23,
122 "LOG_URL_CONTENT": 25,
124 "LOG_SELECTED_CRAWLING": 2,
127 "LOG_SELECTED_PROCESSING": 5,
131 siteDict = dict({
"id":
"Id",
134 "tcDateProcess":
"TcDateProcess",
136 "resources":
"Resources",
137 "contents":
"Contents",
138 "collectedURLs":
"CollectedURLs",
139 "newURLs":
"NewURLs",
140 "deletedURLs":
"DeletedURLs",
141 "iterations":
"Iterations",
143 "priority":
"Priority",
144 "maxURLs":
"MaxURLs",
145 "maxURLsFromPage":
"MaxURLsFromPage",
146 "maxResources":
"MaxResources",
147 "maxErrors":
"MaxErrors",
148 "maxResourceSize":
"MaxResourceSize",
149 "requestDelay":
"RequestDelay",
150 "processingDelay":
"ProcessingDelay",
151 "httpTimeout":
"HTTPTimeout",
152 "errorMask":
"ErrorMask",
155 "avgSpeed":
"AVGSpeed",
156 "avgSpeedCounter":
"AVGSpeedCounter",
157 "urlType":
"URLType",
159 "recrawlPeriod":
"RecrawlPeriod",
160 "recrawlDate":
"RecrawlDate",
161 "fetchType":
"FetchType",
162 "description":
"Description",
163 "categoryId":
"Category_Id"}
166 siteExcludeList = [
"Id"]
168 propDict = dict({
"siteId":
"Site_Id",
177 filterDict = dict({
"siteId":
"Site_Id",
178 "pattern":
"Pattern",
179 "subject":
"Subject",
180 "opCode":
"OperationCode",
188 "groupId":
"Group_Id"}
192 URLTableDict = dict({
"siteId":
"Site_Id",
197 "crawled":
"Crawled",
198 "processed":
"Processed",
200 "contentType":
"ContentType",
201 "requestDelay":
"RequestDelay",
202 "processingDelay":
"ProcessingDelay",
203 "httpTimeout":
"HTTPTimeout",
204 "charset":
"Charset",
205 "batchId":
"Batch_Id",
206 "errorMask":
"ErrorMask",
207 "crawlingTime":
"CrawlingTime",
208 "processingTime":
"ProcessingTime",
209 "totalTime":
"TotalTime",
210 "httpCode":
"HTTPCode",
213 "httpMethod":
"HTTPMethod",
219 "rawContentMd5":
"RawContentMd5",
220 "parentMd5":
"ParentMd5",
221 "lastModified":
"LastModified",
224 "mRateCounter":
"MRateCounter",
226 "maxURLsFromPage":
"MaxURLsFromPage",
227 "tagsMask":
"TagsMask",
228 "tagsCount":
"TagsCount",
230 "contentURLMd5":
"ContentURLMd5",
231 "priority":
"Priority",
232 "classifierMask":
"ClassifierMask"}
235 ProxyTableDict = dict({
"id":
"Id",
238 "domains":
"Domains",
239 "priority":
"Priority",
241 "countryCode":
"CountryCode",
242 "countryName":
"CountryName",
243 "regionCode":
"RegionCode",
244 "regionName":
"RegionName",
245 "cityName":
"CityName",
247 "timeZone":
"TimeZone",
248 "latitude":
"Latitude",
249 "longitude":
"Longitude",
250 "metroCode":
"MetroCode",
252 "faultsMax":
"FaultsMax",
253 "categoryId":
"Category_Id",
255 "description":
"Description",
260 AttrTableDict = dict({
"name":
"Name",
266 urlExcludeList = [
"URL",
"URLMd5"]
267 proxyExcludeList = [
"Id",
"Site_Id",
"Host",
"CDate"]
269 SiteURLTableDitct = dict(URLTableDict.items() + {
"userId":
"User_Id"}.
items())
271 DbContentFields = {
"KVDB": [
"id",
"data",
"CDate"],
272 "MYSQL":[
"id",
"data",
"CDate"]}
274 SITE_ID_NAME =
"SITE_ID" 284 if fName
in row
and row[fName]
is not None:
285 ret = str(row[fName])
295 if len(incomeStr) > 0:
296 if symbol
is None or incomeStr[-1] == symbol:
310 for key
in inputDict:
312 if isinstance(obj, dict):
313 if key
in obj.keys()
and obj[key]
is not None:
314 if excludeList
is None or key
not in excludeList:
315 fields.append(inputDict[key])
318 if hasattr(obj, key)
and getattr(obj, key)
is not None:
319 if excludeList
is None or key
not in excludeList:
320 fields.append(inputDict[key])
321 attr = getattr(obj, key)
324 if isinstance(attr, basestring):
325 escapingStr = MySQLdb.escape_string(str(attr))
326 values.append((
"'" + escapingStr +
"'"))
328 values.append(str(attr))
329 return (fields, values)
340 if len(fields) == len(values):
342 retFields += (FIELD_QUOTE_SEPARATOR + field + FIELD_QUOTE_SEPARATOR + COMA_SEPARATOR)
344 retValues += value + COMA_SEPARATOR
349 return (retFields, retValues)
359 if len(fields) == len(values):
360 for fieldIndex
in xrange(0, len(fields)):
361 if excludeList
is None or fields[fieldIndex]
not in excludeList:
362 ret = ret + FIELD_QUOTE_SEPARATOR + fields[fieldIndex] + FIELD_QUOTE_SEPARATOR +
"=" + \
363 values[fieldIndex] + COMA_SEPARATOR
def readDataTimeField(fName, row)
def createFieldsValuesString(fields, values, excludeList=None)
def cleateFieldsValuesLists(fields, values)
def getFieldsValuesTuple(obj, inputDict, excludeList=None)
def stripSymbol(incomeStr, symbol=None)