HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
URLDeleteTask.py
Go to the documentation of this file.
1 '''
2 @package: dc
3 @author igor
4 @link: http://hierarchical-cluster-engine.com/
5 @copyright: Copyright © 2013-2014 IOIX Ukraine
6 @license: http://hierarchical-cluster-engine.com/license/
7 @since: 0.1
8 '''
9 
10 import hashlib
11 import dc.EventObjects
12 from dtm.EventObjects import GeneralResponse
13 import dc_db.Constants as Constants
14 from dc_db.FieldRecalculator import FieldRecalculator
15 from dc_db.BaseTask import BaseTask
16 from dc_db.URLCleanupTask import URLCleanUpTask
17 from dc_db.StatisticLogManager import StatisticLogManager
18 from dc_db.AttrDeleteTask import AttrDeleteTask
19 import app.Utils as Utils # pylint: disable=F0401
20 
21 logger = Utils.MPLogger().getLogger()
22 
23 
24 # #process urlDelete event
26 
27 
28  # #constructor
29  #
30  # @param keyValueStorageDir path to keyValue storage work dir
31  # @param rawDataDir path to raw data dir
32  def __init__(self, keyValueStorageDir, rawDataDir, dBDataTask):
33  super(URLDeleteTask, self).__init__()
34  self.uRLCleanUpTask = URLCleanUpTask(keyValueStorageDir, rawDataDir, dBDataTask)
36  self.urlMd5 = None
37  # self.dBDataTask = dBDataTask
38 
39 
40  # #make all necessary actions to delete urls data from db
41  #
42  # @param urlDelete list of URLDelete objects
43  # @param queryCallback function for queries execution
44  # @return generalResponse instance of GeneralResponse object
45  def process(self, urlDeletes, queryCallback):
46  ret = GeneralResponse()
47  for urlDelete in urlDeletes:
48  if urlDelete.siteId == "":
49  urlDelete.siteId = "0"
50  if self.isSiteExist(urlDelete.siteId, queryCallback):
51  try:
52  localUrls = []
53  if urlDelete.url is None:
54  isUrlExtract = False
55  if urlDelete.urlType == dc.EventObjects.URLStatus.URL_TYPE_URL:
56  isUrlExtract = True
57  localUrls = self.uRLCleanUpTask.extractUrlByCriterions(urlDelete.siteId, isUrlExtract,
58  urlDelete.criterions, queryCallback)
59  else:
60  localUrls.append(urlDelete.url)
61  logger.debug(">>> [URLDelete] localUrls size = " + str(len(localUrls)))
62  for localUrl in localUrls:
63  try:
64  urlDelete.url = localUrl
65  if urlDelete.delayedType == dc.EventObjects.NOT_DELAYED_OPERATION:
66  self.uRLCleanUpTask.deleteFromDataStorage(urlDelete, queryCallback)
67  self.uRLCleanUpTask.deleteFromRawStorage(urlDelete)
68  elif urlDelete.delayedType == dc.EventObjects.DELAYED_OPERATION:
69  self.copyUrlToDeleteDB(urlDelete, queryCallback)
70  self.deleteFromMysqlDB(urlDelete, queryCallback)
71  AttrDeleteTask.deleteUrlsAttributes(urlDelete.siteId, self.urlMd5, queryCallback)
72  if self.urlMd5 is not None:
73  StatisticLogManager.statisticUpdate(queryCallback, Constants.StatFreqConstants.FREQ_DELETE, 1,
74  urlDelete.siteId, self.urlMd5)
75  StatisticLogManager.statisticUpdate(queryCallback, Constants.StatFreqConstants.FREQ_DELETED_STATE, 1,
76  urlDelete.siteId, self.urlMd5)
77  StatisticLogManager.logUpdate(queryCallback, "LOG_DELETE", urlDelete, urlDelete.siteId, self.urlMd5)
78  except Exception as ex:
79  logger.debug(">>> [URLDelete] Some Type Exception [LOOP] = " + str(type(ex)) + " " + str(ex))
80  ret.statuses.append(True)
81  self.recalculator.commonRecalc(urlDelete.siteId, queryCallback)
82  except Exception as excp:
83  logger.debug(">>> [URLDelete] Some Type Exception = " + str(type(excp)) + " " + str(excp))
84  ret.statuses.append(False)
85  else:
86  ret.statuses.append(False)
87 
88  return ret
89 
90 
91  # #update data in mysql db
92  #
93  # @param urlDelete instance of URLDelete object
94  # @param queryCallback function for queries execution
95  def deleteFromMysqlDB(self, urlDelete, queryCallback):
96  self.urlMd5 = self.calculateMd5FormUrl(urlDelete.url, urlDelete.urlType, True)
97  SQL_CLAUSE = ("`URLMd5` = '%s'" % self.urlMd5)
98  UPDATE_SQL_TEMPLATE = "DELETE FROM `%s` WHERE %s"
99  tableName = Constants.DC_URLS_TABLE_NAME_TEMPLATE % urlDelete.siteId
100  query = UPDATE_SQL_TEMPLATE % (tableName, SQL_CLAUSE)
101  queryCallback(query, Constants.SECONDARY_DB_ID)
102 
103 
104  # #update data in mysql db
105  #
106  # @param urlDelete instance of URLDelete or URLCleanup object
107  # @param queryCallback function for queries execution
108  def copyUrlToDeleteDB(self, urlDelete, queryCallback):
109  if urlDelete.urlType == dc.EventObjects.URLStatus.URL_TYPE_URL:
110  localMd5 = hashlib.md5(urlDelete.url).hexdigest()
111  else:
112  localMd5 = urlDelete.url
113  SQL_COPY_QUERY_TEMPLATE = "INSERT INTO %s SELECT * FROM dc_urls.%s WHERE `URLMd5` = '%s'"
114  tbName = Constants.DC_URLS_TABLE_NAME_TEMPLATE % urlDelete.siteId
115  # TODO: One more query for each delete request is too heavy for DB
116  # query = Constants.SQL_CREATE_QUERY_TEMPLATE % (tbName, tbName)
117  # queryCallback(query, Constants.FOURTH_DB_ID)
118  query = SQL_COPY_QUERY_TEMPLATE % (tbName, tbName, localMd5)
119  queryCallback(query, Constants.FOURTH_DB_ID, Constants.EXEC_INDEX, True)
def isSiteExist(self, siteId, queryCallback, userId=None)
Definition: BaseTask.py:29
def calculateMd5FormUrl(self, url, urlType, useNormilize=False)
Definition: BaseTask.py:188
def copyUrlToDeleteDB(self, urlDelete, queryCallback)
GeneralResponse event object, represents general state response for multipurpose usage.
def process(self, urlDeletes, queryCallback)
def __init__(self, keyValueStorageDir, rawDataDir, dBDataTask)
def deleteFromMysqlDB(self, urlDelete, queryCallback)