HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
URLStatsTask.py
Go to the documentation of this file.
1 '''
2 @package: dc
3 @author scorp
4 @link: http://hierarchical-cluster-engine.com/
5 @copyright: Copyright © 2013-2014 IOIX Ukraine
6 @license: http://hierarchical-cluster-engine.com/license/
7 @since: 0.1
8 '''
9 
10 import copy
11 import dc.EventObjects
12 import dc_db.Constants as Constants
13 from dc_db.BaseTask import BaseTask
14 from dc_db.URLCleanupTask import URLCleanUpTask
15 import app.Utils as Utils # pylint: disable=F0401
16 
17 logger = Utils.MPLogger().getLogger()
18 
19 # #process URLHistoryTask task
21 
22  SQL_STATS_TEMPLATE = "SELECT * FROM %s WHERE `URLMd5`='%s'"
23 
24  # #constructor
25  #
26  def __init__(self, keyValueStorageDir, rawDataDir, dBDataTask):
27  super(URLStatsTask, self).__init__()
28  self.uRLCleanUpTask = URLCleanUpTask(keyValueStorageDir, rawDataDir, dBDataTask)
29 
30 
31  # #process - main class's execution point.
32  #
33  # @param urlStatses incoming urlStatses element (list of urlStats)
34  # @param queryCallback function for queries execution
35  # @return uRLStatsResponses element
36  def process(self, urlStatses, queryCallback):
37  uRLStatsResponses = []
38  for urlStats in urlStatses:
39  uRLStatsResponse = None
40  localMd5s = []
41  if urlStats is not None:
42  if urlStats.urlMd5 is None:
43  if urlStats.urlCriterions is not None and len(urlStats.urlCriterions) > 0:
44  localMd5s = self.uRLCleanUpTask.extractUrlByCriterions(urlStats.siteId, False,
45  urlStats.urlCriterions, queryCallback)
46  if urlStats.statsCriterions is not None:
47  statsMd5s = []
48  if urlStats.urlCriterions is None or len(urlStats.urlCriterions) == 0:
49  statsMd5s = self.uRLCleanUpTask.extractUrlByCriterions(urlStats.siteId,
50  False,
51  urlStats.statsCriterions, queryCallback,
52  Constants.STAT_DB_ID,
53  Constants.DC_FREQ_TABLE_NAME_TEMPLATE)
54  else:
55  SQL_WHERE_TMPL = "`UrlMd5` = '%s'"
56  statsCriterionCopy = copy.deepcopy(urlStats.statsCriterions)
57  for localMd5 in localMd5s:
58  urlStats.statsCriterions = copy.deepcopy(statsCriterionCopy)
59  if dc.EventObjects.URLFetch.CRITERION_WHERE in urlStats.statsCriterions \
60  and urlStats.statsCriterions[dc.EventObjects.URLFetch.CRITERION_WHERE] is not None:
61  urlStats.statsCriterions[dc.EventObjects.URLFetch.CRITERION_WHERE] = ' AND ' + \
62  (SQL_WHERE_TMPL % localMd5)
63  else:
64  urlStats.statsCriterions[dc.EventObjects.URLFetch.CRITERION_WHERE] = (SQL_WHERE_TMPL % localMd5)
65  statsMd5s += self.uRLCleanUpTask.extractUrlByCriterions(urlStats.siteId,
66  False,
67  urlStats.statsCriterions,
68  queryCallback,
69  Constants.STAT_DB_ID,
70  Constants.DC_FREQ_TABLE_NAME_TEMPLATE)
71  localMd5s = statsMd5s
72 
73 # '''
74 # if urlStats.urlCriterions is not None:
75 # urlsMd5s = self.uRLCleanUpTask.extractUrlByCriterions(urlStats.siteId, False,
76 # urlStats.urlCriterions, queryCallback)
77 #
78 # if urlStats.statsCriterions is not None:
79 # statsMd5s = self.uRLCleanUpTask.extractUrlByCriterions(urlStats.siteId, False,
80 # urlStats.statsCriterions, queryCallback, Constants.STAT_DB_ID,
81 # Constants.DC_FREQ_TABLE_NAME_TEMPLATE)
82 # if len(statsMd5s) > 0 and len(urlsMd5s) > 0:
83 # localMd5s = [x for x in statsMd5s if x in urlsMd5s]
84 # elif len(statsMd5s) > 0:
85 # localMd5s = statsMd5s
86 # elif len(urlsMd5s) > 0:
87 # localMd5s = urlsMd5s
88 # '''
89  else:
90  localMd5s.append(urlStats.urlMd5)
91  logger.debug(">>> [URLStatsTask] localUrls size = " + str(len(localMd5s)))
92  for localMd5 in localMd5s:
93  try:
94  urlStats.urlMd5 = localMd5
95  res = self.fetchStatsFromDB(urlStats, queryCallback)
96  if uRLStatsResponse is None:
97  uRLStatsResponse = dc.EventObjects.URLStatsResponse([], urlStats.siteId)
98  if res is not None and len(res) > 0:
99  uRLStatsResponse.freqRows.extend(res)
100  except Exception as ex:
101  logger.debug(">>> [URLStatsTask] Some Type Exception = " + str(type(ex)) + " " + str(ex))
102  uRLStatsResponses.append(uRLStatsResponse)
103 
104  return uRLStatsResponses
105 
106 
107  # #fetchStatsFromDB - method makes SQL response for fetching stats data
108  #
109  # @param urlStats element of URLStats object
110  # @param queryCallback function for queries execution
111  # @return SQL response element
112  def fetchStatsFromDB(self, urlStats, queryCallback):
113  tableName = Constants.DC_FREQ_TABLE_NAME_TEMPLATE % urlStats.siteId
114  query = self.SQL_STATS_TEMPLATE % (tableName, urlStats.urlMd5)
115  ret = queryCallback(query, Constants.STAT_DB_ID, Constants.EXEC_NAME)
116  if ret is not None:
117  for elem in ret:
118  if "CDate" in elem:
119  elem["CDate"] = str(elem["CDate"])
120  if "MDate" in elem:
121  elem["MDate"] = str(elem["MDate"])
122  return ret
def __init__(self, keyValueStorageDir, rawDataDir, dBDataTask)
Definition: URLStatsTask.py:26
def fetchStatsFromDB(self, urlStats, queryCallback)
def process(self, urlStatses, queryCallback)
Definition: URLStatsTask.py:36