HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
URLStatusTask.py
Go to the documentation of this file.
1 '''
2 @package: dc
3 @author igor
4 @link: http://hierarchical-cluster-engine.com/
5 @copyright: Copyright © 2013-2014 IOIX Ukraine
6 @license: http://hierarchical-cluster-engine.com/license/
7 @since: 0.1
8 '''
9 
10 from dc_db.BaseTask import BaseTask
11 from dc_db import Constants
12 import dc.EventObjects
13 from app.Utils import UrlNormalizator
14 import app.Utils as Utils # pylint: disable=F0401
15 
16 logger = Utils.MPLogger().getLogger()
17 
18 # #process urlStatus event
20 
21  # #constructor
22  #
23  def __init__(self):
24  pass
25 
26 
27  # #make all necessary actions to get status of input URLs
28  #
29  # @param urlStatus list of URLStatus objects
30  # @param queryCallback function for queries execution
31  # @return list of URL objects
32  def process(self, urlStatuses, queryCallback):
33  urls = []
34  for urlStatus in urlStatuses:
35  # @todo add more complex case
36  if self.isSiteExist(urlStatus.siteId, queryCallback):
37  urls.extend(self.getURL(urlStatus, queryCallback))
38  return urls
39 
40 
41  # #select URL fields from url_siteId table and filled return object list list[URL]
42  #
43  # @param urlStatus object of UrlStatus type
44  # @param queryCallback function for queries execution
45  # @return list[URL]
46  def getURL(self, urlStatus, queryCallback):
47  URL_SELECT_SQL = "SELECT * FROM `%s` WHERE %s"
48  tableName = Constants.DC_URLS_TABLE_NAME_TEMPLATE % urlStatus.siteId
49  WHERE_CLAUSE = "URL = '%s'"
50  if urlStatus.urlType == dc.EventObjects.URLStatus.URL_TYPE_MD5:
51  WHERE_CLAUSE = "URLMd5 = '%s'"
52  query = URL_SELECT_SQL % (tableName, WHERE_CLAUSE % (urlStatus.url))
53  res = queryCallback(query, Constants.SECONDARY_DB_ID, Constants.EXEC_NAME)
54 
55  urls = self.fillUrlsList(res, dc.EventObjects.URL, Constants.URLTableDict)
56  return urls
57 
58 
59  # #fill urls list in common format
60  #
61  # @param res - MySQL return SELECT * query
62  # @param urlType - type of concret URL object's type
63  # @return urlDict - concret URL dict
64  def fillUrlsList(self, res, urlType, urlDict):
65  ret = []
66  if hasattr(res, '__iter__'):
67  for row in res:
68  if "Site_Id" in row and "URL" in row:
69  url = urlType(siteId=row["Site_Id"], url=row["URL"], normalizeMask=UrlNormalizator.NORM_NONE)
70  for field in urlDict.keys():
71  if hasattr(url, field) and urlDict[field] in row:
72  setattr(url, field, row[urlDict[field]])
73  url.UDate = Constants.readDataTimeField("UDate", row)
74  url.CDate = Constants.readDataTimeField("CDate", row)
75  url.lastModified = Constants.readDataTimeField("LastModified", row)
76  url.tcDate = Constants.readDataTimeField("TcDate", row)
77  url.pDate = Constants.readDataTimeField("PDate", row)
78  ret.append(url)
79  else:
80  logger.error(">>> SQL Select return NULL or not itereble")
81  return ret
def isSiteExist(self, siteId, queryCallback, userId=None)
Definition: BaseTask.py:29
def process(self, urlStatuses, queryCallback)
def fillUrlsList(self, res, urlType, urlDict)
def getURL(self, urlStatus, queryCallback)