HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
SiteFindTask.py
Go to the documentation of this file.
1 '''
2 @package: dc
3 @author igor
4 @link: http://hierarchical-cluster-engine.com/
5 @copyright: Copyright © 2013-2014 IOIX Ukraine
6 @license: http://hierarchical-cluster-engine.com/license/
7 @since: 0.1
8 '''
9 
10 import MySQLdb
11 
12 import dc.EventObjects
13 import dc_db.Constants as Constants
14 from dc_db.SiteTask import SiteTask
15 from dc_db.SiteStatusTask import SiteStatusTask
16 import app.Utils as Utils # pylint: disable=F0401
17 
18 logger = Utils.MPLogger().getLogger()
19 
20 
21 # #sql query which find site by root url
22 GET_SITE_SQL_TEMPLATE = " SELECT * FROM %s WHERE `Id`='%s'"
23 GET_SITE_URLS_SQL_TEMPLATE = " SELECT `URL` FROM sites_urls WHERE `Site_Id`='%s'"
24 GET_SITE_PROPERTIES_SQL_TEMPLATE = " SELECT `Name`, `Value` FROM sites_properties WHERE `Site_Id`='%s'"
25 GET_SITE_FILTERS_SQL_TEMPLATE = " SELECT `Pattern`, `Type`, `Mode` FROM sites_filters WHERE `Site_Id`='%s'"
26 
27 # @todo move to apropriate place
28 TASK_NOT_EXIST_ERR = 2020
29 TASK_NOT_EXISTS_ERR_MSG = "Duplicate site"
30 
31 
33 
34 
35  # #constructor
36  #
37  # @param dcSiteTemplate path to sql template for dc_urls_* tables
38  def __init__(self, dcSiteTemplate, keyValueDefaultFile, keyValueStorageDir, dBDataTask, dcStatTemplates,
39  dcLogTemplate, dcAttrTemplate):
40  super(SiteFindTask, self).__init__(dcSiteTemplate, keyValueDefaultFile, keyValueStorageDir, dBDataTask,
41  dcStatTemplates, dcLogTemplate, dcAttrTemplate)
42  self.siteStatusTask = None
43 
44 
45  # #make all necessary actions to update site into in mysql db
46  #
47  # @param siteUpdate instance of SiteUpdate object
48  # @param queryCallback function for queries execution
49  # @return generalResponse instance of GeneralResponse object
50  def process(self, siteFind, queryCallback):
52  sites = self.siteFind(siteFind, queryCallback)
53  return sites
54 
55 
56  # #
57  #
58  def loadSiteFromDB(self, siteFind, site_id, site, queryCallback):
59  tableName = 'sites'
60  tables = tableName
61  if dc.EventObjects.SiteFind.CRITERION_TABLES in siteFind.criterions and \
62  siteFind.criterions[dc.EventObjects.SiteFind.CRITERION_TABLES] is not None and \
63  siteFind.criterions[dc.EventObjects.SiteFind.CRITERION_TABLES] != "":
64  if tableName not in siteFind.criterions[dc.EventObjects.SiteFind.CRITERION_TABLES]:
65  tables = tableName + ", " + siteFind.criterions[dc.EventObjects.SiteFind.CRITERION_TABLES]
66  else:
67  tables = siteFind.criterions[dc.EventObjects.SiteFind.CRITERION_TABLES]
68 
69  query = GET_SITE_SQL_TEMPLATE % (tables, site_id["Site_Id"])
70  # logger.debug("query: %s", str(query))
71  site_row = queryCallback(query, Constants.PRIMARY_DB_ID, Constants.EXEC_NAME)
72  logger.debug("Get site from sites: %s", str(site_row))
73  for (key, value) in Constants.siteDict.items():
74  if str(value)[:1] == "`":
75  value = str(value)[1:-1]
76  # logger.debug("key: %s; value: %s", str(key), str(value))
77  logger.debug("site field: %s; table field: %s", str(site.__dict__[key]), str(site_row[0].get(value, None)))
78  if key == "uDate":
79  site.__dict__[key] = str(site_row[0].get(value, None))
80  elif key == "tcDate":
81  site.__dict__[key] = str(site_row[0].get(value, None))
82  elif key == "tcDateProcess":
83  site.__dict__[key] = str(site_row[0].get(value, None))
84  elif key == "cDate":
85  site.__dict__[key] = str(site_row[0].get(value, None))
86  elif key == "recrawlDate":
87  site.__dict__[key] = str(site_row[0].get(value, None))
88  else:
89  site.__dict__[key] = site_row[0].get(value, "a")
90  # site.cDate = str(site_row[0]["CDate"])
91  return site
92 
93 
94  # #
95  #
96  def loadListOfSitesFromDB(self, siteFind, queryCallback):
97  if siteFind.url is not None and (not siteFind.criterions):
98  query = "SELECT `Site_Id` FROM sites_urls WHERE `URL` LIKE '" + MySQLdb.escape_string(siteFind.url) + "%' GROUP BY `Site_Id`" # pylint: disable=E1101,C0301
99  elif (siteFind.url is not None) and (siteFind.criterions is not None):
100  additionCriterion = " `URL` LIKE '" + MySQLdb.escape_string(siteFind.url) + "%' " # pylint: disable=E1101,C0301
101  query = "SELECT `Site_Id` FROM sites_urls " + self.generateCriterionSQL(siteFind.criterions, additionCriterion)
102  else:
103  # Fix for tables list to use both "sites" and "sites_urls" tables
104  tableName = "sites_urls"
105  if ("WHERE" in siteFind.criterions) and (siteFind.criterions["WHERE"] is not None) and \
106  (tableName in siteFind.criterions["WHERE"]):
107  addTable = ", " + tableName
108  else:
109  addTable = ""
110 
111  sitesTableName = 'sites'
112  if dc.EventObjects.SiteFind.CRITERION_TABLES in siteFind.criterions and \
113  siteFind.criterions[dc.EventObjects.SiteFind.CRITERION_TABLES] is not None and \
114  siteFind.criterions[dc.EventObjects.SiteFind.CRITERION_TABLES] != "":
115  if sitesTableName not in siteFind.criterions[dc.EventObjects.SiteFind.CRITERION_TABLES]:
116  addTable = ", " + siteFind.criterions[dc.EventObjects.SiteFind.CRITERION_TABLES]
117 
118  query = "SELECT `Id` AS Site_Id FROM " + sitesTableName + addTable + self.generateCriterionSQL(siteFind.criterions)
119 
120  # logger.debug("query: %s", str(query))
121  site_ids = queryCallback(query, Constants.PRIMARY_DB_ID, Constants.EXEC_NAME)
122  logger.debug("List of Site_Id: %s", str(site_ids))
123 
124  return site_ids
125 
126 
127  # #check if given site exist in current db
128  #
129  # @param siteId id of checking site
130  # @param queryCallback function for queries execution
131  # @return True if exist, or False
132  def siteFind(self, siteFind, queryCallback):
133  sites = []
134  # get all UNIQ site id's with urls for given url
135  site_ids = self.loadListOfSitesFromDB(siteFind, queryCallback)
136  if hasattr(site_ids, "__iter__"):
137  # for each site fill it fields
138  for site_id in site_ids:
139  site = dc.EventObjects.Site("")
140  # load site from sites table
141  self.loadSiteFromDB(siteFind, site_id, site, queryCallback)
142  if SiteTask.FIELD_NAME_URLS not in siteFind.excludeList:
143  site.urls = self.siteStatusTask.fillUrls(site, queryCallback)
144  else:
145  site.urls = None
146  if SiteTask.FIELD_NAME_PROPERTIES not in siteFind.excludeList:
147  site.properties = self.siteStatusTask.fillProperties(site, queryCallback)
148  else:
149  site.properties = None
150  if SiteTask.FIELD_NAME_FILTERS not in siteFind.excludeList:
151  site.filters = self.siteStatusTask.fillFilters(site, queryCallback)
152  else:
153  site.filters = None
154  sites.append(site)
155 
156  return sites
def process(self, siteFind, queryCallback)
Definition: SiteFindTask.py:50
def siteFind(self, siteFind, queryCallback)
def loadSiteFromDB(self, siteFind, site_id, site, queryCallback)
Definition: SiteFindTask.py:58
def loadListOfSitesFromDB(self, siteFind, queryCallback)
Definition: SiteFindTask.py:96
def generateCriterionSQL(self, criterions, additionWhere=None, siteId=None)
Definition: BaseTask.py:46
def __init__(self, dcSiteTemplate, keyValueDefaultFile, keyValueStorageDir, dBDataTask, dcStatTemplates, dcLogTemplate, dcAttrTemplate)
Definition: SiteFindTask.py:39