HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
CrawlingOptimiser.py
Go to the documentation of this file.
1 """
2  HCE project, Python bindings, Processor Manager application.
3  Event objects definitions.
4 
5  @package: dc
6  @file CrawlingOptimiser.py.py
7  @author Oleksii <developers.hce@gmail.com>
8  @author madk <developers.hce@gmail.com>
9  @link: http://hierarchical-cluster-engine.com/
10  @copyright: Copyright &copy; 2013-2014 IOIX Ukraine
11  @license: http://hierarchical-cluster-engine.com/license/
12  @since: 0.1
13  """
14 
15 
16 import logging.config
17 import ConfigParser
18 from cement.core import foundation
19 
20 import dc_co.Constants as CONSTS
21 import app.Consts as APP_CONSTS
22 import app.Utils as Utils # pylint: disable=F0401
23 from dc_crawler.DBTasksWrapper import DBTasksWrapper
24 
25 
26 
27 # #The CrawlerTask class, is a interface for fetching content from the web
28 #
29 # This object is a run at once application
30 class CrawlingOptimiser(foundation.CementApp):
31 
32  # Mandatory
33  class Meta(object):
34  label = CONSTS.APP_NAME
35  def __init__(self):
36  pass
37 
38 
39  # #constructor
40  # initialize default fields
41  def __init__(self):
42  # call base class __init__ method
43  foundation.CementApp.__init__(self)
44  self.exit_code = CONSTS.EXIT_SUCCESS
45  self.logger = None
46  self.message_queue = []
47  # self.url_table = None
48  self.site_id = None
49  self.recrawl_dict = {}
50  self.site_features = {}
51  self.local_wrapper = None
52  self.remote_wrapper = None
53  self.remote_host = None
54 
55  # #setup
56  # setup application
57  def setup(self):
58  # call base class setup method
59  foundation.CementApp.setup(self)
60  self.args.add_argument('-c', '--config', action='store', metavar='config_file', help='config ini-file')
61  self.args.add_argument('-s', '--site', action='store', metavar='site alias', help='site alias')
62 
63 
64  # #run
65  # run application
66  def run(self):
67  # call base class run method
68  foundation.CementApp.run(self)
69 
70  # config section
71  self.loadConfig()
72 
73  # load logger config file
74  self.loadLogConfigFile()
75 
76  # load mandatory options
77  self.loadOptions()
78 
79  # make processing
80  self.process()
81 
82  # Finish logging
83  self.logger.info(APP_CONSTS.LOGGER_DELIMITER_LINE)
84 
85 
86 
87  # #load config from file
88  # load from cli argument or default config file
89  def loadConfig(self):
90  try:
91  self.config = ConfigParser.ConfigParser()
92  self.config.optionxform = str
93 
94  # config argument
95  if self.pargs.config:
96  self.config.read(self.pargs.config)
97  self.message_queue.append(CONSTS.MSG_INFO_LOAD_CONFIG_FILE + str(self.pargs.config))
98  else:
99  self.config.read(CONSTS.DEFAULT_CFG_FILE)
100  self.message_queue.append(CONSTS.MSG_INFO_LOAD_DEFAULT_CONFIG_FILE + CONSTS.DEFAULT_CFG_FILE)
101 
102  # site argument
103  if self.pargs.site:
104  self.site_id = self.pargs.site
105  self.message_queue.append(CONSTS.MSG_INFO_LOAD_SITE_ID + str(self.pargs.site))
106  else:
107  self.site_id = CONSTS.SITE_ALL
108  self.message_queue.append(CONSTS.MSG_INFO_LOAD_DEFAULT_SITE_ID + str(CONSTS.SITE_ALL))
109 
110  except Exception, err:
111  print CONSTS.MSG_ERROR_LOAD_CONFIG, err.message
112  raise
113 
114 
115  # #load logging
116  # load logging configuration (log file, log level, filters)
117  #
118  def loadLogConfigFile(self):
119  try:
120  # print str(vars(self.config))
121  log_conf_file = self.config.get("Application", "log")
122  logging.config.fileConfig(log_conf_file)
123  self.logger = Utils.MPLogger().getLogger()
124  except Exception, err:
125  print CONSTS.MSG_ERROR_LOAD_LOG_CONFIG_FILE, err.message
126  raise
127 
128 
129  # #load mandatory options
130  # load mandatory options
131  #
132  def loadOptions(self):
133  try:
134  # remote host
135  remote_db_task_ini = self.config.get(self.__class__.__name__, "db-task_ini_remote")
136  remote_cfgParser = ConfigParser.ConfigParser()
137  remote_cfgParser.read(remote_db_task_ini)
138  self.remote_wrapper = DBTasksWrapper(remote_cfgParser)
139  self.remote_host = remote_cfgParser.get("TasksManager", "db_host")
140  # local host
141  local_db_task_ini = self.config.get(self.__class__.__name__, "db-task_ini_local")
142  local_cfgParser = ConfigParser.ConfigParser()
143  local_cfgParser.read(local_db_task_ini)
144  self.local_wrapper = DBTasksWrapper(local_cfgParser)
145  except Exception, err:
146  self.logger.error(CONSTS.MSG_ERROR_LOAD_LOG_CONFIG_FILE)
147  self.logger.error(str(err.message))
148  raise
149 
150 
151 
152  # # process
153  #
154  def process(self):
155  # log message buffer
156  for msg in self.message_queue:
157  self.logger.info(msg)
158 
159  if self.site_id is not None:
160  try:
161  # collect site's data
162  self.recrawl_dict[self.site_id] = self.collectSiteData()
163  self.logger.info("self.recrawl_dict: %s" % str(self.recrawl_dict))
164  # store site's data
165  self.storeSiteData()
166  except Exception, err:
167  self.logger.error(CONSTS.MSG_ERROR_PROCESS_GENERAL + ' ' + str(err))
168 
169 
170  # # collectSiteData
171  #
172  def collectSiteData(self):
173  site_data_dict = {}
174  if self.site_id is not None:
175  try:
176  # New Contents
177  query = CONSTS.SQL_QUERY_NEW_URLS % (self.site_id, self.site_id, self.site_id)
178  response = self.remote_wrapper.customRequest(query, CONSTS.DB_URLS)
179  if response is not None:
180  self.logger.info("response: %s" % str(response))
181  site_data_dict["Contents"] = response[0][0]
182  site_data_dict["LastAdded"] = response[0][1]
183  site_data_dict["minPDate"] = response[0][2]
184  site_data_dict["maxPDate"] = response[0][3]
185 
186  # Recrawl start
187  query = CONSTS.SQL_QUERY_RECRAWL_PERIOD_START % (self.site_id)
188  response = self.remote_wrapper.customRequest(query, CONSTS.DB_URLS)
189  if response is not None:
190  self.logger.info("response: %s" % str(response))
191  site_data_dict["RecrawlStart"] = response[0][0]
192 
193  # Recrawl end
194  query = CONSTS.SQL_QUERY_RECRAWL_PERIOD_END % (self.site_id)
195  response = self.remote_wrapper.customRequest(query, CONSTS.DB_URLS)
196  if response is not None:
197  self.logger.info("response: %s" % str(response))
198  site_data_dict["RecrawlEnd"] = response[0][0]
199 
200  except Exception, err:
201  self.logger.error(CONSTS.MSG_ERROR_COLLECT_SITE_DATA + ' ' + str(err))
202  self.logger.info("site_data_dict: %s" % str(site_data_dict))
203  return site_data_dict
204 
205 
206  # # storeSiteData
207  #
208  def storeSiteData(self):
209  try:
210  if True: # self.recrawl_dict[self.site_id]["Contents"]>0:
211  # Create New table if not exists
212  query = CONSTS.SQL_QUERY_NEW_SITE_TABLE % (self.site_id)
213  response = self.local_wrapper.customRequest(query, CONSTS.DB_CO)
214  if response is not None:
215  self.logger.info("response: %s" % str(response))
216 
217  # Put site's data to the site's table
218  query = CONSTS.SQL_QUERY_INSERT_SITE_DATA % \
219  (self.site_id, \
220  self.remote_host, \
221  self.recrawl_dict[self.site_id]["Contents"], \
222  self.recrawl_dict[self.site_id]["RecrawlStart"], \
223  self.recrawl_dict[self.site_id]["RecrawlEnd"], \
224  self.recrawl_dict[self.site_id]["minPDate"], \
225  self.recrawl_dict[self.site_id]["maxPDate"], \
226  self.recrawl_dict[self.site_id]["LastAdded"], \
227  self.recrawl_dict[self.site_id]["Contents"], \
228  self.recrawl_dict[self.site_id]["LastAdded"], \
229  self.recrawl_dict[self.site_id]["minPDate"], \
230  self.recrawl_dict[self.site_id]["maxPDate"])
231  response = self.local_wrapper.customRequest(query, CONSTS.DB_CO)
232  if response is not None:
233  self.logger.info("response: %s" % str(response))
234  else:
235  self.logger.info("Zero contents.")
236  except Exception, err:
237  self.logger.error(CONSTS.MSG_ERROR_STORE_SITE_DATA + ' ' + str(err))
-mask-info