HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_co.CrawlingOptimiser.CrawlingOptimiser Class Reference
Inheritance diagram for dc_co.CrawlingOptimiser.CrawlingOptimiser:
Collaboration diagram for dc_co.CrawlingOptimiser.CrawlingOptimiser:

Classes

class  Meta
 

Public Member Functions

def __init__ (self)
 
def setup (self)
 
def run (self)
 
def loadConfig (self)
 
def loadLogConfigFile (self)
 
def loadOptions (self)
 
def process (self)
 
def collectSiteData (self)
 
def storeSiteData (self)
 

Public Attributes

 exit_code
 
 logger
 
 message_queue
 
 site_id
 
 recrawl_dict
 
 site_features
 
 local_wrapper
 
 remote_wrapper
 
 remote_host
 
 config
 

Detailed Description

Definition at line 30 of file CrawlingOptimiser.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_co.CrawlingOptimiser.CrawlingOptimiser.__init__ (   self)

Definition at line 41 of file CrawlingOptimiser.py.

41  def __init__(self):
42  # call base class __init__ method
43  foundation.CementApp.__init__(self)
44  self.exit_code = CONSTS.EXIT_SUCCESS
45  self.logger = None
46  self.message_queue = []
47  # self.url_table = None
48  self.site_id = None
49  self.recrawl_dict = {}
50  self.site_features = {}
51  self.local_wrapper = None
52  self.remote_wrapper = None
53  self.remote_host = None
54 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ collectSiteData()

def dc_co.CrawlingOptimiser.CrawlingOptimiser.collectSiteData (   self)

Definition at line 172 of file CrawlingOptimiser.py.

172  def collectSiteData(self):
173  site_data_dict = {}
174  if self.site_id is not None:
175  try:
176  # New Contents
177  query = CONSTS.SQL_QUERY_NEW_URLS % (self.site_id, self.site_id, self.site_id)
178  response = self.remote_wrapper.customRequest(query, CONSTS.DB_URLS)
179  if response is not None:
180  self.logger.info("response: %s" % str(response))
181  site_data_dict["Contents"] = response[0][0]
182  site_data_dict["LastAdded"] = response[0][1]
183  site_data_dict["minPDate"] = response[0][2]
184  site_data_dict["maxPDate"] = response[0][3]
185 
186  # Recrawl start
187  query = CONSTS.SQL_QUERY_RECRAWL_PERIOD_START % (self.site_id)
188  response = self.remote_wrapper.customRequest(query, CONSTS.DB_URLS)
189  if response is not None:
190  self.logger.info("response: %s" % str(response))
191  site_data_dict["RecrawlStart"] = response[0][0]
192 
193  # Recrawl end
194  query = CONSTS.SQL_QUERY_RECRAWL_PERIOD_END % (self.site_id)
195  response = self.remote_wrapper.customRequest(query, CONSTS.DB_URLS)
196  if response is not None:
197  self.logger.info("response: %s" % str(response))
198  site_data_dict["RecrawlEnd"] = response[0][0]
199 
200  except Exception, err:
201  self.logger.error(CONSTS.MSG_ERROR_COLLECT_SITE_DATA + ' ' + str(err))
202  self.logger.info("site_data_dict: %s" % str(site_data_dict))
203  return site_data_dict
204 
205 
-mask-info
Here is the caller graph for this function:

◆ loadConfig()

def dc_co.CrawlingOptimiser.CrawlingOptimiser.loadConfig (   self)

Definition at line 89 of file CrawlingOptimiser.py.

89  def loadConfig(self):
90  try:
91  self.config = ConfigParser.ConfigParser()
92  self.config.optionxform = str
93 
94  # config argument
95  if self.pargs.config:
96  self.config.read(self.pargs.config)
97  self.message_queue.append(CONSTS.MSG_INFO_LOAD_CONFIG_FILE + str(self.pargs.config))
98  else:
99  self.config.read(CONSTS.DEFAULT_CFG_FILE)
100  self.message_queue.append(CONSTS.MSG_INFO_LOAD_DEFAULT_CONFIG_FILE + CONSTS.DEFAULT_CFG_FILE)
101 
102  # site argument
103  if self.pargs.site:
104  self.site_id = self.pargs.site
105  self.message_queue.append(CONSTS.MSG_INFO_LOAD_SITE_ID + str(self.pargs.site))
106  else:
107  self.site_id = CONSTS.SITE_ALL
108  self.message_queue.append(CONSTS.MSG_INFO_LOAD_DEFAULT_SITE_ID + str(CONSTS.SITE_ALL))
109 
110  except Exception, err:
111  print CONSTS.MSG_ERROR_LOAD_CONFIG, err.message
112  raise
113 
114 
Here is the caller graph for this function:

◆ loadLogConfigFile()

def dc_co.CrawlingOptimiser.CrawlingOptimiser.loadLogConfigFile (   self)

Definition at line 118 of file CrawlingOptimiser.py.

118  def loadLogConfigFile(self):
119  try:
120  # print str(vars(self.config))
121  log_conf_file = self.config.get("Application", "log")
122  logging.config.fileConfig(log_conf_file)
123  self.logger = Utils.MPLogger().getLogger()
124  except Exception, err:
125  print CONSTS.MSG_ERROR_LOAD_LOG_CONFIG_FILE, err.message
126  raise
127 
128 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ loadOptions()

def dc_co.CrawlingOptimiser.CrawlingOptimiser.loadOptions (   self)

Definition at line 132 of file CrawlingOptimiser.py.

132  def loadOptions(self):
133  try:
134  # remote host
135  remote_db_task_ini = self.config.get(self.__class__.__name__, "db-task_ini_remote")
136  remote_cfgParser = ConfigParser.ConfigParser()
137  remote_cfgParser.read(remote_db_task_ini)
138  self.remote_wrapper = DBTasksWrapper(remote_cfgParser)
139  self.remote_host = remote_cfgParser.get("TasksManager", "db_host")
140  # local host
141  local_db_task_ini = self.config.get(self.__class__.__name__, "db-task_ini_local")
142  local_cfgParser = ConfigParser.ConfigParser()
143  local_cfgParser.read(local_db_task_ini)
144  self.local_wrapper = DBTasksWrapper(local_cfgParser)
145  except Exception, err:
146  self.logger.error(CONSTS.MSG_ERROR_LOAD_LOG_CONFIG_FILE)
147  self.logger.error(str(err.message))
148  raise
149 
150 
151 
-mask-info
Here is the caller graph for this function:

◆ process()

def dc_co.CrawlingOptimiser.CrawlingOptimiser.process (   self)

Definition at line 154 of file CrawlingOptimiser.py.

154  def process(self):
155  # log message buffer
156  for msg in self.message_queue:
157  self.logger.info(msg)
158 
159  if self.site_id is not None:
160  try:
161  # collect site's data
162  self.recrawl_dict[self.site_id] = self.collectSiteData()
163  self.logger.info("self.recrawl_dict: %s" % str(self.recrawl_dict))
164  # store site's data
165  self.storeSiteData()
166  except Exception, err:
167  self.logger.error(CONSTS.MSG_ERROR_PROCESS_GENERAL + ' ' + str(err))
168 
169 
-mask-info
Here is the call graph for this function:
Here is the caller graph for this function:

◆ run()

def dc_co.CrawlingOptimiser.CrawlingOptimiser.run (   self)

Definition at line 66 of file CrawlingOptimiser.py.

66  def run(self):
67  # call base class run method
68  foundation.CementApp.run(self)
69 
70  # config section
71  self.loadConfig()
72 
73  # load logger config file
74  self.loadLogConfigFile()
75 
76  # load mandatory options
77  self.loadOptions()
78 
79  # make processing
80  self.process()
81 
82  # Finish logging
83  self.logger.info(APP_CONSTS.LOGGER_DELIMITER_LINE)
84 
85 
86 
Here is the call graph for this function:

◆ setup()

def dc_co.CrawlingOptimiser.CrawlingOptimiser.setup (   self)

Definition at line 57 of file CrawlingOptimiser.py.

57  def setup(self):
58  # call base class setup method
59  foundation.CementApp.setup(self)
60  self.args.add_argument('-c', '--config', action='store', metavar='config_file', help='config ini-file')
61  self.args.add_argument('-s', '--site', action='store', metavar='site alias', help='site alias')
62 
63 

◆ storeSiteData()

def dc_co.CrawlingOptimiser.CrawlingOptimiser.storeSiteData (   self)

Definition at line 208 of file CrawlingOptimiser.py.

208  def storeSiteData(self):
209  try:
210  if True: # self.recrawl_dict[self.site_id]["Contents"]>0:
211  # Create New table if not exists
212  query = CONSTS.SQL_QUERY_NEW_SITE_TABLE % (self.site_id)
213  response = self.local_wrapper.customRequest(query, CONSTS.DB_CO)
214  if response is not None:
215  self.logger.info("response: %s" % str(response))
216 
217  # Put site's data to the site's table
218  query = CONSTS.SQL_QUERY_INSERT_SITE_DATA % \
219  (self.site_id, \
220  self.remote_host, \
221  self.recrawl_dict[self.site_id]["Contents"], \
222  self.recrawl_dict[self.site_id]["RecrawlStart"], \
223  self.recrawl_dict[self.site_id]["RecrawlEnd"], \
224  self.recrawl_dict[self.site_id]["minPDate"], \
225  self.recrawl_dict[self.site_id]["maxPDate"], \
226  self.recrawl_dict[self.site_id]["LastAdded"], \
227  self.recrawl_dict[self.site_id]["Contents"], \
228  self.recrawl_dict[self.site_id]["LastAdded"], \
229  self.recrawl_dict[self.site_id]["minPDate"], \
230  self.recrawl_dict[self.site_id]["maxPDate"])
231  response = self.local_wrapper.customRequest(query, CONSTS.DB_CO)
232  if response is not None:
233  self.logger.info("response: %s" % str(response))
234  else:
235  self.logger.info("Zero contents.")
236  except Exception, err:
237  self.logger.error(CONSTS.MSG_ERROR_STORE_SITE_DATA + ' ' + str(err))
238 
-mask-info
Here is the caller graph for this function:

Member Data Documentation

◆ config

dc_co.CrawlingOptimiser.CrawlingOptimiser.config

Definition at line 91 of file CrawlingOptimiser.py.

◆ exit_code

dc_co.CrawlingOptimiser.CrawlingOptimiser.exit_code

Definition at line 44 of file CrawlingOptimiser.py.

◆ local_wrapper

dc_co.CrawlingOptimiser.CrawlingOptimiser.local_wrapper

Definition at line 51 of file CrawlingOptimiser.py.

◆ logger

dc_co.CrawlingOptimiser.CrawlingOptimiser.logger

Definition at line 45 of file CrawlingOptimiser.py.

◆ message_queue

dc_co.CrawlingOptimiser.CrawlingOptimiser.message_queue

Definition at line 46 of file CrawlingOptimiser.py.

◆ recrawl_dict

dc_co.CrawlingOptimiser.CrawlingOptimiser.recrawl_dict

Definition at line 49 of file CrawlingOptimiser.py.

◆ remote_host

dc_co.CrawlingOptimiser.CrawlingOptimiser.remote_host

Definition at line 53 of file CrawlingOptimiser.py.

◆ remote_wrapper

dc_co.CrawlingOptimiser.CrawlingOptimiser.remote_wrapper

Definition at line 52 of file CrawlingOptimiser.py.

◆ site_features

dc_co.CrawlingOptimiser.CrawlingOptimiser.site_features

Definition at line 50 of file CrawlingOptimiser.py.

◆ site_id

dc_co.CrawlingOptimiser.CrawlingOptimiser.site_id

Definition at line 48 of file CrawlingOptimiser.py.


The documentation for this class was generated from the following file: