HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
ProcessorStoreContentKVDB.py
Go to the documentation of this file.
1 """@package docstring
2  @file Scraper.py
3  @author Alexey <developers.hce@gmail.com>
4  @link http://hierarchical-cluster-engine.com/
5  @copyright Copyright &copy; 2013 IOIX Ukraine
6  @license http://hierarchical-cluster-engine.com/license/
7  @package HCE project node API
8  @since 0.1
9 """
10 
11 
12 
13 import pickle
14 import sys
15 import sqlite3 # as sqlite3
16 import logging.config
17 import ConfigParser
18 from cement.core import foundation
19 # import app.Utils as Utils # pylint: disable=F0401
20 import dc_processor.Constants as CONSTS
21 from app.Utils import ExceptionLog
22 import app.Utils as Utils # pylint: disable=F0401
23 
24 APP_NAME = "ProcessorStoreContentKVDB"
25 
26 MSG_ERROR_LOAD_CONFIG = "Error loading config file. Exciting."
27 MSG_ERROR_LOAD_LOG_CONFIG_FILE = "Error loading logging config file. Exiting."
28 MSG_ERROR_LOAD_EXTRACTORS = "Error load extractors "
29 MSG_ERROR_TEMPLATE_EXTRACTION = "Error template extraction "
30 MSG_ERROR_DYNAMIC_EXTRACTION = "Error dynamic extraction "
31 MSG_ERROR_LOAD_DB_BACKEND = "Error load db backend"
32 MSG_ERROR_LOAD_OPTIONS = "Error load options"
33 MSG_INFO_PREPARE_CONTENT = "Prepare content: "
34 MSG_ERROR_ADJUST_PR = "Error adjust partial references. "
35 MSG_ERROR_PROCESS = "Processor Storing Contents process batch error: "
36 
37 SQLITE_TIMEOUT = 30
38 
39 # #Scraper
40 #
41 #
42 class ProcessorStoreContentKVDB(foundation.CementApp):
43 
44 
45  # Mandatory
46  class Meta(object):
47  label = APP_NAME
48  def __init__(self):
49  pass
50 
51 
52  # #constructor
53  # initialize default fields
54  def __init__(self):
55  # call base class __init__ method
56  foundation.CementApp.__init__(self)
57  self.exit_code = CONSTS.EXIT_SUCCESS
58  self.logger = None
59  self.config_db_dir = None
60  self.sqliteTimeout = SQLITE_TIMEOUT
61  self.input_data = None
62  self.raw_contents_tbl = None
63 
64 
65  # #setup
66  # setup application
67  def setup(self):
68  # call base class setup method
69  foundation.CementApp.setup(self)
70  self.args.add_argument('-c', '--config', action='store', metavar='config_file', help='config ini-file')
71 
72 
73  # #run
74  # run application
75  def run(self):
76  # call base class run method
77  foundation.CementApp.run(self)
78 
79  # config section
80  self.loadConfig()
81 
82  # load logger config file
83  self.loadLogConfigFile()
84 
85  # load sqlite db backend
86  # self.loadSqliteDBBackend()
87 
88  # sqlite
89  # self.loadDBBackend()
90 
91  # options
92  self.loadOptions()
93 
94 
95  # #main content processing
96  # main content processing
97  #
98  def process(self):
99  self.putContentToDB()
100  # return response.get()
101 
102 
103  def putContentToDB(self):
104  # get appropriate db name, depending on siteId
105  if len(self.input_data.siteId):
106  db_name = self.config_db_dir + "/" + self.input_data.siteId + ".db"
107  else:
108  db_name = self.config_db_dir + "/0.db"
109  self.logger.info("db_name: " + db_name)
110  connector = None
111  try:
112  # put parsed resource to the db
113  connector = sqlite3.connect(db_name, timeout=self.sqliteTimeout) # @UndefinedVariable
114  connector.text_factory = str
115  with connector:
116  cur = connector.cursor()
117  query = "CREATE TABLE IF NOT EXISTS \
118  %s(id VARCHAR(32) PRIMARY KEY UNIQUE, data TEXT, CDate DATETIME DEFAULT CURRENT_TIMESTAMP)" \
119  % (self.raw_contents_tbl)
120  cur.execute(query)
121  cur.execute("INSERT OR REPLACE INTO raw_contents VALUES(?,?,datetime('now','localtime'))",
122  (self.input_data.urlId, self.input_data.raw_content))
123 
124  except Exception as err:
125  # Connection objects can be used as context managers that automatically commit or rollback transactions.
126  # In the event of an exception, the transaction is rolled back; otherwise, the transaction is committed:
127  # connector.rollback()
128  ExceptionLog.handler(self.logger, err, 'putContentToDB')
129  raise
130 
131 
132  # #process batch
133  # the main processing of the batch object
134  def processBatch(self):
135  try:
136  # read pickled batch object from stdin and unpickle it
137  input_pickled_object = sys.stdin.read()
138  stored_in_data = pickle.loads(input_pickled_object)
139  self.input_data = stored_in_data
140  # self.logger.info("input scraper object: " + str(vars(stored_in_data)))
141  # TODO main processing over every url from list of urls in the batch object
142  self.process()
143  # self.logger.info("output : " + str(output))
144  # send response to the stdout
145  # print input_pickled_object
146  except Exception as err:
147  ExceptionLog.handler(self.logger, err, MSG_ERROR_PROCESS, (err))
148  self.exit_code = CONSTS.EXIT_FAILURE
149 
150 
151  # #load config from file
152  # load from cli argument or default config file
153  def loadConfig(self):
154  try:
155  self.config = ConfigParser.ConfigParser()
156  self.config.optionxform = str
157  if self.pargs.config:
158  self.config.read(self.pargs.config)
159  except Exception as err:
160  print MSG_ERROR_LOAD_CONFIG + err.message
161  raise
162 
163 
164  # #load logging
165  # load logging configuration (log file, log level, filters)
166  #
167  def loadLogConfigFile(self):
168  try:
169  log_conf_file = self.config.get("Application", "log")
170  logging.config.fileConfig(log_conf_file)
171  self.logger = Utils.MPLogger().getLogger()
172  except Exception as err:
173  print MSG_ERROR_LOAD_LOG_CONFIG_FILE + err.message
174  raise
175 
176 
177  # #load mandatory options
178  # load mandatory options
179  #
180  def loadOptions(self):
181  try:
182  self.config_db_dir = self.config.get(self.__class__.__name__, "config_db_dir")
183  self.raw_contents_tbl = self.config.get("sqlite", "raw_contents_tbl")
184  self.sqliteTimeout = self.config.getint("sqlite", "timeout")
185  except Exception as err:
186  print MSG_ERROR_LOAD_OPTIONS + err.message
187  raise
188 
189 
190  # #
191  #
192  #
193  def getExitCode(self):
194  return self.exit_code