HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_crawler.RTCFinalizer.RTCFinalizer Class Reference
Inheritance diagram for dc_crawler.RTCFinalizer.RTCFinalizer:
Collaboration diagram for dc_crawler.RTCFinalizer.RTCFinalizer:

Classes

class  Meta
 

Public Member Functions

def __init__ (self)
 
def setup (self)
 
def run (self)
 
def getBatchFromInput (self)
 
def getURLContent (self)
 
def sendURLContent (self)
 
def saveBatchToFile (self)
 
def getURLContentFromBatch (self)
 
def process (self)
 
def deleteURLContent (self)
 
def selectSiteProperty (self, batchItem, propName)
 

Public Attributes

 logger
 
 batch
 
 items
 
 exitCode
 
 urlContentResponse
 
 rb
 
 rc
 
 dbTask
 

Static Public Attributes

string MSG_ERROR_PARSE_CMD_PARAMS = "Error parse command line parameters."
 
string MSG_ERROR_EMPTY_CONFIG_FILE_NAME = "Config file name is empty."
 
string MSG_ERROR_WRONG_CONFIG_FILE_NAME = "Config file name is wrong"
 
string MSG_ERROR_LOAD_APP_CONFIG = "Error loading application config file."
 
string MSG_ERROR_READ_LOG_CONFIG = "Error read log config file."
 
string MSG_ERROR_READ_DB_TASK_CONFIG = "Error read db-task config file."
 
string MSG_ERROR_DELETE_URL = "Delete url has failed"
 
string MSG_DELETE_URL_OK = "URL was deleted"
 
string FINALIZER_OPTION_LOG = "log"
 
string FINALIZER_OPTION_DB_TASK_INI = "db_task_ini"
 

Private Member Functions

def __initApp (self)
 
def __loadAppConfig (self, configName)
 
def __loadLogConfig (self, configName)
 
def __loadDBTaskConfig (self, configName)
 

Detailed Description

Definition at line 33 of file RTCFinalizer.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_crawler.RTCFinalizer.RTCFinalizer.__init__ (   self)

Definition at line 58 of file RTCFinalizer.py.

58  def __init__(self):
59  # call base class __init__ method
60  foundation.CementApp.__init__(self)
61 
62  self.logger = None
63  self.batch = None
64  self.items = None
65  self.exitCode = APP_CONSTS.EXIT_SUCCESS
66  self.urlContentResponse = None
67  self.rb = None
68  self.rc = None
69  self.dbTask = None
70 
71 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ __initApp()

def dc_crawler.RTCFinalizer.RTCFinalizer.__initApp (   self)
private

Definition at line 97 of file RTCFinalizer.py.

97  def __initApp(self):
98  if self.pargs.config:
99  confLogFileName, confDBTaskName = self.__loadAppConfig(self.pargs.config)
100  self.__loadLogConfig(confLogFileName)
101  self.dbTask = DBTasksManager(self.__loadDBTaskConfig(confDBTaskName))
102  else:
103  raise Exception(self.MSG_ERROR_LOAD_APP_CONFIG)
104  self.rb = self.pargs.rb
105  self.rc = int(self.pargs.rc) if self.pargs.rc is not None else None
106 
107 
def __initApp(self, configName=None)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ __loadAppConfig()

def dc_crawler.RTCFinalizer.RTCFinalizer.__loadAppConfig (   self,
  configName 
)
private

Definition at line 112 of file RTCFinalizer.py.

112  def __loadAppConfig(self, configName):
113  # variable for result
114  confLogFileName = ""
115  confDBTaskName = ""
116 
117  try:
118  config = ConfigParser.ConfigParser()
119  config.optionxform = str
120 
121  readOk = config.read(configName)
122 
123  if len(readOk) == 0:
124  raise Exception(self.MSG_ERROR_WRONG_CONFIG_FILE_NAME + ": " + configName)
125 
126  if config.has_section(APP_CONSTS.CONFIG_APPLICATION_SECTION_NAME):
127  confLogFileName = str(config.get(APP_CONSTS.CONFIG_APPLICATION_SECTION_NAME, self.FINALIZER_OPTION_LOG))
128  confDBTaskName = str(config.get(APP_CONSTS.CONFIG_APPLICATION_SECTION_NAME, \
129  self.FINALIZER_OPTION_DB_TASK_INI))
130 
131  except Exception, err:
132  raise Exception(self.MSG_ERROR_LOAD_APP_CONFIG + ' ' + str(err))
133 
134  return confLogFileName, confDBTaskName
135 
136 
Here is the caller graph for this function:

◆ __loadDBTaskConfig()

def dc_crawler.RTCFinalizer.RTCFinalizer.__loadDBTaskConfig (   self,
  configName 
)
private

Definition at line 159 of file RTCFinalizer.py.

159  def __loadDBTaskConfig(self, configName):
160  # return config parser
161  config = None
162  if isinstance(configName, str) and len(configName) > 0:
163  try:
164  config = ConfigParser.ConfigParser()
165  config.optionxform = str
166  config.read(configName)
167 
168  except Exception, err:
169  raise Exception(self.MSG_ERROR_READ_DB_TASK_CONFIG + ' ' + str(err))
170 
171  return config
172 
173 
Here is the caller graph for this function:

◆ __loadLogConfig()

def dc_crawler.RTCFinalizer.RTCFinalizer.__loadLogConfig (   self,
  configName 
)
private

Definition at line 141 of file RTCFinalizer.py.

141  def __loadLogConfig(self, configName):
142  try:
143  if isinstance(configName, str) and len(configName) == 0:
144  raise Exception(self.MSG_ERROR_EMPTY_CONFIG_FILE_NAME)
145 
146  logging.config.fileConfig(configName)
147 
148  # call rotation log files and initialization logger
149  self.logger = Utils.MPLogger().getLogger()
150 
151  except Exception, err:
152  raise Exception(self.MSG_ERROR_READ_LOG_CONFIG + ' ' + str(err))
153 
154 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ deleteURLContent()

def dc_crawler.RTCFinalizer.RTCFinalizer.deleteURLContent (   self)

Definition at line 339 of file RTCFinalizer.py.

339  def deleteURLContent(self):
340  items = self.batch.items
341  urlDeleteRequest = []
342  num_of_items = len(items)
343  self.logger.debug("Num of items to delete in batch: <<%s>>" % (num_of_items))
344  item_no = 1
345  for item in self.items:
346  if item is None:
347  continue
348  siteId = item.siteId
349  url = item.urlObj.url
350  urlId = item.urlId
351  self.logger.debug("Delete item #%s: siteId: <<%s>>, urlId: <<%s>>, url: <<%s>>" % (item_no, siteId, urlId, url))
352  urlDeleteRequest.append(dc_event.URLDelete(siteId, url, reason=dc_event.URLDelete.REASON_RT_FINALIZER))
353  item_no = item_no + 1
354  self.dbTask.dbTaskMode = self.batch.dbMode
355  drceSyncTasksCoverObj = DC_CONSTS.DRCESyncTasksCover(DC_CONSTS.EVENT_TYPES.URL_DELETE, urlDeleteRequest)
356  responseDRCESyncTasksCover = self.dbTask.process(drceSyncTasksCoverObj)
357  urlDeleteResponse = responseDRCESyncTasksCover.eventObject
358  self.logger.debug("urlDeleteResponse: %s", varDump(urlDeleteResponse))
359  for status in urlDeleteResponse.statuses:
360  if status:
361  self.logger.debug(self.MSG_DELETE_URL_OK)
362  else:
363  self.exitCode = APP_CONSTS.EXIT_FAILURE
364  self.logger.debug(self.MSG_ERROR_DELETE_URL)
365 
366 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:
Here is the caller graph for this function:

◆ getBatchFromInput()

def dc_crawler.RTCFinalizer.RTCFinalizer.getBatchFromInput (   self)

Definition at line 174 of file RTCFinalizer.py.

174  def getBatchFromInput(self):
175  try:
176  # read pickled batch object from stdin and unpickle it
177  input_pickled_object = sys.stdin.read()
178  # self.logger.debug("input_pickled_object: %s", varDump(input_pickled_object))
179 
180  # print input_pickled_object
181  input_data = (pickle.loads(input_pickled_object))
182  # self.logger.debug("input_data: %s", varDump(input_data))
183 
184  # print("Batch item: siteId: %s, urlId: %s" %(input_data.siteId, input_data.urlId))
185  self.batch = input_data
186  self.items = self.batch.items
187 # self.logger.debug("Batch: %s", varDump(self.batch, stringifyType=0, maxDepth=10))
188  except Exception, err:
189  raise Exception('getBatchFromInput error: ' + str(err))
190 
191 
Here is the caller graph for this function:

◆ getURLContent()

def dc_crawler.RTCFinalizer.RTCFinalizer.getURLContent (   self)

Definition at line 192 of file RTCFinalizer.py.

192  def getURLContent(self):
193  urlContentRequest = []
194  num_of_items = len(self.items)
195  self.logger.debug("Num of items in batch: <<%s>>" % (num_of_items))
196  item_no = 1
197  for item in self.items:
198  if not item:
199  urlContentRequest.append(None)
200  self.logger.debug("Item is None.")
201  else:
202  siteId = item.siteId
203  url = item.urlObj.url
204  urlId = item.urlId
205  self.logger.debug("Item #%s: siteId: <<%s>>, urlId: <<%s>>, url: <<%s>>" % (item_no, siteId, urlId, url))
206  _urlContentRequest = dc_event.URLContentRequest(siteId, url)
207  _urlContentRequest.dbFieldsList = ["Status", "Crawled", "Processed", "ContentType", "Charset", "ErrorMask", \
208  "CrawlingTime", "ProcessingTime", "HTTPCode", "Size", "LinksI", "LinksE", \
209  "RawContentMd5", "LastModified", "CDate", "UDate", "TagsMask", "TagsCount", \
210  "PDate", "ContentURLMd5", "Batch_Id"]
211  urlContentRequest.append(_urlContentRequest)
212  item_no = item_no + 1
213  self.dbTask.dbTaskMode = self.batch.dbMode
214  drceSyncTasksCoverObj = DC_CONSTS.DRCESyncTasksCover(DC_CONSTS.EVENT_TYPES.URL_CONTENT, urlContentRequest)
215  responseDRCESyncTasksCover = self.dbTask.process(drceSyncTasksCoverObj)
216  self.urlContentResponse = responseDRCESyncTasksCover.eventObject
217  self.logger.debug("urlContentResponse: %s", varDump(obj=self.urlContentResponse, strTypeMaxLen=5000))
218 
219 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:
Here is the caller graph for this function:

◆ getURLContentFromBatch()

def dc_crawler.RTCFinalizer.RTCFinalizer.getURLContentFromBatch (   self)

Definition at line 261 of file RTCFinalizer.py.

261  def getURLContentFromBatch(self):
262  self.urlContentResponse = []
263  attributes = []
264  for item in self.items:
265  url = item.urlObj.url
266  if item.urlPutObj is not None:
267  # self.logger.debug("item.urlPutObj.putDict.data: %s", varDump(item.urlPutObj.putDict["data"]))
268  try:
269  if len(item.urlObj.attributes) > 0:
270  self.logger.debug("item.urlPutObj.attributes: %s", varDump(item.urlObj.attributes))
271  attributes = item.urlObj.attributes
272  except Exception, err:
273  self.logger.error("load attributes failed: %s", str(err))
274 
275  if item.urlPutObj.putDict["cDate"] is not None:
276  contents = [dc_event.Content(item.urlPutObj.putDict["data"], item.urlPutObj.putDict["cDate"],
277  dc_event.Content.CONTENT_PROCESSOR_CONTENT)]
278  else:
279  contents = [dc_event.Content(item.urlPutObj.putDict["data"],
280  typeId=dc_event.Content.CONTENT_PROCESSOR_CONTENT)]
281  else:
282  contents = []
283  rawContents = None
284  isFetchRawContent = self.selectSiteProperty(item, "FETCH_RAW_CONTENT")
285  if item.urlObj.urlPut is not None and isFetchRawContent is not None and int(isFetchRawContent) == 1:
286  rawContents = [dc_event.Content(item.urlObj.urlPut.putDict["data"], item.urlObj.urlPut.putDict["cDate"],
287  dc_event.Content.CONTENT_RAW_CONTENT)]
288  urlContentResponse = dc_event.URLContentResponse(url, rawContents, processedContents=contents)
289  urlContentResponse.status = 7
290  urlContentResponse.urlMd5 = item.urlObj.urlMd5
291  urlContentResponse.siteId = item.siteId
292  urlContentResponse.contentURLMd5 = item.urlObj.contentURLMd5
293  urlContentResponse.rawContentMd5 = item.urlObj.rawContentMd5
294  urlContentResponse.attributes = attributes
295  urlContentResponse.dbFields = {"Status":item.urlObj.status,
296  "Crawled":item.urlObj.crawled,
297  "Processed":item.urlObj.processed,
298  "ContentType":item.urlObj.contentType,
299  "Charset":item.urlObj.charset,
300  "ErrorMask":item.urlObj.errorMask,
301  "CrawlingTime":item.urlObj.crawlingTime,
302  "ProcessingTime":item.urlObj.processingTime,
303  "HttpCode":item.urlObj.httpCode,
304  "Size":item.urlObj.size,
305  "LinksI":item.urlObj.linksI,
306  "LinksE":item.urlObj.linksE,
307  "RawContentMd5":item.urlObj.rawContentMd5,
308  "LastModified":item.urlObj.lastModified,
309  "CDate":item.urlObj.CDate,
310  "UDate":item.urlObj.UDate,
311  "TagsMask":item.urlObj.tagsMask,
312  "TagsCount":item.urlObj.tagsCount,
313  "PDate":item.urlObj.pDate,
314  "ContentURLMd5":item.urlObj.contentURLMd5,
315  "BatchId":item.urlObj.batchId}
316 
317  if item.urlPutObj is not None and "properties" in item.urlPutObj.putDict:
318  urlContentResponse.itemProperties = item.urlPutObj.putDict["properties"]
319 
320  self.urlContentResponse.append(urlContentResponse)
321  self.logger.debug("urlContentResponse: %s", varDump(obj=self.urlContentResponse, strTypeMaxLen=5000))
322 
323 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
-mask-info
Here is the call graph for this function:
Here is the caller graph for this function:

◆ process()

def dc_crawler.RTCFinalizer.RTCFinalizer.process (   self)

Definition at line 324 of file RTCFinalizer.py.

324  def process(self):
325  self.getBatchFromInput()
326  # Check is Real-Time crawling
327  if self.batch.crawlerType == dc_event.Batch.TYPE_REAL_TIME_CRAWLER:
328  self.logger.debug("Real-Time crawling batch")
329  self.getURLContentFromBatch()
330  self.deleteURLContent()
331  else:
332  self.logger.debug("Regular crawling batch")
333  self.getURLContent()
334  self.saveBatchToFile()
335  if self.exitCode != self.rc:
336  self.sendURLContent()
337 
338 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ run()

def dc_crawler.RTCFinalizer.RTCFinalizer.run (   self)

Definition at line 79 of file RTCFinalizer.py.

79  def run(self):
80  # call base class run method
81  foundation.CementApp.run(self)
82 
83  # call initialization application
84  self.__initApp()
85 
86  # call internal processing
87  self.process()
88 
89  # Finish logging
90  self.logger.info(APP_CONSTS.LOGGER_DELIMITER_LINE)
91 
92 
Here is the call graph for this function:

◆ saveBatchToFile()

def dc_crawler.RTCFinalizer.RTCFinalizer.saveBatchToFile (   self)

Definition at line 225 of file RTCFinalizer.py.

225  def saveBatchToFile(self):
226  if self.rb is not None:
227  self.logger.debug("batchSaveFile is = " + str(self.rb))
228  urlCleanupList = []
229  contentCheck = ContentCheck()
230  for item in self.items:
231  if item.siteObj is not None and item.siteObj.fetchType == BaseFetcher.TYP_AUTO:
232  if item.urlPutObj is not None and contentCheck.lookMetricsinContent(item.urlPutObj):
233  self.logger.debug(">>> start checkUrlPutObj")
234  metricsApplying = self.selectSiteProperty(item, "FINALIZER_METRICS")
235  toRecrawl = contentCheck.checkUrlPutObj(item.urlPutObj, contentCheck.CHECK_TYPE_SIMPLE, metricsApplying)
236  else:
237  self.logger.debug(">>> start urlObj")
238  toRecrawl = contentCheck.checkUrlObj(item.urlObj)
239  if not toRecrawl:
240  urlCleanup = dc_event.URLCleanup(item.urlObj.siteId, item.urlObj.url)
241  urlCleanup.urlType = dc_event.URLStatus.URL_TYPE_MD5
242  urlCleanup.url = item.urlObj.urlMd5
243  urlCleanupList.append(urlCleanup)
244  item.siteObj.fetchType = BaseFetcher.TYP_DYNAMIC
245  item.urlObj.status = dc_event.URL.STATUS_SELECTED_CRAWLING
246  item.urlObj.crawled = 0
247  item.urlObj.urlPut = None
248  item.urlPutObj = None
249  if self.rc is not None:
250  self.exitCode = self.rc
251  if len(urlCleanupList) > 0:
252  drceSyncTasksCoverObj = DC_CONSTS.DRCESyncTasksCover(DC_CONSTS.EVENT_TYPES.URL_CLEANUP, urlCleanupList)
253  self.dbTask.process(drceSyncTasksCoverObj)
254  fd = open(self.rb, "w")
255  if fd is not None:
256  pickleObj = pickle.dumps(self.batch)
257  fd.write(pickleObj)
258  fd.close()
259 
260 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ selectSiteProperty()

def dc_crawler.RTCFinalizer.RTCFinalizer.selectSiteProperty (   self,
  batchItem,
  propName 
)

Definition at line 372 of file RTCFinalizer.py.

372  def selectSiteProperty(self, batchItem, propName):
373  ret = None
374  if batchItem.properties is not None and propName in batchItem.properties:
375  ret = batchItem.properties[propName]
376  elif batchItem.siteObj is not None and batchItem.siteObj.properties is not None:
377  for elem in batchItem.siteObj.properties:
378  if elem["name"] == propName:
379  ret = elem["value"]
380  break
381  return ret
382 
Here is the caller graph for this function:

◆ sendURLContent()

def dc_crawler.RTCFinalizer.RTCFinalizer.sendURLContent (   self)

Definition at line 220 of file RTCFinalizer.py.

220  def sendURLContent(self):
221  print pickle.dumps(self.urlContentResponse)
222  sys.stdout.flush()
223 
224 
Here is the caller graph for this function:

◆ setup()

def dc_crawler.RTCFinalizer.RTCFinalizer.setup (   self)

Definition at line 73 of file RTCFinalizer.py.

73  def setup(self):
74  # call base class setup method
75  foundation.CementApp.setup(self)
76 
77 

Member Data Documentation

◆ batch

dc_crawler.RTCFinalizer.RTCFinalizer.batch

Definition at line 63 of file RTCFinalizer.py.

◆ dbTask

dc_crawler.RTCFinalizer.RTCFinalizer.dbTask

Definition at line 69 of file RTCFinalizer.py.

◆ exitCode

dc_crawler.RTCFinalizer.RTCFinalizer.exitCode

Definition at line 65 of file RTCFinalizer.py.

◆ FINALIZER_OPTION_DB_TASK_INI

string dc_crawler.RTCFinalizer.RTCFinalizer.FINALIZER_OPTION_DB_TASK_INI = "db_task_ini"
static

Definition at line 49 of file RTCFinalizer.py.

◆ FINALIZER_OPTION_LOG

string dc_crawler.RTCFinalizer.RTCFinalizer.FINALIZER_OPTION_LOG = "log"
static

Definition at line 48 of file RTCFinalizer.py.

◆ items

dc_crawler.RTCFinalizer.RTCFinalizer.items

Definition at line 64 of file RTCFinalizer.py.

◆ logger

dc_crawler.RTCFinalizer.RTCFinalizer.logger

Definition at line 62 of file RTCFinalizer.py.

◆ MSG_DELETE_URL_OK

string dc_crawler.RTCFinalizer.RTCFinalizer.MSG_DELETE_URL_OK = "URL was deleted"
static

Definition at line 45 of file RTCFinalizer.py.

◆ MSG_ERROR_DELETE_URL

string dc_crawler.RTCFinalizer.RTCFinalizer.MSG_ERROR_DELETE_URL = "Delete url has failed"
static

Definition at line 44 of file RTCFinalizer.py.

◆ MSG_ERROR_EMPTY_CONFIG_FILE_NAME

string dc_crawler.RTCFinalizer.RTCFinalizer.MSG_ERROR_EMPTY_CONFIG_FILE_NAME = "Config file name is empty."
static

Definition at line 37 of file RTCFinalizer.py.

◆ MSG_ERROR_LOAD_APP_CONFIG

string dc_crawler.RTCFinalizer.RTCFinalizer.MSG_ERROR_LOAD_APP_CONFIG = "Error loading application config file."
static

Definition at line 39 of file RTCFinalizer.py.

◆ MSG_ERROR_PARSE_CMD_PARAMS

string dc_crawler.RTCFinalizer.RTCFinalizer.MSG_ERROR_PARSE_CMD_PARAMS = "Error parse command line parameters."
static

Definition at line 36 of file RTCFinalizer.py.

◆ MSG_ERROR_READ_DB_TASK_CONFIG

string dc_crawler.RTCFinalizer.RTCFinalizer.MSG_ERROR_READ_DB_TASK_CONFIG = "Error read db-task config file."
static

Definition at line 41 of file RTCFinalizer.py.

◆ MSG_ERROR_READ_LOG_CONFIG

string dc_crawler.RTCFinalizer.RTCFinalizer.MSG_ERROR_READ_LOG_CONFIG = "Error read log config file."
static

Definition at line 40 of file RTCFinalizer.py.

◆ MSG_ERROR_WRONG_CONFIG_FILE_NAME

string dc_crawler.RTCFinalizer.RTCFinalizer.MSG_ERROR_WRONG_CONFIG_FILE_NAME = "Config file name is wrong"
static

Definition at line 38 of file RTCFinalizer.py.

◆ rb

dc_crawler.RTCFinalizer.RTCFinalizer.rb

Definition at line 67 of file RTCFinalizer.py.

◆ rc

dc_crawler.RTCFinalizer.RTCFinalizer.rc

Definition at line 68 of file RTCFinalizer.py.

◆ urlContentResponse

dc_crawler.RTCFinalizer.RTCFinalizer.urlContentResponse

Definition at line 66 of file RTCFinalizer.py.


The documentation for this class was generated from the following file: