HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
ScraperCustomJson.py
Go to the documentation of this file.
1 # coding: utf-8
2 '''
3 Created on Mar 02, 2016
4 
5 @package: dc_processor
6 @author: scorp
7 @link: http://hierarchical-cluster-engine.com/
8 @copyright: Copyright © 2013-2014 IOIX Ukraine
9 @license: http://hierarchical-cluster-engine.com/license/
10 @since: 0.1
11 '''
12 
13 # import re
14 import json
15 # import base64
16 import ConfigParser
17 import logging.config
18 import types
19 import sys
20 import copy
21 # import datetime
22 import time
23 import xml.sax.saxutils
24 try:
25  import cPickle as pickle
26 except ImportError:
27  import pickle
28 # import MySQLdb as mdb
29 from cement.core import foundation
30 
31 # import dc.EventObjects as dc_event
32 from app.Utils import varDump
33 import app.Profiler
34 import app.Utils as Utils
35 import app.Consts as APP_CONSTS
36 from app.Utils import ExceptionLog
37 # from app.DateTimeType import DateTimeType
38 from app.FieldsSQLExpressionEvaluator import FieldsSQLExpressionEvaluator
39 import dc_processor.Constants as CONSTS
40 from dc_processor.Scraper import Scraper
41 from dc_processor.ScraperResponse import ScraperResponse
42 # from dc_processor.PDateTimezonesHandler import PDateTimezonesHandler
43 
44 from dc_processor.scraper_resource import Resource
45 from dc_processor.scraper_result import Result as Result
46 from dc_processor.ScraperLangDetector import ScraperLangDetector
47 
48 # scraper's modules used via eval()
49 from dc_processor.newspaper_extractor import NewspaperExtractor # pylint: disable=W0611
50 from dc_processor.custom_extractor import CustomExtractor # pylint: disable=W0611
51 from dc_processor.goose_extractor import GooseExtractor # pylint: disable=W0611
52 from dc_processor.scrapy_extractor import ScrapyExtractor # pylint: disable=W0611
53 from dc_processor.ml_extractor import MLExtractor # pylint: disable=W0611
54 from dc_crawler.DBTasksWrapper import DBTasksWrapper
55 
56 # staus code
57 ERROR_OK = 0
58 
59 # exit staus code
60 EXIT_SUCCESS = 0
61 EXIT_FAILURE = 1
62 
63 MSG_ERROR_LOAD_EXTRACTORS = "Error load extractors "
64 
65 ENV_SCRAPER_STORE_PATH = "ENV_SCRAPER_STORE_PATH"
66 
67 TAGS_DATETIME_NEWS_NAMES = [CONSTS.TAG_PUB_DATE, CONSTS.TAG_DC_DATE]
68 
69 class ScraperCustomJson(Scraper): # #foundation.CementApp):
70 
71  # # Constants error messages used in class
72  MSG_ERROR_WRONG_CONFIG_FILE_NAME = "Config file name is wrong"
73  # # Constans used in class
74  TAGS_DATETIME_TEMPLATE_TYPES = [CONSTS.TAG_TYPE_DATETIME]
75  OPTION_SECTION_DATETIME_TEMPLATE_TYPES = 'tags_datetime_template_types'
76 
77  # Mandatory
78  class Meta(object):
79  label = CONSTS.SCRAPER_CUSTOM_JSON_APP_CLASS_NAME
80  def __init__(self):
81  pass
82 
83 
84  # # Constructor
85  def __init__(self, usageModel=APP_CONSTS.APP_USAGE_MODEL_PROCESS, configFile=None, logger=None, inputData=None):
86  if usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
87  # call base class __init__ method
88  # #foundation.CementApp.__init__(self)
89  Scraper.__init__(self)
90 
91  self.exitCode = APP_CONSTS.EXIT_SUCCESS
92  self.usageModel = usageModel
93  self.configFile = configFile
94  self.logger = logger
95  self.input_data = inputData
96  self.properties = {}
97  self.extractor = None
98  self.extractors = []
99  self.itr = None
100  self.pubdate = None
101  self.timezone = None
102  self.errorMask = APP_CONSTS.ERROR_OK
103  self.scraperPropFileName = None
104  self.algorithm_name = None
105  self.scraperResponses = []
106  self.tagsCount = 0
107  self.tagsMask = 0
108  self.pubdate = None
109  self.processedContent = None
110  self.outputFormat = None
111  self.metrics = None
112  self.altTagsMask = None
113  self.errorMask = APP_CONSTS.ERROR_OK
114  self.urlHost = None
115  self.output_data = None
116  self.dbWrapper = None
117  self.datetimeTemplateTypes = []
118  self.useCurrentYear = 0
119 
120 
121  # # setup application
122  def setup(self):
123  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
124  # call base class setup method
125  foundation.CementApp.setup(self)
126 
127 
128  # #run
129  # run application
130  def run(self):
131  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
132  # call base class run method
133  foundation.CementApp.run(self)
134 
135  # config section
136  self.loadConfig()
137 
138  # load logger config file
139  self.loadLogConfigFile()
140 
141  # options
142  self.loadOptions()
143 
144  # scraper properties
145  self.loadScraperProperties()
146 
147  # Do applied algorithm's job
148  self.processBatch()
149 
150  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
151  # Finish logging
152  self.logger.info(APP_CONSTS.LOGGER_DELIMITER_LINE)
153 
154 
155  # #load config from file
156  # load from cli argument or default config file
157  def loadConfig(self):
158  try:
159  self.config = ConfigParser.ConfigParser()
160  self.config.optionxform = str
161  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
162  if self.pargs.config:
163  self.config.read(self.pargs.config)
164  else:
165  self.config.read(CONSTS.SCRAPER_CUSTOM_JSON_APP_CLASS_NAME)
166  else:
167  self.config.read(self.configFile)
168  except:
169  raise
170 
171 
172  # #load logging
173  # load logging configuration (log file, log level, filters)
174  #
175  def loadLogConfigFile(self):
176  try:
177  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
178  log_conf_file = self.config.get("Application", "log")
179  logging.config.fileConfig(log_conf_file)
180  # Logger initialization
181  self.logger = Utils.MPLogger().getLogger()
182  except Exception, err:
183  raise Exception(CONSTS.MSG_ERROR_LOAD_CONFIG + " : " + str(err))
184 
185 
186  # #load mandatory options
187  # load mandatory options
188  #
189  def loadOptions(self):
190  try:
191  # class_name = self.__class__.__name__
192  self.scraperPropFileName = self.config.get("Application", "property_file_name")
193  # self.config_db_dir = self.config.get(class_name, "config_db_dir")
194  # self.sqliteTimeout = self.config.getint("sqlite", "timeout")
195 
196  self.useCurrentYear = self.config.getint("DateTimeType", "useCurrentYear")
197 
198  if self.config.has_section(self.OPTION_SECTION_DATETIME_TEMPLATE_TYPES):
199  self.datetimeTemplateTypes = []
200  for key, value in self.config.items(self.OPTION_SECTION_DATETIME_TEMPLATE_TYPES):
201  self.datetimeTemplateTypes.append(key)
202  if self.logger is not None:
203  self.logger.debug('load form config: ' + str(key) + ' = ' + str(value))
204  else:
205  self.datetimeTemplateTypes = self.TAGS_DATETIME_TEMPLATE_TYPES
206  if self.logger is not None:
207  self.logger.debug("Config file hasn't section: " + str(self.OPTION_SECTION_DATETIME_TEMPLATE_TYPES))
208 
209  # DBWrapper initialization
210  dbTaskIniConfigFileName = self.config.get(self.__class__.__name__, "db-task_ini")
211  config = ConfigParser.ConfigParser()
212  config.optionxform = str
213  readOk = config.read(dbTaskIniConfigFileName)
214  if len(readOk) == 0:
215  raise Exception(self.MSG_ERROR_WRONG_CONFIG_FILE_NAME + ": " + dbTaskIniConfigFileName)
216  self.dbWrapper = DBTasksWrapper(config)
217  except:
218  raise
219 
220 
221  # #loadScraperProperties
222  # loadScraperProperties loads scraper propeties from json file
224  if self.scraperPropFileName is not None:
225  try:
226  with open(self.scraperPropFileName, "rb") as fd:
227  scraperProperies = json.loads(fd.read())
228  self.properties = scraperProperies[self.__class__.__name__][CONSTS.PROPERTIES_KEY]
229  except Exception as excp:
230  self.logger.debug(">>> Some error with scraper property loads = " + str(excp))
231 
232 
233  # #process batch
234  # the main processing of the batch object
235  def processBatch(self):
236  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
237  # read pickled batch object from stdin
238  input_pickled_object = sys.stdin.read()
239  try:
240  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
241  scraper_in_data = pickle.loads(input_pickled_object)
242  except Exception as err:
243  ExceptionLog.handler(self.logger, err, 'pickle.loads() error:')
244  self.logger.debug("input_pickled_object:\n" + str(input_pickled_object))
245  self.exitCode = EXIT_FAILURE
246  raise Exception(err)
247 
248  try:
249  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
250  self.input_data = scraper_in_data
251  if self.input_data.batch_item.urlObj is not None:
252  urlString = self.input_data.batch_item.urlObj.url
253  else:
254  urlString = ""
255  logMsg = "BatchItem.siteId=" + str(self.input_data.batch_item.siteId) + \
256  ", BatchItem.urlId=" + str(self.input_data.batch_item.urlId) + \
257  ", BatchItem.urlObj.url=" + urlString
258  app.Profiler.messagesList.append(logMsg)
259  self.logger.info("Incoming data: %s", logMsg)
260 
261  self.urlHost = app.Utils.UrlParser.getDomain(self.input_data.url)
262 
263 
264  if self.input_data.output_format is not None and "name" in self.input_data.output_format:
265  self.outputFormat = self.input_data.output_format["name"]
266 
267  if self.outputFormat is None and "templates" in self.input_data.batch_item.properties["template"] and \
268  len(self.input_data.batch_item.properties["template"]["templates"]) > 0 and \
269  "output_format" in self.input_data.batch_item.properties["template"]["templates"][0] and \
270  "name" in self.input_data.batch_item.properties["template"]["templates"][0]["output_format"]:
271  self.outputFormat = self.input_data.batch_item.properties["template"]["templates"][0]["output_format"]["name"]
272 
273  if "TAGS_MAPPING" in self.input_data.batch_item.properties and \
274  self.input_data.batch_item.properties["TAGS_MAPPING"] is not None:
275  try:
276  self.altTagsMask = json.loads(self.input_data.batch_item.properties["TAGS_MAPPING"])
277  self.logger.debug(">>> AltTags = " + str(self.altTagsMask))
278  except Exception as exp:
279  self.logger.debug(">>> Bad TAGS_MAPPING properties value, err=" + str(exp))
280 
281  try:
282  if (self.input_data is not None) and (self.input_data.processor_properties is not None):
283  processor_properties = self.input_data.processor_properties
284  self.logger.debug("Processor's properties was taken from input data: %s" % processor_properties)
285  self.logger.debug("Processor's properties type: %s" % str(type(processor_properties)))
286  if not isinstance(processor_properties, types.DictType):
287  processor_properties = json.loads(self.input_data.processor_properties)
288  self.logger.debug("Processor's properties was taken from input data: %s" % processor_properties)
289  self.properties.update(processor_properties)
290  except Exception as err:
291  ExceptionLog.handler(self.logger, err, 'Error load properties from input data:')
292 
293  self.algorithm_name = self.properties[CONSTS.ALGORITHM_KEY][CONSTS.ALGORITHM_NAME_KEY]
294  self.logger.debug("Algorithm : %s" % self.algorithm_name)
295  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
296  Utils.storePickleOnDisk(input_pickled_object, ENV_SCRAPER_STORE_PATH, "scraper.in." + \
297  str(self.input_data.urlId))
298  if "metrics" in self.properties:
299  try:
300  self.metrics = json.loads(self.properties["metrics"])
301  self.logger.debug(">>> Metrics loads = " + str(self.metrics))
302  except Exception as excp:
303  self.logger.debug(">>> Metrcis dumps exception = " + str(excp))
304  # TODO main processing over every url from list of urls in the batch object
305  tmp = sys.stdout
306  sys.stdout = open("/dev/null", "wb")
307 
308  # initialization of scraper
309  # load scraper's modules
310  self.loadExtractors()
311 
312  # # Initialization pubdate
313  # self.logger.debug("Initialization pubdate from urlObj.pDate use value: %s",
314  # str(self.input_data.batch_item.urlObj.pDate))
315  # self.pubdate = self.input_data.batch_item.urlObj.pDate
316 
317  scraperResponses = self.jsonParserProcess()
318 
319  sys.stdout = tmp
320 
321  self.logger.debug("scraperResponse:\n%s", varDump(scraperResponses))
322  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
323  output_pickled_object = pickle.dumps(scraperResponses)
324  Utils.storePickleOnDisk(output_pickled_object, ENV_SCRAPER_STORE_PATH,
325  "scraper.out." + str(self.input_data.urlId))
326  print output_pickled_object
327  sys.stdout.flush()
328  else:
329  self.output_data = scraperResponses
330  except Exception as err:
331  ExceptionLog.handler(self.logger, err, 'ScraperCustomJson process batch error:')
332  self.exitCode = EXIT_FAILURE
333  raise Exception('ScraperCustomJson process batch error:' + str(err))
334 
335 
336  # #load extractors
337  #
338  def loadExtractors(self):
339  try:
340  # modules
341  if CONSTS.MODULES_KEY in self.properties and self.algorithm_name in self.properties[CONSTS.MODULES_KEY]:
342  modules = self.properties[CONSTS.MODULES_KEY][self.algorithm_name]
343  else:
344  self.logger.debug(">>> No moduler_key or algorithm_name in self.properties")
345  modules = []
346 
347  self.logger.debug("Algorithm name: <%s>" % (self.algorithm_name))
348  self.logger.debug("Modules: %s" % modules)
349 
350  self.extractors = []
351  for module in modules:
352  exrtactor = self.createModule(module)
353  # Check if module was created successfully and then insert it to extractors
354  if exrtactor is not None:
355  self.extractors.append(exrtactor)
356 
357  # Info show extractors loaded
358  self.logger.debug("*******************")
359  self.logger.debug("Loaded extractors:")
360  for exrtactor in self.extractors:
361  self.logger.debug(exrtactor.name)
362  self.logger.debug("*******************")
363 
364  except Exception as err:
365  ExceptionLog.handler(self.logger, err, MSG_ERROR_LOAD_EXTRACTORS)
366  raise
367 
368 
369  # #createApp
370  # create application's pool
371  #
372  # @param app_name application name which instance will be created
373  # @return instance of created application
374  def createModule(self, module_name):
375  appInst = None
376  try:
377  appInst = (module_name, eval(module_name)(self.config, None, self.urlHost, self.properties))[1] # pylint: disable=W0123
378  self.logger.debug("%s has been created!" % module_name)
379  except Exception as err:
380  ExceptionLog.handler(self.logger, err, "Can't create module %s. Error is:" % (module_name))
381 
382  return appInst
383 
384 
386  # return extractor with highest rank
387  try:
388  extractor = next(self.itr)
389  except StopIteration:
390  extractor = None
391  return extractor
392 
393 
394  def resourceExtraction(self, jsonElem):
395  ret = []
396  # get resource as dictionary
397  resource_set = {}
398  resource_set["url"] = self.input_data.url
399  resource_set["resId"] = self.input_data.urlId
400  resource_set["siteId"] = self.input_data.siteId
401  resource_set["raw_html"] = jsonElem
402  resource = Resource(resource_set)
403 
404  # get best matching extractor
405  self.extractor = self.getNextBestExtractor()
406  self.logger.debug("get best matching extractor: " + str(self.extractor))
407 
408  # search engine parsing ???
409  collectResult = Result(self.config, self.input_data.urlId, self.metrics)
410  # main loooop
411  while self.extractor:
412  result = Result(self.config, self.input_data.urlId, self.metrics)
413  self.logger.debug(">>> TAG BEGIN extractor = " + str(self.extractor))
414  result = self.extractor.extractTags(resource, result)
415 
416  self.logger.debug(">>> TAG END")
417  empty_tags = result.getEmptyTags()
418  self.logger.debug("get list of empty tags from result: " + str(empty_tags))
419  filled_tags = result.getFilledTags()
420  self.logger.debug("get list of filled_tags from result: " + str(filled_tags))
421  self.extractor = self.getNextBestExtractor()
422  self.logger.debug("get best matching extractor: " + str(self.extractor))
423 
424  for key in result.tags:
425  if key not in collectResult.tags or not collectResult.isTagFilled(key):
426  collectResult.tags[key] = copy.deepcopy(result.tags[key])
427  ret.append(result)
428  self.logger.debug(">>> EXIT LOOP")
429  ret = [collectResult] + ret
430  return ret
431 
432 
433  def formatOutpuElement(self, elem, localOutputFormat):
434  ret = elem
435  if localOutputFormat == "json":
436  # self.logger.debug(">>> JSON HTML = " + elem)
437  localStr = json.dumps(elem, ensure_ascii=False)
438  if localStr[0] == '\"' or localStr[0] == '\'':
439  localStr = localStr[1:]
440  if localStr[-1] == '\"' or localStr[-1] == '\'':
441  localStr = localStr[0:-1]
442  ret = localStr
443  # self.logger.debug(">>> JSON HTML = " + ret)
444  elif localOutputFormat == "html" or localOutputFormat == "xml":
445  ret = xml.sax.saxutils.escape(elem, {"'": "&apos;", "\"" : "&quot;"})
446  elif localOutputFormat == "sql":
447  # ret = mdb.escape_string(elem) # pylint: disable=E1101
448  ret = Utils.escape(elem)
449  return ret
450 
451 
452  # formatOutputData formats internal response's data by localOutputFormat format
453  #
454  def formatOutputData(self, response, localOutputFormat):
455  # result.tags[key]["data"]
456  for key in response.tags:
457  if "data" in response.tags[key]:
458  if isinstance(response.tags[key]["data"], types.ListType):
459  for i, elem in enumerate(response.tags[key]["data"]):
460  response.tags[key]["data"][i] = self.formatOutpuElement(elem, localOutputFormat)
461  elif isinstance(response.tags[key]["data"], types.StringTypes):
462  response.tags[key]["data"] = self.formatOutpuElement(response.tags[key]["data"], localOutputFormat)
463 
464 
465  # jsonParserExtractor extract one element
466  #
467  def jsonParserExtractor(self, jsonElem):
468  if self.extractors is not None:
469  self.itr = iter(sorted(self.extractors, key=lambda extractor: 0, reverse=True)) # pylint: disable=W0612,W0613
470  self.logger.debug("Extractors: %s" % varDump(self.itr))
471 
472  responses = self.resourceExtraction(jsonElem)
473  for response in responses:
474  response.metricsPrecalculate()
475  response.stripResult()
476  # Add tag 'source_url'
477  self.addCustomTag(result=response, tag_name=CONSTS.TAG_SOURCE_URL,
478  tag_value=[str(self.input_data.url)])
479 
480  if CONSTS.LANG_PROP_NAME in self.properties:
481  # response.tagsLangDetecting(self.properties[CONSTS.LANG_PROP_NAME])
482  langDetector = ScraperLangDetector(self.properties[CONSTS.LANG_PROP_NAME])
483  langDetector.process(response, self.logger)
484  langTagsDict = langDetector.getLangTags()
485  self.logger.debug("langTagsDict: %s", varDump(langTagsDict))
486 
487  # add lang tags to processed content
488  for tagName, langValue in langTagsDict.items():
489  self.addCustomTag(result=response, tag_name=tagName, tag_value=langValue)
490 
491  summaryLang = langDetector.getSummaryLang(response, self.logger)
492  self.addCustomTag(result=response, tag_name=CONSTS.TAG_SUMMARY_LANG, tag_value=summaryLang)
493 
494  pubdate, timezone = self.normalizeDatetime(response, self.algorithm_name)
495  if pubdate is not None:
496  self.pubdate = pubdate
497  self.logger.debug("Pubdate from 'pubdate': " + str(self.pubdate))
498 
499  # Apply property 'PDATE_DAY_MONTH_ORDER'
500  self.pubdate = self.pubdateMonthOrder(self.pubdate, self.input_data.batch_item.properties, self.input_data.url)
501 
502  # Apply property 'PDATE_TIME'
503  self.input_data.batch_item.urlObj.pDate = self.pubdate
504  self.pubdate = FieldsSQLExpressionEvaluator.evaluatePDateTime(self.input_data.batch_item.properties,
505  self.dbWrapper,
506  self.input_data.batch_item.urlObj,
507  self.logger,
508  self.pubdate)
509 
510  # Apply property 'PDATE_TIMEZONES'
511  self.pubdate, timezone = self.pubdateTransform(self.pubdate,
512  timezone,
513  self.input_data.batch_item.properties,
514  self.input_data.url)
515 
516  # Add tag 'pubdate_tz'
517  self.addCustomTag(result=response, tag_name=CONSTS.TAG_PUBDATE_TZ, tag_value=[timezone])
518 
519  if "pubdate" in response.tags and "data" in response.tags["pubdate"] and \
520  len(response.tags["pubdate"]["data"]) > 0:
521  response.tags["pubdate"]["data"][0] = self.pubdate
522 
523  if self.outputFormat is not None:
524  self.formatOutputData(response, self.outputFormat)
525  else:
526  self.logger.debug(">>> Warning, can't extracr output format")
527  response.recalcTagMaskCount(None, self.altTagsMask)
528  self.tagsCount = response.tagsCount
529  self.tagsMask = response.tagsMask
530  # self.putArticleToDB({"default":response})
531  self.logger.debug("self.tagsCount: %s", self.tagsCount)
532  self.logger.debug("self.tagsMasks: %s", self.tagsMask)
533 
534  self.logger.debug(">>> Resp: %s\n", varDump(response))
535 
536  # TODO: Seems need to be done more system way
537  response.finish = time.time()
538  response.data["time"] = "%s" % (response.finish - response.start)
539 
540  response = self.applyHTTPRedirectLink(self.input_data.batch_item.siteId, self.input_data.batch_item.urlObj.url,
541  self.input_data.batch_item.properties, response)
542 
543  self.getProcessedContent(responses)
544 
545 
546  # getProcessedContent fills self.processedContent's fields
547  #
548  def getProcessedContent(self, result):
549  for elem in result:
550  elem.get()
551  self.processedContent = {}
552  self.processedContent["default"] = result[0]
553  self.processedContent["internal"] = result
554  self.processedContent["custom"] = []
555  self.tagsCount = result[0].tagsCount
556  self.tagsMask = result[0].tagsMask
557 
558  if "pubdate" in result[0].tags and "data" in result[0].tags["pubdate"] and \
559  len(result[0].tags["pubdate"]["data"]) > 0:
560  self.pubdate = result[0].tags["pubdate"]["data"][0]
561  self.logger.debug('>>>> Set self.pubdate = ' + str(self.pubdate))
562 
563 
564  # fillScraperResponse clears all ScraperResponse class fields and return new ScraperResponse instance
565  #
566  def fillScraperResponse(self, jsonElem):
567  self.tagsCount = 0
568  self.tagsMask = 0
569  self.pubdate = None
570  self.processedContent = None
571  self.errorMask = APP_CONSTS.ERROR_OK
572  self.jsonParserExtractor(jsonElem)
573  return ScraperResponse(self.tagsCount, self.tagsMask, self.pubdate, self.processedContent, self.errorMask)
574 
575 
576  # generateEmptyResponse generates and returns empty response
577  #
579  localResult = Result(self.config, self.input_data.urlId, self.metrics)
580  # Add tag 'source_url'
581  self.addCustomTag(result=localResult, tag_name=CONSTS.TAG_SOURCE_URL, tag_value=[str(self.input_data.url)])
582  self.getProcessedContent([localResult])
583  return ScraperResponse(0, 0, self.pubdate, self.processedContent, APP_CONSTS.ERROR_MASK_SCRAPER_ERROR)
584 
585 
586  # jsonParserProcess method execute for json_parser algorithm
587  #
588  def jsonParserProcess(self):
589  rawDataJson = None
590  ret = []
591  try:
592  rawDataJson = json.loads(self.input_data.raw_content)
593  except Exception as excp:
594  self.logger.debug(">>> jsonParserProcess wrong rawData json: " + str(excp))
595 
596  self.logger.debug("!!! type(rawDataJson) = %s", str(type(rawDataJson)))
597  if not isinstance(rawDataJson, list):
598  self.logger.debug("!!! rawDataJson: %s", varDump(rawDataJson))
599 
600 
601  if rawDataJson is not None and isinstance(rawDataJson, list):
602  for elem in rawDataJson:
603  if isinstance(elem, list):
604  for internalElem in elem:
605  ret.append(self.fillScraperResponse(internalElem))
606  else:
607  ret.append(self.fillScraperResponse(elem))
608  else:
609  self.logger.debug(">>> rawDataJson structure not List type")
610 
611  if len(ret) == 0:
612  ret.append(self.generateEmptyResponse())
613  return ret
614 
615 
616  # getExitCode method returns exitCode value
617  #
618  def getExitCode(self):
619  return self.exitCode
620 
621 
622 # # # Add custom tag
623 # #
624 # # @param result - Scrper result instance
625 # # @param tag_name - value name of tag
626 # # @param tag_value - value value of tag
627 # # @return - None
628 # def addCustomTag(self, result, tag_name, tag_value):
629 # data = {"extractor": "Base extractor", "data": "", "name": ""}
630 # data["data"] = tag_value
631 # data["name"] = tag_name
632 # data["xpath"] = None
633 # data["type"] = None
634 # data["extractor"] = self.__class__.__name__
635 # if tag_name not in result.tags:
636 # result.tags[tag_name] = data
637 
638 
639 # # # Normalize datetime tags procedure
640 # #
641 # # @param response - scraper response instance
642 # # @param algorithmName - algorithm name
643 # # @return - 'pubdate tag value'
644 # def normalizeDatetime(self, response, algorithmName):
645 # ret = None
646 # timezone = ''
647 # try:
648 # if response is not None and response.tags is not None:
649 # self.logger.debug("normalizeDatetime scraper response: " + varDump(response))
650 # tagNames = []
651 # if self.input_data.template and algorithmName == CONSTS.PROCESS_ALGORITHM_REGULAR:
652 # # temlate
653 # for responseType in self.datetimeTemplateTypes:
654 # for responseTagName in response.tags:
655 # self.logger.debug("normalizeDatetime responseTagName: '" + str(responseTagName) + "'")
656 # if (response.tags.get(responseTagName) is not None and \
657 # 'type' in response.tags[responseTagName] and \
658 # response.tags[responseTagName]['type'] == responseType) or \
659 # (responseTagName == CONSTS.TAG_PUB_DATE and response.tags.get(responseTagName) is not None):
660 # tagNames.append(responseTagName)
661 # else:
662 # tagNames = TAGS_DATETIME_NEWS_NAMES
663 #
664 # self.logger.debug('normalizeDatetime tagNames: ' + varDump(tagNames))
665 # retDict = {}
666 # for tagName in tagNames:
667 # pubdate, tzone = self.extractPubDate(response, tagName) # , properties, urlString)
668 # if self.extractor and tagName in response.tags:
669 # self.extractor.addTag(result=response, tag_name=tagName + '_normalized', tag_value=pubdate, \
670 # xpath=response.tags[tagName]['xpath'])
671 #
672 # self.logger.debug('tagName: ' + str(tagName) + ' pubdate: ' + str(pubdate))
673 # retDict[tagName] = pubdate
674 #
675 # if tagName == CONSTS.TAG_PUB_DATE:
676 # ret = pubdate
677 # timezone = tzone
678 # else:
679 # pass
680 #
681 # if ret is None:
682 # for key, value in retDict.items():
683 # if value is not None:
684 # ret = value
685 # self.logger.debug('set return value from ' + str(key) + ' : ' + str(value))
686 # break
687 #
688 # except Exception, err:
689 # ExceptionLog.handler(self.logger, err, 'normalizeDatetime error:', (), \
690 # {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
691 #
692 # return ret, timezone
693 
694 
695 # # # Extract pubdate
696 # #
697 # # @param response - response instance
698 # # @param dataTagName - tag name for extracting
699 # # @param properties - properties from PROCESSOR_PROPERTIES
700 # # @param urlString - url string value
701 # # @return pubdate if success or None
702 # def extractPubDate(self, response, dataTagName): # , properties, urlString):
703 # # variable for result
704 # ret = None
705 # timezone = ''
706 # try:
707 # if response is not None and dataTagName in response.tags and response.tags[dataTagName] is not None:
708 #
709 # # self.logger.debug("extractPubDate response: " + varDump(response))
710 #
711 # inputData = response.tags[dataTagName]["data"]
712 # self.logger.debug("extractPubDate response has '" + str(dataTagName) + "' is: " + str(inputData))
713 # self.logger.debug("extractPubDate type of '" + str(dataTagName) + "' is: " + str(type(inputData)))
714 #
715 # inputList = []
716 # if isinstance(inputData, basestring):
717 # inputList = [inputData]
718 # elif isinstance(inputData, list):
719 # inputList = inputData
720 # else:
721 # pass
722 #
723 # pubdate = []
724 # timezones = []
725 # for inputElem in inputList:
726 # d = DateTimeType.parse(inputElem, bool(self.useCurrentYear), self.logger, False)
727 # self.logger.debug('pubdate: ' + str(d))
728 #
729 # if d is not None:
730 # d, tzone = DateTimeType.split(d)
731 # pubdate.append(d.isoformat(DateTimeType.ISO_SEP))
732 # timezones.append(tzone)
733 #
734 # self.logger.debug("extractPubDate result pubdate: " + str(pubdate))
735 # response.tags[dataTagName]["data"] = pubdate
736 # if len(pubdate) > 0:
737 # ret = pubdate[0]
738 #
739 # if len(timezones) > 0:
740 # timezone = timezones[0]
741 #
742 # except Exception, err:
743 # ExceptionLog.handler(self.logger, err, 'extractPubDate error:', (), \
744 # {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
745 #
746 # return ret, timezone
747 
748 
749 # # # pubdate transformation use timezone value
750 # #
751 # # @param rawPubdate - raw pubdate string
752 # # @param rawTimezone - raw timezone string
753 # # @param properties - properties from PROCESSOR_PROPERTIES
754 # # @param urlString - url string value
755 # # @return pubdate and timezone if success or None and empty string
756 # def pubdateTransform(self, rawPubdate, rawTimezone, properties, urlString):
757 # # variables for result
758 # pubdate = rawPubdate
759 # timezone = rawTimezone
760 #
761 # self.logger.debug('properties: ' + varDump(properties))
762 # if CONSTS.PDATE_TIMEZONES_NAME in properties:
763 # propertyString = properties[CONSTS.PDATE_TIMEZONES_NAME]
764 # self.logger.debug('inputted ' + CONSTS.PDATE_TIMEZONES_NAME + ':' + str(propertyString))
765 #
766 # dt = DateTimeType.parse(rawPubdate, bool(self.useCurrentYear), self.logger, False)
767 # self.logger.debug('pubdate: ' + str(dt))
768 # if dt is not None:
769 # # get utc offset if necessary
770 # utcOffset = DateTimeType.extractUtcOffset(rawTimezone, self.logger)
771 # self.logger.debug('utcOffset: ' + str(utcOffset))
772 # # transformation accord to PDATE_TIMEZONES properties
773 # d = PDateTimezonesHandler.transform(dt, utcOffset, propertyString, urlString, self.logger)
774 # if d is not None:
775 # dt = d
776 #
777 # if dt is not None:
778 # d, tzone = DateTimeType.split(dt)
779 # pubdate = d.isoformat(DateTimeType.ISO_SEP)
780 # timezone = tzone
781 #
782 # return pubdate, timezone
783 
784 
785 # # # change month orden in pubdate if neccessary
786 # #
787 # # @param rawPubdate - raw pubdate string in iso format. sample: '2016-02-07 16:28:00'
788 # # @param properties - properties from PROCESSOR_PROPERTIES
789 # # @param urlString - url string value
790 # # @return pubdate and timezone if success or None and empty string
791 # def pubdateMonthOrder(self, rawPubdate, properties, urlString):
792 # # variables for result
793 # pubdate = rawPubdate
794 #
795 # self.logger.debug('pubdateMonthOrder() enter... rawPubdate: ' + str(rawPubdate))
796 # if CONSTS.PDATE_DAY_MONTH_ORDER_NAME in properties and isinstance(rawPubdate, basestring):
797 # propertyObj = []
798 # try:
799 # self.logger.debug('inputted ' + CONSTS.PDATE_DAY_MONTH_ORDER_NAME + ':' + \
800 # str(properties[CONSTS.PDATE_DAY_MONTH_ORDER_NAME]))
801 # propertyObj = json.loads(properties[CONSTS.PDATE_DAY_MONTH_ORDER_NAME])
802 # except Exception, err:
803 # self.logger.error("Fail loads '%s', error: %s", str(CONSTS.PDATE_DAY_MONTH_ORDER_NAME), str(err))
804 #
805 # for propertyElem in propertyObj:
806 # try:
807 # if "pattern" not in propertyElem:
808 # raise Exception('Property "pattern" not found')
809 #
810 # if "order" not in propertyElem:
811 # raise Exception('Property "order" not found')
812 #
813 # pattern = str(propertyElem["pattern"])
814 # order = int(propertyElem["order"])
815 #
816 # if re.search(pattern, urlString, re.UNICODE) is not None:
817 # self.logger.debug("Pattern '%' found in url: %s", str(pattern), str(urlString))
818 #
819 # dt = None
820 # if order == 0: # means day follows month
821 # dt = datetime.datetime.strptime(rawPubdate, "%Y-%d-%m %H:%M:%S")
822 # elif order == 1: # means month follows day
823 # dt = datetime.datetime.strptime(rawPubdate, "%Y-%m-%d %H:%M:%S")
824 # else:
825 # raise Exception("Unsupported value of 'order' == " + str(order))
826 #
827 # if dt is not None:
828 # pubdate = dt.strftime("%Y-%d-%m %H:%M:%S")
829 #
830 # except Exception, err:
831 # self.logger.error("Fail execution '%s', error: %s", str(CONSTS.PDATE_DAY_MONTH_ORDER_NAME), str(err))
832 #
833 # self.logger.debug('pubdateMonthOrder() leave... pubdate: ' + str(pubdate))
834 #
835 # return pubdate
836 
837 
838 # # # Get header content
839 # #
840 # # @param siteId - Site/Project ID
841 # # @param url - url string
842 # # @return extracted header content
843 # def getHeaderContent(self, siteId, url):
844 # # variable for result
845 # headerContent = None
846 # urlContentObj = dc_event.URLContentRequest(siteId, url, \
847 # dc_event.URLContentRequest.CONTENT_TYPE_RAW_LAST + \
848 # dc_event.URLContentRequest. CONTENT_TYPE_RAW + \
849 # dc_event.URLContentRequest.CONTENT_TYPE_HEADERS)
850 #
851 # rawContentData = self.dbWrapper.urlContent([urlContentObj])
852 #
853 # if rawContentData is not None and len(rawContentData) > 0:
854 # if rawContentData[0].headers is not None and len(rawContentData[0].headers) > 0 and \
855 # rawContentData[0].headers[0] is not None:
856 # headerContent = rawContentData[0].headers[0].buffer
857 #
858 # return headerContent
859 #
860 #
861 # # #Get variable from header content
862 # #
863 # # @param headerContent - header content
864 # # @param name - variable name
865 # # @param makeDecode - boolean flag necessary decode
866 # # @return extracted value of 'Location'
867 # def getVariableFromHeaderContent(self, headerContent, name, makeDecode=True):
868 # # variable for result
869 # ret = None
870 #
871 # header = ''
872 # if makeDecode and headerContent is not None:
873 # header = base64.b64decode(headerContent)
874 #
875 # headerList = header.split('\r\n')
876 # self.logger.debug("headerList: " + varDump(headerList))
877 #
878 # for elem in headerList:
879 # pos = elem.find(name + ':')
880 # if pos > -1:
881 # ret = elem.replace(name + ':', '').strip()
882 # self.logger.debug("Found '" + name + "' has value: " + str(ret))
883 # break
884 #
885 # return ret
def formatOutpuElement(self, elem, localOutputFormat)
def formatOutputData(self, response, localOutputFormat)
def __init__(self, usageModel=APP_CONSTS.APP_USAGE_MODEL_PROCESS, configFile=None, logger=None, inputData=None)
def jsonParserExtractor(self, jsonElem)
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
def getDomain(url)
Definition: Utils.py:548
def fillScraperResponse(self, jsonElem)
def createModule(self, module_name)
def resourceExtraction(self, jsonElem)