HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
ScraperMultiItemsTask.py
Go to the documentation of this file.
1 """
2 HCE project, Python bindings, Distributed Tasks Manager application.
3 ScraperMultiItemsTask Class content main functional scrapering for multi items.
4 
5 @package: dc_processor
6 @file ScraperMultiItemsTask.py
7 @author Alexander Vybornyh <alexander.hce.cluster@gmail.com>
8 @link: http://hierarchical-cluster-engine.com/
9 @copyright: Copyright &copy; 2013-2015 IOIX Ukraine
10 @license: http://hierarchical-cluster-engine.com/license/
11 @since: 0.1
12 """
13 
14 import os # pylint: disable=W0611
15 import re
16 import sys
17 import time
18 import logging.config
19 import ConfigParser
20 import pickle
21 import json
22 # import datetime
23 # import base64
24 import copy
25 # import urlparse
26 import xml.sax.saxutils
27 # from contextlib import closing # pylint: disable=W0611
28 from cement.core import foundation
29 from dateutil.parser import * # pylint: disable=W0401,W0614
30 from dateutil import parser
31 from scrapy.selector import Selector
32 
33 from dc.EventObjects import SiteFilter # pylint: disable=W0611
34 from dc.EventObjects import Batch # pylint: disable=W0611
35 import dc.EventObjects as dc_event
36 from app.Utils import varDump
37 from app.Utils import isValidURL
38 import app.Utils as Utils
39 import app.Consts as APP_CONSTS
40 from app.Utils import SQLExpression
41 import app.Profiler
42 from app.DateTimeType import DateTimeType
43 from app.FieldsSQLExpressionEvaluator import FieldsSQLExpressionEvaluator
44 from app.Utils import ExceptionLog
45 import dc_processor.Constants as CONSTS
46 from dc_processor.Scraper import Scraper
47 from dc_processor.ScraperInData import ScraperInData # pylint: disable=W0611
48 from dc_processor.ScraperResponse import ScraperResponse
49 from dc_processor.TemplateExtractorXPathPreparing import TemplateExtractorXPathPreparing
50 from dc_processor.scraper_result import Result as Result
51 from dc_processor.PDateTimezonesHandler import PDateTimezonesHandler
52 from dc_processor.MediaLimitsHandler import MediaLimitsHandler
53 # scraper's modules used via eval()
54 from dc_processor.newspaper_extractor import NewspaperExtractor # pylint: disable=W0611
55 from dc_processor.goose_extractor import GooseExtractor # pylint: disable=W0611
56 from dc_processor.scrapy_extractor import ScrapyExtractor
57 from dc_processor.ml_extractor import MLExtractor # pylint: disable=W0611
58 from dc_processor.base_extractor import BaseExtractor # pylint: disable=W0611
59 from dc_processor.custom_extractor import CustomExtractor # pylint: disable=W0611
60 from dc_crawler.DBTasksWrapper import DBTasksWrapper
61 import dc_crawler.Constants as CRAWLER_CONSTS
62 
63 # # ScraperResultDocuments class for support ScraperMultiItemsTask
64 #
65 class ScraperResultDocuments(object):
66  # Constructor
67  #
68  # @param keys - list of templates names
69  # @param urlId - input data urlId
70  def __init__(self, keys, urlId):
71  self.urlId = urlId
72  self.docs = {}
73  self.join = {}
74  self.isExtract = {}
75  self.mandatory = {}
76  self.etree = {}
77  for key in keys:
78  self.docs[key] = []
79  self.join[key] = []
80  self.isExtract[key] = []
81  self.mandatory[key] = []
82  self.etree[key] = []
83 
84 
85  # #Add etree for documents
86  #
87  # @param key - name of key
88  # @param value - new value of etree
89  # @return - None
90  def addEtree(self, key, value):
91  if not self.etree.has_key(key):
92  self.docs[key] = []
93  self.join[key] = []
94  self.isExtract[key] = []
95  self.mandatory[key] = []
96  self.etree[key] = []
97 
98  self.etree.get(key).append(copy.deepcopy(value))
99 
100 
101  # # Add new document
102  #
103  # @param key - name of key
104  # @param value - new value of doc
105  def addDoc(self, key, value, join, isExtract, mandatory):
106  if not self.docs.has_key(key):
107  self.docs[key] = []
108  self.join[key] = []
109  self.isExtract[key] = []
110  self.mandatory[key] = []
111 
112  self.docs.get(key).append(copy.deepcopy(value))
113  self.join.get(key).append(copy.deepcopy(join))
114  self.isExtract.get(key).append(copy.deepcopy(isExtract))
115  self.mandatory.get(key).append(copy.deepcopy(mandatory))
116 
117 
118  # # Get count of documents
119  #
120  # @param inDict - input dictionary whose has value as list
121  # @return count of documents
122  def getMaxCount(self, inDict):
123  # variable for result
124  count = 0
125  for key in inDict.keys():
126  count = max(count, len(inDict.get(key)))
127 
128  return count
129 
130 
131  # # Get tag names list exist in all documents
132  #
133  # @param - None
134  # @return list of tag names exist in all documents
136  # variable for result
137  tagNames = []
138  count = self.getMaxCount(self.docs)
139  for key in self.docs.keys():
140  size = len(self.docs.get(key))
141  if count == size:
142  tagNames.append(key)
143 
144  return tagNames
145 
146 
147  # #Compare two paths and return common part
148  # @param lhs - first path
149  # @param rhs - second path
150  # @return - common part of paths
151  def getCommonPath(self, lhs, rhs, logger=None): # pylint: disable=W0612,W0613
152  # variable for result
153  ret = []
154  length = min(len(lhs), len(rhs))
155 
156  # if logger is not None:
157  # logger.debug('>>> lhs: ' + str(lhs))
158  # logger.debug('>>> rhs: ' + str(rhs))
159 
160  for i in range(length):
161  if isinstance(lhs[i], str) and isinstance(rhs[i], str) and lhs[i] != rhs[i]:
162  if i > 0:
163  ret = lhs[:i]
164  return ret
165 
166  # logger.info('len(lhs[' + str(i) + ']) = ' + str(len(lhs[i])) + \
167  # ' len(rhs[' + str(i) + ']) = ' + str(len(rhs[i])))
168 
169  if isinstance(lhs[i], tuple) and isinstance(rhs[i], tuple) and len(lhs[i]) == len(rhs[i]):
170  for j in range(len(lhs[i])):
171  # logger.info('lhs[' + str(j) + '] = ' + str(lhs[i][j] + ' rhs[' + str(j) + '] = ' + str(rhs[i][j])))
172  if lhs[i][j] != rhs[i][j]:
173 
174  # logger.info('lhs[:i] = ' + str(lhs[:i]))
175  if i > 0:
176  ret = lhs[:i]
177 
178  # logger.debug('ret = ' + str(ret))
179  return ret
180 
181  return ret
182 
183 
184  # #Calculate index path
185  #
186  # @param etree - etree input data
187  # @result path - common path got from etree
188  def calculateIndexPath(self, etree, logger=None):
189  # variable for result
190  ret = []
191  pathDict = {}
192  pathList = []
193 
194  for key in etree.keys():
195  pathList.extend(etree.get(key))
196 
197  for index in range(len(pathList) - 1):
198  commonPath = self.getCommonPath(pathList[index], pathList[index + 1], logger)
199  commonPathCount = 0
200  if pathDict.has_key(str(commonPath)):
201  commonPathCount = int(pathDict.get(str(commonPath))[1])
202 
203  pathDict[str(commonPath)] = (commonPath, commonPathCount + 1)
204 
205  localpathList = []
206  for elem in pathDict.values():
207  localpathList.append(elem)
208 
209  localpathList.sort(key=lambda tup: tup[1], reverse=True)
210  if len(localpathList) > 0:
211  ret = (localpathList[0])[0]
212 
213  return ret
214 
215 
216  # # Get index number of path
217  #
218  # @param indexPath - common index path
219  # @param elementPath - element path
220  # @return indexNumber - index number (-1 if wrong indexPath) and first inequal elements
221  def getIndexNumberOfPath(self, indexPath, elemPath, logger=None):
222  elementPath = copy.deepcopy(elemPath)
223  length = min(len(indexPath), len(elementPath))
224 
225  if logger is not None:
226  logger.debug('\n>>> indexPath: ' + str(indexPath))
227  logger.debug('\n>>> elementPath: ' + str(elementPath))
228 
229  for i in range(length):
230  if isinstance(indexPath[i], str) and isinstance(elementPath[i], str) and indexPath[i] != elementPath[i]:
231  if logger is not None:
232  logger.debug("Both have type 'str' and indexPath[" + str(i) + "] != elementPath[" + str(i) + "]")
233  return -1
234 
235  if isinstance(indexPath[i], tuple) and isinstance(elementPath[i], tuple):
236  size = min(len(indexPath[i]), len(elementPath[i]))
237  for j in range(size):
238  if indexPath[i][j] != elementPath[i][j]:
239  if logger is not None:
240  logger.debug("Both have type 'tuple' and indexPath[" + str(i) + "][" + str(j) + "] != elementPath[" + \
241  str(i) + "][" + str(j) + "]")
242  return -1
243 
244  if len(elementPath) > len(indexPath):
245  if logger is not None:
246  logger.debug('type(elementPath[len(indexPath)])) = ' + str(type(elementPath[len(indexPath)])) + \
247  ' elementPath[' + str(len(indexPath)) + ']: ' + str(elementPath[len(indexPath)]))
248 
249  if isinstance(elementPath[len(indexPath)], tuple):
250  if len(elementPath[len(indexPath)]) > 1:
251  if logger is not None:
252  logger.debug('>>> elementPath[' + str(len(indexPath)) + '][1] = ' + str(elementPath[len(indexPath)][1]))
253 
254  return elementPath[len(indexPath)][1]
255 
256  return -1
257 
258 
259  # # Get all tags
260  #
261  # @param mandatoryTags - dyctionary of mandatory properties for tags (key - tag name, value - boolean mandatory flag)
262  # @param logger - logger instance
263  # @return all tags
264  def getAllTags(self, mandatoryTags, logger=None):
265  # variable for result
266  resTags = []
267  count = self.getMaxCount(self.docs)
268 
269  # #Calculate index block
270  indexPath = self.calculateIndexPath(self.etree, logger)
271  if logger is not None:
272  logger.info('Calculated indexPath: ' + str(indexPath))
273 
274  if logger is not None:
275  for key in self.etree:
276  logger.debug('len(self.etree.get(' + str(key) + ') = ' + str(len(self.etree.get(key))))
277  for key in self.docs:
278  logger.debug('len(self.docs.get(' + str(key) + ') = ' + str(len(self.docs.get(key))))
279 
280  resultList = []
281  for index in range(self.getMaxCount(self.etree)):
282  localRes = Result(None, self.urlId)
283  resultList.append(localRes)
284 
285  if logger is not None:
286  logger.debug('count = ' + str(count))
287  logger.debug('len(resultList) = ' + str(len(resultList)))
288 
289  for key in self.docs.keys():
290  for index in range(len(self.docs.get(key))):
291  if logger is not None:
292  logger.debug('==== key: ' + str(key) + ' index: ' + str(index) + ' ====')
293 
294  if len(self.etree.get(key)) > index:
295  number = int(self.getIndexNumberOfPath(indexPath, self.etree.get(key)[index], logger))
296  if logger is not None:
297  logger.debug('number = ' + str(number) + ' self.docs.get(' + str(key) + ')[' + str(index) + '].tags: ' + \
298  varDump(self.docs.get(key)[index].tags))
299 
300  if int(number) > 0 and int(number) <= len(self.docs.get(key)):
301  if resultList[int(number) - 1].tags.has_key(key):
302  result = self.updateTagValue(resultList[int(number) - 1], self.docs.get(key)[index].tags, key)
303  resultList[int(number) - 1].tags.update(result.tags)
304  else:
305  resultList[int(number) - 1].tags.update({key:self.docs.get(key)[index].tags[key]})
306 
307  if logger is not None:
308  logger.debug("resultList[" + str(int(number) - 1) + "].tags.update({" + str(key) + ":self.docs.get(" + \
309  str(key) + ")[" + str(index) + "].tags[" + str(key) + "]})")
310 
311  for index in range(0, len(resultList)):
312  isMandatory = True
313  countSelected = 0
314  for key in self.docs.keys():
315  if not resultList[index].tags.has_key(key) and bool(mandatoryTags[key]) is True:
316  isMandatory = False
317  break
318 
319  if resultList[index].tags.has_key(key):
320  countSelected = countSelected + 1
321 
322  if countSelected == 0:
323  isMandatory = False
324 
325  if isMandatory:
326  resTags.append(resultList[index])
327 
328  if len(resTags) == 0:
329  resTags.append(Result(None, self.urlId))
330 
331  return resTags
332 
333 
334  # # Update value of tag
335  #
336  # @param result - instance of Result for update
337  # @param tags - instance of tags object for append to Result object
338  # @param tag_name - tag name used as key
339  # @return - None
340  def updateTagValue(self, result, tags, tag_name):
341 
342  data = {"extractor":"Base extractor", "data":"", "name":""}
343  data["data"] = [result.tags[tag_name]["data"][0] + tags[tag_name]["data"][0]]
344  data["name"] = result.tags[tag_name]["name"]
345  data["xpath"] = result.tags[tag_name]["xpath"]
346  data["type"] = result.tags[tag_name]["type"]
347  data["extractor"] = result.tags[tag_name]["extractor"]
348  result.tags[tag_name] = data
349 
350  return result
351 
352 
353  # # Get all documents
354  #
355  # @param mandatoryTags - dyctionary of mandatory properties for tags (key - tag name, value - boolean mandatory flag)
356  # @param logger - logger instance
357  # @return all documents
358  def getAllDocs(self, mandatoryTags, logger=None):
359  # variable for result
360  resDocs = []
361 
362  resTags = self.getAllTags(mandatoryTags, logger)
363  count = len(resTags)
364 
365  tagsNames = self.getTagNamesExistAllDocs()
366 
367  if len(tagsNames) > 0:
368  key = tagsNames[0]
369 
370  for index in range(count):
371  if len(self.join.get(key)) > index and \
372  len(self.isExtract.get(key)) > index and \
373  len(self.mandatory.get(key)) > index:
374  resDocs.append({"obj": resTags[index],
375  "join": self.join.get(key)[index],
376  "isExtract": self.isExtract.get(key)[index],
377  "mandatory": self.mandatory.get(key)[index],
378  CONSTS.TAG_ORDER_NUMBER: len(resDocs) + 1})
379 
380  return resDocs
381 
382 
383 
384 # # ScraperMultiItemsTask Class content main functional scrapering for multi items,
385 # class inherits from foundation.CementApp
386 #
387 class ScraperMultiItemsTask(Scraper): # #foundation.CementApp):
388 
389  # # Constants error messages used in class
390  MSG_ERROR_PARSE_CMD_PARAMS = "Error parse command line parameters."
391  MSG_ERROR_EMPTY_CONFIG_FILE_NAME = "Config file name is empty."
392  MSG_ERROR_WRONG_CONFIG_FILE_NAME = "Config file name is wrong"
393 
394  MSG_ERROR_LOAD_PROPERTIES_FROM_FILE = "Error load Scraper multi items properties from file"
395  MSG_ERROR_LOAD_APP_CONFIG = "Error loading application config file."
396  MSG_ERROR_READ_LOG_CONFIG = "Error read log config file."
397 
398  MSG_ERROR_READ_INPUT_DATA = "Error read input data from stdin."
399  MSG_ERROR_INPUT_DATA_NONE = "Input data is none"
400  MSG_ERROR_INPUT_DATA_WITHOUT_BATCH = "Input data without batch item."
401  MSG_ERROR_INPUT_DATA_WITHOUT_PROPERTIES = "Input data has batch item without 'properties'."
402  MSG_ERROR_GET_PROPERTIES = "Error getting properties from input data"
403 
404  MSG_ERROR_LOAD_EXTRACTORS = "Error load extractors "
405  MSG_ERROR_ADJUST_PR = "Error adjust partial references. "
406  MSG_ERROR_ADJUST_PUBDATE = "Error adjust PUBDATE. "
407  MSG_ERROR_ADJUST_TITLE = "Error adjust title. "
408  MSG_ERROR_ADJUST_LINK_URL = "Error adjust link URL. "
409 
410 
411  # #Constans used options from config file
412  SCRAPER_MULTI_ITEMS_OPTION_LOG = "log"
413  SCRAPER_MULTI_ITEMS_OPTION_PROPERTY_JSON_FILE = "property_file_name"
414 
415  # #Constans used in class
416  ENV_SCRAPER_STORE_PATH = "self.ENV_SCRAPER_STORE_PATH"
417  EXTENDED_NEWS_TAGS = {"description": ["//meta[@name='description']//@content"]}
418  DATA_NEWS_TAGS = [CONSTS.TAG_DC_DATE]
419 # WWW_PREFIX = "www."
420 
421  TAGS_DATETIME_TEMPLATE_TYPES = [CONSTS.TAG_TYPE_DATETIME]
422  OPTION_SECTION_DATETIME_TEMPLATE_TYPES = 'tags_datetime_template_types'
423 
424  # Mandatory
425  class Meta(object):
426  label = CONSTS.SCRAPER_MULTI_ITEMS_APP_CLASS_NAME
427  def __init__(self):
428  pass
429 
430 
431  # #constructor
432  def __init__(self, usageModel=APP_CONSTS.APP_USAGE_MODEL_PROCESS, configFile=None, logger=None, inputData=None):
433  if usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
434  # call base class __init__ method
435  # #foundation.CementApp.__init__(self)
436  Scraper.__init__(self)
437 
438  self.exitCode = APP_CONSTS.EXIT_SUCCESS
439  self.usageModel = usageModel
440  self.configFile = configFile
441  self.logger = logger
442  self.input_data = inputData
443  self.properties = {}
444  self.outputFormat = None
445  self.output_data = None
446  self.extractor = None
447  self.extractors = []
448  self.itr = None
449  self.pubdate = None
450  self.errorMask = APP_CONSTS.ERROR_OK
451  self.xpathSplitString = ' '
452  self.useCurrentYear = 0
453  self.datetimeTemplateTypes = []
454  self.dbWrapper = None
455  self.mediaLimitsHandler = None
456 
457 
458  # # setup application
459  def setup(self):
460  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
461  # call base class setup method
462  foundation.CementApp.setup(self)
463 
464  # # run application
465  def run(self):
466  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
467  # call base class run method
468  foundation.CementApp.run(self)
469  # get input data from stdin
470  self.input_data = self.__getInputData()
471 
472  # call initialization application
473  config = self.__initApp(self.configFile)
474 
475  self.process(config)
476 
477  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
478  # Finish logging
479  self.logger.info(APP_CONSTS.LOGGER_DELIMITER_LINE)
480 
481 
482  # #initialize application from config files
483  #
484  # @param - configName - name of application config file
485  # @return config - config parser
486  def __initApp(self, configName=None):
487 
488  if configName is None:
489  configName = self.pargs.config
490  else:
491  pass
492 
493  config, confLogFileName, scraperPropertyFileName = self.__loadAppConfig(configName)
494 
495  self.properties = self.__loadScraperProperties(scraperPropertyFileName)
496 
497  if self.logger is None:
498  self.__loadLogConfig(confLogFileName)
499  else:
500  pass
501 
502  self.logger.info('self.properties: ' + varDump(self.properties))
503 
504  return config
505 
506 
507  # #loads scraper propeties from json file
508  #
509  # @param scraperPropertyFileName - input scraper property json file
510  # @return properties - extracted property
511  def __loadScraperProperties(self, scraperPropertyFileName):
512  # variable for result
513  properties = None
514  if scraperPropertyFileName is not None:
515  try:
516  with open(scraperPropertyFileName, "rb") as fd:
517  scraperProperies = json.loads(fd.read())
518  properties = scraperProperies[self.__class__.__name__][CONSTS.PROPERTIES_KEY]
519  except Exception, err:
520  if self.logger is not None:
521  self.logger.error(self.MSG_ERROR_LOAD_PROPERTIES_FROM_FILE + " '" + \
522  str(scraperPropertyFileName) + "': " + str(err))
523 
524  return properties
525 
526 
527  # #load application config file
528  #
529  # @param configName - name of application config file
530  # @return config - config parser
531  # @return confLogFileName - log config file name,
532  # @return scraperPropertyFileName - input scraper property file name,
533  def __loadAppConfig(self, configName):
534  # variables for result
535  confLogFileName = ''
536  scraperPropertyFileName = ''
537  try:
538  if configName is None or configName == "":
539  raise Exception(self.MSG_ERROR_EMPTY_CONFIG_FILE_NAME)
540 
541  config = ConfigParser.ConfigParser()
542  config.optionxform = str
543 
544  readOk = config.read(configName)
545 
546  if len(readOk) == 0:
547  raise Exception(self.MSG_ERROR_WRONG_CONFIG_FILE_NAME + ": " + configName)
548 
549  if config.has_section(APP_CONSTS.CONFIG_APPLICATION_SECTION_NAME):
550  confLogFileName = Utils.getConfigParameter(config, APP_CONSTS.CONFIG_APPLICATION_SECTION_NAME, \
551  self.SCRAPER_MULTI_ITEMS_OPTION_LOG, '')
552 
553  scraperPropertyFileName = Utils.getConfigParameter(config, APP_CONSTS.CONFIG_APPLICATION_SECTION_NAME, \
554  self.SCRAPER_MULTI_ITEMS_OPTION_PROPERTY_JSON_FILE, '')
555 
556  self.useCurrentYear = config.getint("DateTimeType", "useCurrentYear")
557 
558  if config.has_section(self.OPTION_SECTION_DATETIME_TEMPLATE_TYPES):
559  self.datetimeTemplateTypes = []
560  for key, value in config.items(self.OPTION_SECTION_DATETIME_TEMPLATE_TYPES):
561  self.datetimeTemplateTypes.append(key)
562  if self.logger is not None:
563  self.logger.debug('load form config: ' + str(key) + ' = ' + str(value))
564  else:
565  self.datetimeTemplateTypes = self.TAGS_DATETIME_TEMPLATE_TYPES
566  if self.logger is not None:
567  self.logger.debug("Config file hasn't section: " + str(self.OPTION_SECTION_DATETIME_TEMPLATE_TYPES))
568 
569  # DBWrapper initialization
570  dbTaskIniConfigFileName = config.get(self.__class__.__name__, "db-task_ini")
571  readOk = config.read(dbTaskIniConfigFileName)
572  if len(readOk) == 0:
573  raise Exception(self.MSG_ERROR_WRONG_CONFIG_FILE_NAME + ": " + dbTaskIniConfigFileName)
574  self.dbWrapper = DBTasksWrapper(config)
575  except Exception, err:
576  raise Exception(self.MSG_ERROR_LOAD_APP_CONFIG + ' ' + str(err))
577 
578  return config, confLogFileName, scraperPropertyFileName
579 
580 
581  # #load log config file
582  #
583  # @param configName - name of log scraper multi items config file
584  # @return - None
585  def __loadLogConfig(self, configName):
586  try:
587  if isinstance(configName, str) and len(configName) == 0:
588  raise Exception(self.MSG_ERROR_EMPTY_CONFIG_FILE_NAME)
589 
590  logging.config.fileConfig(configName)
591 
592  # call rotation log files and initialization logger
593  self.logger = Utils.MPLogger().getLogger()
594 
595  except Exception, err:
596  raise Exception(self.MSG_ERROR_READ_LOG_CONFIG + ' ' + str(err))
597 
598 
599  # # get exist extractor use extractor name
600  #
601  # @param extractorName - extractor name
602  # @return instance of extractor or None
603  def getExtractorByName(self, extractorName):
604  for extractor in self.extractors:
605  if extractor.__class__.__name__ == extractorName:
606  return extractor
607  # in case if not found
608  return None
609 
610 
611  # # get exit code og application
612  def getExitCode(self):
613  return self.exitCode
614 
615 
616  # # get input data from stdin
617  #
618  # @param - None
619  # @return inputData - input data read from stdin
620  def __getInputData(self):
621  # variable for result
622  scraperInputData = None
623  try:
624  # read pickled object from stdin and extract it
625  scraperInputData = pickle.loads(sys.stdin.read())
626  except Exception, err:
627  if self.logger is not None:
628  ExceptionLog.handler(self.logger, err, self.MSG_ERROR_READ_INPUT_DATA)
629  else:
630  pass
631  raise Exception(self.MSG_ERROR_READ_INPUT_DATA + ' ' + str(err))
632 
633  return scraperInputData
634 
635 
636  # # check input data
637  #
638  # @param inputData - input data
639  # @return - None
640  def __checkInputData(self, inputData):
641 
642  if inputData is None:
643  raise Exception(self.MSG_ERROR_INPUT_DATA_NONE)
644 
645  if inputData.batch_item is None:
646  raise Exception(self.MSG_ERROR_INPUT_DATA_WITHOUT_BATCH)
647 
648  if inputData.batch_item.properties is None:
649  raise Exception(self.MSG_ERROR_INPUT_DATA_WITHOUT_PROPERTIES)
650 
651 
652 
653  # # fill profiler message list from input data
654  #
655  # @param inputData - input data
656  # @return - None
657  def __fillProfilerMessageList(self, inputData):
658 
659  if inputData.batch_item.urlObj is not None:
660  urlString = inputData.batch_item.urlObj.url
661  else:
662  urlString = ""
663  logMsg = "BatchItem.siteId=" + str(inputData.batch_item.siteId) + \
664  ", BatchItem.urlId=" + str(inputData.batch_item.urlId) + \
665  ", BatchItem.urlObj.url=" + urlString
666  app.Profiler.messagesList.append(logMsg)
667  self.logger.info("Incoming data: %s", logMsg)
668 
669 
670  # # get output format from input data
671  #
672  # @param inputData - input data
673  # @return outputFormat - output format data
674  def __getOutputFormat(self, inputData):
675  # variable for result
676  outputFormat = None
677 
678  if inputData.output_format is not None and "name" in inputData.output_format:
679  outputFormat = inputData.output_format["name"]
680 
681  if outputFormat is None and "templates" in inputData.batch_item.properties["template"] and \
682  len(inputData.batch_item.properties["template"]["templates"]) > 0 and \
683  "output_format" in inputData.batch_item.properties["template"]["templates"][0] and \
684  "name" in inputData.batch_item.properties["template"]["templates"][0]["output_format"]:
685  outputFormat = inputData.batch_item.properties["template"]["templates"][0]["output_format"]["name"]
686  else:
687  self.logger.debug(">>> 'output_format' hasn't in template of input batch.")
688 
689  return outputFormat
690 
691 
692  # # get alt tags mask as property from input data
693  #
694  # @param inputData - input data
695  # @return altTagsMask - alt tags mask from input data
696  def __getAltTagsMask(self, inputData):
697  # variable for result
698  altTagsMask = None
699  if "TAGS_MAPPING" in inputData.batch_item.properties and \
700  inputData.batch_item.properties["TAGS_MAPPING"] is not None:
701  try:
702  altTagsMask = json.loads(inputData.batch_item.properties["TAGS_MAPPING"])
703  self.logger.debug(">>> AltTagsMask = " + str(altTagsMask))
704  except Exception, err:
705  ExceptionLog.handler(self.logger, err, 'Bad TAGS_MAPPING properties value:', \
706  (inputData.batch_item.properties["TAGS_MAPPING"]), \
707  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
708 
709  return altTagsMask
710 
711 
712 
713  # # get properties from input data
714  #
715  # @param inputData - input data
716  # @return properties - properties loaded from input data
717  def __getPropertiesFromInputData(self, inputData):
718  # variable for result
719  properties = {}
720  try:
721  if (self.input_data is not None) and \
722  inputData.processor_properties is not None:
723  processor_properties = inputData.processor_properties
724  self.logger.debug("Processor's properties was taken from input data: %s" % processor_properties)
725  self.logger.debug("Processor's properties type: %s" % str(type(processor_properties)))
726  if not isinstance(processor_properties, dict):
727  processor_properties = json.loads(inputData.processor_properties)
728  self.logger.debug("Processor's properties was taken from input data: %s" % processor_properties)
729  properties = processor_properties
730 
731  self.logger.debug('>>> inputData.batch_item.properties: ' + varDump(inputData.batch_item.properties) + \
732  ' type: ' + str(type(inputData.batch_item.properties)))
733  if isinstance(inputData.batch_item.properties, dict):
734  properties.update(inputData.batch_item.properties)
735 
736  except Exception, err:
737  ExceptionLog.handler(self.logger, err, self.MSG_ERROR_GET_PROPERTIES, (inputData.processor_properties))
738 
739  return properties
740 
741 
742  # # load extractors
743  #
744  # @param algorithmName - name of algorithm used for extraction
745  # @param config - config parser
746  # @param urlHost - url of host
747  # @return - None
748  def __loadExtractors(self, algorithmName, config, urlHost):
749  # varable for result
750  extractors = []
751  try:
752  # modules
753  modules = self.properties[CONSTS.MODULES_KEY][algorithmName]
754 
755  self.logger.debug("Algorithm name: <%s>" % (algorithmName))
756  self.logger.debug("Modules: %s" % modules)
757 
758  for module in modules:
759  exrtactor = self.__createModule(module, config, urlHost)
760  # Check if module was created successfully and then insert it to extractors
761  if exrtactor is not None:
762  extractors.append(exrtactor)
763 
764  # Info show extractors loaded
765  self.logger.debug("*******************")
766  self.logger.debug("Loaded extractors:")
767  for extractor in extractors:
768  self.logger.debug(extractor.name)
769  self.logger.debug("*******************")
770 
771  except Exception, err:
772  ExceptionLog.handler(self.logger, err, self.MSG_ERROR_LOAD_EXTRACTORS)
773  raise Exception(self.MSG_ERROR_LOAD_EXTRACTORS + ' ' + str(err))
774 
775  return extractors
776 
777 
778  # # create module
779  #
780  # @param moduleName - modules name which instance will be created
781  # @param config - config parser
782  # @param urlHost - url of host
783  # @return appInst - instance of created application
784  def __createModule(self, moduleName, config, urlHost):
785  # varable for result
786  appInst = None
787  try:
788  appInst = (moduleName, eval(moduleName)(config, None, urlHost))[1] # pylint: disable=W0123
789  self.logger.debug("%s has been created!" % moduleName)
790  except Exception, err:
791  ExceptionLog.handler(self.logger, err, "Can't create module %s. Error is:" % (moduleName))
792 
793  return appInst
794 
795 
796 
797 # # #adjust partial references
798 # # adjust partial references
799 # #
800 # def checkDOMElement(self, elem):
801 # ret = False
802 # if re.search('<', elem):
803 # self.logger.debug("Media tag contain DOM element: %s", elem)
804 # ret = True
805 # return ret
806 
807 
808 # # #adjust partial references
809 # # adjust partial references
810 # #
811 # def adjustPartialReferences(self, response):
812 # if "links" in response.tags and isinstance(response.tags["link"], dict) and \
813 # "media" in response.tags and isinstance(response.tags["media"], dict):
814 # try:
815 # url = None
816 # if self.input_data.template and "link" in self.input_data.template:
817 # self.logger.debug("url type: %s", str(type(response.tags["link"]["data"])))
818 # if isinstance(response.tags["link"]["data"], str) or isinstance(response.tags["link"]["data"], unicode):
819 # url = response.tags["link"]["data"]
820 # else:
821 # url = response.tags["link"]["data"][0]
822 # else:
823 # url = self.input_data.url
824 # if self.input_data.template and "media" in self.input_data.template:
825 # self.logger.debug("resource has template with media tag. Try to adjust media.")
826 # # if type(response.tags["media"]) == str and response.tags["media"] == "": return
827 # self.logger.debug("response.tags['media']: " + str(response.tags["media"]))
828 # self.logger.debug("media tag in response: <<%s>>" % str(response.tags["media"]["data"]))
829 # self.logger.debug("link tag in response: <<%s>>" % str(url))
830 # res = []
831 #
832 # filter_patterns, filter_types = [], []
833 # if self.input_data.filters:
834 # # filter_types = [filter_item["Type"] for filter_item in self.input_data.filters]
835 # # filter_patterns = [re.compile(filter_item["Pattern"]) for filter_item in self.input_data.filters]
836 # filter_types = [filter_item.type for filter_item in self.input_data.filters]
837 # filter_patterns = [re.compile(filter_item.pattern) for filter_item in self.input_data.filters]
838 # self.logger.debug("filter: %s" % (str(self.input_data.filters)))
839 # for media in response.tags["media"]["data"]:
840 # self.logger.debug("Media link: <<%s>>", media)
841 # # instead pure url
842 # if self.checkDOMElement(media):
843 # res.append(media)
844 # break
845 # media = urlparse.urljoin(url, media)
846 # for filter_type, filter_pattern in zip(filter_types, filter_patterns):
847 # match = filter_pattern.match(media)
848 # if filter_type == SiteFilter.TYPE_EXCLUDE and match:
849 # break
850 # if filter_type == SiteFilter.TYPE_INCLUDE and match:
851 # res = self.checkMediaTag(media, res)
852 # break
853 # else:
854 # self.logger.debug("media: %s", media)
855 # self.logger.debug("url: %s", url)
856 # res = self.checkMediaTag(media, res)
857 #
858 # # If media tag after adjusting is empty remove it from response
859 # if not len(res):
860 # self.logger.debug("media tag is empty. Remove media tag from response.")
861 # del response.tags["media"]
862 # else:
863 # self.logger.debug("media tag is adjusted. Copy media tag to response.")
864 # response.tags["media"]["data"] = res
865 # # End code block removing empty media tag
866 # else:
867 # self.logger.debug("resource hasn't template with media tag. adjustPartialReferences doesn't execute")
868 # except Exception as err:
869 # ExceptionLog.handler(self.logger, err, self.MSG_ERROR_ADJUST_PR, (err), \
870 # {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
871 # else:
872 # self.logger.debug(">>> Response has not have link or media tag, Don't need adjust media")
873 
874 
875 # # adjustTitle
876 # #
877 # def adjustTitle(self, response):
878 # try:
879 # if self.input_data.template and "title" in self.input_data.template:
880 # self.logger.debug("resource has template with title tag. Try to adjust title.")
881 # self.logger.debug("response.tags['title']: " + str(response.tags["title"]))
882 # if not self.extractor:
883 # if len(self.extractors) > 2:
884 # self.extractor = self.extractors[2]
885 # else:
886 # raise Exception(" >>> Wrong! self.extractors list doesn't have 3'th element (index 2)")
887 # if isinstance(response.tags["title"], str):
888 # self.logger.debug("response has not have title tag")
889 # sel = Selector(text=self.input_data.raw_content)
890 # title = sel.xpath("//title/text()").extract()
891 # self.extractor.addTag(result=response, tag_name="title", tag_value=title, xpath="", \
892 # isDefaultTag=False, callAdjustment=True, tagType=None, allowNotFilled=True)
893 # self.logger.debug("TYPE response.tags['title']['data']" + str(type(response.tags["title"]["data"])))
894 # else:
895 # self.logger.debug("resource hasn't template with title tag. Don't need adjust title.")
896 # except Exception as err:
897 # ExceptionLog.handler(self.logger, err, self.MSG_ERROR_ADJUST_TITLE, (err), \
898 # {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
899 
900 
901 # # adjustLinkURL
902 # #
903 # def adjustLinkURL(self, response):
904 # flag = False
905 # try:
906 # if response.tags and "link" in response.tags:
907 # self.logger.debug("resource has template with link tag. Try to adjust link.")
908 # self.logger.debug("response.tags['link']: " + str(response.tags["link"]))
909 # self.logger.debug("self.extractor: %s", str(self.extractor))
910 # flag = True
911 # if self.extractor:
912 # self.logger.debug("Extractor exists")
913 # if isinstance(response.tags["link"], str):
914 # self.logger.debug("response has not have link tag")
915 # self.extractor.addTag(result=response, tag_name="link", tag_value=[self.input_data.url], xpath="", \
916 # isDefaultTag=False, callAdjustment=True, tagType=None, allowNotFilled=True)
917 # # bypass
918 # else:
919 # response.tags["link"]["data"] = self.input_data.url
920 # else:
921 # if len(self.extractors) > 2:
922 # self.extractors[2].addTag(result=response, tag_name="link", tag_value=[self.input_data.url], xpath="", \
923 # isDefaultTag=False, callAdjustment=True, tagType=None, allowNotFilled=True)
924 # else:
925 # self.logger.debug(">>> Wrong! self.extractors list doesn't have 3'th element (index 2)")
926 # self.logger.debug("TYPE response.tags['link']['data']" + str(type(response.tags["link"]["data"])))
927 # else:
928 # self.logger.debug("resource hasn't template with link tag. Don't need adjust link.")
929 # except Exception as err:
930 # ExceptionLog.handler(self.logger, err, self.MSG_ERROR_ADJUST_LINK_URL, (err), \
931 # {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
932 #
933 # return flag
934 
935 
936 # # # Normalize datetime tags procedure
937 # #
938 # # @param response - scraper response instance
939 # # @param algorithmName - algorithm name
940 # # @return - 'pubdate tag value'
941 # def normalizeDatetime(self, response, algorithmName):
942 # ret = None
943 # timezone = ''
944 # try:
945 # if response is not None and response.tags is not None:
946 # self.logger.debug("normalizeDatetime scraper response: " + varDump(response))
947 # tagNames = []
948 # if self.input_data.template and algorithmName == CONSTS.PROCESS_ALGORITHM_REGULAR:
949 # # temlate
950 # for responseType in self.datetimeTemplateTypes:
951 # for responseTagName in response.tags:
952 # self.logger.debug("normalizeDatetime responseTagName: '" + str(responseTagName) + "'")
953 # if (responseTagName in response.tags and \
954 # response.tags[responseTagName] is not None and \
955 # response.tags[responseTagName].has_key('type') and \
956 # response.tags[responseTagName]['type'] == responseType) or \
957 # (responseTagName in response.tags and response.tags[responseTagName] is not None and \
958 # responseTagName == CONSTS.TAG_PUB_DATE):
959 # tagNames.append(responseTagName)
960 # else:
961 # pass
962 # else:
963 # pass
964 #
965 # self.logger.debug('normalizeDatetime tagNames: ' + varDump(tagNames))
966 # retDict = {}
967 # for tagName in tagNames:
968 # pubdate, tzone = self.extractPubDate(response, tagName) # , properties, urlString)
969 # if self.extractor and tagName in response.tags:
970 # self.extractor.addTag(result=response, tag_name=tagName + '_normalized', tag_value=pubdate, \
971 # xpath=response.tags[tagName]['xpath'], isDefaultTag=False, \
972 # callAdjustment=True, tagType=None, allowNotFilled=True)
973 #
974 # self.logger.debug('tagName: ' + str(tagName) + ' pubdate: ' + str(pubdate))
975 # retDict[tagName] = pubdate
976 #
977 # if tagName == CONSTS.TAG_PUB_DATE:
978 # ret = pubdate
979 # timezone = tzone
980 # else:
981 # pass
982 #
983 # if ret is None:
984 # for key, value in retDict.items():
985 # if value is not None:
986 # ret = value
987 # self.logger.debug('set return value from ' + str(key) + ' : ' + str(value))
988 # break
989 #
990 # except Exception, err:
991 # ExceptionLog.handler(self.logger, err, 'normalizeDatetime error:', (), \
992 # {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
993 #
994 # return ret, timezone
995 
996 
997 # # # Extract pubdate
998 # #
999 # # @param response - response instance
1000 # # @param dataTagName - tag name for extracting
1001 # # @param properties - properties from PROCESSOR_PROPERTIES
1002 # # @param urlString - url string value
1003 # # @return pubdate if success or None
1004 # def extractPubDate(self, response, dataTagName): # , properties, urlString):
1005 # # variable for result
1006 # ret = None
1007 # timezone = ''
1008 # try:
1009 # if response is not None and dataTagName in response.tags and response.tags[dataTagName] != "":
1010 #
1011 # self.logger.debug("extractPubDate response: " + varDump(response))
1012 #
1013 # if dataTagName in response.tags and response.tags[dataTagName] is not None:
1014 # inputData = response.tags[dataTagName]["data"]
1015 # self.logger.debug("extractPubDate response has '" + str(dataTagName) + "' is: " + str(inputData))
1016 # self.logger.debug("extractPubDate type of '" + str(dataTagName) + "' is: " + str(type(inputData)))
1017 #
1018 # inputList = []
1019 # if isinstance(inputData, str) or isinstance(inputData, unicode):
1020 # inputList = [inputData]
1021 # elif isinstance(inputData, list):
1022 # inputList = inputData
1023 # else:
1024 # pass
1025 #
1026 # pubdate = []
1027 # timezones = []
1028 # for inputElem in inputList:
1029 # d = DateTimeType.parse(inputElem, bool(self.useCurrentYear), self.logger, False)
1030 # self.logger.debug('pubdate: ' + str(d))
1031 #
1032 # if d is not None:
1033 # d, tzone = DateTimeType.split(d)
1034 # pubdate.append(d.isoformat(DateTimeType.ISO_SEP))
1035 # timezones.append(tzone)
1036 #
1037 # self.logger.debug("extractPubDate result pubdate: " + str(pubdate))
1038 # response.tags[dataTagName]["data"] = pubdate
1039 # if len(pubdate) > 0:
1040 # ret = pubdate[0]
1041 #
1042 # if len(timezones) > 0:
1043 # timezone = timezones[0]
1044 #
1045 # except Exception, err:
1046 # ExceptionLog.handler(self.logger, err, 'extractPubDate error:', (), \
1047 # {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
1048 #
1049 # return ret, timezone
1050 
1051 
1052 # # # pubdate transformation use timezone value
1053 # #
1054 # # @param rawPubdate - raw pubdate string
1055 # # @param rawTimezone - raw timezone string
1056 # # @param properties - properties from PROCESSOR_PROPERTIES
1057 # # @param urlString - url string value
1058 # # @return pubdate and timezone if success or None and empty string
1059 # def pubdateTransform(self, rawPubdate, rawTimezone, properties, urlString):
1060 # # variables for result
1061 # pubdate = rawPubdate
1062 # timezone = rawTimezone
1063 #
1064 # self.logger.debug('properties: ' + varDump(properties))
1065 # if CONSTS.PDATE_TIMEZONES_NAME in properties:
1066 # propertyString = properties[CONSTS.PDATE_TIMEZONES_NAME]
1067 # self.logger.debug('inputted ' + CONSTS.PDATE_TIMEZONES_NAME + ':' + str(propertyString))
1068 #
1069 # dt = DateTimeType.parse(rawPubdate, bool(self.useCurrentYear), self.logger, False)
1070 # self.logger.debug('pubdate: ' + str(dt))
1071 # if dt is not None:
1072 # # get utc offset if necessary
1073 # utcOffset = DateTimeType.extractUtcOffset(rawTimezone, self.logger)
1074 # self.logger.debug('utcOffset: ' + str(utcOffset))
1075 # # transformation accord to PDATE_TIMEZONES properties
1076 # d = PDateTimezonesHandler.transform(dt, utcOffset, propertyString, urlString, self.logger)
1077 # if d is not None:
1078 # dt = d
1079 #
1080 # if dt is not None:
1081 # d, tzone = DateTimeType.split(dt)
1082 # pubdate = d.isoformat(DateTimeType.ISO_SEP)
1083 # timezone = tzone
1084 #
1085 # return pubdate, timezone
1086 
1087 
1088  # # refineBadDateTags, deleles, from result, datetime tags with bad datetime value.
1089  #
1090  def refineBadDateTags(self, response):
1091  removeKeys = []
1092  for key in response.tags:
1093  if key in self.DATA_NEWS_TAGS:
1094  tagsValue = None
1095 
1096  if isinstance(response.tags[key], str) or isinstance(response.tags[key], unicode):
1097  tagsValue = response.tags[key]
1098  elif isinstance(response.tags[key], dict) and "data" in response.tags[key]:
1099  if isinstance(response.tags[key]["data"], str) or isinstance(response.tags[key]["data"], unicode):
1100  tagsValue = response.tags[key]["data"]
1101  elif isinstance(response.tags[key]["data"], list) and len(response.tags[key]["data"]) > 0 and \
1102  isinstance(response.tags[key]["data"][0], str) or isinstance(response.tags[key]["data"][0], unicode):
1103  tagsValue = response.tags[key]["data"][0]
1104 
1105  if tagsValue is not None:
1106  try:
1107  dt = parser.parse(tagsValue)
1108  int(time.mktime(dt.timetuple()))
1109  except Exception:
1110  removeKeys.append(key)
1111 
1112  for key in removeKeys:
1113  if key in response.tags:
1114  logging.debug(">>> Remove " + key + " element besause it empty")
1115  del response.tags[key]
1116 
1117 
1118  def preparseResponse(self, response):
1119  self.logger.debug('>>> preparseResponse enter <<<')
1120 
1121  for key in response.tags:
1122  if response.tags[key] is not None:
1123  if "data" in response.tags[key]:
1124  if isinstance(response.tags[key]["data"], str) or isinstance(response.tags[key]["data"], unicode):
1125  localStr = response.tags[key]["data"]
1126 
1127  self.logger.debug('-----------------------------------------')
1128  self.logger.debug('key: ' + str(key) + ' => ' + str(localStr))
1129  self.logger.debug('-----------------------------------------')
1130 
1131  response.tags[key]["data"] = []
1132  response.tags[key]["data"].append(localStr)
1133 
1134  self.logger.debug('response.tags[key]["data"]: ' + str(response.tags[key]["data"]))
1135  self.logger.debug('-----------------------------------------')
1136 
1137 
1138  def formatOutpuElement(self, elem, localOutputFormat):
1139  ret = elem
1140  if localOutputFormat == "json":
1141  # self.logger.debug(">>> JSON HTML = " + elem)
1142  localStr = json.dumps(elem, ensure_ascii=False)
1143  if localStr[0] == '\"' or localStr[0] == '\'':
1144  localStr = localStr[1:]
1145  if localStr[-1] == '\"' or localStr[-1] == '\'':
1146  localStr = localStr[0:-1]
1147  ret = localStr
1148  # self.logger.debug(">>> JSON HTML = " + ret)
1149  elif localOutputFormat == "html":
1150  ret = xml.sax.saxutils.escape(elem, {"'": "&apos;", "\"" : "&quot;"})
1151  elif localOutputFormat == "sql":
1152  # ret = mdb.escape_string(elem) # pylint: disable=E1101
1153  ret = Utils.escape(elem)
1154  return ret
1155 
1156 
1157  def formatOutputData(self, response, localOutputFormat):
1158  # result.tags[key]["data"]
1159  for key in response.tags:
1160  if response.tags[key] is not None:
1161  if "data" in response.tags[key]:
1162  if isinstance(response.tags[key]["data"], list):
1163  for i, elem in enumerate(response.tags[key]["data"]):
1164  response.tags[key]["data"][i] = self.formatOutpuElement(elem, localOutputFormat)
1165  elif isinstance(response.tags[key]["data"], str) or isinstance(response.tags[key]["data"], unicode):
1166  response.tags[key]["data"] = self.formatOutpuElement(response.tags[key]["data"], localOutputFormat)
1167 
1168 
1169  # # template extraction processing
1170  #
1171  # @param config - config parser
1172  # @param urlHost - domain name
1173  # @return resultsList - list of Result
1174  def templateExtraction(self, config, urlHost):
1175  self.extractor = ScrapyExtractor(config, self.input_data.template, urlHost)
1176  sel = Selector(text=self.input_data.raw_content)
1177  if isinstance(self.input_data.template, dict):
1178  template = self.input_data.template
1179  else:
1180  # template = ast.literal_eval(self.input_data.template)
1181  # TODO:strange potential backdoor for malicious code, cancelled by bgv
1182  pass
1183 
1184  # Calculate mandatory properties for exist tags
1185  mandatoryTags = {}
1186  for key, value in template.items():
1187  isMandatory = True
1188  self.logger.debug(">>> Calculate mandatory for '" + str(key) + "'")
1189  for elem in value:
1190  self.logger.debug(">>> mandatory = " + str(elem["mandatory"]) + " type: " + str(type(elem["mandatory"])))
1191  if bool(elem["mandatory"]) is False:
1192  isMandatory = False
1193  continue
1194 
1195  mandatoryTags[key] = isMandatory
1196 
1197  self.logger.debug(">>> Calculated mandatoryTags: " + varDump(mandatoryTags))
1198 
1199  scraperDocs = ScraperResultDocuments(template.keys(), self.input_data.urlId)
1200 
1201  # Add End
1202  for key in template:
1203  self.logger.debug(">>> Template key: " + key)
1204  if "state" in template[key] and not bool(int(template[key]["state"])):
1205  self.logger.debug(">>> Template disable: template name = " + str(key))
1206  continue
1207  for path in template[key]:
1208  if not isinstance(path, dict):
1209  self.logger.debug(">>> WARNING path not DICT type ")
1210  continue
1211 
1212  isExtract = True
1213  localResult = Result(None, self.input_data.urlId)
1214  # Added new template format conversion
1215  xpath = None
1216  xpathValue = None
1217 
1218  # Logging xPath trees
1219  self.logger.debug(">>> Logging xPath trees for key: '" + str(key) + "'")
1220  etrees = sel.xpath(path['target'])
1221  for etree in etrees:
1222 
1223  self.logger.debug(">>> etree: " + varDump(etree))
1224  if isinstance(etree._root, basestring): # pylint: disable=W0212
1225  continue
1226 
1227  etreeValue = self.get_path(etree._root) # pylint: disable=W0212
1228  self.logger.debug('>>> etreeValue: ' + varDump(etreeValue))
1229  scraperDocs.addEtree(key, copy.deepcopy(etreeValue))
1230 
1231  # Added new template type specification
1232  xPathPreparing = TemplateExtractorXPathPreparing(self.properties[CONSTS.TAG_MARKUP_PROP_NAME] \
1233  if CONSTS.TAG_MARKUP_PROP_NAME in self.properties else None)
1234 
1235  self.logger.debug(">>> xPathPreparing: " + varDump(xPathPreparing))
1236  self.logger.debug(">>> path: " + varDump(path))
1237  self.logger.debug(">>> sel: " + varDump(sel))
1238 
1239  self.logger.debug(">>> self.properties: " + varDump(self.properties))
1240  # Added new template type specification
1241  self.xpathSplitString = xPathPreparing.resolveDelimiter(path, self.properties, self.xpathSplitString)
1242  innerDelimiter = xPathPreparing.resolveInnerDelimiter(path, self.properties)
1243  self.logger.debug(">>> xpathSplitString: '" + str(self.xpathSplitString) + "'")
1244  self.logger.debug(">>> innerDelimiter: '" + str(innerDelimiter) + "'")
1245  try:
1246  xpath, xpathValue = xPathPreparing.process(path, sel, self.xpathSplitString, innerDelimiter,
1247  Utils.innerTextToList)
1248  except Exception, err:
1249  ExceptionLog.handler(self.logger, err, "some rule/xpath exception:", (), \
1250  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
1251  continue
1252 
1253  self.logger.debug(">>> xpathValue " + str(type(xpathValue)) + " " + str(xpathValue))
1254  self.logger.debug(">>> xpath: %s" % str(xpath))
1255  if (isinstance(xpathValue, list) and len(xpathValue) == 0) or\
1256  (isinstance(xpathValue, basestring) and xpathValue == ''):
1257  self.logger.debug(">>> set default xpathValue")
1258  xpathValue = []
1259  xpathValue.append(path["default"])
1260  isExtract = False
1261 
1262  if not isinstance(xpathValue, list):
1263  xpathValue = [xpathValue]
1264 
1265  for xpathElem in xpathValue:
1266  elemResult = copy.deepcopy(localResult)
1267  self.logger.debug("result before:\n%s", varDump(elemResult))
1268  self.extractor.addTag(result=elemResult, tag_name=key, tag_value=xpathElem, xpath=xpath,
1269  isDefaultTag=(not isExtract), callAdjustment=False, tagType=path["type"],
1270  allowNotFilled=True)
1271 
1272  self.logger.debug("result after:\n%s", varDump(elemResult))
1273 
1274  self.logger.debug(">>> tag type = " + str(type(elemResult.tags)))
1275  self.logger.debug(">>> tags data type = " + str(type(elemResult.tags[key]["data"])))
1276 
1277  if key in elemResult.tags and isinstance(elemResult.tags[key]["data"], basestring):
1278  self.logger.debug(">>> Convert result = " + str(key))
1279  localString = elemResult.tags[key]["data"]
1280  elemResult.tags[key]["data"] = []
1281  elemResult.tags[key]["data"].append(localString)
1282 
1283  if isExtract and "postProcessing" in path and path["postProcessing"] is not None and \
1284  path["postProcessing"] != "":
1285  self.applyPostProcessing(elemResult, key, path["postProcessing"])
1286 
1287 
1288  self.logger.debug("scraperDocs.addDoc key: " + str(key) + ' mandatory = ' + varDump(mandatoryTags[key]))
1289 
1290  scraperDocs.addDoc(key, elemResult, path["join"], isExtract,
1291  (bool(path["mandatory"]) if "mandatory" in path else False))
1292 
1293  # for response
1294  resultsList = []
1295  resultDocs = scraperDocs.getAllDocs(mandatoryTags, self.logger)
1296 
1297  for elem in resultDocs:
1298  result = Result(None, self.input_data.urlId)
1299  # Add tag 'order_number'
1300  self.addCustomTag(result=result, tag_name=CONSTS.TAG_ORDER_NUMBER, \
1301  tag_value=str(elem[CONSTS.TAG_ORDER_NUMBER]))
1302  # Add tag 'source_url'
1303  self.addCustomTag(result=result, tag_name=CONSTS.TAG_SOURCE_URL, \
1304  tag_value=[self.input_data.url])
1305 
1306  # Prepare result
1307  prepareResultsList = self.prepareResults([elem])
1308  self.compileResults(result, prepareResultsList, key, xPathPreparing)
1309  result.finish = time.time()
1310  resultsList.append(copy.deepcopy(result))
1311 
1312  return resultsList
1313 
1314 
1315 # # # Add custom tag
1316 # #
1317 # # @param result - Scrper result instance
1318 # # @param tag_name - value name of tag
1319 # # @param tag_value - value value of tag
1320 # # @return - None
1321 # def addCustomTag(self, result, tag_name, tag_value):
1322 # data = {"extractor": "Base extractor", "data": "", "name": ""}
1323 # data["data"] = tag_value
1324 # data["name"] = tag_name
1325 # data["xpath"] = None
1326 # data["type"] = None
1327 # data["extractor"] = self.__class__.__name__
1328 # result.tags[tag_name] = data
1329 
1330 
1331 # def compileResults(self, result, resultsList, key, xPathPreparing=None):
1332 # for elem in resultsList:
1333 # if key in result.tags:
1334 # if result.tags[key] is not None:
1335 # if result.tags[key]["xpath"] is None:
1336 # result.tags[key]["xpath"] = elem["obj"].tags[key]["xpath"]
1337 # else:
1338 # result.tags[key]["xpath"] += ' '
1339 # result.tags[key]["xpath"] += elem["obj"].tags[key]["xpath"]
1340 # if result.tags[key]["data"] is None or len(result.tags[key]["data"]) == 0:
1341 # result.tags[key]["data"] = elem["obj"].tags[key]["data"]
1342 # else:
1343 # if xPathPreparing is not None:
1344 # self.xpathSplitString = xPathPreparing.resolveDelimiter(elem, self.properties, self.xpathSplitString)
1345 # result.tags[key]["data"][0] += self.xpathSplitString
1346 # else:
1347 # result.tags[key]["data"][0] += ' '
1348 # result.tags[key]["data"][0] += elem["obj"].tags[key]["data"][0]
1349 # else:
1350 # result.tags.update(elem["obj"].tags)
1351 
1352 
1353 # def prepareResults(self, resultsList):
1354 # ret = []
1355 # if len(resultsList) > 0:
1356 # localElemWeight = 0
1357 # firstElemWeight = 0
1358 # firstElem = None
1359 # tempList = []
1360 # for elem in resultsList:
1361 # localElemWeight = 0
1362 # if elem["join"] == "concat":
1363 # tempList.append(elem)
1364 # else:
1365 # if elem["mandatory"]:
1366 # #>>> Mandatory breaking block -------------
1367 # if not elem["isExtract"]:
1368 # return []
1369 # #-------------
1370 # localElemWeight = localElemWeight | CONSTS.TAGS_RULES_MASK_MANDATORY_FIELD
1371 # if elem["join"] == "best":
1372 # localElemWeight = localElemWeight | CONSTS.TAGS_RULES_MASK_RULE_PRIORITY
1373 # if elem["isExtract"]:
1374 # localElemWeight = localElemWeight | CONSTS.TAGS_RULES_MASK_DEFAULT_VALUE
1375 #
1376 # self.logger.debug(">>> Rule weight = " + str(localElemWeight))
1377 # self.logger.debug(">>> Rule join = " + elem["join"])
1378 # if localElemWeight > firstElemWeight:
1379 # firstElemWeight = localElemWeight
1380 # firstElem = elem
1381 #
1382 # if firstElem is not None:
1383 # tempList = [firstElem] + tempList
1384 # isExtractResults = any([elem["isExtract"] for elem in tempList])
1385 # if isExtractResults:
1386 # ret = [elem for elem in tempList if elem["isExtract"]]
1387 # else:
1388 # ret.append(tempList[0])
1389 # return ret
1390 
1391 
1392  def applyPostProcessing(self, result, key, postProcessingRE):
1393  if key in result.tags and "data" in result.tags[key] and result.tags[key]["data"] is not None and \
1394  len(result.tags[key]["data"]) > 0:
1395  try:
1396  matchingVal = re.compile(postProcessingRE)
1397  except re.error as err:
1398  self.logger.debug(">>> RE error = " + str(err))
1399  self.errorMask = self.errorMask | APP_CONSTS.ERROR_RE_ERROR
1400  else:
1401  tmpStr = ""
1402  matchingResult = matchingVal.findall(result.tags[key]["data"][0])
1403  if matchingResult is not None:
1404  for elem in matchingResult:
1405  if isinstance(elem, str) or isinstance(elem, unicode):
1406  tmpStr += str(elem)
1407  tmpStr += ' '
1408  else:
1409  for innerElem in elem:
1410  if innerElem is not None and innerElem != '':
1411  tmpStr += str(innerElem)
1412  tmpStr += ' '
1413  tmpStr = tmpStr.strip()
1414  if tmpStr != "":
1415  self.logger.debug(">>> Replace value, prev. value is = " + result.tags[key]["data"][0])
1416  result.tags[key]["data"][0] = tmpStr
1417  else:
1418  # Set not detected value if no match, changed default behavior by bgv
1419  result.tags[key]["data"][0] = None
1420 
1421 
1422  def getProcessedContent(self, result):
1423  result.get()
1424  processedContent = {}
1425  processedContent["default"] = result
1426  processedContent["internal"] = [result]
1427  processedContent["custom"] = []
1428 
1429  if "pubdate" in result.tags and "data" in result.tags["pubdate"] and \
1430  len(result.tags["pubdate"]["data"]) > 0:
1431  self.pubdate = result.tags["pubdate"]["data"]
1432  self.logger.debug('>>>> Set self.pubdate = ' + str(self.pubdate))
1433 
1434  return processedContent
1435 
1436 
1437 # # #Internal method of url's domain crc calculating
1438 # #
1439 # # @param url - incoming url
1440 # def calcUrlDomainCrc(self, url):
1441 # urlHost = None
1442 # auth = urlparse.urlsplit(url)[1]
1443 # if auth is not None:
1444 # urlHost = (re.search('([^@]*@)?([^:]*):?(.*)', auth).groups())[1]
1445 # if urlHost is not None and urlHost.find(self.WWW_PREFIX) == 0:
1446 # urlHost = urlHost[len(self.WWW_PREFIX): len(urlHost)]
1447 #
1448 # return urlHost
1449 
1450 
1451  # # The main processing of the batch object
1452  #
1453  # @param config - config parser
1454  # @return None
1455  def process(self, config):
1456 
1457  # check recieved input data accord to protocol
1458  self.__checkInputData(self.input_data)
1459 
1460  self.logger.info('Start processing on BatchItem from Batch: ' + str(self.input_data.batchId))
1461 
1462  # fill profiler message list
1463  self.__fillProfilerMessageList(self.input_data)
1464  self.logger.debug("self.inputData:\n%s", varDump(self.input_data))
1465 
1466  # get output data format
1467  self.outputFormat = self.__getOutputFormat(self.input_data)
1468 
1469  # get alt tags mask as property from input data
1470  altTagsMask = self.__getAltTagsMask(self.input_data)
1471 
1472  # get property from input data and use in valid case
1473  properties = self.__getPropertiesFromInputData(self.input_data)
1474  if properties is not None:
1475  self.properties = properties
1476 
1477  algorithmName = self.properties[CONSTS.ALGORITHM_KEY][CONSTS.ALGORITHM_NAME_KEY]
1478 
1479  self.logger.debug("Algorithm : %s" % algorithmName)
1480  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1481  Utils.storePickleOnDisk(self.input_data, self.ENV_SCRAPER_STORE_PATH, "scraper.in." + \
1482  str(self.input_data.urlId))
1483 
1484  tmp = sys.stdout
1485  sys.stdout = open("/dev/null", "wb")
1486 
1487  # initialization of scraper
1488  # load scraper's modules
1489 
1490 
1491  urlHost = self.calcUrlDomainCrc(self.input_data.url)
1492  self.logger.info('urlHost: ' + str(urlHost))
1493 
1494  self.extractors = self.__loadExtractors(algorithmName, config, urlHost)
1495 
1496 
1497  # log info input data
1498  self.logger.info("input_data url: %s, urlId: %s, siteId: %s", str(self.input_data.url), str(self.input_data.urlId),
1499  str(self.input_data.siteId))
1500  # self.logger.debug("input_data:\n" + varDump(self.input_data))
1501 
1502  # self.logger.debug("Initialization pubdate from urlObj.pDate use value: %s",
1503  # str(self.input_data.batch_item.urlObj.pDate))
1504  # self.pubdate = self.input_data.batch_item.urlObj.pDate
1505 
1506  # get iterator to ranked list of extractors
1507  self.itr = iter(sorted(self.extractors, key=lambda extractor: extractor.rank, reverse=True))
1508  self.logger.debug("Extractors: %s" % varDump(self.itr))
1509 
1510  # Reconfigure processor's properties to involve only template scraper
1511  responses = self.templateExtraction(config, urlHost)
1512 
1513  if CONSTS.MEDIA_LIMITS_NAME in self.input_data.batch_item.properties:
1514  self.logger.debug("Found property '%s'", str(CONSTS.MEDIA_LIMITS_NAME))
1515  self.mediaLimitsHandler = MediaLimitsHandler(self.input_data.batch_item.properties[CONSTS.MEDIA_LIMITS_NAME])
1516 
1517  # variable for result
1518  scraperResponseList = []
1519  for response in responses:
1520  if response is not None:
1521  response.stripResult()
1522 
1523  # put extracted article to the db
1524  if algorithmName != CONSTS.PROCESS_ALGORITHM_REGULAR:
1525  self.adjustTitle(response)
1526  self.adjustLinkURL(response)
1527  self.adjustPartialReferences(response)
1528  self.logger.debug("PDate: %s" % str(self.input_data.batch_item.urlObj.pDate))
1529  self.logger.debug("PDate type: %s" % str(type(self.input_data.batch_item.urlObj.pDate)))
1530 
1531 
1532  self.preparseResponse(response)
1533 
1534  self.logger.debug('>>>>> self.properties = ' + varDump(self.properties))
1535 
1536  # Setting pubdate in depend of different sources masks
1537  # default values
1538  pdateSourceMask = APP_CONSTS.PDATE_SOURCES_MASK_BIT_DEFAULT
1539  pdateSourceMaskOverwrite = APP_CONSTS.PDATE_SOURCES_MASK_OVERWRITE_DEFAULT
1540 
1541  # get value 'PDATE_SOURCES_MASK' from site properties
1542  if APP_CONSTS.PDATE_SOURCES_MASK_PROP_NAME in self.input_data.batch_item.properties:
1543  pdateSourceMask = int(self.input_data.batch_item.properties[APP_CONSTS.PDATE_SOURCES_MASK_PROP_NAME])
1544 
1545  # get value 'PDATE_SOURCES_MASK_OVERWRITE' from site properties
1546  if APP_CONSTS.PDATE_SOURCES_MASK_OVERWRITE_PROP_NAME in self.input_data.batch_item.properties:
1547  pdateSourceMaskOverwrite = \
1548  int(self.input_data.batch_item.properties[APP_CONSTS.PDATE_SOURCES_MASK_OVERWRITE_PROP_NAME])
1549 
1550  self.logger.debug('pdateSourceMask = %s, pdateSourceMaskOverwrite = %s',
1551  str(pdateSourceMask), str(pdateSourceMaskOverwrite))
1552 
1553  self.logger.debug("!!! self.input_data.batch_item.urlObj.pDate = " + str(self.input_data.batch_item.urlObj.pDate))
1554 
1555  timezone = ''
1556  # URL object the "pdate" field (supposed was got from the RSS feed)
1557  if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_RSS_FEED:
1558  if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_RSS_FEED) or \
1559  not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_RSS_FEED:
1560  self.pubdate, timezone = self.extractPubdateRssFeed(self.input_data.siteId, self.input_data.url)
1561 
1562  # Normalization procedure after the scraping, supposes the tag dc_date for the NEWS or TEMPLATE scraping.
1563  if CONSTS.TAG_DC_DATE in response.tags and pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_DC_DATE:
1564  if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_DC_DATE and self.pubdate is None) or \
1565  not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_DC_DATE:
1566  if CONSTS.TAG_PUB_DATE not in response.tags or \
1567  (isinstance(response.tags[CONSTS.TAG_PUB_DATE]["data"], basestring) and \
1568  response.tags[CONSTS.TAG_PUB_DATE]["data"].strip() == ""):
1569  response.tags[CONSTS.TAG_PUB_DATE] = copy.deepcopy(response.tags[CONSTS.TAG_DC_DATE])
1570  response.tags[CONSTS.TAG_PUB_DATE]["name"] = CONSTS.TAG_PUB_DATE
1571  if len(response.tags[CONSTS.TAG_PUB_DATE]) > 0 and response.tags[CONSTS.TAG_PUB_DATE][0]:
1572  self.pubdate = response.tags[CONSTS.TAG_PUB_DATE][0]
1573  self.logger.debug("Pubdate from 'dc_date': " + str(self.pubdate))
1574 
1575  # Normalization procedure after the scraping, supposes the "pubdate" tag for the NEWS or TEMPLATE scraping.
1576  timezone = ''
1577  if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_PUBDATE:
1578  if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_PUBDATE and self.pubdate is None) or \
1579  not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_PUBDATE:
1580  pubdate, timezone = self.normalizeDatetime(response, algorithmName)
1581  if pubdate is not None:
1582  self.pubdate = pubdate
1583  self.logger.debug("Pubdate from 'pubdate': " + str(self.pubdate))
1584 
1585  # Current date (SQL NOW())
1586  if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_NOW:
1587  if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_NOW and self.pubdate is None) or \
1588  not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_NOW:
1589  self.pubdate = SQLExpression("NOW()")
1590  self.logger.debug("Pubdate from 'SQL NOW()': " + str(self.pubdate))
1591 
1592  # Custom SQL expression defined in the property PDATE_SOURCES_EXPRESSION
1593  if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_SQL_EXPRESSION and \
1594  APP_CONSTS.PDATE_SOURCES_EXPRESSION_PROP_NAME in self.properties:
1595  if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_SQL_EXPRESSION and self.pubdate is None) or \
1596  not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_SQL_EXPRESSION:
1597  self.pubdate = SQLExpression(str(self.properties[APP_CONSTS.PDATE_SOURCES_EXPRESSION_PROP_NAME]))
1598  self.logger.debug("Pubdate from 'sql expression': " + str(self.pubdate))
1599 
1600  # Apply property 'PDATE_DAY_MONTH_ORDER'
1601  self.pubdate = self.pubdateMonthOrder(self.pubdate, self.input_data.batch_item.properties, self.input_data.url)
1602 
1603  # Apply property 'PDATE_TIME'
1604  self.input_data.batch_item.urlObj.pDate = self.pubdate
1605  self.pubdate = FieldsSQLExpressionEvaluator.evaluatePDateTime(self.input_data.batch_item.properties,
1606  self.dbWrapper,
1607  self.input_data.batch_item.urlObj,
1608  self.logger,
1609  self.pubdate)
1610 
1611  # Apply property 'PDATE_TIMEZONES'
1612  self.pubdate, timezone = self.pubdateTransform(self.pubdate,
1613  timezone,
1614  self.input_data.batch_item.properties,
1615  self.input_data.url)
1616 
1617  # Add tag 'pubdate_tz'
1618  self.addCustomTag(result=response, tag_name=CONSTS.TAG_PUBDATE_TZ, tag_value=[timezone])
1619 
1620  if "pubdate" in response.tags and "data" in response.tags["pubdate"] and \
1621  len(response.tags["pubdate"]["data"]) > 0:
1622  response.tags["pubdate"]["data"][0] = self.pubdate
1623 
1624  if self.outputFormat is None:
1625  self.logger.debug(">>> Warning, can't extract output format")
1626  else:
1627  self.formatOutputData(response, self.outputFormat)
1628 
1629  response.recalcTagMaskCount(None, altTagsMask)
1630 
1631  self.logger.debug("response.tagsCount: " + str(response.tagsCount) + \
1632  " response.tagsMasks: " + str(response.tagsMask) + \
1633  "\n>>> Resp: " + varDump(response))
1634 
1635  # Get start and finish times
1636  startTime = 0
1637  if len(responses) > 0:
1638  startTime = responses[0].start
1639 
1640  finishTime = time.time()
1641  # recalculate spend time
1642  for response in responses:
1643  response.start = startTime
1644  response.finish = finishTime
1645  response.data["time"] = "%s" % str(finishTime - startTime)
1646 
1647  response = self.applyHTTPRedirectLink(self.input_data.batch_item.siteId, self.input_data.batch_item.urlObj.url,
1648  self.input_data.batch_item.properties, response)
1649 
1650  # get processed content and append to list of scraper responses
1651  processedContent = self.getProcessedContent(response)
1652  scraperResponseList.append(ScraperResponse(response.tagsCount, response.tagsMask, self.pubdate, \
1653  processedContent, self.errorMask))
1654 
1655  self.logger.debug('len(scraperResponseList): ' + varDump(len(scraperResponseList)))
1656  self.logger.debug('maxURLsFromPage: ' + str(self.input_data.batch_item.urlObj.maxURLsFromPage))
1657 
1658  # check allowed limits
1659  if self.input_data.batch_item.urlObj.maxURLsFromPage is not None and \
1660  int(self.input_data.batch_item.urlObj.maxURLsFromPage) > 0 and \
1661  int(self.input_data.batch_item.urlObj.maxURLsFromPage) < len(scraperResponseList):
1662  self.logger.debug('>>> scraperResponseList 1')
1663  scraperResponseList = scraperResponseList[0: int(self.input_data.batch_item.urlObj.maxURLsFromPage)]
1664  self.logger.debug('>>> scraperResponseList 2')
1665  scraperResponseList[-1].errorMask |= APP_CONSTS.ERROR_MAX_URLS_FROM_PAGE
1666  self.logger.debug("Truncated scraper responces list because over limit 'maxURLsFromPage' = " + \
1667  str(self.input_data.batch_item.urlObj.maxURLsFromPage) + " set errorMask = " + \
1668  str(APP_CONSTS.ERROR_MAX_URLS_FROM_PAGE))
1669 
1670  # send response to the stdout
1671  sys.stdout = tmp
1672 
1673  # output result of scraping
1674  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1675  output_pickled_object = pickle.dumps(scraperResponseList)
1676  Utils.storePickleOnDisk(output_pickled_object, self.ENV_SCRAPER_STORE_PATH,
1677  "scraper.out." + str(self.input_data.urlId))
1678  print output_pickled_object
1679  sys.stdout.flush()
1680  else:
1681  self.output_data = scraperResponseList
1682  self.logger.debug('self.output_data: ' + str(varDump(self.output_data)))
1683 
1684 
1685  # #get_path returns list of tuples xPath tree
1686  #
1687  # @param etreeElement - element of etree
1688  # @param path - xPath value
1689  # @return - result list of tuples xPath tree
1690  def get_path(self, etreeElement, path=None):
1691  if path is None:
1692  rpath = []
1693  else:
1694  rpath = path
1695 
1696  p = etreeElement.getparent()
1697  if p is not None:
1698  index = p.index(etreeElement) + 1
1699  rpath.insert(0, (etreeElement.tag, str(index)))
1700  return self.get_path(p, rpath)
1701  else:
1702  rpath.insert(0, etreeElement.tag)
1703  return rpath
1704 
1705 
1706 # # # Extract pubdate rss feed from header
1707 # #
1708 # # @param siteId - Site/Project ID
1709 # # @param url - url string
1710 # # @return pubdate from rss feed
1711 # def extractPubdateRssFeed(self, siteId, url):
1712 # # variable for result
1713 # pubdate = None
1714 # timezone = ''
1715 #
1716 # self.logger.debug('!!! extractPubdateRssFeed siteId: ' + str(siteId))
1717 # self.logger.debug('!!! extractPubdateRssFeed url: ' + str(url))
1718 # headerContent = self.getHeaderContent(siteId, url)
1719 # rawPubdate = self.getVariableFromHeaderContent(headerContent, CRAWLER_CONSTS.pubdateRssFeedHeaderName)
1720 #
1721 # self.logger.debug('!!! getVariableFromHeaderContent: ' + str(rawPubdate))
1722 # if rawPubdate is not None:
1723 # try:
1724 # dt = DateTimeType.parse(rawPubdate, True, self.logger, False)
1725 # if dt is not None:
1726 # dt, timezone = DateTimeType.split(dt)
1727 # pubdate = dt.strftime("%Y-%m-%d %H:%M:%S")
1728 #
1729 # if timezone is '':
1730 # timezone = '+0000'
1731 # except Exception, err:
1732 # self.logger.debug("Unsupported date format: <%s>, error: %s", str(rawPubdate), str(err))
1733 #
1734 # return pubdate, timezone
1735 
1736 
1737 # # # Get header content
1738 # #
1739 # # @param siteId - Site/Project ID
1740 # # @param url - url string
1741 # # @return extracted header content
1742 # def getHeaderContent(self, siteId, url):
1743 # # variable for result
1744 # headerContent = None
1745 # urlContentObj = dc_event.URLContentRequest(siteId, url, \
1746 # dc_event.URLContentRequest.CONTENT_TYPE_RAW_LAST + \
1747 # dc_event.URLContentRequest. CONTENT_TYPE_RAW + \
1748 # dc_event.URLContentRequest.CONTENT_TYPE_HEADERS)
1749 #
1750 # rawContentData = self.dbWrapper.urlContent([urlContentObj])
1751 #
1752 # if rawContentData is not None and len(rawContentData) > 0:
1753 # if rawContentData[0].headers is not None and len(rawContentData[0].headers) > 0 and \
1754 # rawContentData[0].headers[0] is not None:
1755 # headerContent = rawContentData[0].headers[0].buffer
1756 #
1757 # return headerContent
1758 #
1759 #
1760 # # #Get variable from header content
1761 # #
1762 # # @param headerContent - header content
1763 # # @param name - variable name
1764 # # @param makeDecode - boolean flag necessary decode
1765 # # @return extracted value of 'Location'
1766 # def getVariableFromHeaderContent(self, headerContent, name, makeDecode=True):
1767 # # variable for result
1768 # ret = None
1769 #
1770 # header = ''
1771 # if makeDecode and headerContent is not None:
1772 # header = base64.b64decode(headerContent)
1773 #
1774 # headerList = header.split('\r\n')
1775 # self.logger.debug("headerList: " + varDump(headerList))
1776 #
1777 # for elem in headerList:
1778 # pos = elem.find(name + ':')
1779 # if pos > -1:
1780 # ret = elem.replace(name + ':', '').strip()
1781 # self.logger.debug("Found '" + name + "' has value: " + str(ret))
1782 # break
1783 #
1784 # return ret
1785 
1786 
1787 # # # change month orden in pubdate if neccessary
1788 # #
1789 # # @param rawPubdate - raw pubdate string in iso format. sample: '2016-02-07 16:28:00'
1790 # # @param properties - properties from PROCESSOR_PROPERTIES
1791 # # @param urlString - url string value
1792 # # @return pubdate and timezone if success or None and empty string
1793 # def pubdateMonthOrder(self, rawPubdate, properties, urlString):
1794 # # variables for result
1795 # pubdate = rawPubdate
1796 #
1797 # self.logger.debug('pubdateMonthOrder() enter... rawPubdate: ' + str(rawPubdate))
1798 # if CONSTS.PDATE_DAY_MONTH_ORDER_NAME in properties and isinstance(rawPubdate, basestring):
1799 # propertyObj = []
1800 # try:
1801 # self.logger.debug('inputted ' + CONSTS.PDATE_DAY_MONTH_ORDER_NAME + ':' + \
1802 # str(properties[CONSTS.PDATE_DAY_MONTH_ORDER_NAME]))
1803 # propertyObj = json.loads(properties[CONSTS.PDATE_DAY_MONTH_ORDER_NAME])
1804 # except Exception, err:
1805 # self.logger.error("Fail loads '%s', error: %s", str(CONSTS.PDATE_DAY_MONTH_ORDER_NAME), str(err))
1806 #
1807 # for propertyElem in propertyObj:
1808 # try:
1809 # if "pattern" not in propertyElem:
1810 # raise Exception('Property "pattern" not found')
1811 #
1812 # if "order" not in propertyElem:
1813 # raise Exception('Property "order" not found')
1814 #
1815 # pattern = str(propertyElem["pattern"])
1816 # order = int(propertyElem["order"])
1817 #
1818 # if re.search(pattern, urlString, re.UNICODE) is not None:
1819 # self.logger.debug("Pattern '%' found in url: %s", str(pattern), str(urlString))
1820 #
1821 # dt = None
1822 # if order == 0: # means day follows month
1823 # dt = datetime.datetime.strptime(rawPubdate, "%Y-%d-%m %H:%M:%S")
1824 # elif order == 1: # means month follows day
1825 # dt = datetime.datetime.strptime(rawPubdate, "%Y-%m-%d %H:%M:%S")
1826 # else:
1827 # raise Exception("Unsupported value of 'order' == " + str(order))
1828 #
1829 # if dt is not None:
1830 # pubdate = dt.strftime("%Y-%d-%m %H:%M:%S")
1831 #
1832 # except Exception, err:
1833 # self.logger.error("Fail execution '%s', error: %s", str(CONSTS.PDATE_DAY_MONTH_ORDER_NAME), str(err))
1834 #
1835 # self.logger.debug('pubdateMonthOrder() leave... pubdate: ' + str(pubdate))
1836 #
1837 # return pubdate
1838 
1839 
1840 # # # Check media tag and append to list
1841 # #
1842 # # @param urlStringMedia - url string of media tag
1843 # # @param allowedUrls - list for accumulate allowed url strings (by validator and limits)
1844 # # @return allowedUrls list already accumulated allowed url strings
1845 # def checkMediaTag(self, urlStringMedia, allowedUrls):
1846 #
1847 # mediaUrls = self.splitMediaTagString(urlStringMedia)
1848 # for media in mediaUrls:
1849 # # Check if media is binary picture
1850 # if re.search(MediaLimitsHandler.BINARY_IMAGE_SEARCH_STR, media, re.UNICODE) is not None:
1851 # self.logger.debug("Tag 'media' has binary picture...")
1852 #
1853 # if self.mediaLimitsHandler is None:
1854 # allowedUrls.append(media)
1855 # else:
1856 # if self.mediaLimitsHandler.isAllowedLimits(urlString=media, binaryType=True):
1857 # allowedUrls.append(media)
1858 # else:
1859 # self.logger.debug("Binary media tag has not allowed limits. Skipped...")
1860 #
1861 # # Check is media content valid url
1862 # elif isValidURL(media):
1863 # self.logger.debug("Tag 'media' has valid url of picture...")
1864 # if self.mediaLimitsHandler is None:
1865 # allowedUrls.append(media)
1866 # else:
1867 # if self.mediaLimitsHandler.isAllowedLimits(media):
1868 # allowedUrls.append(media)
1869 # else:
1870 # self.logger.debug("Media tag has not allowed limits. Skipped. Url: %s", str(media))
1871 #
1872 # # Invalid url of 'media' tag
1873 # else:
1874 # self.logger.debug("Invalid url in tag 'media'... Url: %s", str(media))
1875 #
1876 # return allowedUrls
1877 #
1878 #
1879 # # # Split media tag string
1880 # #
1881 # # @param urlStringMedia - url string of media tag
1882 # # @return list urls extracted from string of media tag
1883 # def splitMediaTagString(self, urlStringMedia):
1884 # # variable for result
1885 # urls = []
1886 # # temporary string for replace in url string
1887 # REPLACE_STR = 'base64|'
1888 # if urlStringMedia.find(MediaLimitsHandler.BINARY_IMAGE_SEARCH_STR) > -1:
1889 # urlStringMedia = urlStringMedia.replace(MediaLimitsHandler.BINARY_IMAGE_SEARCH_STR, REPLACE_STR)
1890 # urls = urlStringMedia.split(',')
1891 # self.logger.debug("!!! urls before: " + varDump(urls))
1892 # urls = [url.replace(REPLACE_STR, MediaLimitsHandler.BINARY_IMAGE_SEARCH_STR) for url in urls]
1893 # self.logger.debug("!!! urls after: " + varDump(urls))
1894 # else:
1895 # urls = urlStringMedia.split(',')
1896 #
1897 # return urls
def __loadExtractors(self, algorithmName, config, urlHost)
def __getPropertiesFromInputData(self, inputData)
def getIndexNumberOfPath(self, indexPath, elemPath, logger=None)
def __loadScraperProperties(self, scraperPropertyFileName)
def __init__(self, usageModel=APP_CONSTS.APP_USAGE_MODEL_PROCESS, configFile=None, logger=None, inputData=None)
def addDoc(self, key, value, join, isExtract, mandatory)
def get_path(self, etreeElement, path=None)
def applyPostProcessing(self, result, key, postProcessingRE)
def __initApp(self, configName=None)
def formatOutpuElement(self, elem, localOutputFormat)
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
-mask-info
def templateExtraction(self, config, urlHost)
def formatOutputData(self, response, localOutputFormat)
def getExtractorByName(self, extractorName)
def __createModule(self, moduleName, config, urlHost)