HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
Scraper.py
Go to the documentation of this file.
1 """@package docstring
2  @file Scraper.py
3  @author Alexey <developers.hce@gmail.com>, scorp, bgv
4  @link http://hierarchical-cluster-engine.com/
5  @copyright Copyright &copy; 2013-2016 IOIX Ukraine
6  @license http://hierarchical-cluster-engine.com/license/
7  @package HCE project node API
8  @since 0.1
9 """
10 import re
11 import sys
12 import time
13 import json
14 import pickle
15 import logging.config
16 import ConfigParser
17 import xml.sax.saxutils
18 import urlparse
19 import copy
20 import datetime
21 import base64
22 
23 from dateutil.parser import * # pylint: disable=W0401,W0614
24 from dateutil import parser
25 from cement.core import foundation
26 
27 from app.Utils import varDump
28 from app.Utils import isValidURL
29 from app.SelectorWrapper import SelectorWrapper
30 import app.Utils as Utils # pylint: disable=F0401
31 from app.Utils import ExceptionLog
32 from app.Utils import SQLExpression
33 from app.Utils import UrlNormalizator
34 from app.Utils import urlNormalization
35 # scraper's modules
36 import app.Consts as APP_CONSTS
37 import app.Profiler
38 from app.DateTimeType import DateTimeType
39 from app.FieldsSQLExpressionEvaluator import FieldsSQLExpressionEvaluator
40 # import dc.EventObjects
41 import dc.EventObjects as dc_event
42 from dc.EventObjects import SiteFilter
43 import dc_processor.Constants as CONSTS
44 from dc_processor.scraper_resource import Resource
45 from dc_processor.scraper_result import Result as Result
46 from dc_processor.ScraperResponse import ScraperResponse
47 from dc_processor.TemplateExtractorXPathPreparing import TemplateExtractorXPathPreparing
48 from dc_processor.PDateTimezonesHandler import PDateTimezonesHandler
49 from dc_processor.AuthorType import AuthorType
50 from dc_processor.MediaLimitsHandler import MediaLimitsHandler
51 from dc_processor.ScraperLangDetector import ScraperLangDetector
52 # scraper's modules used via eval()
53 from dc_processor.newspaper_extractor import NewspaperExtractor # pylint: disable=W0611
54 from dc_processor.goose_extractor import GooseExtractor # pylint: disable=W0611
55 from dc_processor.scrapy_extractor import ScrapyExtractor
56 from dc_processor.ml_extractor import MLExtractor # pylint: disable=W0611
57 from dc_processor.custom_extractor import CustomExtractor # pylint: disable=W0611
58 from dc_crawler.DBTasksWrapper import DBTasksWrapper
59 import dc_crawler.Constants as CRAWLER_CONSTS
60 
61 APP_NAME = "scraper"
62 
63 MSG_ERROR_LOAD_CONFIG = "Error loading config file. Exciting."
64 MSG_ERROR_LOAD_LOG_CONFIG_FILE = "Error loading logging config file. Exiting."
65 MSG_ERROR_LOAD_EXTRACTORS = "Error load extractors "
66 MSG_ERROR_TEMPLATE_EXTRACTION = "Error template extraction "
67 MSG_ERROR_DYNAMIC_EXTRACTION = "Error dynamic extraction "
68 MSG_ERROR_LOAD_DB_BACKEND = "Error load db backend"
69 MSG_ERROR_LOAD_OPTIONS = "Error load options"
70 MSG_INFO_PREPARE_CONTENT = "Prepare content: "
71 MSG_ERROR_ADJUST_PR = "Error adjust partial references. "
72 MSG_ERROR_ADJUST_PUBDATE = "PUBDATE_ERROR "
73 MSG_ERROR_ADJUST_TITLE = "Can't adjust title. "
74 
75 EXIT_SUCCESS = 0
76 EXIT_FAILURE = 1
77 
78 SQLITE_TIMEOUT = 30
79 
80 ENV_SCRAPER_STORE_PATH = "ENV_SCRAPER_STORE_PATH"
81 # CONTENT_REPLACEMENT = "[\"\\n\", \"\\r\\n\", \"\\t\"]"
82 CONTENT_REPLACEMENT_LIST = ['\n', '\r\n', '\t', ' ', '<br>', '<p>', '</p>']
83 DEFAULT_TAG_REDUCE_MASK = 65535
84 
85 EXTENDED_NEWS_TAGS = {"description": ["//meta[@name='description']//@content"]}
86 LINKS_NEWS_TAGS = [CONSTS.TAG_MEDIA, CONSTS.TAG_LINK, CONSTS.TAG_MEDIA_CONTENT, "links", "href"]
87 # DATA_NEWS_TAGS = [CONSTS.TAG_DC_DATE]
88 DATA_NEWS_TAGS = []
89 
90 TAGS_DATETIME_NEWS_NAMES = [CONSTS.TAG_PUB_DATE, CONSTS.TAG_DC_DATE]
91 TAGS_DATETIME_TEMPLATE_TYPES = [CONSTS.TAG_TYPE_DATETIME]
92 
93 OPTION_SECTION_DATETIME_NEWS_NAMES = 'tags_datetime_news_names'
94 OPTION_SECTION_DATETIME_TEMPLATE_TYPES = 'tags_datetime_template_types'
95 
96 OPTION_SECTION_TAGS_TYPE = 'tagsTypes'
97 
98 OPTION_SECTION_URL_SOURCES_RULES = 'urlSourcesRules'
99 URL_SOURCES_RULE_DATA_URL = 'd_url'
100 URL_SOURCES_RULE_REDIRECT_URL = 'r_url '
101 URL_SOURCES_RULE_FEED_URL = 'f_url'
102 
103 # #Scraper
104 #
105 #
106 class Scraper(foundation.CementApp):
107 
108  MSG_ERROR_WRONG_CONFIG_FILE_NAME = "Config file name is wrong"
109  # Constants use in class
110  WWW_PREFIX = "www."
111 
112  # Mandatory
113  class Meta(object):
114  label = APP_NAME
115  def __init__(self):
116  pass
117 
118 
119  # #constructor
120  # initialize default fields
121  def __init__(self, usageModel=APP_CONSTS.APP_USAGE_MODEL_PROCESS, configFile=None, logger=None, inputData=None):
122  if usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
123  # call base class __init__ method
124  foundation.CementApp.__init__(self)
125 
126  self.exitCode = EXIT_SUCCESS
127  self.itr = None
128  self.extractor = None
129  self.extractors = []
130  self.input_data = inputData
131  self.logger = logger
132  self.sqliteTimeout = SQLITE_TIMEOUT
134  self.properties = {}
135  self.algorithm_name = None
136  self.pubdate = None
137  self.message_queue = []
138  self.entry = None
139  self.article = None
140  self.outputFormat = None
141  self.errorMask = APP_CONSTS.ERROR_OK
142  self.metrics = None
143  self.altTagsMask = None
144  self.tagsCount = 0
145  self.tagsMask = 0
146  self.processedContent = None
147  self.usageModel = usageModel
148  self.configFile = configFile
149  self.output_data = None
150  self.urlHost = None
151  self.xpathSplitString = ' '
152  self.useCurrentYear = 0
155  self.tagsTypes = None
156  self.attrConditions = None
157  self.dbWrapper = None
158  self.mediaLimitsHandler = None
159  self.urlSourcesRules = None
160  self.tagReduceMask = DEFAULT_TAG_REDUCE_MASK
161  self.baseUrl = None
162 
163 
164  # #setup
165  # setup application
166  def setup(self):
167  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
168  # call base class setup method
169  foundation.CementApp.setup(self)
170 
171 
172  # #run
173  # run application
174  def run(self):
175  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
176  # call base class run method
177  foundation.CementApp.run(self)
178 
179  # config section
180  self.loadConfig()
181 
182  # load logger config file
183  self.loadLogConfigFile()
184 
185  # options
186  self.loadOptions()
187 
188  # scraper properties
189  self.loadScraperProperties()
190 
191  # Do applied algorithm's job
192  self.processBatch()
193 
194  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
195  # Finish logging
196  self.logger.info(APP_CONSTS.LOGGER_DELIMITER_LINE)
197 
198 
199  # # Check DOM element
200  #
201  # @param elem - element of DOM model
202  # @return True if is DOM model or otherwise False
203  def checkDOMElement(self, elem):
204  ret = False
205  if re.search('<', elem):
206  self.logger.debug("Media tag contain DOM element: %s", elem)
207  ret = True
208  return ret
209 
210 
211  # #adjust partial references
212  # adjust partial references
213  #
214  def adjustPartialReferences(self, response):
215  # self.logger.debug("!!! response.tags: " + varDump(response.tags))
216 # self.logger.debug("!!! self.input_data.template: " + varDump(self.input_data.template))
217 # self.logger.debug("self.input_data.url: %s", varDump(self.input_data.url))
218 # self.logger.debug("self.input_data.siteId: %s", varDump(self.input_data.siteId))
219 
220  if "link" in response.tags and isinstance(response.tags["link"], dict) and \
221  "media" in response.tags and isinstance(response.tags["media"], dict):
222  try:
223  url = None
224  if self.input_data.template and "link" in self.input_data.template:
225  self.logger.debug("url type: %s", str(type(response.tags["link"]["data"])))
226  if isinstance(response.tags["link"]["data"], basestring):
227  url = response.tags["link"]["data"]
228  else:
229  url = response.tags["link"]["data"][0]
230 
231  url = urlNormalization(self.baseUrl, url)
232  response.tags["link"]["data"] = url
233 
234  else:
235  url = self.input_data.url
236 
237 # self.logger.debug("link tag in response: '%s'", str(url))
238 # self.logger.debug("response.tags['media']: %s", str(response.tags["media"]))
239 # self.logger.debug("media tag in response: %s, type: %s" , str(response.tags["media"]["data"]), str(type(response.tags["media"]["data"])))
240  res = []
241  mediaData = []
242  if isinstance(response.tags["media"]["data"], basestring):
243  mediaData = [response.tags["media"]["data"]]
244  elif isinstance(response.tags["media"]["data"], list):
245  mediaData = list(set(response.tags["media"]["data"]))
246  else:
247  self.logger.error("!!! Wrong type of tag 'media': %s", str(type(response.tags["media"]["data"])))
248 
249  filter_patterns, filter_types = [], []
250  if self.input_data.filters:
251  # filter_types = [filter_item["Type"] for filter_item in self.input_data.filters]
252  # filter_patterns = [re.compile(filter_item["Pattern"]) for filter_item in self.input_data.filters]
253  filter_types = [filter_item.type for filter_item in self.input_data.filters]
254  filter_patterns = [re.compile(filter_item.pattern) for filter_item in self.input_data.filters]
255  # self.logger.debug("filter: %s", varDump(self.input_data.filters))
256 
257  for media in mediaData:
258  self.logger.debug("Media link: '%s'", media)
259  # instead pure url
260  if self.checkDOMElement(media):
261  res.append(media)
262  break
263 # media = urlparse.urljoin(url, media)
264  media = urlNormalization(self.baseUrl, media)
265 # self.logger.debug("media 2: %s", media)
266 
267  for filter_type, filter_pattern in zip(filter_types, filter_patterns):
268  match = filter_pattern.search(media)
269  if filter_type == SiteFilter.TYPE_EXCLUDE and match:
270  break
271  if filter_type == SiteFilter.TYPE_INCLUDE and match:
272  allowedUrls = self.checkMediaTag(media)
273  if len(allowedUrls) > 0:
274  res.append(','.join(allowedUrls))
275  break
276  else:
277  self.logger.debug("media: %s", media)
278  self.logger.debug("url: %s", url)
279  allowedUrls = self.checkMediaTag(media)
280  if len(allowedUrls) > 0:
281  res.append(','.join(allowedUrls))
282 
283  # If media tag after adjusting is empty remove it from response
284  if not len(res):
285  self.logger.debug("media tag is empty. Remove media tag from response.")
286  del response.tags["media"]
287  else:
288  self.logger.debug("media tag is adjusted. Copy media tag to response.")
289  response.tags["media"]["data"] = res
290  # End code block removing empty media tag
291 # else:
292 # self.logger.debug("resource hasn't template with media tag. adjustPartialReferences doesn't execute")
293  except Exception as err:
294  ExceptionLog.handler(self.logger, err, MSG_ERROR_ADJUST_PR, (), \
295  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
296 
297  else:
298  self.logger.debug(">>> Response has not have link or media tag, Don't need adjust media")
299 
300 
301  # adjustTitle
302  #
303  def adjustTitle(self, response):
304  try:
305  if self.input_data.template and "title" in self.input_data.template and "title" in response.tags:
306  self.logger.debug("resource has template with title tag. Try to adjust title.")
307  self.logger.debug("response.tags['title']: " + str(response.tags["title"]))
308  localExtractor = self.extractor
309  if localExtractor is None:
310  if len(self.extractors) > 2:
311  localExtractor = self.extractors[2]
312  else:
313  raise Exception(">>> Wrong! self.extractors list doesn't have 3'th element (index 2)")
314  if isinstance(response.tags["title"], basestring):
315  self.logger.debug("response has not have title tag")
316  sel = SelectorWrapper(text=self.input_data.raw_content)
317  title = sel.xpath("//title/text()").extract()
318  localExtractor.addTag(result=response, tag_name="title", tag_value=title)
319  self.logger.debug("TYPE response.tags['title']['data']" + str(type(response.tags["title"]["data"])))
320  else:
321  self.logger.debug("resource hasn't template with title tag. Don't need adjust title.")
322  except Exception as err:
323  ExceptionLog.handler(self.logger, err, MSG_ERROR_ADJUST_TITLE, (), \
324  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
325 
326 
327  # adjustLinkURL
328  #
329  def adjustLinkURL(self, response):
330  flag = False
331  try:
332  if response.tags and "link" in response.tags:
333  self.logger.debug("resource has template with link tag. Try to adjust link.")
334  self.logger.debug("response.tags['link']: " + str(response.tags["link"]))
335  self.logger.debug("self.extractor: %s", str(self.extractor))
336  flag = True
337  if self.extractor:
338  self.logger.debug("Extractor exists")
339  if isinstance(response.tags["link"], basestring):
340  self.logger.debug("response has not have link tag")
341  self.extractor.addTag(result=response, tag_name="link", tag_value=[self.input_data.url])
342  # bypass
343  else:
344  response.tags["link"]["data"] = self.input_data.url
345  else:
346  if len(self.extractors) > 2:
347  self.extractors[2].addTag(result=response, tag_name="link", tag_value=[self.input_data.url])
348  else:
349  self.logger.debug(">>> Wrong! self.extractors list doesn't have 3'th element (index 2)")
350  self.logger.debug("TYPE response.tags['link']['data']" + str(type(response.tags["link"]["data"])))
351  else:
352  self.logger.debug("resource hasn't template with link tag. Don't need adjust link.")
353  except Exception as err:
354  ExceptionLog.handler(self.logger, err, MSG_ERROR_ADJUST_PR, (), \
355  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
356 
357  return flag
358 
359 
360  # # Normalize author tags procedure
361  #
362  # @param confProp - properties as JSON already read from config file
363  # @param procProp - properties as JSON from PROCESSOR_PROPERTIES
364  # @param response - scraper response instance
365  # @return - None
366  def normalizeAuthor(self, confProp, procProp, response):
367  try:
368  if response is not None and response.tags is not None:
369  # self.logger.debug("normalizeAuthor scraper response: " + varDump(response))
370 
371  if self.input_data.template and self.algorithm_name != CONSTS.PROCESS_ALGORITHM_REGULAR:
372  if AuthorType.MAIN_TAG_NAME in response.tags and response.tags[AuthorType.MAIN_TAG_NAME] is not None and \
373  "data" in response.tags[AuthorType.MAIN_TAG_NAME]:
374  inputData = response.tags[AuthorType.MAIN_TAG_NAME]["data"]
375  self.logger.debug("normalizeAuthor response has '" + str(AuthorType.MAIN_TAG_NAME) + "' is: " + \
376  str(inputData))
377  self.logger.debug("normalizeAuthor type of '" + str(AuthorType.MAIN_TAG_NAME) + "' is: " + \
378  str(type(inputData)))
379 
380  inputList = []
381  if isinstance(inputData, str) or isinstance(inputData, unicode):
382  inputList = [inputData]
383  elif isinstance(inputData, list):
384  inputList = inputData
385  else:
386  pass
387 
388  self.logger.debug("normalizeAuthor confProp: " + varDump(confProp))
389  self.logger.debug("normalizeAuthor procProp: " + varDump(procProp))
390 
391  authors = []
392  for inputElem in inputList:
393  author = AuthorType.parse(confProp, procProp, inputElem, self.logger)
394  if author is not None:
395  authors.append(author)
396 
397  self.logger.debug("normalizeAuthor result author: " + str(authors))
398  if len(authors) > 0:
399  response.tags[AuthorType.MAIN_TAG_NAME]["data"] = authors
400 
401  except Exception, err:
402  ExceptionLog.handler(self.logger, err, 'normalizeAuthor error:', (), \
403  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
404 
405 
406  # # Normalize datetime tags procedure
407  #
408  # @param response - scraper response instance
409  # @param algorithmName - algorithm name
410  # @return - 'pubdate tag value'
411  def normalizeDatetime(self, response, algorithmName):
412  ret = None
413  timezone = ''
414  try:
415  if response is not None and response.tags is not None:
416  # self.logger.debug("normalizeDatetime scraper response: " + varDump(response))
417  tagNames = []
418  if self.input_data.template and algorithmName == CONSTS.PROCESS_ALGORITHM_REGULAR:
419  # temlate
420  for responseType in self.datetimeTemplateTypes:
421  for responseTagName in response.tags:
422  self.logger.debug("normalizeDatetime responseTagName: '" + str(responseTagName) + "'")
423  if (response.tags.get(responseTagName) is not None and \
424  'type' in response.tags[responseTagName] and \
425  response.tags[responseTagName]['type'] == responseType) or \
426  (responseTagName == CONSTS.TAG_PUB_DATE and response.tags.get(responseTagName) is not None):
427  tagNames.append(responseTagName)
428  else:
429  # dynamic
430  tagNames = self.datetimeNewsNames
431 
432  self.logger.debug('normalizeDatetime tagNames: ' + varDump(tagNames))
433  retDict = {}
434  for tagName in tagNames:
435  pubdate, tzone = self.extractPubDate(response, tagName)
436  if self.extractor and tagName in response.tags:
437  self.extractor.addTag(result=response, tag_name=tagName + '_normalized', tag_value=pubdate, \
438  xpath=response.tags[tagName]['xpath'])
439 
440  self.logger.debug('tagName: ' + str(tagName) + ' pubdate: ' + str(pubdate))
441  retDict[tagName] = pubdate
442 
443  if tagName == CONSTS.TAG_PUB_DATE:
444  ret = pubdate
445  timezone = tzone
446  else:
447  pass
448 
449  if ret is None:
450  for key, value in retDict.items():
451  if value is not None:
452  ret = value
453  self.logger.debug('set return value from ' + str(key) + ' : ' + str(value))
454  break
455 
456  except Exception, err:
457  ExceptionLog.handler(self.logger, err, 'normalizeDatetime error:', (), \
458  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
459 
460  return ret, timezone
461 
462 
463  # # Extract pubdate
464  #
465  # @param response - response instance
466  # @param dataTagName - tag name for extracting
467  # @return pubdate if success or None
468  def extractPubDate(self, response, dataTagName):
469  # variable for result
470  ret = None
471  timezone = ''
472  try:
473  if response is not None and dataTagName in response.tags and response.tags[dataTagName] is not None:
474 
475  # self.logger.debug("extractPubDate response: " + varDump(response))
476 
477  inputData = response.tags[dataTagName]["data"]
478  self.logger.debug("extractPubDate response has '" + str(dataTagName) + "' is: " + str(inputData))
479  self.logger.debug("extractPubDate type of '" + str(dataTagName) + "' is: " + str(type(inputData)))
480 
481  inputList = []
482  if isinstance(inputData, basestring):
483  inputList = [inputData]
484  elif isinstance(inputData, list):
485  inputList = inputData
486  else:
487  pass
488 
489  pubdate = []
490  timezones = []
491  for inputElem in inputList:
492  d = DateTimeType.parse(inputElem, bool(self.useCurrentYear), self.logger, False)
493  self.logger.debug('pubdate: ' + str(d))
494 
495  if d is not None:
496  d, tzone = DateTimeType.split(d)
497  pubdate.append(d.isoformat(DateTimeType.ISO_SEP))
498  timezones.append(tzone)
499 
500  self.logger.debug("extractPubDate result pubdate: " + str(pubdate))
501  response.tags[dataTagName]["data"] = pubdate
502  if len(pubdate) > 0:
503  ret = pubdate[0]
504 
505  if len(timezones) > 0:
506  timezone = timezones[0]
507 
508  except Exception, err:
509  ExceptionLog.handler(self.logger, err, 'extractPubDate error:', (), \
510  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
511 
512  return ret, timezone
513 
514 
515  # # pubdate transformation use timezone value
516  #
517  # @param rawPubdate - raw pubdate string
518  # @param rawTimezone - raw timezone string
519  # @param properties - properties from PROCESSOR_PROPERTIES
520  # @param urlString - url string value
521  # @return pubdate and timezone if success or None and empty string
522  def pubdateTransform(self, rawPubdate, rawTimezone, properties, urlString):
523  # variables for result
524  pubdate = rawPubdate
525  timezone = rawTimezone
526 
527  # self.logger.debug('properties: ' + varDump(properties))
528  if CONSTS.PDATE_TIMEZONES_NAME in properties:
529  propertyString = properties[CONSTS.PDATE_TIMEZONES_NAME]
530  self.logger.debug('inputted ' + CONSTS.PDATE_TIMEZONES_NAME + ':' + str(propertyString))
531 
532  dt = DateTimeType.parse(rawPubdate, bool(self.useCurrentYear), self.logger, False)
533  self.logger.debug('pubdate: ' + str(dt))
534  if dt is not None:
535  # get utc offset if necessary
536  utcOffset = DateTimeType.extractUtcOffset(rawTimezone, self.logger)
537  self.logger.debug('utcOffset: ' + str(utcOffset))
538  # transformation accord to PDATE_TIMEZONES properties
539  d = PDateTimezonesHandler.transform(dt, utcOffset, propertyString, urlString, self.logger)
540  if d is not None:
541  dt = d
542 
543  if dt is not None:
544  d, tzone = DateTimeType.split(dt)
545  pubdate = d.isoformat(DateTimeType.ISO_SEP)
546  timezone = tzone
547 
548  return pubdate, timezone
549 
550 
551  # # refineBadDateTags, deleles, from result, datetime tags with bad datetime value.
552  #
553  def refineBadDateTags(self, response):
554  removeKeys = []
555  for key in response.tags:
556  if key in DATA_NEWS_TAGS:
557  tagsValue = None
558 
559  if isinstance(response.tags[key], basestring):
560  tagsValue = response.tags[key]
561  elif isinstance(response.tags[key], dict) and "data" in response.tags[key]:
562  if isinstance(response.tags[key]["data"], basestring):
563  tagsValue = response.tags[key]["data"]
564  elif isinstance(response.tags[key]["data"], list) and len(response.tags[key]["data"]) > 0 and \
565  isinstance(response.tags[key]["data"][0], basestring):
566  tagsValue = response.tags[key]["data"][0]
567 
568  if tagsValue is not None:
569  try:
570  dt = parser.parse(tagsValue)
571  int(time.mktime(dt.timetuple()))
572  except Exception:
573  removeKeys.append(key)
574 
575  for key in removeKeys:
576  if key in response.tags:
577  logging.debug(">>> Remove " + key + " element besause it empty")
578  del response.tags[key]
579 
580 
581  # #Internal method of url's domain crc calculating
582  #
583  # @param url - incoming url
584  def calcUrlDomainCrc(self, url):
585  urlHost = None
586  auth = urlparse.urlsplit(url)[1]
587  if auth is not None:
588  urlHost = (re.search('([^@]*@)?([^:]*):?(.*)', auth).groups())[1]
589  if urlHost is not None and urlHost.find(self.WWW_PREFIX) == 0:
590  urlHost = urlHost[len(self.WWW_PREFIX): len(urlHost)]
591 
592  return urlHost
593 
594 
595  # # The main processing of the batch object
596  #
597  # @param config - config parser
598  # @return None
599  def process(self, config):
600  # info input data
601  self.logger.info("input_data url: %s, urlId: %s, siteId: %s", str(self.input_data.url), str(self.input_data.urlId),
602  str(self.input_data.siteId))
603 
604  self.baseUrl = self.extractBaseUrlRssFeed(self.input_data.siteId, self.input_data.url)
605  if self.baseUrl is None:
606  self.baseUrl = self.input_data.url
607 
608  if self.input_data.template and self.algorithm_name == CONSTS.PROCESS_ALGORITHM_REGULAR:
609  # Reconfigure processor's properties to involve only template scraper
610  responses = self.templateExtraction(config, self.urlHost)
611  else:
612  # get iterator to ranked list of extractors
613  self.itr = iter(sorted(self.extractors, key=lambda extractor: 0, reverse=True))
614  self.logger.debug("Extractors: %s" % varDump(self.itr))
615  responses = self.newsExtraction()
616 
617  if CONSTS.MEDIA_LIMITS_NAME in self.input_data.batch_item.properties:
618  self.logger.debug("Found property '%s'", str(CONSTS.MEDIA_LIMITS_NAME))
619  self.mediaLimitsHandler = MediaLimitsHandler(self.input_data.batch_item.properties[CONSTS.MEDIA_LIMITS_NAME])
620 
621  for response in responses:
622  response.metricsPrecalculate()
623  response.stripResult()
624  # Add tag 'source_url'
625  self.addCustomTag(result=response, tag_name=CONSTS.TAG_SOURCE_URL, \
626  tag_value=str(self.input_data.url))
627 
628  #self.logger.debug("self.properties: %s", varDump(self.properties))
629  if CONSTS.LANG_PROP_NAME in self.properties:
630  self.logger.debug("!!! Enter '%s' !!!", str(CONSTS.LANG_PROP_NAME))
631 
632  langDetector = ScraperLangDetector(self.properties[CONSTS.LANG_PROP_NAME])
633  langDetector.process(response, self.logger)
634  langTagsDict = langDetector.getLangTags()
635  self.logger.debug("langTagsDict: %s", varDump(langTagsDict))
636 
637 # # self.logger.debug("!!! self.input_data.batch_item.properties = %s, type = %s", varDump(self.input_data.batch_item.properties), str(type(self.input_data.batch_item.properties)))
638 # #
639 # # if 'template' in self.input_data.batch_item.properties and \
640 # # 'templates' in self.input_data.batch_item.properties['template'] and \
641 # # len(self.input_data.batch_item.properties['template']['templates']) > 0 and \
642 # # 'output_format' in self.input_data.batch_item.properties['template']['templates'][0] and \
643 # # 'item' in self.input_data.batch_item.properties['template']['templates'][0]['output_format']:
644 # # itemString = self.input_data.batch_item.properties['template']['templates'][0]['output_format']['item']
645 # # self.logger.debug("itemString: %s:", str(itemString))
646 # # try:
647 # # jsonDict = json.loads(itemString, encoding='utf-8')
648 # # self.logger.debug("jsonDict: %s:", varDump(jsonDict))
649 # # for tagName, langValue in langTagsDict.items():
650 # # jsonDict[tagName] = langValue
651 # #
652 # # self.input_data.batch_item.properties['template']['templates'][0]['output_format']['item'] = \
653 # # json.dumps(jsonDict, ensure_ascii=False, encoding='utf-8')
654 # # except Exception, err:
655 # # self.logger.error(str(err))
656 # # self.logger.info(Utils.getTracebackInfo())
657 
658  # add lang tags to processed content
659  for tagName, langValue in langTagsDict.items():
660  self.addCustomTag(result=response, tag_name=tagName, tag_value=langValue)
661 
662  summaryLang = langDetector.getSummaryLang(response, self.logger)
663  self.addCustomTag(result=response, tag_name=CONSTS.TAG_SUMMARY_LANG, tag_value=summaryLang)
664  self.logger.debug("!!! Leave '%s' !!!", str(CONSTS.LANG_PROP_NAME))
665 
666  # put extracted article to the db
667 
668  if self.algorithm_name != CONSTS.PROCESS_ALGORITHM_REGULAR:
669  self.adjustTitle(response)
670  self.adjustLinkURL(response)
671  self.adjustPartialReferences(response)
672 
673  # self.logger.debug("CONSTS.TAG_PUB_DATE response: " + varDump(response))
674 
675  self.preparseResponse(response)
676 
677  # Improvement author
678  tagsTypes = None
679  if CONSTS.TAGS_TYPES_NAME in self.input_data.batch_item.properties:
680  tagsTypes = self.input_data.batch_item.properties[CONSTS.TAGS_TYPES_NAME]
681 
682  self.logger.info('=' * 50)
683  self.logger.info('self.properties: ' + varDump(self.properties))
684 
685  self.normalizeAuthor(self.tagsTypes, tagsTypes, response)
686 
687  # Setting pubdate in depend of different sources masks
688  # default values
689  pdateSourceMask = APP_CONSTS.PDATE_SOURCES_MASK_BIT_DEFAULT
690  pdateSourceMaskOverwrite = APP_CONSTS.PDATE_SOURCES_MASK_OVERWRITE_DEFAULT
691 
692  # get value 'PDATE_SOURCES_MASK' from site properties
693  if APP_CONSTS.PDATE_SOURCES_MASK_PROP_NAME in self.input_data.batch_item.properties:
694  pdateSourceMask = int(self.input_data.batch_item.properties[APP_CONSTS.PDATE_SOURCES_MASK_PROP_NAME])
695 
696  # get value 'PDATE_SOURCES_MASK_OVERWRITE' from site properties
697  if APP_CONSTS.PDATE_SOURCES_MASK_OVERWRITE_PROP_NAME in self.input_data.batch_item.properties:
698  pdateSourceMaskOverwrite = \
699  int(self.input_data.batch_item.properties[APP_CONSTS.PDATE_SOURCES_MASK_OVERWRITE_PROP_NAME])
700 
701  self.logger.debug('pdateSourceMask = %s, pdateSourceMaskOverwrite = %s',
702  str(pdateSourceMask), str(pdateSourceMaskOverwrite))
703 
704  self.logger.debug("!!! self.input_data.batch_item.urlObj.pDate = " + str(self.input_data.batch_item.urlObj.pDate))
705 
706  timezone = ''
707  # URL object the "pdate" field (supposed was got from the RSS feed)
708  if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_RSS_FEED:
709  if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_RSS_FEED) or \
710  not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_RSS_FEED:
711  self.pubdate, timezone = self.extractPubdateRssFeed(self.input_data.siteId, self.input_data.url)
712 
713  # Normalization procedure after the scraping, supposes the tag dc_date for the NEWS or TEMPLATE scraping.
714  if CONSTS.TAG_DC_DATE in response.tags and pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_DC_DATE:
715  if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_DC_DATE and self.pubdate is None) or \
716  not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_DC_DATE:
717  if CONSTS.TAG_PUB_DATE not in response.tags or \
718  (isinstance(response.tags[CONSTS.TAG_PUB_DATE]["data"], basestring) and \
719  response.tags[CONSTS.TAG_PUB_DATE]["data"].strip() == ""):
720  response.tags[CONSTS.TAG_PUB_DATE] = copy.deepcopy(response.tags[CONSTS.TAG_DC_DATE])
721  response.tags[CONSTS.TAG_PUB_DATE]["name"] = CONSTS.TAG_PUB_DATE
722  if len(response.tags[CONSTS.TAG_PUB_DATE]["data"]) > 0 and response.tags[CONSTS.TAG_PUB_DATE]["data"][0]:
723  self.pubdate = response.tags[CONSTS.TAG_PUB_DATE]["data"][0]
724  self.logger.debug("Pubdate from 'dc_date': " + str(self.pubdate))
725  # Check format
726  d = DateTimeType.parse(self.pubdate, bool(self.useCurrentYear), self.logger, False)
727  self.logger.debug('Check format pubdate: ' + str(d))
728  if d is not None:
729  d, timezone = DateTimeType.split(d)
730  self.pubdate = d.isoformat(DateTimeType.ISO_SEP)
731  self.logger.debug("Result pubdate from 'dc_date': %s, timezone: %s", str(self.pubdate), str(timezone))
732  else:
733  self.pubdate = ''
734 
735  # Normalization procedure after the scraping, supposes the "pubdate" tag for the NEWS or TEMPLATE scraping.
736  if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_PUBDATE:
737  if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_PUBDATE and self.pubdate is None) or \
738  not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_PUBDATE:
739  pubdate, tzone = self.normalizeDatetime(response, self.algorithm_name)
740  if pubdate is not None:
741  self.pubdate = pubdate
742  timezone = tzone
743  self.logger.debug("Pubdate from 'pubdate': " + str(self.pubdate) + " timezone: " + str(timezone))
744 
745  # Current date (SQL NOW())
746  if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_NOW:
747  if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_NOW and self.pubdate is None) or \
748  not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_NOW:
749  self.pubdate = SQLExpression("NOW()") # pylint: disable=R0204
750  self.logger.debug("Pubdate from 'SQL NOW()': " + str(self.pubdate))
751 
752  # Custom SQL expression defined in the property PDATE_SOURCES_EXPRESSION
753  if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_SQL_EXPRESSION and \
754  APP_CONSTS.PDATE_SOURCES_EXPRESSION_PROP_NAME in self.properties:
755  if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_SQL_EXPRESSION and self.pubdate is None) or \
756  not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_SQL_EXPRESSION:
757  self.pubdate = SQLExpression(str(self.properties[APP_CONSTS.PDATE_SOURCES_EXPRESSION_PROP_NAME]))
758  self.logger.debug("Pubdate from 'sql expression': " + str(self.pubdate))
759 
760  # Apply property 'PDATE_DAY_MONTH_ORDER'
761  self.pubdate = self.pubdateMonthOrder(self.pubdate, self.input_data.batch_item.properties, self.input_data.url)
762 
763  # Apply property 'PDATE_TIME'
764 # self.input_data.batch_item.urlObj.pDate = self.pubdate
765  self.pubdate = FieldsSQLExpressionEvaluator.evaluatePDateTime(self.input_data.batch_item.properties,
766  self.dbWrapper,
767  self.input_data.batch_item.urlObj,
768  self.logger,
769  self.pubdate)
770 
771  # Apply property 'PDATE_TIMEZONES'
772  self.pubdate, timezone = self.pubdateTransform(self.pubdate,
773  timezone,
774  self.input_data.batch_item.properties,
775  self.input_data.url)
776 
777  # Add tag 'pubdate_tz'
778  self.addCustomTag(result=response, tag_name=CONSTS.TAG_PUBDATE_TZ, tag_value=[timezone])
779 
780  self.logger.debug("!!! self.pubdate: %s", str(self.pubdate))
781 # self.logger.debug("!!! response.tags: %s", varDump(response.tags))
782 
783  # apply content of 'pubdate' before formatOutputData
784  self.applyPubdate(response, self.pubdate)
785 
786  # Add tag 'feed_url'
787  feedUrl = self.extractFeedUrlRssFeed(self.input_data.siteId, self.input_data.url)
788  if feedUrl is not None:
789  self.addCustomTag(result=response, tag_name=CONSTS.TAG_FEED_URL, tag_value=[feedUrl])
790 
791  # self.logger.debug("!!! response: %s", varDump(response))
792 
793  if self.outputFormat is None:
794  self.logger.debug(">>> Warning, can't extract output format")
795  else:
796  self.formatOutputData(response, self.outputFormat)
797 
798  response.recalcTagMaskCount(None, self.altTagsMask)
799  self.tagsCount = response.tagsCount
800  self.tagsMask = response.tagsMask
801  # self.putArticleToDB({"default":response})
802  self.logger.debug("self.tagsCount: " + str(self.tagsCount) + " self.tagsMasks: " + str(self.tagsMask))
803 
804  response.finish = time.time()
805  response.data["time"] = "%s" % (response.finish - response.start)
806 
807  response = self.applyHTTPRedirectLink(self.input_data.batch_item.siteId, self.input_data.batch_item.urlObj.url,
808  self.input_data.batch_item.properties, response)
809 
810  self.getProcessedContent(responses)
811 
812 
813  # # Apply pubdate for processed content
814  #
815  # @param response - Scraper result instance for response
816  # @param pubdate - pubdate value for apply
817  # @return - None
818  def applyPubdate(self, response, pubdate):
819  if isinstance(pubdate, SQLExpression) and str(pubdate) == "NOW()":
820  pubdate = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
821  else:
822  d = DateTimeType.parse(pubdate, bool(self.useCurrentYear), self.logger, False)
823  self.logger.debug("Check pubdate: '%s'", str(d))
824  if d is not None:
825  pubdate = d.strftime("%Y-%m-%d %H:%M:%S")
826  else:
827  pubdate = ''
828 
829  if "pubdate" in response.tags and "data" not in response.tags["pubdate"]:
830  response.tags["pubdate"]["data"] = []
831 
832  if "pubdate" in response.tags and "data" in response.tags["pubdate"]:
833  if len(response.tags["pubdate"]["data"]) > 0:
834  response.tags["pubdate"]["data"][0] = pubdate
835  else:
836  response.tags["pubdate"]["data"] = [pubdate]
837 
838  if "pubdate" not in response.tags:
839  self.addCustomTag(result=response, tag_name=CONSTS.TAG_PUB_DATE, tag_value=[pubdate])
840 
841 
842  def preparseResponse(self, response):
843  for key in response.tags:
844  if "data" in response.tags[key]:
845  if isinstance(response.tags[key]["data"], basestring):
846  localStr = response.tags[key]["data"]
847  response.tags[key]["data"] = []
848  response.tags[key]["data"].append(localStr)
849 
850 
851  def formatOutpuElement(self, elem, localOutputFormat):
852  ret = elem
853  if localOutputFormat == "json":
854  localStr = json.dumps(elem, ensure_ascii=False)
855 
856  if len(localStr) > 0:
857  if localStr[0] == '\"' or localStr[0] == '\'':
858  localStr = localStr[1:]
859  if localStr[-1] == '\"' or localStr[-1] == '\'':
860  localStr = localStr[0:-1]
861 
862  ret = localStr
863  elif localOutputFormat == "html" or localOutputFormat == "xml":
864  ret = xml.sax.saxutils.escape(elem, {"'": "&apos;", "\"" : "&quot;"})
865  elif localOutputFormat == "sql":
866  # ret = mdb.escape_string(elem) # pylint: disable=E1101
867  ret = Utils.escape(elem)
868 
869  return ret
870 
871 
872  def formatOutputData(self, response, localOutputFormat):
873  for key in response.tags:
874  if "data" in response.tags[key]:
875  if isinstance(response.tags[key]["data"], list):
876  for i, elem in enumerate(response.tags[key]["data"]):
877  if len(response.tags[key]["data"]) > i:
878  response.tags[key]["data"][i] = self.formatOutpuElement(elem, localOutputFormat)
879 
880  elif isinstance(response.tags[key]["data"], str) or isinstance(response.tags[key]["data"], unicode):
881  response.tags[key]["data"] = self.formatOutpuElement(response.tags[key]["data"], localOutputFormat)
882 
883 
884  def getTemplate(self, explicit=True):
885  if isinstance(self.input_data.template, dict):
886  template = self.input_data.template
887  else:
888  # template = ast.literal_eval(self.input_data.template)
889  # TODO:strange potential backdoor for malicious code, cancelled by bgv
890  if explicit:
891  self.logger.error("Wrong template structure: `%s` but dict expected, assumed empty!",
892  str(type(self.input_data.template)))
893  self.logger.debug("Template:\n%s", str(self.input_data.template))
894  template = {}
895 
896  return template
897 
898 
899  def postprocessing(self, result, rule, tag):
900  self.logger.debug("!!! rule: '%s'", varDump(rule))
901  if rule.get('postProcessing') is not None and rule["postProcessing"] != "":
902  self.logger.debug("Post-processing applied for tag `%s` with expression: %s",
903  str(tag), str(rule["postProcessing"]))
904  self.applyPostProcessing(result, tag, rule["postProcessing"])
905  else:
906  self.logger.debug("Post-processing is not applied for tag `%s`", str(tag))
907 
908 
909  # # template extraction processing
910  #
911  # @param config - config parser
912  # @param urlHost - domain name
913  # @return resultsList - list of Result
914  def templateExtraction(self, config, urlHost):
915  resultsList = []
916 
917  self.extractor = ScrapyExtractor(config, self.input_data.template, urlHost)
918  result = Result(None, self.input_data.urlId, self.metrics)
919  sel = SelectorWrapper(text=self.input_data.raw_content)
920  template = self.getTemplate()
921  for tag in template:
922  self.logger.debug("Template tag: " + tag)
923  if "state" in template[tag] and not bool(int(template[tag]["state"])):
924  self.logger.debug("Tag skipped because state disabled, name: %s", str(tag))
925  continue
926  xPathPreparing = TemplateExtractorXPathPreparing(self.properties[CONSTS.TAG_MARKUP_PROP_NAME] \
927  if CONSTS.TAG_MARKUP_PROP_NAME in self.properties else None)
928  for rule in template[tag]:
929  if not isinstance(rule, dict):
930  self.logger.error("Rule skipped because wrong structure - is not dict() type: %s", str(type(rule)))
931  continue
932  if "attributesExclude" in rule:
933  try:
934  if rule["attributesExclude"] != "":
935  self.attrConditions = json.loads(rule["attributesExclude"])
936  except Exception as err:
937  self.logger.error("Feature of attributesExclude ignored because wrong structure: %s", str(err))
938  self.attrConditions = None
939  else:
940  self.attrConditions = None
941  xPathPreparing.attrConditions = self.attrConditions
942  pathDict = Utils.getPairsDicts(rule)
943  isExtract = True
944  localResult = Result(None, self.input_data.urlId, self.metrics)
945  # Added new template format conversion
946  xpath = None
947  xpathValue = None
948  self.logger.debug(">>> self.properties: " + varDump(self.properties))
949  # Added new template type specification
950  self.xpathSplitString = xPathPreparing.resolveDelimiter(rule, self.properties, self.xpathSplitString)
951  innerDelimiter = xPathPreparing.resolveInnerDelimiter(rule, self.properties)
952  self.logger.debug(">>> xpathSplitString: '" + str(self.xpathSplitString) + "'")
953  self.logger.debug(">>> innerDelimiter: '" + str(innerDelimiter) + "'")
954  try:
955  xpath, xpathValue = xPathPreparing.process(rule, sel, self.xpathSplitString, innerDelimiter)
956  except Exception as excp:
957  ExceptionLog.handler(self.logger, excp, "Rule/xpath exception: ", (), \
958  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
959  continue
960  self.logger.debug("xpath: `%s`, xpathType: `%s`, xpathValue: `%s`",
961  str(xpath), str(type(xpathValue)), str(xpathValue))
962  if (isinstance(xpathValue, list) and len(xpathValue) == 0) or\
963  (isinstance(xpathValue, basestring) and xpathValue == ''):
964  self.logger.debug(">>> set default xpathValue")
965  xpathValue = []
966  xpathValue.append(rule["default"])
967  isExtract = False
968  self.logger.debug("result before:\n%s", varDump(localResult))
969  self.extractor.addTag(localResult, tag, xpathValue, xpath, not isExtract, False, rule["type"])
970  self.logger.debug("result after:\n%s", varDump(localResult))
971 
972  self.logger.debug("Tag type: `%s`, tags data type: `%s`",
973  str(type(localResult.tags)), str(type(localResult.tags[tag]["data"])))
974  if tag in localResult.tags and isinstance(localResult.tags[tag]["data"], basestring):
975  self.logger.debug("Convert result for tag: `%s`", str(tag))
976  localString = localResult.tags[tag]["data"]
977  localResult.tags[tag]["data"] = []
978  localResult.tags[tag]["data"].append(localString)
979 
980  self.formatTag(localResult, rule, tag, pathDict, isExtract)
981 
982  if isExtract:
983  self.postprocessing(localResult, rule, tag)
984 
985  localResult.finish = time.time()
986 
987  resultsList.append({"obj": localResult, "join": rule["join"], "isExtract": isExtract, "mandatory":
988  (bool(rule["mandatory"]) if "mandatory" in rule else False),
989  "delimiter": (rule["delimiter"] if "delimiter" in rule else self.xpathSplitString),
990  "type": rule["type"]})
991 
992  prepareResultsList = self.prepareResults(resultsList)
993  self.compileResults(result, prepareResultsList, tag, xPathPreparing)
994  resultsList = []
995  result.finish = time.time()
996 
997  return [result]
998 
999 
1000  # # Add custom tag
1001  #
1002  # @param result - Scraper result instance
1003  # @param tag_name - value name of tag
1004  # @param tag_value - value value of tag
1005  # @return - None
1006  def addCustomTag(self, result, tag_name, tag_value):
1007  if tag_name not in result.tags:
1008  data = {"extractor": "Base extractor", "data": "", "name": ""}
1009  data["data"] = tag_value
1010  data["name"] = tag_name
1011  data["xpath"] = None
1012  data["type"] = None
1013  data["extractor"] = self.__class__.__name__
1014  result.tags[tag_name] = data
1015 
1016 
1017 # def compileResults(self, result, resultsList, key, xPathPreparing=None):
1018 # for elem in resultsList:
1019 # if key in result.tags:
1020 # if result.tags[key]["xpath"] is None:
1021 # result.tags[key]["xpath"] = elem["obj"].tags[key]["xpath"]
1022 # else:
1023 # result.tags[key]["xpath"] += ' '
1024 # result.tags[key]["xpath"] += elem["obj"].tags[key]["xpath"]
1025 # if result.tags[key]["data"] is None or len(result.tags[key]["data"]) == 0:
1026 # result.tags[key]["data"] = elem["obj"].tags[key]["data"]
1027 # else:
1028 # if xPathPreparing is not None:
1029 # self.xpathSplitString = xPathPreparing.resolveDelimiter(elem, self.properties, self.xpathSplitString)
1030 # result.tags[key]["data"][0] += self.xpathSplitString
1031 # result.tags[key]["data"][0] += elem["obj"].tags[key]["data"][0]
1032 # else:
1033 # result.tags.update(elem["obj"].tags)
1034 
1035  def compileResults(self, result, resultsList, key, xPathPreparing=None):
1036  for elem in resultsList:
1037  if key in result.tags:
1038  if result.tags[key] is not None:
1039  if result.tags[key]["xpath"] is None:
1040  result.tags[key]["xpath"] = elem["obj"].tags[key]["xpath"]
1041  else:
1042  result.tags[key]["xpath"] += ' '
1043  result.tags[key]["xpath"] += elem["obj"].tags[key]["xpath"]
1044  if result.tags[key]["data"] is None or len(result.tags[key]["data"]) == 0:
1045  result.tags[key]["data"] = elem["obj"].tags[key]["data"]
1046  else:
1047  if xPathPreparing is not None:
1048  self.xpathSplitString = xPathPreparing.resolveDelimiter(elem, self.properties, self.xpathSplitString)
1049  result.tags[key]["data"][0] += self.xpathSplitString
1050  else:
1051  result.tags[key]["data"][0] += ' '
1052  result.tags[key]["data"][0] += elem["obj"].tags[key]["data"][0]
1053  else:
1054  result.tags.update(elem["obj"].tags)
1055 
1056 
1057  def prepareResults(self, resultsList):
1058  ret = []
1059  if len(resultsList) > 0:
1060  localElemWeight = 0
1061  firstElemWeight = 0
1062  firstElem = None
1063  tempList = []
1064  for elem in resultsList:
1065  localElemWeight = 0
1066  if elem["join"] == "concat":
1067  tempList.append(elem)
1068  else:
1069  if elem["mandatory"]:
1070  #>>> Mandatory breaking block -------------
1071  if not elem["isExtract"]:
1072  return []
1073  #-------------
1074  localElemWeight = localElemWeight | CONSTS.TAGS_RULES_MASK_MANDATORY_FIELD
1075  if elem["join"] == "best":
1076  localElemWeight = localElemWeight | CONSTS.TAGS_RULES_MASK_RULE_PRIORITY
1077  if elem["isExtract"]:
1078  localElemWeight = localElemWeight | CONSTS.TAGS_RULES_MASK_DEFAULT_VALUE
1079 
1080  self.logger.debug(">>> Rule weight = " + str(localElemWeight))
1081  self.logger.debug(">>> Rule join = " + elem["join"])
1082  if localElemWeight > firstElemWeight:
1083  firstElemWeight = localElemWeight
1084  firstElem = elem
1085 
1086  if firstElem is not None:
1087  tempList = [firstElem] + tempList
1088  isExtractResults = any([elem["isExtract"] for elem in tempList])
1089  if isExtractResults:
1090  ret = [elem for elem in tempList if elem["isExtract"]]
1091  elif len(tempList) > 0:
1092  ret.append(tempList[0])
1093  return ret
1094 
1095 
1096  # #elemUrlsCanoizator canonizates incoming string buf as url string, parses it before by " " and " " symbols
1097  #
1098  # @param data - incoming string/strings list buffer
1099  # @param baseUrl - base url
1100  # @return - canonizated url string
1101  def elemUrlsCanoizator(self, data, baseUrl=None, firstDelim=' ', secondDelim=',', useAdditionEncoding=False):
1102  normMask = UrlNormalizator.NORM_NONE
1103  if "URL_NORMALIZE_MASK_PROCESSOR" in self.properties:
1104  normMask = int(self.properties["URL_NORMALIZE_MASK_PROCESSOR"])
1105 
1106  ret = data
1107  if data.strip() != "":
1108  ret = ""
1109  for elem in data.split(firstDelim):
1110  if elem.strip() != "":
1111  localUrl = elem
1112  if baseUrl is not None:
1113 # localUrl = urlparse.urljoin(baseUrl, localUrl)
1114  localUrl = urlNormalization(baseUrl, localUrl)
1115  processedUrl = dc_event.URL(0, localUrl, normalizeMask=normMask).getURL(normMask)
1116  if useAdditionEncoding:
1117  processedUrl = xml.sax.saxutils.escape(processedUrl, {})
1118  ret += processedUrl + secondDelim
1119  if ret != "" and ret[-1] == secondDelim:
1120  ret = ret[0: len(ret) - 1]
1121  return ret
1122 
1123 
1124  # #dataUrlsCanonizator canonizates incoming string/list as url string
1125  #
1126  # @param data - incoming string/strings list buffer
1127  # @param baseUrl - base url
1128  # @return - canonizated url string
1129  def dataUrlsCanonizator(self, data, baseUrl=None, useAdditionEncoding=False):
1130  ret = data
1131  # self.logger.debug(">>> url canonizator = " + str(data))
1132  if isinstance(data, basestring):
1133  ret = self.elemUrlsCanoizator(data, baseUrl, useAdditionEncoding=useAdditionEncoding)
1134  elif isinstance(data, list):
1135  ret = []
1136  for elem in data:
1137  elem = self.elemUrlsCanoizator(elem, baseUrl, useAdditionEncoding=useAdditionEncoding)
1138  ret.append(elem)
1139  return ret
1140 
1141 
1142  # # formatTag
1143  #
1144  def formatTag(self, result, path, key, pathDict, isExtract):
1145  # Andrey Add
1146  self.logger.debug("Tag name: '%s', tag type: %s, tag format: '%s'",
1147  str(key), str(path["type"]), str(path["format"]))
1148  # Add End
1149  if path["type"] == "text":
1150  localText = ''
1151  for elem in result.tags[key]["data"]:
1152  localText += (elem.strip() + self.xpathSplitString)
1153  localText = localText.strip(self.xpathSplitString)
1154  localMaxCh = None
1155  if "format" in pathDict and "maxCh" in pathDict["format"]:
1156  localMaxCh = pathDict["format"]["maxCh"]
1157  self.logger.debug("!!! get localMaxCh from pathDict[\"format\"][\"maxCh\"] = %s", str(localMaxCh))
1158  else:
1159  localMaxCh = path["format"]
1160  if isinstance(localMaxCh, basestring) and localMaxCh == "":
1161  localMaxCh = 0
1162  self.logger.debug("!!! get localMaxCh from [\"format\"] = %s", str(localMaxCh))
1163 
1164  try:
1165  if localMaxCh is not None and int(localMaxCh) > 0 and len(localText) > int(localMaxCh):
1166  localText = localText[0: int(localMaxCh)]
1167  except ValueError, err:
1168  self.logger.debug("!!! Use wrong value, error: %s", str(err))
1169 
1170  result.tags[key]["data"] = []
1171  result.tags[key]["data"].append(localText)
1172  elif path["type"] == "html":
1173  # >>> html
1174  for i, elem in enumerate(result.tags[key]["data"]):
1175  result.tags[key]["data"][i] = re.sub(r"<![ \r\n\t]*(--([^\-]|[\r\n]|-[^\-])*--[ \r\n\t]*)>", "", elem)
1176  self.logger.debug(">>> After RE = " + str(result.tags[key]["data"]))
1177 # # # apply post processing algorithm
1178 # self.postprocessing(result, path, key)
1179  # >>> html END
1180  elif path["type"] == "datetime":
1181  # >>> datetime
1182  bestData = ''
1183  try:
1184  self.logger.debug("Try to convert data")
1185  if not isExtract:
1186  # New use default value as a format string for current date
1187  if len(result.tags[key]["data"][0]) > 0 and result.tags[key]["data"][0][0] == '@':
1188  localFormatStr = result.tags[key]["data"][0][1: len(result.tags[key]["data"][0])]
1189  localTm = datetime.datetime.fromtimestamp(time.time())
1190  result.tags[key]["data"][0] = datetime.datetime.strftime(localTm, localFormatStr)
1191  else:
1192  bestData = self.getBestDatatimeData(result.tags[key]["data"])
1193  self.logger.debug(">>> Time log Before = " + bestData)
1194  if path["format"] != "" and path["format"] != "FULL":
1195  result.tags[key]["data"][0] = datetime.datetime.strftime(parser.parse(bestData), path["format"])
1196  else:
1197  result.tags[key]["data"][0] = str(parser.parse(bestData))
1198  self.logger.debug(">>> Time log after = " + result.tags[key]["data"][0])
1199  except Exception as err:
1200  self.logger.debug("Can't convert data <<< " + str(result.tags) + " " + str(key) + " err = " + str(err))
1201  result.tags[key]["data"][0] = bestData
1202  if len(result.tags[key]["data"]) > 0:
1203  result.tags[key]["data"] = [result.tags[key]["data"][0]]
1204  # >>> datetime END
1205  elif path["type"] == "image":
1206  if path["format"] == "URL" and "canonicalizeURLs" in path and int(path["canonicalizeURLs"]) == 1:
1207  result.tags[key]["data"] = self.dataUrlsCanonizator(result.tags[key]["data"], self.baseUrl)
1208  elif path["type"] == "link":
1209  formatName = path["format"]
1210  if len(formatName.split(',')) > 1:
1211  formatName = formatName.split(',')[1]
1212  if formatName == "email-address" or formatName == "email-to":
1213  localText = ''
1214  if isinstance(result.tags[key]["data"], basestring):
1215  self.logger.debug(">>> mail to str type")
1216  localText = result.tags[key]["data"].strip(self.xpathSplitString)
1217  index = localText.find("mailto:")
1218  if index >= 0:
1219  localText = localText[index + len("mailto:"), len(localText)]
1220  else:
1221  localText = ""
1222  elif isinstance(result.tags[key]["data"], list):
1223  self.logger.debug(">>> mail to list type")
1224  for elem in result.tags[key]["data"]:
1225  elemText = elem.strip(self.xpathSplitString)
1226  index = elemText.find("mailto:")
1227  if index >= 0:
1228  elemText = elemText[index + len("mailto:"): len(elemText)]
1229  if formatName == "email-address":
1230  elemText = Utils.emailParse(elemText)
1231  else:
1232  elemText = Utils.emailParse(elemText, True)
1233  else:
1234  elemText = ""
1235  if elemText != "":
1236  localText += (elemText + self.xpathSplitString)
1237 
1238  result.tags[key]["data"] = []
1239  result.tags[key]["data"].append(localText)
1240  if "canonicalizeURLs" in path and int(path["canonicalizeURLs"]) == 1:
1241  result.tags[key]["data"] = self.dataUrlsCanonizator(result.tags[key]["data"], self.baseUrl)
1242  elif path["type"] == "attribute":
1243  if isExtract:
1244  localText = ''
1245  if isinstance(result.tags[key]["data"], basestring):
1246  localText = result.tags[key]["data"]
1247  elif isinstance(result.tags[key]["data"], list):
1248  localText = self.xpathSplitString.join([elem for elem in result.tags[key]["data"] if elem != ''])
1249  splittedFormatString = path["format"].split(',')
1250  if len(splittedFormatString) >= 2:
1251  try:
1252  if int(splittedFormatString[0]) < len(localText):
1253  localText = localText[0: int(splittedFormatString[0])]
1254  except Exception as err:
1255  self.logger.debug("Error: %s; Wrong path format for attribute rule, format=%s", str(err), path["format"])
1256  result.tags[key]["data"] = []
1257  result.tags[key]["data"].append(localText)
1258 
1259  localElem = ''
1260  for elem in result.tags[key]["data"]:
1261  localElem += elem
1262  localElem += self.xpathSplitString
1263  result.tags[key]["data"][0] = localElem
1264  result.tags[key]["data"][0] = result.tags[key]["data"][0].strip(self.xpathSplitString)
1265 
1266 
1267  def applyPostProcessing(self, result, key, postProcessingRE):
1268  if key in result.tags and "data" in result.tags[key] and result.tags[key]["data"] is not None and \
1269  len(result.tags[key]["data"]) > 0:
1270  try:
1271  matchingVal = re.compile(postProcessingRE) # #, re.UNICODE | re.MULTILINE)
1272  except re.error as err:
1273  self.logger.debug("Post-processing RE error: %s", str(err))
1274  self.errorMask = self.errorMask | APP_CONSTS.ERROR_RE_ERROR
1275  else:
1276  self.logger.debug("!!! type(result.tags[%s][\"data\"] = %s", str(key), type(result.tags[key]["data"]))
1277 
1278  tmpStr = ""
1279  matchingResult = []
1280  if isinstance(result.tags[key]["data"], basestring):
1281  matchingResult = matchingVal.findall(result.tags[key]["data"])
1282  elif isinstance(result.tags[key]["data"], list):
1283  # accumulate all results
1284  for tagData in result.tags[key]["data"]:
1285  self.logger.debug("!!! type(tagData) = %s, tagData: %s", str(type(tagData)), varDump(tagData))
1286  localRes = matchingVal.findall(tagData)
1287  matchingResult.extend(localRes)
1288 # match = re.search(postProcessingRE, tagData, re.U | re.M)
1289 # self.logger.debug("!!! match = %s, postProcessingRE = '%s'", str(match), str(postProcessingRE))
1290 # if match is not None:
1291 # matchingResult.append(str(match.group()))
1292 
1293  innerSplitString = '|||||'
1294  self.logger.debug("Post-processing has %s matched results!", str(len(matchingResult)))
1295  self.logger.debug("Post-processing matchingResult: %s", varDump(matchingResult))
1296  if len(matchingResult) > 0:
1297  for elem in matchingResult:
1298  if isinstance(elem, basestring):
1299  tmpStr += str(elem)
1300  tmpStr += self.xpathSplitString
1301  else:
1302  for innerElem in elem:
1303  if innerElem is not None and innerElem != '':
1304  tmpStr += str(innerElem)
1305  tmpStr += innerSplitString
1306  else:
1307  self.logger.debug("Post-processing has no matched results!")
1308 
1309  tmpStr = tmpStr.strip(self.xpathSplitString)
1310  if tmpStr != "":
1311  self.logger.debug("Post-processing matched and replaced with pieces!")
1312  self.logger.debug("!!! type(result.tags[%s][\"data\"])) = %s", str(key), str(type(result.tags[key]["data"])))
1313  self.logger.debug("!!! tmpStr: %s", varDump(tmpStr))
1314  if isinstance(result.tags[key]["data"], basestring):
1315  result.tags[key]["data"] = tmpStr
1316 # else:
1317 # result.tags[key]["data"][0] = tmpStr
1318  elif isinstance(result.tags[key]["data"], list):
1319  result.tags[key]["data"] = matchingResult # #tmpStr.split(innerSplitString)
1320  else:
1321  # Set not detected value if no match, changed default behavior by bgv
1322  self.logger.debug("Post-processing not matched, value replaced with None or empty!")
1323  if isinstance(result.tags[key]["data"], basestring):
1324  result.tags[key]["data"] = ''
1325  else:
1326  result.tags[key]["data"][0] = None
1327  else:
1328  self.logger.debug("Post-processing keys not found!")
1329 
1330 
1331  def processingHTMLData(self, htmlBuf, bufFormat):
1332  ret = htmlBuf
1333  if bufFormat.find("NO_SCRIPT") >= 0:
1334  ret = Utils.stripHTMLComments(htmlBuf, soup=None)
1335  if bufFormat.find("NO_META") >= 0:
1336  pass
1337  if bufFormat.find("NO_COMMENTS") >= 0:
1338  pass
1339  if bufFormat.find("ENTITIES_ENCODED") >= 0:
1340  pass
1341  return ret
1342 
1343 
1344  def getBestDatatimeData(self, data):
1345  ret = ""
1346  if isinstance(data, list):
1347  for elem in data:
1348  for ch in elem:
1349  if ch >= '0' and ch <= '9':
1350  ret = elem
1351  break
1352  if ret is not None:
1353  break
1354  if ret is None:
1355  ret = data[0]
1356  else:
1357  ret = data
1358  if isinstance(ret, basestring):
1359  ret = ret.replace('\n', '')
1360  ret = ret.replace('\t', '')
1361  else:
1362  ret = ""
1363  return ret
1364 
1365 
1366  def newsExtraction(self):
1367  ret = []
1368 
1369  template = self.getTemplate(explicit=False)
1370 
1371  # get resource as dictionary
1372  resource_set = {}
1373  resource_set["url"] = self.input_data.url
1374  resource_set["resId"] = self.input_data.urlId
1375  resource_set["siteId"] = self.input_data.siteId
1376  resource_set["raw_html"] = self.input_data.raw_content
1377  resource = Resource(resource_set)
1378 
1379  collectResult = Result(self.config, self.input_data.urlId, self.metrics)
1380  blockedByXpathTags = []
1381 
1382  while True:
1383  self.extractor = self.getNextBestExtractor()
1384  self.logger.debug("Got best matching extractor: " + str(self.extractor))
1385  if self.extractor is None:
1386  self.logger.debug("No more extractors, exiting loop")
1387  break
1388 
1389  result = Result(self.config, self.input_data.urlId, self.metrics)
1390 
1391  if CONSTS.TAG_MEDIA in collectResult.tags.keys() and \
1392  not self.extractor.isTagNotFilled(collectResult, CONSTS.TAG_MEDIA):
1393  self.logger.debug("!!! Check collectResult. Tag 'media' already selected. Copy.")
1394  result.tags[CONSTS.TAG_MEDIA] = collectResult.tags[CONSTS.TAG_MEDIA]
1395 
1396  result.blockedByXpathTags = blockedByXpathTags
1397  self.logger.debug(">>> TAG BEGIN extractor = " + str(self.extractor))
1398  result = self.extractor.extractTags(resource, result)
1399 
1400  self.logger.debug(">>> TAG END")
1401  empty_tags = result.getEmptyTags()
1402  self.logger.debug("get list of empty tags from result: " + str(empty_tags))
1403  filled_tags = result.getFilledTags()
1404  self.logger.debug("get list of filled_tags from result: " + str(filled_tags))
1405 
1406  self.commonResultOperations(result)
1407  for tag in result.tags:
1408  if tag in template:
1409  for rule in template[tag]:
1410  self.postprocessing(result, rule, tag)
1411  if tag not in collectResult.tags or not collectResult.isTagFilled(tag):
1412  collectResult.tags[tag] = copy.deepcopy(result.tags[tag])
1413  blockedByXpathTags = result.blockedByXpathTags
1414  result.finish = time.time()
1415  ret.append(result)
1416 
1417  collectResult.blockedByXpathTags = blockedByXpathTags
1418  ret = [collectResult] + ret
1419 
1420  return ret
1421 
1422 
1423  def commonResultOperations(self, result):
1424  empty_tags = result.getEmptyTags()
1425  for localKey in EXTENDED_NEWS_TAGS:
1426  if localKey in empty_tags or (localKey in result.tags and result.isTagFilled(localKey) is False):
1427  self.extractAdditionTagsByScrapy(result, localKey, EXTENDED_NEWS_TAGS[localKey])
1428  for tagName in LINKS_NEWS_TAGS:
1429  if tagName in result.tags:
1430  if isinstance(result.tags[tagName], dict) and (result.tags[tagName]["xpath"] == "" or \
1431  result.tags[tagName]["xpath"].find("/@src") != -1 or result.tags[tagName]["xpath"].find("/@href") != -1):
1432  result.tags[tagName]["data"] = \
1433  self.dataUrlsCanonizator(result.tags[tagName]["data"], self.baseUrl)
1434 
1435  self.refineCommonText(CONSTS.TAG_CONTENT_UTF8_ENCODED, result)
1436  self.refineBadDateTags(result)
1437 
1438 
1439  def replaceLoopValue(self, buf, replaceFrom, replaceTo):
1440  localValue = buf
1441  replaceValue = localValue.replace(replaceFrom, replaceTo)
1442  while len(replaceValue) != len(buf):
1443  localValue = replaceValue
1444  replaceValue = localValue.replace(replaceFrom, replaceTo)
1445  return localValue
1446 
1447 
1448  def refineCommonText(self, tagName, result):
1449  if tagName in result.tags:
1450  if isinstance(result.tags[tagName], dict):
1451  localValue = None
1452  if isinstance(result.tags[tagName]["data"], list) and len(result.tags[tagName]["data"]) > 0:
1453  localValue = result.tags[tagName]["data"][0]
1454  elif isinstance(result.tags[tagName]["data"], basestring):
1455  localValue = result.tags[tagName]["data"]
1456  if localValue is not None:
1457  replaceList = None
1458  if CONSTS.TAG_REDUCE_PROP_NAME in self.properties:
1459  try:
1460  replaceList = json.loads(self.properties[CONSTS.TAG_REDUCE_PROP_NAME])
1461  except Exception:
1462  self.logger.debug(">>> Bad processor_property json format, [" + CONSTS.TAG_REDUCE_PROP_NAME + "]")
1463  if replaceList is None:
1464  replaceList = CONTENT_REPLACEMENT_LIST # json.loads(CONTENT_REPLACEMENT)
1465 
1466  if CONSTS.TAG_REDUCE_MASK_PROP_NAME in self.properties:
1467  try:
1468  self.tagReduceMask = int(self.properties[CONSTS.TAG_REDUCE_MASK_PROP_NAME])
1469  except Exception:
1470  self.logger.error("Bad processor property '%s' value: '%s'", CONSTS.TAG_REDUCE_MASK_PROP_NAME,
1471  str(self.properties[CONSTS.TAG_REDUCE_MASK_PROP_NAME]))
1472 
1473  self.logger.debug("self.tagReduceMask = %s", str(self.tagReduceMask))
1474 # self.logger.debug("replaceList: %s", str(replaceList))
1475 
1476  replaceList = [replaceList[i] for i in xrange(len(replaceList)) if 1 << i & self.tagReduceMask]
1477 
1478 # if " " not in replaceList:
1479 # replaceList.append(" ")
1480 # self.logger.debug(">>> Repl list = " + str(replaceList))
1481  for elem in replaceList:
1482  # self.logger.debug(">>> Value before = " + localValue)
1483  localValue = Utils.replaceLoopValue(localValue, (elem * 2), elem)
1484  # self.logger.debug(">>> Value after = " + localValue)
1485  localValue = localValue.replace("\r", " ")
1486 
1487  if isinstance(result.tags[tagName]["data"], list) and len(result.tags[tagName]["data"]) > 0:
1488  result.tags[tagName]["data"][0] = localValue
1489  elif isinstance(result.tags[tagName]["data"], basestring):
1490  result.tags[tagName]["data"] = localValue
1491 
1492 
1493  def extractAdditionTagsByScrapy(self, localResult, key, tagsXpaths):
1494  self.logger.debug(">>> Start addition news extracting")
1495  extractor = self.getExtractorByName("ScrapyExtractor")
1496  if extractor is not None:
1497  sel = SelectorWrapper(text=self.input_data.raw_content)
1498  for tagsXpath in tagsXpaths:
1499  if tagsXpath is not None and tagsXpath != "":
1500  localXpath = sel.xpath(tagsXpath)
1501  localValue = Utils.innerText(localXpath, ' ', ' ', self.properties[CONSTS.TAG_MARKUP_PROP_NAME] \
1502  if CONSTS.TAG_MARKUP_PROP_NAME in self.properties else None, None,
1503  self.attrConditions)
1504  if localValue != "":
1505  extractor.addTag(localResult, key, localValue, tagsXpath)
1506  break
1507  else:
1508  self.logger.debug(">>> Cant extract tag=%s for xpath=%s" % (key, tagsXpath))
1509 
1510 
1512  # return extractor with highest rank
1513  try:
1514  extractor = next(self.itr)
1515  except StopIteration:
1516  extractor = None
1517  return extractor
1518 
1519 
1520  # #getProcessedContent
1521  #
1522  def getProcessedContent(self, result):
1523  for elem in result:
1524  elem.get()
1525 
1526 # self.logger.info("!!! result[0].tags[\"content_encoded\"][\"data\"][0]: %s",
1527 # str(result[0].tags["content_encoded"]["data"][0]))
1528 
1529 # if "content_encoded" in result[0].tags and "data" in result[0].tags["content_encoded"] and \
1530 # len(result[0].tags["content_encoded"]["data"]) > 0:
1531 # result[0].tags["content_encoded"]["data"][0] = result[0].tags["content_encoded"]["data"][0].replace('\\n', '\n')
1532 
1533  self.processedContent = {}
1534  self.processedContent["default"] = result[0]
1535  self.processedContent["internal"] = result
1536  self.processedContent["custom"] = []
1537  self.tagsCount = result[0].tagsCount
1538  self.tagsMask = result[0].tagsMask
1539 
1540 # #TODO remove in future ## checked now
1541  if "pubdate" in result[0].tags and "data" in result[0].tags["pubdate"] and \
1542  len(result[0].tags["pubdate"]["data"]) > 0:
1543  self.pubdate = result[0].tags["pubdate"]["data"][0]
1544  self.logger.debug('>>>> Set self.pubdate = ' + str(self.pubdate))
1545  self.input_data.batch_item.urlObj.pDate = self.pubdate
1546 
1547 
1548  # #load extractors
1549  #
1550  def loadExtractors(self):
1551  try:
1552  # modules
1553  if CONSTS.MODULES_KEY in self.properties and self.algorithm_name in self.properties[CONSTS.MODULES_KEY]:
1554  modules = self.properties[CONSTS.MODULES_KEY][self.algorithm_name]
1555  else:
1556  self.logger.debug(">>> No moduler_key or algorithm_name in self.properties")
1557  modules = []
1558 
1559  self.logger.debug("Algorithm name: <%s>" % (self.algorithm_name))
1560  self.logger.debug("Modules: %s" % modules)
1561 
1562  self.extractors = []
1563  for module in modules:
1564  exrtactor = self.createModule(module)
1565  # Check if module was created successfully and then insert it to extractors
1566  if exrtactor is not None:
1567  self.extractors.append(exrtactor)
1568 
1569  # Info show extractors loaded
1570  self.logger.debug("*******************")
1571  self.logger.debug("Loaded extractors:")
1572  for exrtactor in self.extractors:
1573  self.logger.debug(exrtactor.name)
1574  self.logger.debug("*******************")
1575 
1576  except Exception as err:
1577  ExceptionLog.handler(self.logger, err, MSG_ERROR_LOAD_EXTRACTORS)
1578  raise
1579 
1580 
1581  # #process batch
1582  # the main processing of the batch object
1583  def processBatch(self):
1584  # logger
1585  for entry in self.message_queue:
1586  self.logger.debug(entry)
1587 
1588  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1589  # read pickled batch object from stdin
1590  input_pickled_object = sys.stdin.read()
1591 
1592  try:
1593  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1594  scraper_in_data = pickle.loads(input_pickled_object)
1595  except Exception as err:
1596  ExceptionLog.handler(self.logger, err, 'pickle.loads() error:')
1597  self.logger.debug("input_pickled_object:\n" + str(input_pickled_object))
1598  self.exitCode = EXIT_FAILURE
1599  raise Exception(err)
1600 
1601  try:
1602  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1603  self.input_data = scraper_in_data
1604  if self.input_data.batch_item.urlObj is not None:
1605  urlString = self.input_data.batch_item.urlObj.url
1606  else:
1607  urlString = ""
1608  logMsg = "BatchItem.siteId=" + str(self.input_data.batch_item.siteId) + \
1609  ", BatchItem.urlId=" + str(self.input_data.batch_item.urlId) + \
1610  ", BatchItem.urlObj.url=" + urlString
1611  app.Profiler.messagesList.append(logMsg)
1612  self.logger.info("Incoming data: %s", logMsg)
1613  # self.logger.debug("self.input_data:\n%s", varDump(self.input_data))
1614  self.urlHost = self.calcUrlDomainCrc(self.input_data.url)
1615 
1616  if self.input_data.output_format is not None and "name" in self.input_data.output_format:
1617  self.outputFormat = self.input_data.output_format["name"]
1618 
1619  if self.outputFormat is None and "templates" in self.input_data.batch_item.properties["template"] and \
1620  len(self.input_data.batch_item.properties["template"]["templates"]) > 0 and \
1621  "output_format" in self.input_data.batch_item.properties["template"]["templates"][0] and \
1622  "name" in self.input_data.batch_item.properties["template"]["templates"][0]["output_format"]:
1623  self.outputFormat = self.input_data.batch_item.properties["template"]["templates"][0]["output_format"]["name"]
1624 
1625  if "TAGS_MAPPING" in self.input_data.batch_item.properties and \
1626  self.input_data.batch_item.properties["TAGS_MAPPING"] is not None:
1627  try:
1628  self.altTagsMask = json.loads(self.input_data.batch_item.properties["TAGS_MAPPING"])
1629  self.logger.debug(">>> AltTags = " + str(self.altTagsMask))
1630  except Exception as exp:
1631  self.logger.debug(">>> Bad TAGS_MAPPING properties value, err=" + str(exp))
1632  # check properties in input data
1633  try:
1634  if (self.input_data is not None) and (self.input_data.processor_properties is not None):
1635  processor_properties = self.input_data.processor_properties
1636  # self.logger.debug("Processor's properties was taken from input data: %s" % processor_properties)
1637  # self.logger.debug("Processor's properties type: %s" % str(type(processor_properties)))
1638  if not isinstance(processor_properties, dict):
1639  processor_properties = json.loads(self.input_data.processor_properties)
1640  self.logger.debug("Processor's properties was taken from input data: %s" % processor_properties)
1641  self.properties.update(processor_properties)
1642  except Exception as err:
1643  ExceptionLog.handler(self.logger, err, 'Error load properties from input data:')
1644 
1645  self.algorithm_name = self.properties[CONSTS.ALGORITHM_KEY][CONSTS.ALGORITHM_NAME_KEY]
1646  self.logger.debug("Algorithm : %s" % self.algorithm_name)
1647  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1648  Utils.storePickleOnDisk(input_pickled_object, ENV_SCRAPER_STORE_PATH, "scraper.in." + \
1649  str(self.input_data.urlId))
1650  if "metrics" in self.properties:
1651  try:
1652  self.metrics = json.loads(self.properties["metrics"])
1653  self.logger.debug(">>> Metrics loads = " + str(self.metrics))
1654  except Exception as excp:
1655  self.logger.debug(">>> Metrcis dumps exception = " + str(excp))
1656  # TODO main processing over every url from list of urls in the batch object
1657  tmp = sys.stdout
1658  sys.stdout = open("/dev/null", "wb")
1659 
1660  # initialization of scraper
1661  # load scraper's modules
1662  self.loadExtractors()
1663 
1664  self.logger.info("Process with extractor algorithm: " + str(self.algorithm_name))
1665  # SUPPORT METRICS ALGORITHTM
1666  # if self.algorithm_name == CONSTS.PROCESS_ALGORITHM_METRIC:
1667  # self.processMetrics()
1668  # SUPPORT FEED_PARSER ALGORITHTM
1669  if self.algorithm_name == CONSTS.PROCESS_ALGORITHM_FEED_PARSER:
1670  self.feedParserProcess()
1671  else:
1672  self.process(self.config)
1673 
1674  # send response to the stdout
1675  sys.stdout = tmp
1676 
1677  scraperResponse = ScraperResponse(self.tagsCount, self.tagsMask, self.pubdate, self.processedContent,
1678  self.errorMask)
1679 # self.logger.debug("scraperResponse:\n%s", varDump(scraperResponse))
1680 
1681  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1682  output_pickled_object = pickle.dumps(scraperResponse)
1683  Utils.storePickleOnDisk(output_pickled_object, ENV_SCRAPER_STORE_PATH,
1684  "scraper.out." + str(self.input_data.urlId))
1685  print output_pickled_object
1686  sys.stdout.flush()
1687  else:
1688  self.output_data = scraperResponse
1689 
1690  except Exception as err:
1691  ExceptionLog.handler(self.logger, err, 'Scraper process batch error:')
1692  self.exitCode = EXIT_FAILURE
1693  raise Exception('Scraper process batch error:' + str(err))
1694 
1695 
1696 
1697  # #load config from file
1698  # load from cli argument or default config file
1699  def loadConfig(self):
1700  try:
1701  self.config = ConfigParser.ConfigParser()
1702  self.config.optionxform = str
1703  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1704  if self.pargs.config:
1705  self.config.read(self.pargs.config)
1706  else:
1707  self.config.read(APP_NAME)
1708  else:
1709  self.config.read(self.configFile)
1710  except:
1711  print MSG_ERROR_LOAD_CONFIG
1712  raise
1713 
1714 
1715  # #load logging
1716  # load logging configuration (log file, log level, filters)
1717  #
1719  try:
1720  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1721  log_conf_file = self.config.get("Application", "log")
1722  logging.config.fileConfig(log_conf_file)
1723  # Logger initialization
1724  self.logger = Utils.MPLogger().getLogger()
1725  except Exception, err:
1726  raise Exception(CONSTS.MSG_ERROR_LOAD_CONFIG + " : " + str(err))
1727 
1728 
1729  # #load mandatory options
1730  # load mandatory options
1731  #
1732  def loadOptions(self):
1733  try:
1734  class_name = self.__class__.__name__
1735  self.scraperPropFileName = self.config.get("Application", "property_file_name")
1736  # DBWrapper initialization
1737  dbTaskIniConfigFileName = self.config.get(self.__class__.__name__, "db-task_ini")
1738  config = ConfigParser.ConfigParser()
1739  config.optionxform = str
1740  readOk = config.read(dbTaskIniConfigFileName)
1741  if len(readOk) == 0:
1742  raise Exception(self.MSG_ERROR_WRONG_CONFIG_FILE_NAME + ": " + dbTaskIniConfigFileName)
1743  self.dbWrapper = DBTasksWrapper(config)
1744 
1745  # url sources rules initialization
1746  urlSourcesList = self.config.get(self.__class__.__name__, OPTION_SECTION_URL_SOURCES_RULES)
1747  if isinstance(urlSourcesList, basestring):
1748  self.urlSourcesRules = [urlSourcesRule.strip() for urlSourcesRule in urlSourcesList.split(',')]
1749  self.logger.debug("Initialization urlSourcesRules: %s", varDump(self.urlSourcesRules))
1750 
1751  self.sqliteTimeout = self.config.getint("sqlite", "timeout")
1752 
1753  self.useCurrentYear = self.config.getint("DateTimeType", "useCurrentYear")
1754 
1755  self.tagsTypes = self.config.get(class_name, OPTION_SECTION_TAGS_TYPE)
1756 
1757  if self.config.has_section(OPTION_SECTION_DATETIME_NEWS_NAMES):
1758  self.datetimeNewsNames = []
1759  for item in self.config.items(OPTION_SECTION_DATETIME_NEWS_NAMES):
1760  self.datetimeNewsNames.append(item[0])
1761  else:
1762  self.logger.debug("Config file hasn't section: " + str(OPTION_SECTION_DATETIME_NEWS_NAMES))
1763  self.datetimeNewsNames = TAGS_DATETIME_NEWS_NAMES
1764 
1765  if self.config.has_section(OPTION_SECTION_DATETIME_TEMPLATE_TYPES):
1766  self.datetimeTemplateTypes = []
1767  for item in self.config.items(OPTION_SECTION_DATETIME_TEMPLATE_TYPES):
1768  self.datetimeTemplateTypes.append(item[0])
1769  else:
1770  self.logger.debug("Config file hasn't section: " + str(OPTION_SECTION_DATETIME_TEMPLATE_TYPES))
1771  self.datetimeTemplateTypes = TAGS_DATETIME_TEMPLATE_TYPES
1772  except:
1773  print MSG_ERROR_LOAD_OPTIONS
1774  raise
1775 
1776 
1777  # #loadScraperProperties
1778  # loadScraperProperties loads scraper propeties from json file
1780  if self.scraperPropFileName is not None:
1781  try:
1782  with open(self.scraperPropFileName, "rb") as fd:
1783  scraperProperies = json.loads(fd.read())
1784  self.properties = scraperProperies[self.__class__.__name__][CONSTS.PROPERTIES_KEY]
1785  except Exception as excp:
1786  self.logger.debug(">>> Some error with scraper property loads = " + str(excp))
1787 
1788 
1789  # #createApp
1790  # create application's pool
1791  #
1792  # @param app_name application name which instance will be created
1793  # @return instance of created application
1794  def createModule(self, module_name):
1795  appInst = None
1796  try:
1797 # appInst = (module_name, eval(module_name)(self.config, None, self.urlHost, self.properties))[1] # pylint: disable=W0123
1798  appInst = (module_name, eval(module_name)(self.config,
1799  None,
1801  self.properties))[1]
1802  self.logger.debug("%s has been created!" % module_name)
1803  except Exception as err:
1804  ExceptionLog.handler(self.logger, err, "Can't create module %s. Error is:" % (module_name))
1805 
1806  return appInst
1807 
1808 
1809  # #createApp
1810  # create application's pool
1811  #
1812  # @param app_name application name which instance will be created
1813  # @return instance of created application
1814  def getExtractorByName(self, extractorName):
1815  for extractor in self.extractors:
1816  if extractor.__class__.__name__ == extractorName:
1817  return extractor
1818 
1819 
1820  # #
1821  #
1822  #
1823  def getExitCode(self):
1824  return self.exitCode
1825 
1826  #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1827  # FeedParser section
1828  #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1829  # #main content processing
1830  # main content processing
1831  #
1833  self.logger.debug("URL: %s" % str(self.input_data.url))
1834  self.logger.debug("URLMd5: %s" % str(self.input_data.urlId))
1835  self.logger.debug("SiteId: %s" % str(self.input_data.siteId))
1836  if self.parseFeed():
1837  self.tagsCount = self.article.tagsCount
1838  self.tagsMask = self.article.tagsMask
1839  self.processedContent = self.article.get()
1840  # correct pubdate
1841  if CONSTS.PUBLISHED in self.article.tags:
1842  # self.pubdate = parse(self.article.tags[CONSTS.PUBLISHED]["data"]).strftime(CONSTS.COMMON_DATE_FORMAT)
1843  self.pubdate = DateTimeType.parse(self.article.tags[CONSTS.PUBLISHED]["data"], bool(self.useCurrentYear), \
1844  self.logger)
1845  else:
1846  self.logger.debug("Resource %s hasn't publish date" % str(self.article.tags[CONSTS.TAG_LINK]["data"]))
1847  else:
1848  self.logger.debug("Resource hasn't raw content. Exit.")
1849 
1850 
1851  # #main content processing
1852  # main content processing
1853  #
1854  def createArticle(self):
1855  resid = self.entry["urlMd5"]
1856  self.article = Result(self.config, resid, self.metrics)
1857 
1858  for tag in self.entry["entry"]:
1859  data = {"extractor":"feedParser extractor", "data":"", "name":""}
1860  data["data"] = self.entry["entry"][tag]
1861  data["name"] = tag
1862  self.article.tags[tag] = data
1863 
1864  date_tags = ["published", "updated", "updated_parsed"]
1865  if len(set(self.entry["entry"].keys()).intersection(date_tags)) == 0:
1866  self.logger.debug("PUBDATE_ERROR: list of tags from rss feed: %s" % str(self.entry["entry"].keys()))
1867 
1868  if "pubdate" in self.entry and self.article.tags["pubdate"] == "":
1869  data = {"extractor":"feedParser extractor", "data":"", "name":""}
1870  data["data"] = self.entry["pubdate"]
1871  data["name"] = "pubdate"
1872  self.article.tags["pubdate"] = data
1873 
1874  # parent rss feed
1875  data = {"extractor":"feedParser extractor", "data":"", "name":""}
1876  data["data"] = self.entry["parent_rss_feed"]
1877  data["name"] = "parent_rss_feed"
1878  data["xpath"] = ""
1879  data["extractor"] = self.__class__.__name__
1880  self.article.tags["parent_rss_feed"] = data
1881 
1882  # parent rss feed urlMd5
1883  data = {"extractor":"feedParser extractor", "data":"", "name":""}
1884  data["data"] = self.entry["parent_rss_feed_urlMd5"]
1885  data["name"] = "parent_rss_feed_urlMd5"
1886  data["xpath"] = ""
1887  data["extractor"] = self.__class__.__name__
1888  self.article.tags["parent_rss_feed_urlMd5"] = data
1889 
1890  # tags count
1891  self.article.tagsCount = len(self.article.tags.keys())
1892 
1893 
1894  # #main content processing
1895  # main content processing
1896  #
1897  def parseFeed(self):
1898  ret = True
1899  try:
1900  self.entry = json.loads(self.input_data.raw_content)
1901  self.createArticle()
1902  self.putArticleToDB({"default":self.article}) # pylint: disable=E1101
1903  except ValueError, err:
1904  ExceptionLog.handler(self.logger, err, 'Bad raw content:', (self.input_data.raw_content), \
1905  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
1906  ret = False
1907 
1908  return ret
1909 
1910 
1911  # # Extract pubdate rss feed from header
1912  #
1913  # @param siteId - Site/Project ID
1914  # @param url - url string
1915  # @return pubdate from rss feed
1916  def extractPubdateRssFeed(self, siteId, url):
1917  # variable for result
1918  pubdate = None
1919  timezone = ''
1920 
1921  self.logger.debug("!!! siteId: %s, url: %s", str(siteId), str(url))
1922  headerContent = self.getHeaderContent(siteId, url)
1923  rawPubdate = self.getVariableFromHeaderContent(headerContent, CRAWLER_CONSTS.pubdateRssFeedHeaderName)
1924 
1925 # self.logger.debug('!!! getVariableFromHeaderContent: ' + str(rawPubdate))
1926  if rawPubdate is not None:
1927  try:
1928  dt = DateTimeType.parse(rawPubdate, True, self.logger, False)
1929  if dt is not None:
1930  dt, timezone = DateTimeType.split(dt)
1931  pubdate = dt.strftime("%Y-%m-%d %H:%M:%S")
1932 
1933  if timezone is '':
1934  timezone = '+0000'
1935  except Exception, err:
1936  self.logger.debug("Unsupported date format: '%s', error: %s", str(rawPubdate), str(err))
1937 
1938  return pubdate, timezone
1939 
1940 
1941  # # Extract feed url of rss feed from header
1942  #
1943  # @param siteId - Site/Project ID
1944  # @param url - url string
1945  # @return feed url
1946  def extractFeedUrlRssFeed(self, siteId, url):
1947  # variable for result
1948  ret = None
1949 
1950  self.logger.debug("!!! siteId: %s, url: %s", str(siteId), str(url))
1951  headerContent = self.getHeaderContent(siteId, url)
1952  if headerContent is not None:
1953  ret = self.getVariableFromHeaderContent(headerContent, CRAWLER_CONSTS.rssFeedUrlHeaderName)
1954 
1955  self.logger.debug('!!! ret: ' + str(ret))
1956 
1957  return ret
1958 
1959 
1960  # # Extract base url from header
1961  #
1962  # @param siteId - Site/Project ID
1963  # @param url - url string
1964  # @return base url
1965  def extractBaseUrlRssFeed(self, siteId, url):
1966  # variable for result
1967  ret = None
1968 
1969  self.logger.debug("!!! siteId: %s, url: %s", str(siteId), str(url))
1970  headerContent = self.getHeaderContent(siteId, url)
1971  if headerContent is not None:
1972  ret = self.getVariableFromHeaderContent(headerContent, CRAWLER_CONSTS.baseUrlHeaderName)
1973 
1974  self.logger.debug('!!! ret: ' + str(ret))
1975 
1976  return ret
1977 
1978 
1979  # # Get header content
1980  #
1981  # @param siteId - Site/Project ID
1982  # @param url - url string
1983  # @return extracted header content
1984  def getHeaderContent(self, siteId, url):
1985  # variable for result
1986  headerContent = None
1987  urlContentObj = dc_event.URLContentRequest(siteId, url, \
1988  dc_event.URLContentRequest.CONTENT_TYPE_RAW_LAST + \
1989  dc_event.URLContentRequest. CONTENT_TYPE_RAW + \
1990  dc_event.URLContentRequest.CONTENT_TYPE_HEADERS)
1991 
1992  rawContentData = self.dbWrapper.urlContent([urlContentObj])
1993 
1994  if rawContentData is not None and len(rawContentData) > 0:
1995  if rawContentData[0].headers is not None and len(rawContentData[0].headers) > 0 and \
1996  rawContentData[0].headers[0] is not None:
1997  headerContent = rawContentData[0].headers[0].buffer
1998 
1999  return headerContent
2000 
2001 
2002  # #Get variable from header content
2003  #
2004  # @param headerContent - header content
2005  # @param name - variable name
2006  # @param makeDecode - boolean flag necessary decode
2007  # @return extracted value of incoming name (for sample 'Location')
2008  def getVariableFromHeaderContent(self, headerContent, name, makeDecode=True):
2009  # variable for result
2010  ret = None
2011 
2012  header = ''
2013  if isinstance(headerContent, basestring):
2014  if makeDecode:
2015  header = base64.b64decode(headerContent)
2016  else:
2017  header = headerContent
2018 
2019  headerList = header.split('\r\n')
2020  self.logger.debug("headerList: " + varDump(headerList))
2021 
2022  for elem in headerList:
2023  pos = elem.find(name + ':')
2024 # self.logger.debug("!!! name: '%s', pos = %s", str(name), str(pos))
2025  if pos > -1:
2026  ret = elem.replace(name + ':', '').strip()
2027  self.logger.debug("Found '" + name + "' has value: " + str(ret))
2028  break
2029 
2030  return ret
2031 
2032 
2033  # # change month orden in pubdate if neccessary
2034  #
2035  # @param rawPubdate - raw pubdate string in iso format. sample: '2016-02-07 16:28:00'
2036  # @param properties - properties from PROCESSOR_PROPERTIES
2037  # @param urlString - url string value
2038  # @return pubdate and timezone if success or None and empty string
2039  def pubdateMonthOrder(self, rawPubdate, properties, urlString):
2040  # variables for result
2041  pubdate = rawPubdate
2042 
2043  self.logger.debug('pubdateMonthOrder() enter... rawPubdate: ' + str(rawPubdate))
2044  if CONSTS.PDATE_DAY_MONTH_ORDER_NAME in properties and isinstance(rawPubdate, basestring):
2045  propertyObj = []
2046  try:
2047  self.logger.debug('inputted ' + CONSTS.PDATE_DAY_MONTH_ORDER_NAME + ':' + \
2048  str(properties[CONSTS.PDATE_DAY_MONTH_ORDER_NAME]))
2049  propertyObj = json.loads(properties[CONSTS.PDATE_DAY_MONTH_ORDER_NAME])
2050  except Exception, err:
2051  self.logger.error("Fail loads '%s', error: %s", str(CONSTS.PDATE_DAY_MONTH_ORDER_NAME), str(err))
2052 
2053  for propertyElem in propertyObj:
2054  try:
2055  if "pattern" not in propertyElem:
2056  raise Exception('Property "pattern" not found')
2057 
2058  if "order" not in propertyElem:
2059  raise Exception('Property "order" not found')
2060 
2061  pattern = str(propertyElem["pattern"])
2062  order = int(propertyElem["order"])
2063 
2064  if re.search(pattern, urlString, re.UNICODE) is not None:
2065  self.logger.debug("Pattern '%' found in url: %s", str(pattern), str(urlString))
2066 
2067  dt = None
2068  if order == 0: # means day follows month
2069  dt = datetime.datetime.strptime(rawPubdate, "%Y-%d-%m %H:%M:%S")
2070  elif order == 1: # means month follows day
2071  dt = datetime.datetime.strptime(rawPubdate, "%Y-%m-%d %H:%M:%S")
2072  else:
2073  raise Exception("Unsupported value of 'order' == " + str(order))
2074 
2075  if dt is not None:
2076  pubdate = dt.strftime("%Y-%d-%m %H:%M:%S")
2077 
2078  except Exception, err:
2079  self.logger.error("Fail execution '%s', error: %s", str(CONSTS.PDATE_DAY_MONTH_ORDER_NAME), str(err))
2080 
2081  self.logger.debug('pubdateMonthOrder() leave... pubdate: ' + str(pubdate))
2082 
2083  return pubdate
2084 
2085 
2086  # # Check media tag and append to list
2087  #
2088  # @param urlStringMedia - url string of media tag
2089  # @return allowedUrls list already accumulated allowed url strings
2090  def checkMediaTag(self, urlStringMedia):
2091  # variable for result
2092  allowedUrls = []
2093  # self.logger.debug("!!! urlStringMedia: %s", varDump(urlStringMedia))
2094  mediaUrls = self.splitMediaTagString(urlStringMedia)
2095  # self.logger.debug("!!! mediaUrls: %s", varDump(mediaUrls))
2096 
2097  for media in mediaUrls:
2098  # Check if media is binary picture
2099  if re.search(MediaLimitsHandler.BINARY_IMAGE_SEARCH_STR, media, re.UNICODE) is not None:
2100  self.logger.debug("Tag 'media' has binary picture...")
2101 
2102  if self.mediaLimitsHandler is None:
2103  allowedUrls.append(media)
2104  else:
2105  if self.mediaLimitsHandler.isAllowedLimits(urlString=media, binaryType=True):
2106  allowedUrls.append(media)
2107  else:
2108  self.logger.debug("Binary media tag has not allowed limits. Skipped...")
2109 
2110  # Check is media content valid url
2111  elif isValidURL(media):
2112  self.logger.debug("Tag 'media' has valid url: %s", str(media))
2113  if self.mediaLimitsHandler is None:
2114  allowedUrls.append(media)
2115  else:
2116  if self.mediaLimitsHandler.isAllowedLimits(media):
2117  allowedUrls.append(media)
2118  else:
2119  self.logger.debug("Media tag has not allowed limits. Skipped. Url: %s", str(media))
2120 
2121  # Invalid url of 'media' tag
2122  else:
2123  self.logger.debug("Invalid url in tag 'media'... Url: %s", str(media))
2124 
2125  return allowedUrls
2126 
2127 
2128  # # Split media tag string
2129  #
2130  # @param urlStringMedia - url string of media tag
2131  # @return list urls extracted from string of media tag
2132  def splitMediaTagString(self, urlStringMedia):
2133  # variable for result
2134  urls = []
2135  PROTOCOL_STR = 'http'
2136  DELIMITER_OLD = ','
2137  DELIMITER_NEW = '|||||'
2138  urlStringMedia = urlStringMedia.replace(DELIMITER_OLD + PROTOCOL_STR, DELIMITER_NEW + PROTOCOL_STR)
2139  # temporary string for replace in url string
2140  REPLACE_STR = 'base64|'
2141  if urlStringMedia.find(MediaLimitsHandler.BINARY_IMAGE_SEARCH_STR) > -1:
2142  urlStringMedia = urlStringMedia.replace(MediaLimitsHandler.BINARY_IMAGE_SEARCH_STR, REPLACE_STR)
2143  urls = urlStringMedia.split(DELIMITER_NEW)
2144  self.logger.debug("!!! urls before: " + varDump(urls))
2145  urls = [url.replace(REPLACE_STR, MediaLimitsHandler.BINARY_IMAGE_SEARCH_STR) for url in urls]
2146  self.logger.debug("!!! urls after: " + varDump(urls))
2147  else:
2148  urls = urlStringMedia.split(DELIMITER_NEW)
2149 
2150  return urls
2151 
2152 
2153  # # apply http redirect link
2154  #
2155  # @param siteId - Site/Project ID
2156  # @param url - url string
2157  # @param properties - properties
2158  # @param response - scraper result object
2159  # @return response - alredy modified if necessary
2160  def applyHTTPRedirectLink(self, siteId, url, properties, response):
2161  if CONSTS.HTTP_REDIRECT_LINK_NAME in properties:
2162  self.logger.debug("Found property '%s'", str(CONSTS.HTTP_REDIRECT_LINK_NAME))
2163  propertyValue = int(properties[CONSTS.HTTP_REDIRECT_LINK_NAME])
2164 
2165  self.logger.debug("siteId: %s, url: %s, propertyValue: %s", str(siteId), str(url), str(propertyValue))
2166 # self.logger.debug("response: %s", varDump(response))
2167 
2168  headerContent = self.getHeaderContent(siteId, url)
2169  urlValue = self.getVariableFromHeaderContent(headerContent, CONSTS.LOCATION_NAME)
2170  self.logger.debug("%s value: %s", str(CONSTS.LOCATION_NAME), str(urlValue))
2171 
2172  if propertyValue == CONSTS.HTTP_REDIRECT_LINK_VALUE_URL:
2173  self.logger.debug("!!! propertyValue & %s", str(CONSTS.HTTP_REDIRECT_LINK_VALUE_URL))
2174 
2175  if CONSTS.HTTP_REDIRECT_LINK_LINK_TAG_NAME in response.tags and \
2176  "data" in response.tags[CONSTS.HTTP_REDIRECT_LINK_LINK_TAG_NAME] and \
2177  len(response.tags[CONSTS.HTTP_REDIRECT_LINK_LINK_TAG_NAME]["data"]) > 0:
2178  response.tags[CONSTS.HTTP_REDIRECT_LINK_LINK_TAG_NAME]["data"][0] = url
2179 
2180  if urlValue is not None and propertyValue == CONSTS.HTTP_REDIRECT_LINK_VALUE_LOCATION:
2181  self.logger.debug("!!! propertyValue & %s", str(CONSTS.HTTP_REDIRECT_LINK_VALUE_LOCATION))
2182 
2183  if CONSTS.HTTP_REDIRECT_LINK_LINK_TAG_NAME in response.tags and \
2184  "data" in response.tags[CONSTS.HTTP_REDIRECT_LINK_LINK_TAG_NAME] and \
2185  len(response.tags[CONSTS.HTTP_REDIRECT_LINK_LINK_TAG_NAME]["data"]) > 0:
2186  response.tags[CONSTS.HTTP_REDIRECT_LINK_LINK_TAG_NAME]["data"][0] = str(urlValue)
2187 
2188  if urlValue is not None and propertyValue == CONSTS.HTTP_REDIRECT_LINK_VALUE_REDIRECT_URL:
2189  self.logger.debug("!!! propertyValue & %s", str(CONSTS.HTTP_REDIRECT_LINK_VALUE_REDIRECT_URL))
2190  self.addCustomTag(result=response, tag_name=CONSTS.REDIRECT_URL_NAME, tag_value=[str(urlValue)])
2191 
2192  if propertyValue == CONSTS.HTTP_REDIRECT_LINK_VALUE_SOURCE_URL:
2193  self.logger.debug("!!! propertyValue & %s", str(CONSTS.HTTP_REDIRECT_LINK_VALUE_SOURCE_URL))
2194 
2195  if urlValue is not None:
2196  self.addCustomTag(result=response, tag_name=CONSTS.REDIRECT_URL_NAME, tag_value=[str(urlValue)])
2197  else:
2198  self.addCustomTag(result=response, tag_name=CONSTS.REDIRECT_URL_NAME, tag_value=[url])
2199 
2200  return response
2201 
2202 
2203  # # get domains accord to url sources rules
2204  #
2205  # @param urlSourcesRules - url sources rules
2206  # @return domains - domains list accord to url sources rules
2207  def getDomainsForUrlSourcesRules(self, urlSourcesRules):
2208  self.logger.debug("Incoming value urlSourcesRules: %s", varDump(urlSourcesRules))
2209  # variable for result
2210  domains = []
2211 
2212  for urlSourcesRule in urlSourcesRules:
2213  if urlSourcesRule == URL_SOURCES_RULE_DATA_URL:
2214  self.logger.debug("dataUrl: %s", str(self.input_data.url))
2215  self.logger.debug("urlHost: %s", str(self.urlHost))
2216 
2217  domain = self.calcUrlDomainCrc(self.input_data.url)
2218  self.logger.debug("domain: %s", str(domain))
2219 
2220  if domain is not None:
2221  domains.append(domain)
2222 
2223  if urlSourcesRule == URL_SOURCES_RULE_REDIRECT_URL:
2224  headerContent = self.getHeaderContent(self.input_data.siteId, self.input_data.url)
2225  redirectUrl = self.getVariableFromHeaderContent(headerContent, CONSTS.LOCATION_NAME)
2226  self.logger.debug("redirectUrl: %s", str(redirectUrl))
2227 
2228  if isinstance(redirectUrl, basestring):
2229  domain = self.calcUrlDomainCrc(redirectUrl)
2230  self.logger.debug("domain: %s", str(domain))
2231 
2232  if domain is not None:
2233  domains.append(domain)
2234 
2235  if urlSourcesRule == URL_SOURCES_RULE_FEED_URL:
2236  feedUrl = self.extractFeedUrlRssFeed(self.input_data.siteId, self.input_data.url)
2237  self.logger.debug("feedUrl: %s", str(feedUrl))
2238 
2239  if isinstance(feedUrl, basestring):
2240  domain = self.calcUrlDomainCrc(feedUrl)
2241  self.logger.debug("domain: %s", str(domain))
2242 
2243  if domain is not None:
2244  domains.append(domain)
2245 
2246  if len(domains) == 0:
2247  domains.append(self.urlHost)
2248 
2249  self.logger.debug("return domains: %s", varDump(domains))
2250 
2251  return domains
def pubdateMonthOrder(self, rawPubdate, properties, urlString)
Definition: Scraper.py:2039
def formatTag(self, result, path, key, pathDict, isExtract)
Definition: Scraper.py:1144
def process(self, config)
Definition: Scraper.py:599
def normalizeAuthor(self, confProp, procProp, response)
Definition: Scraper.py:366
def applyHTTPRedirectLink(self, siteId, url, properties, response)
Definition: Scraper.py:2160
def __init__(self, usageModel=APP_CONSTS.APP_USAGE_MODEL_PROCESS, configFile=None, logger=None, inputData=None)
Definition: Scraper.py:121
def addCustomTag(self, result, tag_name, tag_value)
Definition: Scraper.py:1006
def compileResults(self, result, resultsList, key, xPathPreparing=None)
Definition: Scraper.py:1035
def applyPubdate(self, response, pubdate)
Definition: Scraper.py:818
def calcUrlDomainCrc(self, url)
Definition: Scraper.py:584
def extractPubDate(self, response, dataTagName)
Definition: Scraper.py:468
def extractAdditionTagsByScrapy(self, localResult, key, tagsXpaths)
Definition: Scraper.py:1493
def extractPubdateRssFeed(self, siteId, url)
Definition: Scraper.py:1916
def getVariableFromHeaderContent(self, headerContent, name, makeDecode=True)
Definition: Scraper.py:2008
def getTemplate(self, explicit=True)
Definition: Scraper.py:884
def dataUrlsCanonizator(self, data, baseUrl=None, useAdditionEncoding=False)
Definition: Scraper.py:1129
def prepareResults(self, resultsList)
Definition: Scraper.py:1057
def normalizeDatetime(self, response, algorithmName)
Definition: Scraper.py:411
def checkDOMElement(self, elem)
Definition: Scraper.py:203
def getProcessedContent(self, result)
Definition: Scraper.py:1522
def pubdateTransform(self, rawPubdate, rawTimezone, properties, urlString)
Definition: Scraper.py:522
def preparseResponse(self, response)
Definition: Scraper.py:842
def refineBadDateTags(self, response)
Definition: Scraper.py:553
def createModule(self, module_name)
Definition: Scraper.py:1794
def adjustTitle(self, response)
Definition: Scraper.py:303
def adjustPartialReferences(self, response)
Definition: Scraper.py:214
def formatOutpuElement(self, elem, localOutputFormat)
Definition: Scraper.py:851
def applyPostProcessing(self, result, key, postProcessingRE)
Definition: Scraper.py:1267
def getHeaderContent(self, siteId, url)
Definition: Scraper.py:1984
def getDomainsForUrlSourcesRules(self, urlSourcesRules)
Definition: Scraper.py:2207
def replaceLoopValue(self, buf, replaceFrom, replaceTo)
Definition: Scraper.py:1439
def postprocessing(self, result, rule, tag)
Definition: Scraper.py:899
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
def splitMediaTagString(self, urlStringMedia)
Definition: Scraper.py:2132
def urlNormalization(base, url, supportProtocols=None, log=None)
Definition: Utils.py:561
-mask-info
def elemUrlsCanoizator(self, data, baseUrl=None, firstDelim=' ', secondDelim=', useAdditionEncoding=False)
Definition: Scraper.py:1101
def checkMediaTag(self, urlStringMedia)
Definition: Scraper.py:2090
def extractFeedUrlRssFeed(self, siteId, url)
Definition: Scraper.py:1946
def extractBaseUrlRssFeed(self, siteId, url)
Definition: Scraper.py:1965
def getBestDatatimeData(self, data)
Definition: Scraper.py:1344
def getExtractorByName(self, extractorName)
Definition: Scraper.py:1814
pubdate
response.tagsLangDetecting(self.properties[CONSTS.LANG_PROP_NAME])
Definition: Scraper.py:136
Definition: join.py:1
def formatOutputData(self, response, localOutputFormat)
Definition: Scraper.py:872
def templateExtraction(self, config, urlHost)
Definition: Scraper.py:914
def adjustLinkURL(self, response)
Definition: Scraper.py:329
def isValidURL(url)
Definition: Utils.py:1637
def commonResultOperations(self, result)
Definition: Scraper.py:1423
def refineCommonText(self, tagName, result)
Definition: Scraper.py:1448
string MSG_ERROR_WRONG_CONFIG_FILE_NAME
Definition: Scraper.py:108
def processingHTMLData(self, htmlBuf, bufFormat)
Definition: Scraper.py:1331