HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
Utils.py
Go to the documentation of this file.
1 '''
2 Created on Mar 28, 2014
3 
4 @package: app
5 @author: scorp
6 @link: http://hierarchical-cluster-engine.com/
7 @copyright: Copyright © 2013-2014 IOIX Ukraine
8 @license: http://hierarchical-cluster-engine.com/license/
9 @since: 0.1
10 '''
11 
12 
13 try:
14  import cPickle as pickle # pylint: disable=W0611
15 except ImportError:
16  import pickle # pylint: disable=W0611
17 
18 import json
19 import re
20 import sys
21 import os
22 import traceback
23 import types
24 import time
25 import hashlib
26 import ctypes
27 import zlib
28 import urllib
29 import urlparse
30 import collections
31 from subprocess import Popen
32 from subprocess import PIPE
33 from datetime import datetime
34 from decimal import Decimal
35 import logging
36 import threading
37 from stat import ST_MTIME
38 from HTMLParser import HTMLParser
39 import validators
40 
41 import app.Consts as APP_CONSTS
42 from app.Url import Url
43 from app.url_normalize import url_normalize
44 from app.ExtendInnerText import ExtendInnerText
45 from app.Exceptions import UrlParseException
46 
47 
48 # Logger initialization
49 logger = logging.getLogger(APP_CONSTS.LOGGER_NAME)
50 
51 lock = threading.Lock()
52 
53 # META_RE_0 = "<meta http-equiv=\"refresh\".*?>"
54 # META_RE_1 = "<!--(.*<meta http-equiv=\"refresh\".*|\n)*?-->"
55 # META_RE_2 = "url=(.*?)\""
56 # META_RE_3 = ";(http.*)\""
57 
58 META_REDIRECT = r"http-equiv\W*refresh.+?url\W+?(.+?)\""
59 
60 SEARCH_COMMENT_SIMPLE_PATTERN = r"<!--(.|\n)*?-->"
61 SEARCH_COMMENT_PATTERN = r"<![ \r\n\t]*(--([^\-]|[\r\n]|-[^\-])*--[ \r\n\t]*)>"
62 SEARCH_NOSCRIPT_PATTERN = r"<noscript>(.|\n)*?</noscript>"
63 
64 # #PropertiesValidator contains only 1 method - isValueIn, that find value in class(classType param)
65 # attributes that began with "prefix"
66 #
67 class PropertiesValidator(object):
68 
69 
70  def __init__(self):
71  pass
72 
73 
74  @staticmethod
75  def isValueIn(classType, prefix, value):
76  retVal = False
77  for localValue in classType.__dict__:
78  if str(localValue).find(prefix) == 0:
79  if value == getattr(classType, localValue, None):
80  retVal = True
81  break
82  return retVal
83 
84 
85 # #getPath global function, finds and return value by path in json string or dict document
86 # dictionary - incoming dictionary (optional)
87 # jsonString - incoming json string (optional)
88 # path - incoming path to find
89 # method return valid value or raises exceptions -
90 # [ValueError] - bad jsodnString format
91 # [TypeError, KeyError, IndexError] - excpetions raised if path not found
92 # Warning!!! path don't checks by syntaxis
93 def getPath(dictionary, jsonString, path):
94  if jsonString != None:
95  dictionary = json.loads(jsonString)
96  for i, p in re.findall(r'(\d+)|(\w+)', path):
97  dictionary = dictionary[p or int(i)]
98  return dictionary
99 
100 
101 
102 # #Json serialization
103 #
104 class JsonSerializable(object):
105 
106  # #constructor
107  # initialize task's fields
108  #
109  def __init__(self):
110  pass
111 
112  @staticmethod
113  def json_serial(obj):
114  if isinstance(obj, datetime):
115  return obj.isoformat()
116  else:
117  if isinstance(obj, Decimal):
118  return str(obj)
119  else:
120  # if isinstance(obj, type.DictProxy):
121  if isinstance(obj, types.DictProxyType):
122  return dict(obj)
123  else:
124  return obj.__dict__
125 
126 
127  def toJSON(self):
128  # return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4)
129  # Support custom serialization of datetime
130  return json.dumps(self.__dict__, default=JsonSerializable.json_serial, sort_keys=True, indent=4)
131 
132 
133 class SQLExpression(object):
134  def __init__(self, stringExpression=""):
135  super(SQLExpression, self).__init__()
136  if stringExpression is None:
137  self.str = ""
138  else:
139  self.str = str(stringExpression)
140 
141  def __str__(self):
142  return self.str
143 
144 
145 
146 # #The PathMaker class
147 #
148 # This class purpose to use as static method container to split FS directory pattern on two parts
149 # @param string is a directory pattern
150 # @param subdirLen is a number of characters in sub-directory item
151 #
152 class PathMaker(object):
153 
154  SUBDIR_LEVEL1_LEN = 2
155  SUBDIR_CHAR = "/"
156 
157  def __init__(self, string, subdirLen=SUBDIR_LEVEL1_LEN):
158  super(PathMaker, self).__init__()
159 
160  self.string = string
161  self.subdirLen = subdirLen
162 
163  if "CONTENT_STORE_PATH" in os.environ and os.environ["CONTENT_STORE_PATH"] != "":
164  logger.debug("os.environ[CONTENT_STORE_PATH]: set to %s", os.environ["CONTENT_STORE_PATH"])
165  self.string += "/"
166  self.string += os.environ["CONTENT_STORE_PATH"]
167  else:
168  logger.debug("os.environ[CONTENT_STORE_PATH]: not set.")
169 
170  def getDir(self):
171  if len(self.string) > self.subdirLen:
172  return self.string[:self.subdirLen] + self.SUBDIR_CHAR + self.string[self.subdirLen:]
173  else:
174  return self.string
175 
176 
177 # #The ConfigParamsList class
178 #
179 # This class purpose to use as object few options read from config
180 # @param initial_data - input dictionaries
181 # @param kwargs - keyword arguments
182 #
183 class ConfigParamsList(object):
184  def __init__(self, *initial_data, **kwargs):
185  for dictionary in initial_data:
186  for key in dictionary:
187  setattr(self, key, dictionary[key])
188  for key in kwargs:
189  setattr(self, key, kwargs[key])
190 
191 
192 # #The function to safe getting config parameters and return default value if cannot get value
193 #
194 # @param parser - instance of ConfigParser class
195 # @param section - section name of parameter
196 # @param option - option name of parameter
197 # @param defValue - default value
198 # @return - if success extracted value, otherwise return default value
199 #
200 def getConfigParameter(parser, section, option, defValue):
201  ret = defValue
202 
203  if parser and parser.has_option(section, option):
204  try:
205  ret = parser.get(section, option, defValue)
206  except Exception:
207  ret = defValue
208 
209  return ret
210 
211 
212 # #The function to get traceback information string prepared for logging
213 #
214 # This function collects traceback information and creates sreing representation ready to log it
215 # @param linesNumberMax max number of traceback lines to include in to the collection, None - signs all
216 # @ret return string
217 #
218 def getTracebackInfo(linesNumberMax=None):
219  ret = ""
220  n = 0
221 
222  type_, value_, traceback_ = sys.exc_info()
223  stack = traceback.format_tb(traceback_)
224  del type_
225  del value_
226  for item in stack:
227  ret = ret + "\n" + (str(item))
228  n = n + 1
229  if linesNumberMax != None and n == linesNumberMax:
230  break
231 
232  return ret
233 
234 
235 
236 # #The function to get accumulate the traceback information in global variable __tracebackList
237 tracebackList = []
238 tracebackTimeQueue = []
239 tracebackIdent = False
240 tracebackIdentFiller = "-"
241 tracebackMessageCall = "call"
242 tracebackMessageExit = "exit"
243 tracebackmessageDelimiter = ":"
244 tracebackTimeMark = True
245 tracebackTimeMarkFormat = "%Y-%m-%d %H:%M:%S.%f"
246 tracebackTimeMarkDelimiter = " "
247 tracebackIncludeInternalCalls = False
248 tracebackIncludeLineNumber = True
249 tracebackIncludeLineNumberDelimiter = ":"
250 tracebackIncludeFileNumber = True
251 tracebackIncludeFileNumberDelimiter = ":"
252 tracebackFunctionNameDelimiter = ":"
253 tracebackExcludeModulePath = ["/usr/lib/", "/usr/local/lib/"]
254 tracebackExcludeFunctionName = ["varDump"]
255 tracebackExcludeFunctionNameStarts = ["<"]
256 tracebackIncludeExitCalls = True
257 tracebackRecursionlimit = 0
258 tracebackRecursionlimitErrorMsg = "RECURSION STACK LIMIT REACHED "
259 tracebackIncludeLocals = False
260 tracebackIncludeArg = False
261 tracebackIncludeLocalsPrefix = "\nLOCALS:\n"
262 tracebackIncludeArgPrefix = "\nARG:\n"
263 tracebackLogger = None
264 tracebackElapsedTimeDelimiter = ""
265 tracebackElapsedTimeFormat = "{:.6f}"
266 tracebackUnknownExceptionMsg = "Unknown exception!"
267 
268 #
269 # This function collects traceback information and creates sreing representation ready to log it
270 # @param linesNumberMax max number of traceback lines to include in to the collection, None - signs all
271 # @ret return string
272 #
273 def tracefunc(frame, event, arg, indent=None): # pylint: disable=W0613
274  if indent is None:
275  indent = [0]
276 
277  if event == "call" or event == "return":
278  lock.acquire()
279 
280  try:
281  if event == "call":
282  indent[0] += 2
283  if tracebackIdent:
284  idents = tracebackIdentFiller * indent[0]
285  else:
286  idents = ""
287  message = tracebackMessageCall
288  tracebackTimeQueue.append(time.time())
289  te = ""
290  elif event == "return":
291  if tracebackIdent:
292  idents = tracebackIdentFiller * indent[0]
293  else:
294  idents = ""
295  indent[0] -= 2
296  message = tracebackMessageExit
297  te = "{:.6f}".format(time.time() - tracebackTimeQueue.pop())
298 
299  if tracebackTimeMark:
300  # t = time.strftime(tracebackTimeMarkFormat)
301  t = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
302  else:
303  t = ""
304 
305  if tracebackIncludeLineNumber:
306  ln = str(frame.f_lineno)
307  else:
308  ln = ""
309 
310  if tracebackIncludeFileNumber:
311  fn = str(frame.f_code.co_filename)
312  else:
313  fn = ""
314 
315  excludedP = False
316  for item in tracebackExcludeModulePath:
317  if item in frame.f_code.co_filename:
318  excludedP = True
319  break
320 
321  excludedF = False
322  for item in tracebackExcludeFunctionName:
323  if frame.f_code.co_name == item:
324  excludedF = True
325  break
326 
327  excludedF2 = False
328  for item in tracebackExcludeFunctionNameStarts:
329  if frame.f_code.co_name.startswith(item):
330  excludedF2 = True
331  break
332 
333  if tracebackIncludeLocals or tracebackIncludeArg:
334  oldRL = sys.getrecursionlimit()
335  if oldRL < tracebackRecursionlimit:
336  sys.setrecursionlimit(tracebackRecursionlimit)
337  else:
338  oldRL = None
339 
340  if tracebackIncludeLocals:
341  localsDump = ""
342  try:
343  # localsDump = varDump(frame.f_locals)
344  localsDump = str(frame.f_locals)
345  localsDump = tracebackIncludeLocalsPrefix + localsDump
346  except: # pylint:disable=W0702
347  localsDump = tracebackRecursionlimitErrorMsg + str(tracebackRecursionlimit)
348  # pass
349  else:
350  localsDump = ""
351 
352  if tracebackIncludeArg:
353  argDump = ""
354  try:
355  # argDump = varDump(arg)
356  argDump = str(arg)
357  argDump = tracebackIncludeArgPrefix + argDump
358  except: # pylint:disable=W0702
359  argDump = tracebackRecursionlimitErrorMsg + str(tracebackRecursionlimit)
360  # pass
361  else:
362  argDump = ""
363 
364  if oldRL is not None:
365  sys.setrecursionlimit(oldRL)
366  else:
367  localsDump = ""
368  argDump = ""
369 
370  if (not (tracebackIncludeInternalCalls is False and frame.f_code.co_name.startswith("__"))) and\
371  (not (tracebackIncludeExitCalls is False and event == "return")) and\
372  (not excludedP) and (not excludedF) and (not excludedF2):
373  tmsg = idents + message + tracebackmessageDelimiter + \
374  fn + tracebackIncludeFileNumberDelimiter + \
375  ln + tracebackIncludeLineNumberDelimiter + \
376  frame.f_code.co_name + "()" + tracebackFunctionNameDelimiter + \
377  tracebackElapsedTimeDelimiter + te + localsDump + argDump
378  if tracebackLogger is None:
379  tracebackList.append(t + tracebackTimeMarkDelimiter + tmsg)
380  else:
381  tracebackLogger.debug("%s", tmsg)
382  if len(tracebackTimeQueue) == 0:
383  tracebackLogger.debug("%s", APP_CONSTS.LOGGER_DELIMITER_LINE)
384 
385  except Exception as e:
386  if tracebackLogger is None:
387  tracebackList.append("Exception: " + str(e))
388  else:
389  tracebackLogger.error("%s", str(e))
390  except: # pylint: disable=W0702
391  if tracebackLogger is None:
392  tracebackList.append(tracebackUnknownExceptionMsg)
393  else:
394  tracebackLogger.error("%s", tracebackUnknownExceptionMsg)
395 
396  lock.release()
397 
398  return tracefunc
399 
400 
401 
402 # #The function to get a printable representation of an object for debugging
403 #
404 #
405 # @param obj The object to print
406 # @param stringifyType - 0 - json, 1 - str
407 # @ret return string dump
408 #
409 def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False,
410  objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10):
411  if objectsHash is None:
412  objectsHash = []
413  # print 'depth: ' + str(depth)
414  depth += 1
415  if depth < maxDepth:
416  newobj = obj
417  try:
418  if isinstance(obj, list):
419  newobj = []
420  for item in obj:
421  newobj.append(varDump(item, False, strTypeMaxLen, strTypeCutSuffix, stringifyType, ignoreErrors,
422  objectsHash, depth, indent, ensure_ascii, maxDepth))
423  elif isinstance(obj, tuple):
424  temp = []
425  for item in obj:
426  temp.append(varDump(item, False, strTypeMaxLen, strTypeCutSuffix, stringifyType, ignoreErrors,
427  objectsHash, depth, indent, ensure_ascii, maxDepth))
428  newobj = tuple(temp) # pylint: disable=R0204
429  elif isinstance(obj, set):
430  temp = []
431  for item in obj:
432  temp.append(str(varDump(item, False, strTypeMaxLen, strTypeCutSuffix, stringifyType, ignoreErrors,
433  objectsHash, depth, indent, ensure_ascii, maxDepth)))
434  newobj = set(temp)
435  elif isinstance(obj, dict):
436  newobj = {}
437  for key, value in obj.items():
438  newobj[str(varDump(key, False, strTypeMaxLen, strTypeCutSuffix))] = \
439  varDump(value, False, strTypeMaxLen, strTypeCutSuffix, stringifyType, ignoreErrors,
440  objectsHash, depth, indent, ensure_ascii, maxDepth)
441  # elif isinstance(obj, types.FunctionType):
442  # newobj = repr(obj)
443  elif '__dict__' in dir(obj):
444  newobj = {}
445  for k in obj.__dict__.keys():
446  # print 'k:' + str(k)
447  # print 'v:' + str(obj.__dict__[k])
448  if isinstance(obj.__dict__[k], basestring):
449  newobj[k] = obj.__dict__[k]
450  if strTypeMaxLen > 0 and len(newobj[k]) > strTypeMaxLen:
451  newobj[k] = newobj[k][:strTypeMaxLen] + strTypeCutSuffix
452  else:
453  if '__dict__' in dir(obj.__dict__[k]):
454  sobj = str(obj.__dict__[k])
455  if sobj in objectsHash:
456  newobj[k] = 'OBJECT RECURSION: ' + sobj
457  else:
458  objectsHash.append(sobj)
459  newobj[k] = varDump(obj.__dict__[k], False, strTypeMaxLen, strTypeCutSuffix, stringifyType,
460  ignoreErrors, objectsHash, depth, indent, ensure_ascii, maxDepth)
461  else:
462  newobj[k] = varDump(obj.__dict__[k], False, strTypeMaxLen, strTypeCutSuffix, stringifyType,
463  ignoreErrors, objectsHash, depth, indent, ensure_ascii, maxDepth)
464  sobj = str(obj)
465  if ' object at ' in sobj and '__type__' not in newobj:
466  newobj['__type__'] = sobj.replace(" object at ", " #").replace("__main__.", "")
467  else:
468  if stringifyType == 0:
469  try:
470  s = json.dumps(newobj, indent=indent, ensure_ascii=ensure_ascii)
471  del s
472  except Exception as err:
473  newobj = str(newobj)
474  except Exception as err:
475  if ignoreErrors:
476  newobj = ''
477  else:
478  newobj = 'General error: ' + str(err) + "\n" + getTracebackInfo()
479  else:
480  newobj = 'MAX OBJECTS EMBED DEPTH ' + str(maxDepth) + ' REACHED!'
481 
482  if stringify:
483  if stringifyType == 0:
484  try:
485  newobj = json.dumps(newobj, indent=indent, ensure_ascii=ensure_ascii)
486  except Exception as err:
487  if ignoreErrors:
488  newobj = ''
489  else:
490  newobj = 'To json error: ' + str(err)
491  else:
492  newobj = str(newobj)
493 
494  return newobj
495 
496 
497 # pylint: disable=W0702
498 def memUsage(point=""):
499  import resource
500  # usage = resource.getrusage(resource.RUSAGE_SELF)
501  return '''%s: mem=%s mb
502  ''' % (point, resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000)
503 
504 
505 
506 # #class UrlParser makes URL operation
507 #
508 class UrlParser(object):
509 
510 
511  def __init__(self):
512  pass
513 
514 
515  # #The function check protocol and dimain name in the incoming url
516  #
517  # url - param, incoming url
518  # @ret boolean
519  @staticmethod
520  def isValidURL(url):
521  ret = False
522  parseUrl = urlparse.urlparse(url)
523  if parseUrl.scheme != None and parseUrl.netloc != None:
524  ret = True
525  return ret
526 
527 
528  # #The function generates base url
529  #
530  # url - param, incoming url
531  # @ret boolean
532  @staticmethod
534  ret = ""
535  parseUrl = urlparse.urlparse(url)
536  if UrlParser.isValidURL(url):
537  ret = parseUrl.scheme + "://" + parseUrl.netloc
538  else:
539  raise UrlParseException("Empty protocol or domain name")
540  return ret
541 
542 
543  # #The function extracts domain name
544  #
545  # url - param, incoming url
546  # @ret boolean
547  @staticmethod
548  def getDomain(url):
549  auth = urlparse.urlsplit(url.strip())[1]
550  ret = (re.search('([^@]*@)?([^:]*):?(.*)', auth).groups())[1]
551  return ret
552 
553 
554 # # normalization url string use base url
555 #
556 # @param base - base url string
557 # @param url - url string
558 # @param supportProtocols - support protocol list
559 # @param log - logger instance
560 # @return already normalized url string or None - in case of bad result normalization
561 def urlNormalization(base, url, supportProtocols=None, log=None):
562  # variable for result
563  res = None
564 
565  # Internal function for prepare before normalization
566  def prepareNormalization(path):
567  out = []
568  pathStr = path
569  replaceSimbolDict = {'\a':'/a',
570  '\b':'/b',
571  '\f':'/f',
572  '\n':'/n',
573  '\r':'/r',
574  '\t':'/t',
575  '\v':'/v',
576  '\\':'\\\\'}
577 
578  replaceStartSimbolDict = {'://': ''}
579 
580  for src, dest in replaceStartSimbolDict.items():
581  if pathStr.startswith(src):
582  pathStr = pathStr.replace(src, dest)
583 
584  for src, dest in replaceSimbolDict.items():
585  pathStr = pathStr.replace(src, dest)
586 
587  for i in range(0, 32):
588  pathStr = pathStr.replace(str(chr(i)), str('/%o' % i))
589 
590  for s in pathStr.split("\\"):
591  out.append(s)
592 
593  out = [elem for elem in out if elem != '']
594 
595  return '/'.join(out)
596 
597 
598  if isinstance(url, basestring):
599  # validate
600 # if Url(url).isValid():
601 # if log is not None:
602 # log.debug("return as valid url: %s", str(url))
603 # res = url
604 # else:
605  # set default result
606  resUrl = prepareNormalization(url)
607  if isinstance(base, basestring):
608  # normalization url
609  baseUrl = prepareNormalization(base)
610 
611  if baseUrl != resUrl:
612  resUrl = urlparse.urljoin(baseUrl, resUrl)
613 
614  if url != resUrl and log is not None:
615  log.debug('==== Urls different ====')
616  log.debug("base: %s", str(baseUrl))
617  log.debug("url: %s", str(url))
618  log.debug("res: %s", str(resUrl))
619 
620  res = resUrl
621 
622  # check support protocols
623  if isinstance(supportProtocols, list):
624  if log is not None:
625  log.debug("supportProtocols: %s, res: %s", str(supportProtocols), str(res))
626  # extract protocol schema from url
627  if isinstance(res, basestring):
628  v = urlparse.urlsplit(res)
629  if v.scheme not in supportProtocols:
630  if log is not None:
631  log.debug("Not support protocol: %s", str(v.scheme))
632  res = None
633 
634  if log is not None:
635  log.debug("before normalization res: %s", str(res))
636 
637  # normalization
638  if res is not None:
639  localUrls = res.split()
640  resUrls = []
641  if log is not None:
642  log.debug("localUrls: %s", str(localUrls))
643 
644  for localUrl in localUrls:
645  if localUrl != "":
646  resUrls.append(url_normalize(localUrl))
647 
648  if log is not None:
649  log.debug("resUrls: %s", varDump(resUrls))
650  res = ','.join(resUrls)
651  if log is not None:
652  log.debug("res: %s", str(res))
653 
654  return res
655 
656 
657 # #class UrlNormalizator makes URL normalization
658 #
659 class UrlNormalizator(object):
660 
661  NORM_NONE = 0
662  NORM_SKIP_WWW = 1
663  NORM_USE_VALIDATOR = 2
664  NORM_MAIN = 4
665  NORM_DEFAULT = NORM_MAIN
666  BAD_URL_PREFIX = "normalization-error://?"
667 
668 
669  def __init__(self):
670  pass
671 
672 
673  # #The function normalize the incoming url according RFC 3986
674  #
675  # url - param, incoming url
676  # @ret normalized url
677  @staticmethod
678  def normalize(url, supportProtocols=None, normMask=NORM_DEFAULT):
679  norm_url = url.strip()
680  if normMask != 0:
681  logger.debug("None zero normMask: %s", str(normMask))
682  # TODO: need to be replaced with default filter for collect URLs protocols check stage
683  if supportProtocols is not None and isinstance(supportProtocols, list):
684  colonPos = norm_url.find(':')
685  slashPos = norm_url.find('/')
686  if colonPos != -1 and (slashPos == -1 or slashPos > colonPos):
687  if len(norm_url.split(':')) > 1:
688  protocol = norm_url.split(':')[0]
689  if protocol not in supportProtocols:
690  try:
691  norm_url = UrlNormalizator.BAD_URL_PREFIX + urllib.quote(norm_url)
692  except Exception as err:
693  logger.debug(">>> urllib.quote error = " + str(err))
694  norm_url = UrlNormalizator.BAD_URL_PREFIX + norm_url
695 
696  if norm_url == url:
697  try:
698  stripWWW = True if normMask & UrlNormalizator.NORM_SKIP_WWW else False
699  useValidator = True if normMask & UrlNormalizator.NORM_USE_VALIDATOR else False
700  enableAdditionNormalize = True if normMask & UrlNormalizator.NORM_MAIN else False
701  norm_url = str(urinormpath(url.strip(), stripWWW, useValidator, enableAdditionNormalize))
702  # norm_url = str(canonicalize_url(url.strip()))
703  # logger.debug("norm_url: <%s>", norm_url)
704  # except urlnorm.InvalidUrl:
705  # logger.error("Normalization InvalidUrl")
706  # norm_url = ""
707  except Exception as e:
708  logger.error("Normalization error: " + str(e) + "\nURL: [" + url + "]\n" + str(getTracebackInfo()))
709 
710  return norm_url
711 
712 
713  # #The check url by bad url prefix
714  #
715  # url - param, incoming url
716  # @ret bool value is url valid or not
717  @staticmethod
718  def isNormalUrl(url):
719  return False if url.find(UrlNormalizator.BAD_URL_PREFIX) == 0 else True
720 
721 
722  # #The function encode entities "&" to "&amp;" if needed
723  #
724  # @param url to encode
725  # @param entities dict, keys are not encode and values are encoded forms
726  # @return encoded url
727  @staticmethod
728  def entitiesEncode(url, entities=None):
729  ret = url
730  if entities is None:
731  entities = {"&": "&amp;"}
732 
733  for k in entities:
734  le = len(entities[k])
735  p = -1
736  while True:
737  l = len(ret)
738  p = ret.find(k, p + 1)
739  if p == -1:
740  break
741  else:
742  if (p + le - 1 > l) or ((p + le - 1 <= l) and (ret[p:p + le] != entities[k])):
743  ret = ret[:p] + entities[k] + ret[p + 1:]
744  else:
745  continue
746 
747  return ret
748 
749 
750 
751 # ENV_SCRAPER_STORE_PATH = "ENV_SCRAPER_STORE_PATH"
752 # # storePickleOnDisk
753 #
754 def storePickleOnDisk(input_pickled_object, env_path, file_name):
755  if env_path in os.environ and os.environ[env_path] != "":
756  logger.debug("os.environ[%s]: set to %s", env_path, os.environ[env_path])
757  open(os.environ[env_path] + file_name, "wb").write(input_pickled_object)
758  else:
759  logger.debug("os.environ[%s]: not set.", env_path)
760 
761 
762 
763 # This function taken from uritools module as it was removed from module
764 def urinormpath(path, stripWWW=False, useValidator=False, enableAdditionNormalize=True): # pylint: disable=W0613
765  # Remove '.' and '..' path segments from a URI path.
766  # RFC 3986 5.2.4. Remove Dot Segments
767  ret = None
768  ret1 = None
769 
770  try:
771  if path is None or path == "":
772  ret1 = path
773  else:
774  out = []
775  for s in path.split('/'):
776  if s == '.':
777  continue
778  elif s != '..':
779  out.append(s)
780  elif out:
781  out.pop()
782  # Fix leading/trailing slashes
783  if path.startswith('/') and (not out or out[0]):
784  out.insert(0, '')
785  if path.endswith('/.') or path.endswith('/..'):
786  out.append('')
787  ret = '/'.join(out)
788 
789  if stripWWW:
790  splitPath = path.split("?")
791  if len(splitPath) > 0:
792  splitPath[0] = splitPath[0].replace("://www.", "://")
793  localPath = splitPath[0]
794  for elem in splitPath[1:]:
795  localPath += "?"
796  localPath += elem
797  else:
798  localPath = path
799 
800  if enableAdditionNormalize:
801  resultUrlDict = Url(localPath)
802  if useValidator and not Url.GetStats([resultUrlDict])[0]["valid"]:
803  raise Exception(path + " NOT VALIDATE!")
804  ret1 = Url.GetStats([resultUrlDict])[0]["canonicalized"]
805  else:
806  ret1 = localPath
807 
808  if ret is not None and ret1 is not None and ret != ret1:
809  logger.debug("--->>>> URLS DIFFERTNT <<<<---")
810  logger.debug(ret)
811  logger.debug(ret1)
812  except Exception as e:
813  logger.error("Normalization error: " + str(e) + "\npath: [" + path + "]\n" + str(getTracebackInfo()))
814 
815  return ret1
816 
817 
818 
819 # #Logger file name generator
820 #
821 #
822 class LoggerFileName(object):
823 
824 
825  def __init__(self, loggerInst=None):
826  self.loggerInst = loggerInst
827 
828 
829  def getFreeProcInstanceNumber(self, sock_prefix="module", min_number=0, max_number=32):
830  ret = ""
831 
832  for i in range(min_number, max_number):
833  try:
834  import socket
835 
836  global lock_socket # pylint: disable=W0601
837  lock_socket = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM)
838 
839  # Create an abstract socket
840  lock_socket.bind('\0' + "dc_process_lock_" + sock_prefix + str(i))
841  ret = str(i)
842  break
843  except socket.error:
844  # Socket already exists
845  continue
846 
847  return ret
848 
849 
850  def findReplace(self, newFile=None, rollover=True):
851  log_file = None
852 
853  if self.loggerInst is None:
854  lg = logging.getLogger('')
855  else:
856  lg = self.loggerInst
857  for h in lg.__dict__['handlers']:
858  if h.__class__.__name__ == 'FileHandler':
859  log_file = h.baseFilename
860  if newFile is not None:
861  h.baseFilename = newFile
862  if h.stream:
863  h.stream.close()
864  h.stream = None
865  h.stream = h._open() # pylint: disable=W0212
866  if os.path.exists(newFile):
867  t = os.stat(newFile)[ST_MTIME]
868  else:
869  t = int(time.time())
870  h.rolloverAt = h.computeRollover(t)
871  if rollover and h.shouldRollover(''):
872  h.doRollover()
873  break
874  elif h.__class__.__name__ == 'TimedRotatingFileHandler':
875  log_file = h.baseFilename
876  if newFile is not None:
877  h.baseFilename = newFile
878  if h.stream:
879  h.stream.close()
880  h.stream = None
881  h.stream = h._open() # pylint: disable=W0212
882  if rollover and h.shouldRollover(''):
883  h.doRollover()
884  break
885 
886  return log_file
887 
888 
889 
890 # #Flush logger's file type handlers
891 #
892 # @param loggerObj
893 def loggerFlush(loggerObj):
894  for h in loggerObj.handlers:
895  if h.__class__.__name__ == 'FileHandler' or h.__class__.__name__ == 'TimedRotatingFileHandler':
896  h.flush()
897 
898 
899 
900 # #accumulateSubstrings accumulates substr list in one string and returns it, also adds prefixies between
901 # substrings in resulting string. substrList and prefixes must be List[str] type with equal length
902 # @param substrList - substrings list
903 # @param prefixes - prefixies list
904 # @returns - accumulate string
905 def accumulateSubstrings(substrList, prefixes):
906  ret = ""
907  if substrList is None or not isinstance(substrList, list): # # type(substrList) is not types.ListType:
908  raise Exception(">>> error substrList is None or not List type")
909  if prefixes is None or not isinstance(prefixes, list): # # type(prefixes) is not types.ListType:
910  raise Exception(">>> error prefixes is None or not List type")
911  if len(substrList) != len(prefixes):
912  raise Exception(">>> error substrList and prefixes lists have different lengths")
913  i = 0
914  for substr in substrList:
915  if isinstance(substr, str) or isinstance(substr, unicode):
916  if isinstance(prefixes[i], str) or isinstance(prefixes[i], unicode):
917  ret += str(prefixes[i])
918  ret += str(substr)
919  i += 1
920  return ret
921 
922 
923 
925 
926  CUR_YEAR_FULL = "@CUR_YEAR_FULL"
927  CUR_YEAR_SHORT = "@CUR_YEAR_SHORT"
928  CUR_MONTH = "@CUR_MONTH"
929  CUR_DAY = "@CUR_DAY"
930 
931 
932 # #generateReplacementDict method generates and returns replacement dict with current datatime
933 # @returns - replacement dict with current datatime
935  ret = {}
936  ret[DataReplacementConstants.CUR_YEAR_FULL] = datetime.now().strftime("%Y")
937  ret[DataReplacementConstants.CUR_YEAR_SHORT] = datetime.now().strftime("%y")
938  ret[DataReplacementConstants.CUR_MONTH] = datetime.now().strftime("%m")
939  ret[DataReplacementConstants.CUR_DAY] = datetime.now().strftime("%d")
940  return ret
941 
942 
943 # #parseHost parse the root host name from url
944 # for example: the result of http://s1.y1.example.com/path/to is example.com
945 # @param url the full url
946 # @return host of the url, eg: example.com
947 def parseHost(url):
948  host = None
949  if urlparse.urlparse(url).hostname:
950  host = '.'.join(urlparse.urlparse(url).hostname.split('.')[-2:])
951  return host
952 
953 
954 # # convert date str to HTTP header format
955 # 2014-07-29 20:31:50 (GMT+8) to Tue, 29 Jul 2014 12:31:50 GMT
956 # @param date_str date str, 2014-07-29 20:31:50
957 # @return HTTP header formated date str : Tue, 29 Jul 2014 12:31:50 GMT
958 def convertToHttpDateFmt(date_str):
959  stamp = time.mktime(date_str.timetuple())
960  # stamp = time.mktime(time.strptime(date_str, '%Y-%m-%d %H:%M:%S'))
961  return time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime(stamp))
962 
963 
964 # # method returns siteId, substitutes to "0" value if incoming siteId is None
965 # @param siteId - ID of site
966 # @param log - logger instanse for log usage
967 def autoFillSiteId(siteId, log):
968  ret = siteId
969  if siteId is None:
970  ret = "0"
971  if log is not None:
972  log.debug("set siteId = '0' from 'autoFillSiteId'")
973 
974  return ret
975 
976 
977 # # method strips incoming html from html comments
978 # @param htmlBuf incoming content in string format
979 # @param soup incoming content as bs object
980 # @param hType -hType of handler
981 # @return clean html buff
982 def stripHTMLComments(htmlBuf=None, soup=None, hType=3):
983  from bs4 import Comment
984 
985  ret = htmlBuf
986  if soup is not None and hType == 0:
987  for elem in soup.findAll(text=lambda text: isinstance(text, Comment)):
988  elem.extract()
989  elif htmlBuf is not None and hType == 1:
990  ret = re.sub(SEARCH_COMMENT_PATTERN, "", htmlBuf)
991  logger.debug("!!! use pattern: %s", str(SEARCH_COMMENT_PATTERN))
992  elif htmlBuf is not None and hType == 2:
993  ret = re.sub(SEARCH_COMMENT_SIMPLE_PATTERN, "", htmlBuf)
994  logger.debug("!!! use pattern: %s", str(SEARCH_COMMENT_SIMPLE_PATTERN))
995  elif htmlBuf is not None and hType == 3:
996  ret = cutSubstringEntrances(htmlBuf, behaveMask=2)
997 
998  return ret
999 
1000 
1001 # Cuts substring entrances in source buffer started and finished with strings
1002 #
1003 # @param buf - source buffer
1004 # @param startStr - start string
1005 # @param finishStr - finish string
1006 # @param behaveMask - bit set mask defines a behavior in case of finishStr not found, 0 - do nothing,
1007 # 1 - cut up to finishDefault or end of buffer if no end of line found, 2 - cut up to end of buffer
1008 # @param greediness - max cutting number, 0 - means unlimited
1009 # @param finishDefault - default finish string used if behaveMask == 1 and finishStr is not found
1010 # @return resulted string
1011 def cutSubstringEntrances(buf, startStr='<!--', finishStr='-->', behaveMask=0, greediness=0, finishDefault='\n'):
1012  ret = buf
1013  i = 0
1014  while True:
1015  i += 1
1016  replaced = False
1017  if ret.find(startStr) != -1:
1018  p = ret.index(startStr)
1019  if p is not None:
1020  p1 = None
1021  if ret.find(finishStr, p) != -1:
1022  p1 = ret.index(finishStr, p) + len(finishStr)
1023  else:
1024  if behaveMask == 1:
1025  if ret.find(finishDefault, p) != -1:
1026  p1 = ret.index(finishDefault, p) + len(finishDefault)
1027  else:
1028  p1 = len(ret)
1029  if behaveMask == 2:
1030  p1 = len(ret)
1031  if p1 is not None:
1032  ret = ret[0:p] + ret[p1:]
1033  # print ret
1034  replaced = True
1035  if greediness > 0 and i == greediness:
1036  break
1037  if not replaced:
1038  break
1039 
1040  return ret
1041 
1042 
1043 # # method erase incoming html from noscript blocks
1044 # @param htmlBuf - incoming content in string format
1045 # @return clean html buff
1046 def eraseNoScript(htmlBuf=None):
1047  ret = htmlBuf
1048  if htmlBuf is not None:
1049 # ret = re.sub(SEARCH_NOSCRIPT_PATTERN, "", htmlBuf)
1050 # logger.debug("!!! use pattern: %s", str(SEARCH_NOSCRIPT_PATTERN))
1051  ret = cutSubstringEntrances(htmlBuf, startStr='<noscript>', finishStr='</noscript>', behaveMask=2)
1052 # logger.debug("!!! htmlBuf: %s", varDump(htmlBuf, strTypeMaxLen=10))
1053 # logger.debug("!!! ret: %s", varDump(ret, strTypeMaxLen=10))
1054 
1055  return ret
1056 
1057 
1058 # Strips from all HTML tags with set of different methods
1059 # @param htmlTxt input content
1060 # @param method 0 - by BeautifulSoup, 1 - with RE 1, 2 - RE 2, 3 - HTML parser, 4 - clear Python w/o lib, 5 - xml lib
1061 # @param joinGlue - the glue string to joing parts
1062 # @param regExp - the custom re for the method 1 or 2
1063 # @return cleared content
1064 def stripHTMLTags(htmlTxt, method=0, joinGlue=' ', regExp=None):
1065  ret = ''
1066 
1067  if htmlTxt is not None and htmlTxt.strip() != '':
1068  if method == 0:
1069  from bs4 import BeautifulSoup
1070  ret = joinGlue.join(BeautifulSoup(htmlTxt, 'lxml').findAll(text=True))
1071  elif method == 1 or method == 2:
1072  if regExp is not None:
1073  r = regExp
1074  else:
1075  if method == 1:
1076  r = r'<[^<]+?>'
1077  else:
1078  r = r'(<!--.*?-->|<[^>]*>)'
1079  ret = re.sub(r, joinGlue, htmlTxt)
1080  elif method == 3:
1081  ret = MLStripper() # pylint: disable=R0204
1082  ret.feed(htmlTxt)
1083  ret = ret.get_data()
1084  elif method == 4:
1085  tag = False
1086  quote = False
1087  for c in htmlTxt:
1088  if c == '<' and not quote:
1089  tag = True
1090  elif c == '>' and not quote:
1091  tag = False
1092  elif (c == '"' or c == "'") and tag:
1093  quote = not quote
1094  elif not tag:
1095  ret = ret + joinGlue + c
1096  elif method == 5:
1097  import xml
1098  ret = joinGlue.join(xml.etree.ElementTree.fromstring(htmlTxt).itertext())
1099 
1100  if method == 1 or method == 2:
1101  import cgi
1102  ret = cgi.escape(ret)
1103  ret = re.sub('[<>]', '', ret)
1104 
1105  return ret.strip()
1106 
1107 
1108 
1109 class MLStripper(object, HTMLParser):
1110  def __init__(self, joinGlue=' '):
1111  super(MLStripper, self).__init__()
1112  self.reset()
1113  self.fed = []
1114  self.joinGlue = joinGlue
1115 
1116 
1117  def handle_data(self, d):
1118  self.fed.append(d)
1119 
1120 
1121  def get_data(self):
1122  return self.joinGlue.join(self.fed)
1123 
1124 
1125 # # function extracts text from html content, strips all tags
1126 # @param htmlText incoming raw html
1127 # @param stripComment is strip comment or not
1128 # @param stripScript is strip script or not
1129 # @return inner html text
1130 def innerHTMLText(htmlBuf, stripComment=True, stripScript=True):
1131  from bs4 import BeautifulSoup
1132 
1133  soup = BeautifulSoup(htmlBuf, 'lxml')
1134 
1135  if stripScript:
1136  for elem in soup.findAll(name='script'):
1137  elem.extract()
1138  if stripComment:
1139  stripHTMLComments(htmlBuf=None, soup=soup)
1140 
1141  return ''.join(soup.findAll(text=True))
1142 
1143 
1144 # # function concatinates all HTMLTags from extractor also strips elements
1145 # @param selectorList incoming Selector
1146 # @return inner text from incoming selector
1147 def innerText(selectorList, delimiter=' ', innerDelimiter=' ', tagReplacers=None, REconditions=None,
1148  attrConditions=None, keepAttributes=None, baseUrl=None, closeVoid=None, excludeNodes=None):
1149  extendInnerText = ExtendInnerText(tagReplacers, delimiter, innerDelimiter, REconditions, attrConditions,
1150  keepAttributes, baseUrl, closeVoid, excludeNodes)
1151  extendInnerText.innerText(None, selectorList, None)
1152  ret = extendInnerText.stripHtml
1153  return ret
1154 
1155 
1156 # # function concatinates all HTMLTags from extractor also strips elements
1157 # @param selectorList incoming Selector
1158 # @return list of inner text from incoming selector
1159 def innerTextToList(selectorList, delimiter=' ', innerDelimiter=' ', tagReplacers=None, REconditions=None,
1160  attrConditions=None, keepAttributes=None, baseUrl=None, closeVoid=None, excludeNodes=None):
1161  extendInnerText = ExtendInnerText(tagReplacers, delimiter, innerDelimiter, REconditions, attrConditions,
1162  keepAttributes, baseUrl, closeVoid, excludeNodes)
1163  extendInnerText.innerTextToList(None, selectorList, None)
1164  ret = extendInnerText.stripHtmlList
1165  return ret
1166 
1167 
1168 # # function looks fiers not empty extracted XPath from subXPathes, using subXPathPattern for real xpath creating
1169 # @param xpath - incoming root xpath
1170 # @param sel - incoming selector
1171 # @param subXPathPattern - subXPath creation pattern
1172 # @param subXPathes - list of subXPathes
1173 # @return retXPath and retXPathValue values
1174 def getFirstNotEmptySubXPath(xpath, sel, subXPathPattern, subXPathes):
1175  retXPath = None
1176  retXPathValue = None
1177  for subXPath in subXPathes:
1178  retXPath = xpath + (subXPathPattern % subXPath)
1179  try:
1180  retXPathValue = sel.xpath(retXPath).extract()
1181  except Exception as excp:
1182  logger.info(">>> Common xPath extractor exception, = " + retXPath + " excp=" + str(excp))
1183  retXPathValue = None
1184  continue
1185  if len(retXPathValue) > 0 and ''.join(retXPathValue).strip() != '':
1186  break
1187  return retXPath, retXPathValue
1188 
1189 
1190 # # function call splitPairs for each element in incomeDict and fills return dict
1191 # @param incomeDict incoming dict
1192 # @param splitters incoming splitters
1193 # @return result dict
1194 def getPairsDicts(incomeDict, splitters=','):
1195  ret = {}
1196  if isinstance(incomeDict, dict):
1197  for key in incomeDict:
1198  if isinstance(incomeDict[key], str) or isinstance(incomeDict[key], unicode):
1199  ret[key] = splitPairs(incomeDict[key], splitters)
1200  return ret
1201 
1202 
1203 # # function extracts splits incoming string by splitters into dict of name=value pairs
1204 # @param buf incoming text buf
1205 # @param splitters incoming splitters
1206 # @return result dict
1207 def splitPairs(buf, splitters=','):
1208  ret = {}
1209  splitStr = buf.split(splitters)
1210  for elem in splitStr:
1211  localStr = elem.split('=')
1212  if isinstance(localStr, list) and len(localStr) >= 2:
1213  ret[localStr[0]] = localStr[1]
1214  return ret
1215 
1216 
1217 # # function looks is str2 an a tail of str1
1218 # @param str1 main string
1219 # @param str2 searching tail substring
1220 # @return False or True
1221 def isTailSubstr(str1, str2):
1222  ret = False
1223  if str1.find(str2) > 0 and ((len(str1) - str1.find(str2)) == len(str2)):
1224  ret = True
1225  return ret
1226 
1227 
1228 # # function make string raplacement while
1229 # @param buf incoming text buf
1230 # @param replaceFrom substring for replacement from
1231 # @param replaceTo substring for replacement to
1232 # @return replacement string
1233 def replaceLoopValue(buf, replaceFrom, replaceTo):
1234  localValue = buf
1235  replaceValue = localValue.replace(replaceFrom, replaceTo)
1236  while len(replaceValue) != len(localValue):
1237  localValue = replaceValue
1238  replaceValue = localValue.replace(replaceFrom, replaceTo)
1239  return localValue
1240 
1241 
1242 # # # function extract html redirect link from meta
1243 # # @param utf8Buff incoming buff of html page
1244 # # @param log - logger instance
1245 # # @return html redirect link
1246 # def extractHTMLRedirectFromMeta(utf8Buff, log):
1247 # # variable for result
1248 # ret = None
1249 #
1250 # localREList = re.findall(META_RE_0, utf8Buff, re.I)
1251 # if len(localREList) > 0:
1252 # log.debug("!!! Found pattern: '%s' - HTML redirect is exist...", str(META_RE_0))
1253 # match = re.search(META_RE_1, utf8Buff, re.I | re.U)
1254 # if match is not None:
1255 # log.debug("!!! Found pattern: '%s' - HTML redirect blocked by comment...", str(META_RE_1))
1256 # else:
1257 # for bodyStr in localREList:
1258 # match = re.search(META_RE_2, bodyStr, re.I | re.U)
1259 # log.debug("!!! bodyStr: %s, pattern: '%s', match: %s", str(bodyStr), str(META_RE_2), varDump(match))
1260 # if match is not None:
1261 # ret = match.group(1)
1262 # else:
1263 # match = re.search(META_RE_3, bodyStr, re.I | re.U)
1264 # log.debug("!!! bodyStr: %s, pattern: '%s', match: %s", str(bodyStr), str(META_RE_3), varDump(match))
1265 # if match is not None:
1266 # ret = match.group(1)
1267 #
1268 # if ret is not None:
1269 # break
1270 #
1271 # return ret
1272 
1273 
1274 # # extract html redirect link from meta
1275 # @param buff - raw contant of html page
1276 # @param log - logger instance
1277 # @return - html redirect link
1278 def getHTMLRedirectUrl(buff, log):
1279  # variable for result
1280  ret = None
1281  resUrl = ''
1282 
1283  match = re.search(META_REDIRECT, stripHTMLComments(buff), re.I | re.U)
1284  if match is not None:
1285  resUrl = match.groups()[0].strip()
1286 
1287  log.debug('resUrl: ' + str(resUrl))
1288  urlObj = Url(resUrl)
1289  if urlObj.isValid():
1290  ret = resUrl
1291 
1292  log.debug('ret: ' + str(ret))
1293 
1294  return ret
1295 
1296 
1297 # # function parse incoming email adress
1298 # @param href - incoming email href
1299 # @param onlyName - extract email names instead full email names
1300 # @param defaultSeparator - default separator between email elements
1301 # @return parsed email
1302 def emailParse(href, onlyName=False, defaultSeparator=' '): # pylint: disable=W0613
1303  ret = href
1304  splitHref = href.split('?')
1305  if splitHref is not None and len(splitHref) > 0:
1306  adresses = splitHref[0]
1307  adresses = adresses.split(',')
1308  if onlyName:
1309  names = []
1310  for adress in adresses:
1311  adress = adress.split('@')
1312  if adress is not None and len(adress) > 0:
1313  names.append(adress[0])
1314  adresses = names
1315  ret = ''
1316  for adress in adresses:
1317  ret += adress
1318  ret += ' '
1319  ret = ret.strip()
1320  return ret
1321 
1322 
1323 
1324 # #Multi process logger
1325 #
1326 class MPLogger(object):
1327 
1328  ROTATED_ATTRIBUTE_NAME = '__rotated'
1329 
1330  # #initialization
1331  #
1332  def __init__(self):
1333  super(MPLogger, self).__init__()
1334  self.fnameOld = ''
1335 
1336 
1337 # #rotation log files of logger
1338  #
1339  # @param loggerName - name of logger
1340  # @return - None
1341  def getLogger(self, loggerName=None, fileNameSuffix='', restore=False):
1342  if loggerName is None:
1343  ln = APP_CONSTS.LOGGER_NAME
1344  else:
1345  ln = loggerName
1346 
1347  if fileNameSuffix != '' or restore is True:
1348  rollover = False
1349  else:
1350  rollover = True
1351 
1352  try:
1353  # Get regular logger
1354  lg = logging.getLogger(ln)
1355  # Replace logger name for processes instances
1356  lfn = LoggerFileName(lg)
1357  fname = lfn.findReplace()
1358  if fname is not None and fname != '':
1359  if restore is False:
1360  if not hasattr(lg, self.ROTATED_ATTRIBUTE_NAME) or not getattr(lg, self.ROTATED_ATTRIBUTE_NAME):
1361  self.fnameOld = fname
1362  fname += fileNameSuffix
1363  pin = lfn.getFreeProcInstanceNumber(os.path.basename(fname))
1364  if pin != '' and ((pin != '0' and fileNameSuffix == '') or (pin == '0' and fileNameSuffix != '')):
1365  pin = '.' + pin + '.log'
1366  fname = lfn.findReplace(fname + pin, rollover=rollover)
1367  setattr(lg, self.ROTATED_ATTRIBUTE_NAME, True)
1368  lg = logging.getLogger(ln)
1369  setattr(lg, self.ROTATED_ATTRIBUTE_NAME, True)
1370  else:
1371  if self.fnameOld != '':
1372  fname = lfn.findReplace(self.fnameOld, rollover=rollover)
1373  lg = logging.getLogger(ln)
1374  return lg
1375  except Exception, err:
1376  raise Exception('Logger initialization error:' + str(err) + "\n" + getTracebackInfo())
1377 
1378 
1379 def strToUnicode(inputStr):
1380  ret = inputStr
1381 
1382  if isinstance(inputStr, str):
1383  ret = inputStr.decode('utf-8')
1384 
1385  return ret
1386 
1387 
1388 # Split string removes duplicated peaces and joing back
1389 # @param inStr - input string
1390 # @param delimiter - splitter delimiter
1391 # @param joingGlue - optional glue string to joing with, if None or omitted - the delimiter used
1392 # @param trimMode - peaces trim mode: 0 - not trimmed, 1 - trimmed left, 2 - trimmed right, 3 - trimmed both
1393 # @return string with duplicated peaces removed
1394 def removeDuplicated(inStr, delimiter="\n", joingGlue=None, trimMode=1, skipEmpty=False):
1395  ret = inStr.split(delimiter)
1396 
1397  if joingGlue is None:
1398  glue = delimiter
1399  else:
1400  glue = joingGlue
1401 
1402  prev = None
1403  new = []
1404  for item in ret:
1405  if trimMode > 0:
1406  if trimMode == 1:
1407  item = item.lstrip()
1408  elif trimMode == 2:
1409  item = item.rstrip()
1410  else:
1411  item = item.strip()
1412  if skipEmpty and item == '':
1413  continue
1414  if item != prev:
1415  new.append(item)
1416  prev = item
1417  ret = new
1418 
1419  return glue.join(ret).strip()
1420 
1421 
1422 # Checks is the input content possible contains an CSS markup, possible is an in-line STYLE tag innerHTML
1423 #
1424 # @param content - to analyse
1425 # @return zero if presence of the CSS markup is not detected or number of the detected fragments
1427  return len(re.findall(r'\{.+?\}', content))
1428 
1429 
1430 # Class ExceptionLog for logging of the exception common way
1431 class ExceptionLog(object):
1432 
1433  # #Constans used in class
1434  LEVEL_NAME_ERROR = 'error'
1435  LEVEL_NAME_INFO = 'info'
1436  LEVEL_NAME_DEBUG = 'debug'
1437 
1438  LEVEL_VALUE_ERROR = logging.ERROR
1439  LEVEL_VALUE_INFO = logging.INFO
1440  LEVEL_VALUE_DEBUG = logging.DEBUG
1441 
1442  # #Constructor
1443  #
1444  # @param log - logger instance
1445  # @param error - Exception inherited object instance
1446  # @param message - error message string
1447  # @param objects - objects tuple to dump
1448  def __init__(self, log, error, message, objects):
1449  super(ExceptionLog, self).__init__()
1450  self.logger = log
1451  self.error = error
1452  self.message = message
1453  self.objects = objects
1454 
1455 
1456  # # Static handler for logging of the exception
1457  #
1458  # @param log - logger instance
1459  # @param error - Exception inherited object instance
1460  # @param message - error message string
1461  # @param objects - objects tuple to dump
1462  @staticmethod
1463  def handler(log, error, message, objects=(), levels={}): # pylint: disable=W0102
1464  # dictionary with default values
1465  levelsDict = { \
1466  ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_ERROR, \
1467  ExceptionLog.LEVEL_NAME_INFO:ExceptionLog.LEVEL_VALUE_INFO, \
1468  ExceptionLog.LEVEL_NAME_DEBUG:ExceptionLog.LEVEL_VALUE_DEBUG \
1469  }
1470 
1471  # filling levelDict necessary log level values
1472  for name, level in levels.items():
1473  if levelsDict.has_key(name):
1474  levelsDict[name] = level
1475 
1476  errorMsg = ''
1477  try:
1478  if isinstance(str(error), str) or isinstance(str(error), unicode):
1479  errorMsg = str(error)
1480  except Exception, err:
1481  log.log(levelsDict[ExceptionLog.LEVEL_NAME_DEBUG], 'Try make str(err) return error: ' + str(err))
1482 
1483  # Log the error message and Exception object with the ERROR level
1484  log.log(levelsDict[ExceptionLog.LEVEL_NAME_ERROR], message + ' ' + errorMsg)
1485 
1486  # Log the traceback with INFO level.
1487  log.log(levelsDict[ExceptionLog.LEVEL_NAME_INFO], getTracebackInfo())
1488 
1489  # Log the objects dumps with DEBUG level.
1490  if isinstance(objects, tuple):
1491  for obj in objects:
1492  log.log(levelsDict[ExceptionLog.LEVEL_NAME_DEBUG], varDump(obj))
1493 
1494 
1495  # # Dump log of the exception
1496  #
1497  # @param - None
1498  # @return - None
1499  def dump(self):
1500  ExceptionLog.handler(self.logger, self.error, self.message, self.objects)
1501 
1502 
1503 class InterruptableThread(threading.Thread):
1504  ERROR_CODE_OK = 0
1505  ERROR_CODE_GENERAL_EXCEPTION = 1
1506  ERROR_CODE_APPLIED_EXCEPTION = 2
1507 
1508  def __init__(self, func, args, kwargs, default, log):
1509  threading.Thread.__init__(self)
1510  self.function = func
1511  self.args = args
1512  self.kwargs = kwargs
1513  self.result = default
1514  self.logger = log
1516  self.errorMessage = ''
1517  self.errorException = Exception('Dummy exception')
1518  def run(self):
1519  try:
1520  self.result = self.function(*self.args, **self.kwargs)
1521  except Exception, err:
1522  if self.logger is not None:
1523  self.logger.error("Error of execution of thread class InterruptableThread(): %s\nargs: %s",
1524  str(err), str(self.args))
1526  self.errorMessage = str(err)
1527  self.errorException = err
1528  raise err
1529  except:
1531  self.errorMessage = 'Undefined error of execution of thread class InterruptableThread(), args: ' + str(self.args)
1532  if self.logger is not None:
1533  self.logger.error(self.errorMessage)
1534 
1535 
1536 # #The function to execute another function in a thread with limited time to run
1537 #
1538 # @param func to execute
1539 # @param args
1540 # @param kwargs
1541 # @param timeout - limit of execution time floating point, sec
1542 # @param default - value to return if execution time limit reached
1543 # @ret return value or default value
1544 def executeWithTimeout(func, args=None, kwargs=None, timeout=1, default=None, log=None):
1545  if args is None:
1546  args = ()
1547  # import threading
1548  if kwargs is None:
1549  kwargs = {}
1550 
1551  it = InterruptableThread(func, args, kwargs, default, log)
1552  it.start()
1553  it.join(timeout)
1554  if it.isAlive():
1555  try:
1556  it._Thread__stop() # pylint: disable=W0212
1557  time.sleep(1)
1558  except:
1559  if log is not None:
1560  log.error("an not stop thread with _Thread__stop()!")
1561  if it.isAlive():
1562  try:
1563  it.__stop() # pylint: disable=W0212
1564  time.sleep(1)
1565  except:
1566  if log is not None:
1567  log.error("Can not stop thread with __stop()!")
1568  if it.isAlive():
1569  try:
1570  it._Thread__delete() # pylint: disable=W0212
1571  time.sleep(1)
1572  except:
1573  if log is not None:
1574  log.error("Can not stop thread with _Thread__delete()!")
1575 
1576  if it.errorCode == it.ERROR_CODE_APPLIED_EXCEPTION:
1577  if log is not None:
1578  log.error("Error1 code %s, exception: %s", str(it.errorCode), str(it.errorException))
1579  raise it.errorException
1580  return default
1581  else:
1582  if it.errorCode == it.ERROR_CODE_APPLIED_EXCEPTION:
1583  if log is not None:
1584  log.error("Error2 code %s, exception: %s", str(it.errorCode), str(it.errorException))
1585  raise it.errorException
1586  return it.result
1587 
1588 
1589 # #Load file data by protocoled reference
1590 #
1591 # @param initString string in json format or @file:// reference
1592 # @param protocolPrefix
1593 # @param loggerObj
1594 # @return initString unchanged, value from file loaded by link or empty string if load error
1595 def loadFromFileByReference(fileReference, initString=None, protocolPrefix='file://', loggerObj=None):
1596  ret = initString
1597 
1598  if fileReference.startswith(protocolPrefix):
1599  try:
1600  f = fileReference[len(protocolPrefix):]
1601  ret = readFile(f)
1602  except Exception, err:
1603  if loggerObj is not None:
1604  loggerObj.error("Error load from file `%s` by reference: %s", f, str(err))
1605 
1606  return ret
1607 
1608 
1609 # #Read file
1610 #
1611 # @param inFile - name of file to read
1612 # @param decodeUTF8 - decode utf8 or not after read from file
1613 # @return - the buffer
1614 def readFile(inFile, decodeUTF8=True):
1615  with open(inFile, 'r') as f:
1616  ret = f.read()
1617 
1618  if decodeUTF8:
1619  ret = ret.decode('utf8')
1620 
1621  return ret
1622 
1623 
1624 # #Escape string value
1625 #
1626 # @param string
1627 # @return escaped string
1628 def escape(string):
1629  return string.replace("\\", "\\\\").replace('"', '\\\"').replace("'", "\\\'").replace("\n", "\\n").\
1630  replace("\r", "\\r").replace("\0", "\\0")
1631 
1632 
1633 # #Validate URL string
1634 #
1635 # @param url - url string
1636 # @return True if valid or otherwise False
1637 def isValidURL(url):
1638  return False if isinstance(validators.url(url), validators.ValidationFailure) else True
1639 
1640 
1641 # #Get some hash of a string limited bit size
1642 #
1643 # @param strBuf - string buffer
1644 # @param binSize - binary value size bits, supported values 32, 64 and 128
1645 # @param digestType - 0 - md5, 1 - sha1
1646 # @param fixedMode - 0 digests play, 1 - crc32 to uint32, 2 - crc32 to ulong
1647 # @param valLimit - limit of a value useful to fix a DB type size (MySQL 8 bytes BIGINT(20))
1648 # @return True if valid or otherwise False
1649 def getHash(strBuf, binSize=32, digestType=0, fixedMode=0, valLimit=18446744073709552000L):
1650 
1651  if fixedMode == 0:
1652  if digestType == 0:
1653  d = hashlib.md5(strBuf)
1654  else:
1655  d = hashlib.sha1(strBuf) # pylint: disable=R0204
1656  if binSize == 32:
1657  s = 8
1658  elif binSize == 64:
1659  s = 16
1660  else:
1661  s = 32
1662  h = d.hexdigest()
1663  v = int(h[:s], 16)
1664  if v > valLimit:
1665  for i in xrange(1, s - 1):
1666  v = int(h[:s - i], 16)
1667  if v < valLimit:
1668  break
1669  elif fixedMode == 1:
1670  v = ctypes.c_uint32(zlib.crc32(strBuf, int(time.time()))).value
1671  else:
1672  v = ctypes.c_ulong(zlib.crc32(strBuf, int(time.time()))).value
1673 
1674  return v
1675 
1676 
1677 # # Convert string to float
1678 # @param val - input value as string
1679 # @param defaultValue - default value for result
1680 # @param log - logger instance
1681 # @param positivePrefixes - positive prefixes dictionary
1682 # @return result float value
1683 def strToFloat(val, defaultValue=0.0, log=None, positivePrefixes=None):
1684  # variable for result
1685  ret = defaultValue
1686  if positivePrefixes is None:
1687  posPrefixes = {'K':'1E3', 'M':'1E6', 'G':'1E9', 'T':'1E12', 'P':'1E15', 'E':'1E18', 'Z':'1E21', 'Y':'1E24'}
1688  else:
1689  posPrefixes = positivePrefixes
1690 
1691  try:
1692  val = val.upper()
1693  if val[-1] in posPrefixes.keys():
1694  v = Decimal(val[:-1])
1695  ret = float(v * Decimal(posPrefixes[val[-1]]))
1696  else:
1697  ret = float(val)
1698  except Exception, err:
1699  if log is not None:
1700  log.debug(str(err))
1701 
1702  return ret
1703 
1704 
1705 # #Convert string to proxy tuple (proxy_type, proxy_host, proxy_port, proxy_user, proxy_passwd)
1706 #
1707 # @param proxyString - proxy string
1708 # @param log - logger instance
1709 # @return proxy tuple if success or None otherwise
1710 def strToProxy(proxyString, log=None, defaultProxyType='http'):
1711  # variables for result
1712  ret = None
1713  proxy_type = proxy_host = proxy_port = proxy_user = proxy_passwd = None
1714  if isinstance(proxyString, basestring) and proxyString != "":
1715  try:
1716  pattern = '(.*)://(.*):(.*)@(.*):(.*)'
1717  match = re.search(pattern, proxyString, re.I + re.U)
1718  if match is not None:
1719  proxy_type, proxy_user, proxy_passwd, proxy_host, proxy_port = match.groups()
1720 
1721  else:
1722  pattern = '(.*)://(.*):(.*)'
1723  match = re.search(pattern, proxyString, re.I + re.U)
1724  if match is not None:
1725  proxy_type, proxy_host, proxy_port = match.groups()
1726  else:
1727  pattern = '(.*):(.*)'
1728  match = re.search(pattern, proxyString, re.I + re.U)
1729  if match is not None:
1730  proxy_host, proxy_port = match.groups()
1731  proxy_type = defaultProxyType
1732 
1733  ret = (proxy_type, proxy_host, proxy_port, proxy_user, proxy_passwd)
1734  except Exception, err:
1735  if log is not None:
1736  log.error("Error: %s", str(err))
1737 
1738  return ret
1739 
1740 
1741 # # execute command line command
1742 #
1743 # @param cmd - command line string
1744 # @param inputStream - input stream to popen
1745 # @param log - logger instance
1746 # @return result named tuple with support names: 'stdout', 'stderr', 'exitCode'
1747 def executeCommand(cmd, inputStream='', log=None):
1748  # variables for result tuple
1749  output = ''
1750  errMsg = ''
1751  exitCode = APP_CONSTS.EXIT_FAILURE
1752  try:
1753  if log is not None:
1754  log.debug("Popen: %s", str(cmd))
1755 
1756  process = Popen(cmd, stdout=PIPE, stdin=PIPE, stderr=PIPE, shell=True, close_fds=True, executable='/bin/bash')
1757  if log is not None:
1758  log.debug("len(inputStream)= %s", str(len(inputStream)))
1759 
1760  (output, errMsg) = process.communicate(input=inputStream)
1761  exitCode = process.wait()
1762 
1763  if log is not None:
1764  log.debug("Process response has exitCode = %s, stdout len = %s, stderr: %s",
1765  str(exitCode), str(len(output)), str(errMsg))
1766 
1767  except Exception, err:
1768  if log is not None:
1769  log.error("Popen execution error: %s", str(err))
1770 
1771  # make result tuple
1772  PopenResult = collections.namedtuple('PopenResult', ['stdout', 'stderr', 'exitCode'])
1773  popenResult = PopenResult(stdout=output, stderr=errMsg, exitCode=exitCode)
1774 
1775  return popenResult
1776 
1777 
1778 # # Parse json and return dict if okay or None if not
1779 #
1780 # @param jsonString json to pars
1781 # @param log - logger instance
1782 # @return resulted dict
1783 def jsonLoadsSafe(jsonString, default=None, log=None):
1784  # variable for result
1785  ret = default
1786  try:
1787  if jsonString is not None and jsonString != '':
1788  if isinstance(jsonString, basestring):
1789  ret = json.loads(jsonString)
1790  else:
1791  ret = jsonString
1792  if log is not None:
1793  log.debug("Input object type is: %s", type(jsonString))
1794  except Exception, err:
1795  if log is not None:
1796  log.error("Error pars json: %s; source string:\n%s", str(err), jsonString)
1797 
1798  return ret
1799 
1800 
1801 # simple re match check for search word definition
1802 #
1803 # @param word - word for search
1804 # @param buff - buffer where is search
1805 # @param log - logger instance
1806 # @return True if match exist or False otherwise
1807 def reMatch(word, buff, log=None):
1808  # variable for result
1809  ret = False
1810  if isinstance(word, basestring) and isinstance(buff, basestring):
1811  try:
1812  if word.startswith(u'/'):
1813  word = word[1:]
1814  if re.search(pattern=word, string=buff, flags=re.U + re.I + re.M) is not None:
1815  ret = True
1816  else:
1817  ret = (word.upper() == buff.upper())
1818 
1819  except Exception, err:
1820  if log is not None:
1821  log.error("Expression: %s, Error: %s", str(word), str(err))
1822 
1823  return ret
def generateDomainUrl(url)
Definition: Utils.py:533
def getContentCSSMarkupEntrancesNumber(content)
Definition: Utils.py:1426
def loadFromFileByReference(fileReference, initString=None, protocolPrefix='file://', loggerObj=None)
Definition: Utils.py:1595
def __init__(self)
Definition: Utils.py:511
def isTailSubstr(str1, str2)
Definition: Utils.py:1221
def removeDuplicated(inStr, delimiter="\, joingGlue=None, trimMode=1, skipEmpty=False)
Definition: Utils.py:1394
def getFreeProcInstanceNumber(self, sock_prefix="module", min_number=0, max_number=32)
Definition: Utils.py:829
def executeCommand(cmd, inputStream='', log=None)
Definition: Utils.py:1747
def __init__(self, loggerInst=None)
Definition: Utils.py:825
def __init__(self, stringExpression="")
Definition: Utils.py:134
def autoFillSiteId(siteId, log)
Definition: Utils.py:967
def getConfigParameter(parser, section, option, defValue)
Definition: Utils.py:200
def getPairsDicts(incomeDict, splitters=')
Definition: Utils.py:1194
string SUBDIR_CHAR
Definition: Utils.py:155
def escape(string)
Definition: Utils.py:1628
def __str__(self)
Definition: Utils.py:141
def cutSubstringEntrances(buf, startStr='<!--', finishStr='-->', behaveMask=0, greediness=0, finishDefault='\n')
Definition: Utils.py:1011
def getDir(self)
Definition: Utils.py:170
def isValueIn(classType, prefix, value)
Definition: Utils.py:75
def reMatch(word, buff, log=None)
Definition: Utils.py:1807
def url_normalize(url, charset='utf-8')
def innerHTMLText(htmlBuf, stripComment=True, stripScript=True)
Definition: Utils.py:1130
def getHTMLRedirectUrl(buff, log)
Definition: Utils.py:1278
def getFirstNotEmptySubXPath(xpath, sel, subXPathPattern, subXPathes)
Definition: Utils.py:1174
def getPath(dictionary, jsonString, path)
Definition: Utils.py:93
def loggerFlush(loggerObj)
Definition: Utils.py:893
def eraseNoScript(htmlBuf=None)
Definition: Utils.py:1046
def emailParse(href, onlyName=False, defaultSeparator=' ')
Definition: Utils.py:1302
def urinormpath(path, stripWWW=False, useValidator=False, enableAdditionNormalize=True)
Definition: Utils.py:764
def __init__(self)
Definition: Utils.py:1332
def jsonLoadsSafe(jsonString, default=None, log=None)
Definition: Utils.py:1783
def innerText(selectorList, delimiter=' ', innerDelimiter=' ', tagReplacers=None, REconditions=None, attrConditions=None, keepAttributes=None, baseUrl=None, closeVoid=None, excludeNodes=None)
Definition: Utils.py:1148
def stripHTMLComments(htmlBuf=None, soup=None, hType=3)
Definition: Utils.py:982
def readFile(inFile, decodeUTF8=True)
Definition: Utils.py:1614
def splitPairs(buf, splitters=')
Definition: Utils.py:1207
Definition: Url.py:1
def parseHost(url)
Definition: Utils.py:947
def handle_data(self, d)
Definition: Utils.py:1117
def strToUnicode(inputStr)
Definition: Utils.py:1379
def __init__(self, joinGlue=' ')
Definition: Utils.py:1110
def __init__(self, initial_data, kwargs)
Definition: Utils.py:184
def executeWithTimeout(func, args=None, kwargs=None, timeout=1, default=None, log=None)
Definition: Utils.py:1544
def get_data(self)
Definition: Utils.py:1121
def getLogger(self, loggerName=None, fileNameSuffix='', restore=False)
Definition: Utils.py:1341
def handler(log, error, message, objects=(), levels={})
Definition: Utils.py:1463
def isValidURL(url)
Definition: Utils.py:520
def __init__(self, string, subdirLen=SUBDIR_LEVEL1_LEN)
Definition: Utils.py:157
def entitiesEncode(url, entities=None)
Definition: Utils.py:728
def findReplace(self, newFile=None, rollover=True)
Definition: Utils.py:850
def normalize(url, supportProtocols=None, normMask=NORM_DEFAULT)
Definition: Utils.py:678
def __init__(self, func, args, kwargs, default, log)
Definition: Utils.py:1508
def replaceLoopValue(buf, replaceFrom, replaceTo)
Definition: Utils.py:1233
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
def urlNormalization(base, url, supportProtocols=None, log=None)
Definition: Utils.py:561
-mask-info
def innerTextToList(selectorList, delimiter=' ', innerDelimiter=' ', tagReplacers=None, REconditions=None, attrConditions=None, keepAttributes=None, baseUrl=None, closeVoid=None, excludeNodes=None)
Definition: Utils.py:1160
def generateReplacementDict()
Definition: Utils.py:934
def accumulateSubstrings(substrList, prefixes)
Definition: Utils.py:905
def memUsage(point="")
Definition: Utils.py:498
string ROTATED_ATTRIBUTE_NAME
Definition: Utils.py:1328
def strToFloat(val, defaultValue=0.0, log=None, positivePrefixes=None)
Definition: Utils.py:1683
def getDomain(url)
Definition: Utils.py:548
Definition: join.py:1
def tracefunc(frame, event, arg, indent=None)
Definition: Utils.py:273
def getHash(strBuf, binSize=32, digestType=0, fixedMode=0, valLimit=18446744073709552000L)
Definition: Utils.py:1649
def stripHTMLTags(htmlTxt, method=0, joinGlue=' ', regExp=None)
Definition: Utils.py:1064
def strToProxy(proxyString, log=None, defaultProxyType='http')
Definition: Utils.py:1710
def storePickleOnDisk(input_pickled_object, env_path, file_name)
Definition: Utils.py:754
def __init__(self, log, error, message, objects)
Definition: Utils.py:1448
def convertToHttpDateFmt(date_str)
Definition: Utils.py:958
def getTracebackInfo(linesNumberMax=None)
Definition: Utils.py:218
def isValidURL(url)
Definition: Utils.py:1637