HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
UrlNormalize.py
Go to the documentation of this file.
1 # coding: utf-8
2 """
3 HCE project, Python bindings, Distributed Tasks Manager application.
4 UrlNormalize Class content main functional of support the URL_NORMALIZE properties.
5 
6 @package: app
7 @file UrlNormalize.py
8 @author Alexander Vybornyh <alexander.hce.cluster@gmail.com>
9 @link: http://hierarchical-cluster-engine.com/
10 @copyright: Copyright &copy; 2013-2017 IOIX Ukraine
11 @license: http://hierarchical-cluster-engine.com/license/
12 @since: 0.1
13 """
14 
15 import re
16 
17 import app.Consts as APP_CONSTS
18 import app.Utils as Utils
19 
20 
21 class UrlNormalize(object):
22 
23  # Constants used in class
24  PROPERTY_OPTIONS_MASK = 'mask'
25  PROPERTY_OPTIONS_REPLACE = 'replace'
26 
27  # Constants of error messages
28  ERROR_MSG_FAILED_REPLACE = "Operation replace failed. Error: %s"
29 
30  # Initialization
31  def __init__(self):
32  pass
33 
34 
35 
40  @staticmethod
41  def getNormalizeMask(siteProperties, defaultValue=Utils.UrlNormalizator.NORM_DEFAULT):
42  # variable for result
43  ret = defaultValue
44 
45  if siteProperties is not None and isinstance(siteProperties, dict) and APP_CONSTS.URL_NORMALIZE in siteProperties and \
46  isinstance(siteProperties[APP_CONSTS.URL_NORMALIZE], dict) and UrlNormalize.PROPERTY_OPTIONS_MASK in siteProperties[APP_CONSTS.URL_NORMALIZE]:
47  ret = int(siteProperties[APP_CONSTS.URL_NORMALIZE][UrlNormalize.PROPERTY_OPTIONS_MASK])
48 
49  return ret
50 
51 
52  # # execute normalization url string use base url
53  #
54  # @param siteProperties - site properties
55  # @param base - base url string
56  # @param url - url string
57  # @param supportProtocols - support protocol list
58  # @param log - logger instance
59  # @return already normalized url string or None - in case of bad result normalization
60  @staticmethod
61  def execute(siteProperties, base, url, supportProtocols=None, log=None):
62 
63  # check site property for exist replace rule
64  if siteProperties is not None and isinstance(siteProperties, dict) and APP_CONSTS.URL_NORMALIZE in siteProperties:
65  if log is not None:
66  log.info("!!! siteProperties['%s']: '%s', type: %s", str(APP_CONSTS.URL_NORMALIZE), str(siteProperties[APP_CONSTS.URL_NORMALIZE]),
67  str(type(siteProperties[APP_CONSTS.URL_NORMALIZE])))
68 
69  replaceList = []
70  propertyDict = {}
71  if isinstance(siteProperties[APP_CONSTS.URL_NORMALIZE], basestring):
72  propertyDict = Utils.jsonLoadsSafe(jsonString=siteProperties[APP_CONSTS.URL_NORMALIZE], default=propertyDict, log=log)
73 
74  if isinstance(propertyDict, dict) and UrlNormalize.PROPERTY_OPTIONS_REPLACE in propertyDict:
75  replaceList = propertyDict[UrlNormalize.PROPERTY_OPTIONS_REPLACE]
76 
77  if log is not None:
78  log.debug("!!! replaceList: %s", str(replaceList))
79 
80  if isinstance(replaceList, list):
81  for replaceElem in replaceList:
82  if isinstance(replaceElem, dict):
83  for pattern, repl in replaceElem.items():
84  try:
85  if log is not None:
86  log.debug("!!! pattern: %s, url: %s", str(pattern), str(url))
87  url = re.sub(pattern=pattern, repl=repl, string=url, flags=re.U + re.I)
88  if log is not None:
89  log.debug("!!! res url: %s", str(url))
90  except Exception, err:
91  if log is not None:
92  log.error(UrlNormalize.ERROR_MSG_FAILED_REPLACE, str(err))
93 
94  return Utils.urlNormalization(base=base, url=url, supportProtocols=supportProtocols, log=log)
def execute(siteProperties, base, url, supportProtocols=None, log=None)
Definition: UrlNormalize.py:61
def getNormalizeMask(siteProperties, defaultValue=Utils.UrlNormalizator.NORM_DEFAULT)
get normalize mask
Definition: UrlNormalize.py:41