HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
app.Utils.UrlNormalizator Class Reference
Inheritance diagram for app.Utils.UrlNormalizator:
Collaboration diagram for app.Utils.UrlNormalizator:

Public Member Functions

def __init__ (self)
 

Static Public Member Functions

def normalize (url, supportProtocols=None, normMask=NORM_DEFAULT)
 
def isNormalUrl (url)
 
def entitiesEncode (url, entities=None)
 

Static Public Attributes

int NORM_NONE = 0
 
int NORM_SKIP_WWW = 1
 
int NORM_USE_VALIDATOR = 2
 
int NORM_MAIN = 4
 
int NORM_DEFAULT = NORM_MAIN
 
string BAD_URL_PREFIX = "normalization-error://?"
 

Detailed Description

Definition at line 659 of file Utils.py.

Constructor & Destructor Documentation

◆ __init__()

def app.Utils.UrlNormalizator.__init__ (   self)

Definition at line 669 of file Utils.py.

669  def __init__(self):
670  pass
671 
672 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ entitiesEncode()

def app.Utils.UrlNormalizator.entitiesEncode (   url,
  entities = None 
)
static

Definition at line 728 of file Utils.py.

728  def entitiesEncode(url, entities=None):
729  ret = url
730  if entities is None:
731  entities = {"&": "&"}
732 
733  for k in entities:
734  le = len(entities[k])
735  p = -1
736  while True:
737  l = len(ret)
738  p = ret.find(k, p + 1)
739  if p == -1:
740  break
741  else:
742  if (p + le - 1 > l) or ((p + le - 1 <= l) and (ret[p:p + le] != entities[k])):
743  ret = ret[:p] + entities[k] + ret[p + 1:]
744  else:
745  continue
746 
747  return ret
748 
749 
750 
751 # ENV_SCRAPER_STORE_PATH = "ENV_SCRAPER_STORE_PATH"
752 # # storePickleOnDisk
753 #

◆ isNormalUrl()

def app.Utils.UrlNormalizator.isNormalUrl (   url)
static

Definition at line 718 of file Utils.py.

718  def isNormalUrl(url):
719  return False if url.find(UrlNormalizator.BAD_URL_PREFIX) == 0 else True
720 
721 

◆ normalize()

def app.Utils.UrlNormalizator.normalize (   url,
  supportProtocols = None,
  normMask = NORM_DEFAULT 
)
static

Definition at line 678 of file Utils.py.

678  def normalize(url, supportProtocols=None, normMask=NORM_DEFAULT):
679  norm_url = url.strip()
680  if normMask != 0:
681  logger.debug("None zero normMask: %s", str(normMask))
682  # TODO: need to be replaced with default filter for collect URLs protocols check stage
683  if supportProtocols is not None and isinstance(supportProtocols, list):
684  colonPos = norm_url.find(':')
685  slashPos = norm_url.find('/')
686  if colonPos != -1 and (slashPos == -1 or slashPos > colonPos):
687  if len(norm_url.split(':')) > 1:
688  protocol = norm_url.split(':')[0]
689  if protocol not in supportProtocols:
690  try:
691  norm_url = UrlNormalizator.BAD_URL_PREFIX + urllib.quote(norm_url)
692  except Exception as err:
693  logger.debug(">>> urllib.quote error = " + str(err))
694  norm_url = UrlNormalizator.BAD_URL_PREFIX + norm_url
695 
696  if norm_url == url:
697  try:
698  stripWWW = True if normMask & UrlNormalizator.NORM_SKIP_WWW else False
699  useValidator = True if normMask & UrlNormalizator.NORM_USE_VALIDATOR else False
700  enableAdditionNormalize = True if normMask & UrlNormalizator.NORM_MAIN else False
701  norm_url = str(urinormpath(url.strip(), stripWWW, useValidator, enableAdditionNormalize))
702  # norm_url = str(canonicalize_url(url.strip()))
703  # logger.debug("norm_url: <%s>", norm_url)
704  # except urlnorm.InvalidUrl:
705  # logger.error("Normalization InvalidUrl")
706  # norm_url = ""
707  except Exception as e:
708  logger.error("Normalization error: " + str(e) + "\nURL: [" + url + "]\n" + str(getTracebackInfo()))
709 
710  return norm_url
711 
712 
def urinormpath(path, stripWWW=False, useValidator=False, enableAdditionNormalize=True)
Definition: Utils.py:764
def getTracebackInfo(linesNumberMax=None)
Definition: Utils.py:218
Here is the call graph for this function:

Member Data Documentation

◆ BAD_URL_PREFIX

string app.Utils.UrlNormalizator.BAD_URL_PREFIX = "normalization-error://?"
static

Definition at line 666 of file Utils.py.

◆ NORM_DEFAULT

int app.Utils.UrlNormalizator.NORM_DEFAULT = NORM_MAIN
static

Definition at line 665 of file Utils.py.

◆ NORM_MAIN

int app.Utils.UrlNormalizator.NORM_MAIN = 4
static

Definition at line 664 of file Utils.py.

◆ NORM_NONE

int app.Utils.UrlNormalizator.NORM_NONE = 0
static

Definition at line 661 of file Utils.py.

◆ NORM_SKIP_WWW

int app.Utils.UrlNormalizator.NORM_SKIP_WWW = 1
static

Definition at line 662 of file Utils.py.

◆ NORM_USE_VALIDATOR

int app.Utils.UrlNormalizator.NORM_USE_VALIDATOR = 2
static

Definition at line 663 of file Utils.py.


The documentation for this class was generated from the following file: