HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
Url.py
Go to the documentation of this file.
1 
2 import re
3 from app.url_normalize import url_normalize
4 
5 
6 class Url(object):
7  def __init__(self, url):
8  self.url = url
9 
10 
11  def checkUrlCodeValid(self):
12  ret = True
13  urlEncode = False
14  subIndex = 0
15  for index in xrange(0, len(self.url)):
16  if urlEncode:
17  if not (self.url[index] >= "a" and self.url[index] <= "f" or \
18  self.url[index] >= "A" and self.url[index] <= "F" or \
19  self.url[index] >= "0" and self.url[index] <= "9"):
20  ret = False
21  break
22  subIndex += 1
23  if subIndex == 2:
24  subIndex = 0
25  urlEncode = False
26  if self.url[index] == "%":
27  urlEncode = True
28  return ret
29 
30 
31  def isValid(self):
32  ret = False
33 
34  regex = re.match(r'^((ht|f)tp(s?)\:\/\/|~/|/)?([\w]+:\w+@)?([a-zA-Z]{1}([\w\-]+\.)+([\w]{2,5}))(:[\d]{1,5})?/?(\w+\.[\w]{3,4})?((\?\w+=\w+)?(&\w+=\w+)*)?', self.url, re.IGNORECASE) # pylint: disable=C0301
35 
36  ret = regex is not None
37  if ret:
38  ret = self.checkUrlCodeValid()
39  return ret
40 
41  def getNormalized(self):
42  return url_normalize(self.url)
43 
44  def __lt__(self, other):
45  return self.url < other.url
46 
47  def __le__(self, other):
48  return self.url <= other.url
49 
50  def __gt__(self, other):
51  return self.url > other.url
52 
53  def __ge__(self, other):
54  return self.url >= other.url
55 
56  def __eq__(self, other):
57  return self.url == other.url
58 
59 
70  @staticmethod
71  def GetStats(urls):
72  stats = []
73  canonicalized_list = [url.getNormalized() for url in urls]
74  for url in urls:
75  url_stat = {}
76  canonicalized = url.getNormalized()
77  url_stat['source'] = url.url
78  url_stat['canonicalized'] = canonicalized
79  url_stat['valid'] = url.isValid()
80  url_stat['source_unique'] = (urls.count(url) == 1)
81  url_stat['canonicalized_unique'] = \
82  (canonicalized_list.count(canonicalized) == 1)
83  stats.append(url_stat)
84  return stats
85 
def __init__(self, url)
Definition: Url.py:7
def __le__(self, other)
Definition: Url.py:47
def __eq__(self, other)
Definition: Url.py:56
def url_normalize(url, charset='utf-8')
def GetStats(urls)
Parameters: urls - a list of Url instance objects to display stats for Returns: A list of stat mappin...
Definition: Url.py:71
def __ge__(self, other)
Definition: Url.py:53
def __gt__(self, other)
Definition: Url.py:50
def getNormalized(self)
Definition: Url.py:41
def checkUrlCodeValid(self)
Definition: Url.py:11
def isValid(self)
Definition: Url.py:31
def __lt__(self, other)
Definition: Url.py:44