HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
app.url_normalize Namespace Reference

Functions

def url_normalize (url, charset='utf-8')
 
def testcase1 (expected, value)
 
def testcase2 (original, normalized)
 

Variables

string __license__ = "Python"
 
float __version__ = 1.1
 
int MAX_ALLOWED_LABEL_LENGTH = 63
 
 suite = unittest.TestSuite()
 
list tests1
 
dictionary tests2
 

Function Documentation

◆ testcase1()

def app.url_normalize.testcase1 (   expected,
  value 
)

Definition at line 193 of file url_normalize.py.

193  def testcase1(expected, value):
194 
195  class test(unittest.TestCase):
196 
197  def runTest(self):
198  assert (url_normalize(value) == value) == expected, (expected, value, url_normalize(value))
199  return test()
200 
def url_normalize(url, charset='utf-8')
def testcase1(expected, value)
Here is the call graph for this function:

◆ testcase2()

def app.url_normalize.testcase2 (   original,
  normalized 
)

Definition at line 276 of file url_normalize.py.

276  def testcase2(original, normalized):
277 
278  class test(unittest.TestCase):
279 
280  def runTest(self):
281  assert url_normalize(original) == normalized, (original, normalized, url_normalize(original))
282  return test()
283 
def url_normalize(url, charset='utf-8')
def testcase2(original, normalized)
Here is the call graph for this function:

◆ url_normalize()

def app.url_normalize.url_normalize (   url,
  charset = 'utf-8' 
)
Sometimes you get an URL by a user that just isn't a real
URL because it contains unsafe characters like ' ' and so on.  This
function can fix some of the problems in a similar way browsers
handle data entered by the user:

>>> url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffsklärung)')
'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29'

:param charset: The target charset for the URL if the url was
        given as unicode string.

Definition at line 40 of file url_normalize.py.

40 def url_normalize(url, charset='utf-8'):
41  """
42  Sometimes you get an URL by a user that just isn't a real
43  URL because it contains unsafe characters like ' ' and so on. This
44  function can fix some of the problems in a similar way browsers
45  handle data entered by the user:
46 
47  >>> url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffsklärung)')
48  'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29'
49 
50  :param charset: The target charset for the URL if the url was
51  given as unicode string.
52  """
53 
54  def _clean(string):
55  string = unicode(string, 'utf-8', 'replace')
56  return unicodedata.normalize('NFC', string).encode('utf-8')
57 
58  default_port = {
59  'ftp': 21,
60  'telnet': 23,
61  'http': 80,
62  'gopher': 70,
63  'news': 119,
64  'nntp': 119,
65  'prospero': 191,
66  'https': 443,
67  'snews': 563,
68  'snntp': 563,
69  }
70  if isinstance(url, unicode):
71  url = url.encode(charset, 'ignore')
72 
73  # if there is no scheme use http as default scheme
74  if url[0] not in ['/', '-'] and ':' not in url[:7]:
75  url = 'http://' + url
76 
77  # shebang urls support
78  url = url.replace('#!', '?_escaped_fragment_=')
79 
80  # splitting url to useful parts
81  scheme, auth, path, query, fragment = urlparse.urlsplit(url.strip())
82  (userinfo, host, port) = re.search('([^@]*@)?([^:]*):?(.*)', auth).groups()
83 
84  # Always provide the URI scheme in lowercase characters.
85  scheme = scheme.lower()
86 
87  # Always provide the host, if any, in lowercase characters.
88  host = host.lower()
89  if host and host[-1] == '.':
90  host = host[:-1]
91 
92  if (len(host) <= MAX_ALLOWED_LABEL_LENGTH):
93  # take care about IDN domains
94  host = host.decode(charset).encode('idna') # IDN -> ACE
95 
96  # Only perform percent-encoding where it is essential.
97  # Always use uppercase A-through-F characters when percent-encoding.
98  # All portions of the URI must be utf-8 encoded NFC from Unicode strings
99  path = quote(_clean(path), "~:/?#[]@!$&'()*+,;=%")
100  fragment = quote(_clean(fragment), "~")
101 
102  # note care must be taken to only encode & and = characters as values
103  query = "&".join(["=".join([quote(_clean(t), "~:/?#[]@!$'()*+,;=%") for t in q.split("=", 1)]) for q in query.split("&")])
104 
105  # Prevent dot-segments appearing in non-relative URI paths.
106  if scheme in ["", "http", "https", "ftp", "file"]:
107  output = []
108  for part in path.split('/'):
109  if part == "":
110  if not output:
111  output.append(part)
112  elif part == ".":
113  pass
114  elif part == "..":
115  if len(output) > 1:
116  output.pop()
117  else:
118  output.append(part)
119  if part in ["", ".", ".."]:
120  output.append("")
121  path = '/'.join(output)
122 
123  # For schemes that define a default authority, use an empty authority if
124  # the default is desired.
125  if userinfo in ["@", ":@"]:
126  userinfo = ""
127 
128  # For schemes that define an empty path to be equivalent to a path of "/",
129  # use "/".
130  if path == "" and scheme in ["http", "https", "ftp", "file"]:
131  path = "/"
132 
133  # For schemes that define a port, use an empty port if the default is
134  # desired
135  if port and scheme in default_port.keys():
136  if port.isdigit():
137  port = str(int(port))
138  if int(port) == default_port[scheme]:
139  port = ''
140 
141  # Put it all back together again
142  auth = (userinfo or "") + host
143  if port:
144  auth += ":" + port
145  if url.endswith("#") and query == "" and fragment == "":
146  path += "#"
147  return urlparse.urlunsplit((scheme, auth, path, query, ''))
148 
def url_normalize(url, charset='utf-8')
Definition: join.py:1
Here is the call graph for this function:
Here is the caller graph for this function:

Variable Documentation

◆ __license__

string app.url_normalize.__license__ = "Python"
private

Definition at line 29 of file url_normalize.py.

◆ __version__

float app.url_normalize.__version__ = 1.1
private

Definition at line 30 of file url_normalize.py.

◆ MAX_ALLOWED_LABEL_LENGTH

int app.url_normalize.MAX_ALLOWED_LABEL_LENGTH = 63

Definition at line 38 of file url_normalize.py.

◆ suite

app.url_normalize.suite = unittest.TestSuite()

Definition at line 151 of file url_normalize.py.

◆ tests1

list app.url_normalize.tests1

Definition at line 154 of file url_normalize.py.

◆ tests2

dictionary app.url_normalize.tests2

Definition at line 205 of file url_normalize.py.