HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
url_normalize.py
Go to the documentation of this file.
1 #pylint: skip-file
2 # -*- coding: utf-8 -*-
3 """
4 URI Normalization function:
5  * Take care of IDN domains.
6  * Always provide the URI scheme in lowercase characters.
7  * Always provide the host, if any, in lowercase characters.
8  * Only perform percent-encoding where it is essential.
9  * Always use uppercase A-through-F characters when percent-encoding.
10  * Prevent dot-segments appearing in non-relative URI paths.
11  * For schemes that define a default authority, use an empty authority if the
12  default is desired.
13  * For schemes that define an empty path to be equivalent to a path of "/",
14  use "/".
15  * For schemes that define a port, use an empty port if the default is desired
16  * All portions of the URI must be utf-8 encoded NFC from Unicode strings
17 
18 Inspired by Sam Ruby's urlnorm.py: http://intertwingly.net/blog/2004/08/04/Urlnorm
19 This fork author: Nikolay Panov (<pythoneer@niksite.ru>)
20 
21 History:
22  * 10 Feb 2010: support for shebang (#!) urls
23  * 28 Feb 2010: using 'http' schema by default when appropriate
24  * 28 Feb 2010: added handling of IDN domains
25  * 28 Feb 2010: code pep8-zation
26  * 27 Feb 2010: forked from Sam Ruby's urlnorm.py
27 """
28 
29 __license__ = "Python"
30 __version__ = 1.1
31 
32 import re
33 import unicodedata
34 import urlparse
35 from urllib import quote, unquote
36 
37 
38 MAX_ALLOWED_LABEL_LENGTH = 63
39 
40 def url_normalize(url, charset='utf-8'):
41  """
42  Sometimes you get an URL by a user that just isn't a real
43  URL because it contains unsafe characters like ' ' and so on. This
44  function can fix some of the problems in a similar way browsers
45  handle data entered by the user:
46 
47  >>> url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffsklärung)')
48  'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29'
49 
50  :param charset: The target charset for the URL if the url was
51  given as unicode string.
52  """
53 
54  def _clean(string):
55  string = unicode(string, 'utf-8', 'replace')
56  return unicodedata.normalize('NFC', string).encode('utf-8')
57 
58  default_port = {
59  'ftp': 21,
60  'telnet': 23,
61  'http': 80,
62  'gopher': 70,
63  'news': 119,
64  'nntp': 119,
65  'prospero': 191,
66  'https': 443,
67  'snews': 563,
68  'snntp': 563,
69  }
70  if isinstance(url, unicode):
71  url = url.encode(charset, 'ignore')
72 
73  # if there is no scheme use http as default scheme
74  if url[0] not in ['/', '-'] and ':' not in url[:7]:
75  url = 'http://' + url
76 
77  # shebang urls support
78  url = url.replace('#!', '?_escaped_fragment_=')
79 
80  # splitting url to useful parts
81  scheme, auth, path, query, fragment = urlparse.urlsplit(url.strip())
82  (userinfo, host, port) = re.search('([^@]*@)?([^:]*):?(.*)', auth).groups()
83 
84  # Always provide the URI scheme in lowercase characters.
85  scheme = scheme.lower()
86 
87  # Always provide the host, if any, in lowercase characters.
88  host = host.lower()
89  if host and host[-1] == '.':
90  host = host[:-1]
91 
92  if (len(host) <= MAX_ALLOWED_LABEL_LENGTH):
93  # take care about IDN domains
94  host = host.decode(charset).encode('idna') # IDN -> ACE
95 
96  # Only perform percent-encoding where it is essential.
97  # Always use uppercase A-through-F characters when percent-encoding.
98  # All portions of the URI must be utf-8 encoded NFC from Unicode strings
99  path = quote(_clean(path), "~:/?#[]@!$&'()*+,;=%")
100  fragment = quote(_clean(fragment), "~")
101 
102  # note care must be taken to only encode & and = characters as values
103  query = "&".join(["=".join([quote(_clean(t), "~:/?#[]@!$'()*+,;=%") for t in q.split("=", 1)]) for q in query.split("&")])
104 
105  # Prevent dot-segments appearing in non-relative URI paths.
106  if scheme in ["", "http", "https", "ftp", "file"]:
107  output = []
108  for part in path.split('/'):
109  if part == "":
110  if not output:
111  output.append(part)
112  elif part == ".":
113  pass
114  elif part == "..":
115  if len(output) > 1:
116  output.pop()
117  else:
118  output.append(part)
119  if part in ["", ".", ".."]:
120  output.append("")
121  path = '/'.join(output)
122 
123  # For schemes that define a default authority, use an empty authority if
124  # the default is desired.
125  if userinfo in ["@", ":@"]:
126  userinfo = ""
127 
128  # For schemes that define an empty path to be equivalent to a path of "/",
129  # use "/".
130  if path == "" and scheme in ["http", "https", "ftp", "file"]:
131  path = "/"
132 
133  # For schemes that define a port, use an empty port if the default is
134  # desired
135  if port and scheme in default_port.keys():
136  if port.isdigit():
137  port = str(int(port))
138  if int(port) == default_port[scheme]:
139  port = ''
140 
141  # Put it all back together again
142  auth = (userinfo or "") + host
143  if port:
144  auth += ":" + port
145  if url.endswith("#") and query == "" and fragment == "":
146  path += "#"
147  return urlparse.urlunsplit((scheme, auth, path, query, ''))
148 
149 if __name__ == "__main__":
150  import unittest
151  suite = unittest.TestSuite()
152 
153  """ from http://www.intertwingly.net/wiki/pie/PaceCanonicalIds """
154  tests1 = [
155  (False, "http://:@example.com/"),
156  (False, "http://@example.com/"),
157  (False, "http://example.com"),
158  (False, "HTTP://example.com/"),
159  (False, "http://EXAMPLE.COM/"),
160  (False, "http://example.com/%7Ejane"),
161  (False, "http://example.com/?q=%C7"),
162  (False, "http://example.com/?q=%5c"),
163  (False, "http://example.com/?q=C%CC%A7"),
164  (False, "http://example.com/a/../a/b"),
165  (False, "http://example.com/a/./b"),
166  (False, "http://example.com:80/"),
167  (True, "http://example.com/"),
168  (True, "http://example.com/?q=%C3%87"),
169  (True, "http://example.com/?q=%E2%85%A0"),
170  (True, "http://example.com/?q=%5C"),
171  (True, "http://example.com/~jane"),
172  (True, "http://example.com/a/b"),
173  (True, "http://example.com:8080/"),
174  (True, "http://user:password@example.com/"),
175 
176  # from rfc2396bis
177  (True, "ftp://ftp.is.co.za/rfc/rfc1808.txt"),
178  (True, "http://www.ietf.org/rfc/rfc2396.txt"),
179  (True, "ldap://[2001:db8::7]/c=GB?objectClass?one"),
180  (True, "mailto:John.Doe@example.com"),
181  (True, "news:comp.infosystems.www.servers.unix"),
182  (True, "tel:+1-816-555-1212"),
183  (True, "telnet://192.0.2.16:80/"),
184  (True, "urn:oasis:names:specification:docbook:dtd:xml:4.1.2"),
185 
186  # other
187  (True, "http://127.0.0.1/"),
188  (False, "http://127.0.0.1:80/"),
189  (True, "http://www.w3.org/2000/01/rdf-schema#"),
190  (False, "http://example.com:081/"),
191  ]
192 
193  def testcase1(expected, value):
194 
195  class test(unittest.TestCase):
196 
197  def runTest(self):
198  assert (url_normalize(value) == value) == expected, (expected, value, url_normalize(value))
199  return test()
200 
201  for (expected, value) in tests1:
202  suite.addTest(testcase1(expected, value))
203 
204  """ mnot test suite; three tests updated for rfc2396bis. """
205  tests2 = {
206  '/foo/bar/.':
207  '/foo/bar/',
208  '/foo/bar/./':
209  '/foo/bar/',
210  '/foo/bar/..':
211  '/foo/',
212  '/foo/bar/../':
213  '/foo/',
214  '/foo/bar/../baz':
215  '/foo/baz',
216  '/foo/bar/../..':
217  '/',
218  '/foo/bar/../../':
219  '/',
220  '/foo/bar/../../baz':
221  '/baz',
222  '/foo/bar/../../../baz':
223  '/baz', #was: '/../baz',
224  '/foo/bar/../../../../baz':
225  '/baz',
226  '/./foo':
227  '/foo',
228  '/../foo':
229  '/foo', #was: '/../foo',
230  '/foo.':
231  '/foo.',
232  '/.foo':
233  '/.foo',
234  '/foo..':
235  '/foo..',
236  '/..foo':
237  '/..foo',
238  '/./../foo':
239  '/foo', #was: '/../foo',
240  '/./foo/.':
241  '/foo/',
242  '/foo/./bar':
243  '/foo/bar',
244  '/foo/../bar':
245  '/bar',
246  '/foo//':
247  '/foo/',
248  '/foo///bar//':
249  '/foo/bar/',
250  'http://www.foo.com:80/foo':
251  'http://www.foo.com/foo',
252  'http://www.foo.com:8000/foo':
253  'http://www.foo.com:8000/foo',
254  'http://www.foo.com./foo/bar.html':
255  'http://www.foo.com/foo/bar.html',
256  'http://www.foo.com.:81/foo':
257  'http://www.foo.com:81/foo',
258  'http://www.foo.com/%7ebar':
259  'http://www.foo.com/~bar',
260  'http://www.foo.com/%7Ebar':
261  'http://www.foo.com/~bar',
262  'ftp://user:pass@ftp.foo.net/foo/bar':
263  'ftp://user:pass@ftp.foo.net/foo/bar',
264  'http://USER:pass@www.Example.COM/foo/bar':
265  'http://USER:pass@www.example.com/foo/bar',
266  'http://www.example.com./':
267  'http://www.example.com/',
268  '-':
269  '-',
270  'пример.испытание/Служебная:Search/Test':
271  'http://xn--e1afmkfd.xn--80akhbyknj4f/%D0%A1%D0%BB%D1%83%D0%B6%D0%B5%D0%B1%D0%BD%D0%B0%D1%8F:Search/Test',
272  'http://lifehacker.com/#!5753509/hello-world-this-is-the-new-lifehacker':
273  'http://lifehacker.com/?_escaped_fragment_=5753509/hello-world-this-is-the-new-lifehacker',
274  }
275 
276  def testcase2(original, normalized):
277 
278  class test(unittest.TestCase):
279 
280  def runTest(self):
281  assert url_normalize(original) == normalized, (original, normalized, url_normalize(original))
282  return test()
283 
284  for (original, normalized) in tests2.items():
285  suite.addTest(testcase2(original, normalized))
286 
287  """ execute tests """
288  unittest.TextTestRunner().run(suite)
def url_normalize(url, charset='utf-8')
def testcase1(expected, value)
def testcase2(original, normalized)
Definition: join.py:1