4 URI Normalization function: 5 * Take care of IDN domains. 6 * Always provide the URI scheme in lowercase characters. 7 * Always provide the host, if any, in lowercase characters. 8 * Only perform percent-encoding where it is essential. 9 * Always use uppercase A-through-F characters when percent-encoding. 10 * Prevent dot-segments appearing in non-relative URI paths. 11 * For schemes that define a default authority, use an empty authority if the 13 * For schemes that define an empty path to be equivalent to a path of "/", 15 * For schemes that define a port, use an empty port if the default is desired 16 * All portions of the URI must be utf-8 encoded NFC from Unicode strings 18 Inspired by Sam Ruby's urlnorm.py: http://intertwingly.net/blog/2004/08/04/Urlnorm 19 This fork author: Nikolay Panov (<pythoneer@niksite.ru>) 22 * 10 Feb 2010: support for shebang (#!) urls 23 * 28 Feb 2010: using 'http' schema by default when appropriate 24 * 28 Feb 2010: added handling of IDN domains 25 * 28 Feb 2010: code pep8-zation 26 * 27 Feb 2010: forked from Sam Ruby's urlnorm.py 29 __license__ =
"Python" 35 from urllib
import quote, unquote
38 MAX_ALLOWED_LABEL_LENGTH = 63
42 Sometimes you get an URL by a user that just isn't a real 43 URL because it contains unsafe characters like ' ' and so on. This 44 function can fix some of the problems in a similar way browsers 45 handle data entered by the user: 47 >>> url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffsklärung)') 48 'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29' 50 :param charset: The target charset for the URL if the url was 51 given as unicode string. 55 string = unicode(string,
'utf-8',
'replace')
56 return unicodedata.normalize(
'NFC', string).
encode(
'utf-8')
70 if isinstance(url, unicode):
71 url = url.encode(charset,
'ignore')
74 if url[0]
not in [
'/',
'-']
and ':' not in url[:7]:
78 url = url.replace(
'#!',
'?_escaped_fragment_=')
81 scheme, auth, path, query, fragment = urlparse.urlsplit(url.strip())
82 (userinfo, host, port) = re.search(
'([^@]*@)?([^:]*):?(.*)', auth).groups()
85 scheme = scheme.lower()
89 if host
and host[-1] ==
'.':
92 if (len(host) <= MAX_ALLOWED_LABEL_LENGTH):
94 host = host.decode(charset).
encode(
'idna')
99 path = quote(_clean(path),
"~:/?#[]@!$&'()*+,;=%")
100 fragment = quote(_clean(fragment),
"~")
103 query =
"&".
join([
"=".
join([quote(_clean(t),
"~:/?#[]@!$'()*+,;=%")
for t
in q.split(
"=", 1)])
for q
in query.split(
"&")])
106 if scheme
in [
"",
"http",
"https",
"ftp",
"file"]:
108 for part
in path.split(
'/'):
119 if part
in [
"",
".",
".."]:
121 path =
'/'.
join(output)
125 if userinfo
in [
"@",
":@"]:
130 if path ==
"" and scheme
in [
"http",
"https",
"ftp",
"file"]:
135 if port
and scheme
in default_port.keys():
137 port = str(int(port))
138 if int(port) == default_port[scheme]:
142 auth = (userinfo
or "") + host
145 if url.endswith(
"#")
and query ==
"" and fragment ==
"":
147 return urlparse.urlunsplit((scheme, auth, path, query,
''))
149 if __name__ ==
"__main__":
151 suite = unittest.TestSuite()
153 """ from http://www.intertwingly.net/wiki/pie/PaceCanonicalIds """ 155 (
False,
"http://:@example.com/"),
156 (
False,
"http://@example.com/"),
157 (
False,
"http://example.com"),
158 (
False,
"HTTP://example.com/"),
159 (
False,
"http://EXAMPLE.COM/"),
160 (
False,
"http://example.com/%7Ejane"),
161 (
False,
"http://example.com/?q=%C7"),
162 (
False,
"http://example.com/?q=%5c"),
163 (
False,
"http://example.com/?q=C%CC%A7"),
164 (
False,
"http://example.com/a/../a/b"),
165 (
False,
"http://example.com/a/./b"),
166 (
False,
"http://example.com:80/"),
167 (
True,
"http://example.com/"),
168 (
True,
"http://example.com/?q=%C3%87"),
169 (
True,
"http://example.com/?q=%E2%85%A0"),
170 (
True,
"http://example.com/?q=%5C"),
171 (
True,
"http://example.com/~jane"),
172 (
True,
"http://example.com/a/b"),
173 (
True,
"http://example.com:8080/"),
174 (
True,
"http://user:password@example.com/"),
177 (
True,
"ftp://ftp.is.co.za/rfc/rfc1808.txt"),
178 (
True,
"http://www.ietf.org/rfc/rfc2396.txt"),
179 (
True,
"ldap://[2001:db8::7]/c=GB?objectClass?one"),
180 (
True,
"mailto:John.Doe@example.com"),
181 (
True,
"news:comp.infosystems.www.servers.unix"),
182 (
True,
"tel:+1-816-555-1212"),
183 (
True,
"telnet://192.0.2.16:80/"),
184 (
True,
"urn:oasis:names:specification:docbook:dtd:xml:4.1.2"),
187 (
True,
"http://127.0.0.1/"),
188 (
False,
"http://127.0.0.1:80/"),
189 (
True,
"http://www.w3.org/2000/01/rdf-schema#"),
190 (
False,
"http://example.com:081/"),
195 class test(unittest.TestCase):
201 for (expected, value)
in tests1:
202 suite.addTest(
testcase1(expected, value))
204 """ mnot test suite; three tests updated for rfc2396bis. """ 220 '/foo/bar/../../baz':
222 '/foo/bar/../../../baz':
224 '/foo/bar/../../../../baz':
250 'http://www.foo.com:80/foo':
251 'http://www.foo.com/foo',
252 'http://www.foo.com:8000/foo':
253 'http://www.foo.com:8000/foo',
254 'http://www.foo.com./foo/bar.html':
255 'http://www.foo.com/foo/bar.html',
256 'http://www.foo.com.:81/foo':
257 'http://www.foo.com:81/foo',
258 'http://www.foo.com/%7ebar':
259 'http://www.foo.com/~bar',
260 'http://www.foo.com/%7Ebar':
261 'http://www.foo.com/~bar',
262 'ftp://user:pass@ftp.foo.net/foo/bar':
263 'ftp://user:pass@ftp.foo.net/foo/bar',
264 'http://USER:pass@www.Example.COM/foo/bar':
265 'http://USER:pass@www.example.com/foo/bar',
266 'http://www.example.com./':
267 'http://www.example.com/',
270 'пример.испытание/Служебная:Search/Test':
271 'http://xn--e1afmkfd.xn--80akhbyknj4f/%D0%A1%D0%BB%D1%83%D0%B6%D0%B5%D0%B1%D0%BD%D0%B0%D1%8F:Search/Test',
272 'http://lifehacker.com/#!5753509/hello-world-this-is-the-new-lifehacker':
273 'http://lifehacker.com/?_escaped_fragment_=5753509/hello-world-this-is-the-new-lifehacker',
278 class test(unittest.TestCase):
284 for (original, normalized)
in tests2.items():
285 suite.addTest(
testcase2(original, normalized))
287 """ execute tests """ 288 unittest.TextTestRunner().
run(suite)
def url_normalize(url, charset='utf-8')
def testcase1(expected, value)
def testcase2(original, normalized)