42 Sometimes you get an URL by a user that just isn't a real 43 URL because it contains unsafe characters like ' ' and so on. This 44 function can fix some of the problems in a similar way browsers 45 handle data entered by the user: 47 >>> url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffsklärung)') 48 'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29' 50 :param charset: The target charset for the URL if the url was 51 given as unicode string. 55 string = unicode(string,
'utf-8',
'replace')
56 return unicodedata.normalize(
'NFC', string).
encode(
'utf-8')
70 if isinstance(url, unicode):
71 url = url.encode(charset,
'ignore')
74 if url[0]
not in [
'/',
'-']
and ':' not in url[:7]:
78 url = url.replace(
'#!',
'?_escaped_fragment_=')
81 scheme, auth, path, query, fragment = urlparse.urlsplit(url.strip())
82 (userinfo, host, port) = re.search(
'([^@]*@)?([^:]*):?(.*)', auth).groups()
85 scheme = scheme.lower()
89 if host
and host[-1] ==
'.':
92 if (len(host) <= MAX_ALLOWED_LABEL_LENGTH):
94 host = host.decode(charset).
encode(
'idna')
99 path = quote(_clean(path),
"~:/?#[]@!$&'()*+,;=%")
100 fragment = quote(_clean(fragment),
"~")
103 query =
"&".
join([
"=".
join([quote(_clean(t),
"~:/?#[]@!$'()*+,;=%")
for t
in q.split(
"=", 1)])
for q
in query.split(
"&")])
106 if scheme
in [
"",
"http",
"https",
"ftp",
"file"]:
108 for part
in path.split(
'/'):
119 if part
in [
"",
".",
".."]:
121 path =
'/'.
join(output)
125 if userinfo
in [
"@",
":@"]:
130 if path ==
"" and scheme
in [
"http",
"https",
"ftp",
"file"]:
135 if port
and scheme
in default_port.keys():
137 port = str(int(port))
138 if int(port) == default_port[scheme]:
142 auth = (userinfo
or "") + host
145 if url.endswith(
"#")
and query ==
"" and fragment ==
"":
147 return urlparse.urlunsplit((scheme, auth, path, query,
''))
def url_normalize(url, charset='utf-8')