I\'m wondering what\'s the best way -- or if there\'s a simple way with the standard library -- to convert a URL with Unicode chars in the domain name and path to the equiva
You might use urlparse.urlsplit instead, but otherwise you seem to have a very straightforward solution, there.
protocol, domain, path, query, fragment = urlparse.urlsplit(url)
(You can access the domain and port separately by accessing the returned value's named properties, but as port syntax is always in ASCII it is unaffected by the IDNA encoding process.)
Okay, with these comments and some bug-fixing in my own code (it didn't handle fragments at all), I've come up with the following canonurl()
function -- returns a canonical, ASCII form of the URL:
import re
import urllib
import urlparse
def canonurl(url):
r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or ''
if the URL looks invalid.
>>> canonurl(' ')
''
>>> canonurl('www.google.com')
'http://www.google.com/'
>>> canonurl('bad-utf8.com/path\xff/file')
''
>>> canonurl('svn://blah.com/path/file')
'svn://blah.com/path/file'
>>> canonurl('1234://badscheme.com')
''
>>> canonurl('bad$scheme://google.com')
''
>>> canonurl('site.badtopleveldomain')
''
>>> canonurl('site.com:badport')
''
>>> canonurl('http://123.24.8.240/blah')
'http://123.24.8.240/blah'
>>> canonurl('http://123.24.8.240:1234/blah?q#f')
'http://123.24.8.240:1234/blah?q#f'
>>> canonurl('\xe2\x9e\xa1.ws') # tinyarro.ws
'http://xn--hgi.ws/'
>>> canonurl(' http://www.google.com:80/path/file;params?query#fragment ')
'http://www.google.com:80/path/file;params?query#fragment'
>>> canonurl('http://\xe2\x9e\xa1.ws/\xe2\x99\xa5')
'http://xn--hgi.ws/%E2%99%A5'
>>> canonurl('http://\xe2\x9e\xa1.ws/\xe2\x99\xa5/pa%2Fth')
'http://xn--hgi.ws/%E2%99%A5/pa/th'
>>> canonurl('http://\xe2\x9e\xa1.ws/\xe2\x99\xa5/pa%2Fth;par%2Fams?que%2Fry=a&b=c')
'http://xn--hgi.ws/%E2%99%A5/pa/th;par/ams?que/ry=a&b=c'
>>> canonurl('http://\xe2\x9e\xa1.ws/\xe2\x99\xa5?\xe2\x99\xa5#\xe2\x99\xa5')
'http://xn--hgi.ws/%E2%99%A5?%E2%99%A5#%E2%99%A5'
>>> canonurl('http://\xe2\x9e\xa1.ws/%e2%99%a5?%E2%99%A5#%E2%99%A5')
'http://xn--hgi.ws/%E2%99%A5?%E2%99%A5#%E2%99%A5'
>>> canonurl('http://badutf8pcokay.com/%FF?%FE#%FF')
'http://badutf8pcokay.com/%FF?%FE#%FF'
>>> len(canonurl('google.com/' + 'a' * 16384))
4096
"""
# strip spaces at the ends and ensure it's prefixed with 'scheme://'
url = url.strip()
if not url:
return ''
if not urlparse.urlsplit(url).scheme:
url = 'http://' + url
# turn it into Unicode
try:
url = unicode(url, 'utf-8')
except UnicodeDecodeError:
return '' # bad UTF-8 chars in URL
# parse the URL into its components
parsed = urlparse.urlsplit(url)
scheme, netloc, path, query, fragment = parsed
# ensure scheme is a letter followed by letters, digits, and '+-.' chars
if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I):
return ''
scheme = str(scheme)
# ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port]
match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I)
if not match:
return ''
domain, port = match.groups()
netloc = domain + (port if port else '')
netloc = netloc.encode('idna')
# ensure path is valid and convert Unicode chars to %-encoded
if not path:
path = '/' # eg: 'http://google.com' -> 'http://google.com/'
path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;')
# ensure query is valid
query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/')
# ensure fragment is valid
fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8')))
# piece it all back together, truncating it to a maximum of 4KB
url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
return url[:4096]
if __name__ == '__main__':
import doctest
doctest.testmod()
the code given by MizardX isnt 100% correct. This example wont work:
example.com/folder/?page=2
check out django.utils.encoding.iri_to_uri() to convert unicode URL to ASCII urls.
http://docs.djangoproject.com/en/dev/ref/unicode/
import urlparse, urllib
def fixurl(url):
# turn string into unicode
if not isinstance(url,unicode):
url = url.decode('utf8')
# parse it
parsed = urlparse.urlsplit(url)
# divide the netloc further
userpass,at,hostport = parsed.netloc.rpartition('@')
user,colon1,pass_ = userpass.partition(':')
host,colon2,port = hostport.partition(':')
# encode each component
scheme = parsed.scheme.encode('utf8')
user = urllib.quote(user.encode('utf8'))
colon1 = colon1.encode('utf8')
pass_ = urllib.quote(pass_.encode('utf8'))
at = at.encode('utf8')
host = host.encode('idna')
colon2 = colon2.encode('utf8')
port = port.encode('utf8')
path = '/'.join( # could be encoded slashes!
urllib.quote(urllib.unquote(pce).encode('utf8'),'')
for pce in parsed.path.split('/')
)
query = urllib.quote(urllib.unquote(parsed.query).encode('utf8'),'=&?/')
fragment = urllib.quote(urllib.unquote(parsed.fragment).encode('utf8'))
# put it back together
netloc = ''.join((user,colon1,pass_,at,host,colon2,port))
return urlparse.urlunsplit((scheme,netloc,path,query,fragment))
print fixurl('http://\xe2\x9e\xa1.ws/\xe2\x99\xa5')
print fixurl('http://\xe2\x9e\xa1.ws/\xe2\x99\xa5/%2F')
print fixurl(u'http://Åsa:abc123@➡.ws:81/admin')
print fixurl(u'http://➡.ws/admin')
http://xn--hgi.ws/%E2%99%A5
http://xn--hgi.ws/%E2%99%A5/%2F
http://%C3%85sa:abc123@xn--hgi.ws:81/admin
http://xn--hgi.ws/admin
urlparse
/urlunparse
to urlsplit
/urlunsplit
.there's some RFC-3896 url parsing work underway (e.g. as part of the Summer Of Code) but nothing in the standard library yet AFAIK -- and nothing much on the uri encoding side of things either, again AFAIK. So you might as well go with MizardX's elegant approach.