1
0
mirror of https://github.com/django/django.git synced 2025-10-25 06:36:07 +00:00

Fixed #19508 -- Implemented uri_to_iri as per RFC.

Thanks Loic Bistuer for helping in shaping the patch and Claude Paroz
for the review.
This commit is contained in:
Anubhav Joshi
2014-07-22 17:55:22 +05:30
committed by Loic Bistuer
parent 3af5af1a61
commit 10b17a22be
9 changed files with 189 additions and 42 deletions

View File

@@ -1,3 +1,4 @@
# -*- encoding: utf-8 -*-
from __future__ import unicode_literals
import codecs
@@ -7,7 +8,9 @@ import locale
from django.utils.functional import Promise
from django.utils import six
from django.utils.six.moves.urllib.parse import quote
from django.utils.six.moves.urllib.parse import quote, unquote
if six.PY3:
from urllib.parse import unquote_to_bytes
class DjangoUnicodeDecodeError(UnicodeDecodeError):
@@ -185,7 +188,9 @@ def iri_to_uri(iri):
assuming input is either UTF-8 or unicode already, we can simplify things a
little from the full method.
Returns an ASCII string containing the encoded result.
Takes an IRI in UTF-8 bytes (e.g. '/I \xe2\x99\xa5 Django/') or unicode
(e.g. '/I ♥ Django/') and returns ASCII bytes containing the encoded result
(e.g. '/I%20%E2%99%A5%20Django/').
"""
# The list of safe characters here is constructed from the "reserved" and
# "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986:
@@ -204,6 +209,38 @@ def iri_to_uri(iri):
return quote(force_bytes(iri), safe=b"/#%[]=:;$&()+,!?*@'~")
def uri_to_iri(uri):
"""
Converts a Uniform Resource Identifier(URI) into an Internationalized
Resource Identifier(IRI).
This is the algorithm from section 3.2 of RFC 3987.
Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns
unicode containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/').
"""
if uri is None:
return uri
uri = force_bytes(uri)
iri = unquote_to_bytes(uri) if six.PY3 else unquote(uri)
return repercent_broken_unicode(iri).decode('utf-8')
def repercent_broken_unicode(path):
"""
As per section 3.2 of RFC 3987, step three of converting a URI into an IRI,
we need to re-percent-encode any octet produced that is not part of a
strictly legal UTF-8 octet sequence.
"""
try:
path.decode('utf-8')
except UnicodeDecodeError as e:
repercent = quote(path[e.start:e.end], safe=b"/#%[]=:;$&()+,!?*@'~")
path = repercent_broken_unicode(
path[:e.start] + force_bytes(repercent) + path[e.end:])
return path
def filepath_to_uri(path):
"""Convert a file system path to a URI portion that is suitable for
inclusion in a URL.