Fixed #26005 -- Fixed some percent decoding cases in uri_to_iri().

2025-10-25 06:36:07 +00:00 · 2017-02-07 14:55:44 +01:00
parent 500532c95d
commit 03281d8fe7
4 changed files with 55 additions and 17 deletions
--- a/django/utils/encoding.py
+++ b/django/utils/encoding.py
@@ -2,7 +2,7 @@ import codecs
 import datetime
 import locale
 from decimal import Decimal
-from urllib.parse import quote, unquote_to_bytes
+from urllib.parse import quote

 from django.utils import six
 from django.utils.functional import Promise
@@ -151,20 +151,57 @@ def iri_to_uri(iri):
    return quote(iri, safe="/#%[]=:;$&()+,!?*@'~")


+# List of byte values that uri_to_iri() decodes from percent encoding.
+# First, the unreserved characters from RFC 3986:
+_ascii_ranges = [[45, 46, 95, 126], range(65, 91), range(97, 123)]
+_hextobyte = {
+    (fmt % char).encode(): bytes((char,))
+    for ascii_range in _ascii_ranges
+    for char in ascii_range
+    for fmt in ['%02x', '%02X']
+}
+# And then everything above 128, because bytes ≥ 128 are part of multibyte
+# unicode characters.
+_hexdig = '0123456789ABCDEFabcdef'
+_hextobyte.update({
+    (a + b).encode(): bytes.fromhex(a + b)
+    for a in _hexdig[8:] for b in _hexdig
+})
+
+
 def uri_to_iri(uri):
    """
    Converts a Uniform Resource Identifier(URI) into an Internationalized
    Resource Identifier(IRI).

-    This is the algorithm from section 3.2 of RFC 3987.
+    This is the algorithm from section 3.2 of RFC 3987, excluding step 4.

    Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns
-    a string containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/').
+    a string containing the encoded result (e.g. '/I%20♥%20Django/').
    """
    if uri is None:
        return uri
    uri = force_bytes(uri)
-    iri = unquote_to_bytes(uri)
+    # Fast selective unqote: First, split on '%' and then starting with the
+    # second block, decode the first 2 bytes if they represent a hex code to
+    # decode. The rest of the block is the part after '%AB', not containing
+    # any '%'. Add that to the output without further processing.
+    bits = uri.split(b'%')
+    if len(bits) == 1:
+        iri = uri
+    else:
+        parts = [bits[0]]
+        append = parts.append
+        hextobyte = _hextobyte
+        for item in bits[1:]:
+            hex = item[:2]
+            if hex in hextobyte:
+                append(hextobyte[item[:2]])
+                append(item[2:])
+            else:
+                append(b'%')
+                append(item)
+        iri = b''.join(parts)
    return repercent_broken_unicode(iri).decode()