mirror of
				https://github.com/django/django.git
				synced 2025-10-24 22:26:08 +00:00 
			
		
		
		
	Fixed #26005 -- Fixed some percent decoding cases in uri_to_iri().
This commit is contained in:
		| @@ -6,7 +6,7 @@ import sys | |||||||
| from copy import copy | from copy import copy | ||||||
| from importlib import import_module | from importlib import import_module | ||||||
| from io import BytesIO | from io import BytesIO | ||||||
| from urllib.parse import urljoin, urlparse, urlsplit | from urllib.parse import unquote_to_bytes, urljoin, urlparse, urlsplit | ||||||
|  |  | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| from django.core.handlers.base import BaseHandler | from django.core.handlers.base import BaseHandler | ||||||
| @@ -20,7 +20,7 @@ from django.template import TemplateDoesNotExist | |||||||
| from django.test import signals | from django.test import signals | ||||||
| from django.test.utils import ContextList | from django.test.utils import ContextList | ||||||
| from django.urls import resolve | from django.urls import resolve | ||||||
| from django.utils.encoding import force_bytes, uri_to_iri | from django.utils.encoding import force_bytes | ||||||
| from django.utils.functional import SimpleLazyObject, curry | from django.utils.functional import SimpleLazyObject, curry | ||||||
| from django.utils.http import urlencode | from django.utils.http import urlencode | ||||||
| from django.utils.itercompat import is_iterable | from django.utils.itercompat import is_iterable | ||||||
| @@ -320,7 +320,7 @@ class RequestFactory: | |||||||
|         # If there are parameters, add them |         # If there are parameters, add them | ||||||
|         if parsed.params: |         if parsed.params: | ||||||
|             path += ";" + parsed.params |             path += ";" + parsed.params | ||||||
|         path = uri_to_iri(path).encode() |         path = unquote_to_bytes(path) | ||||||
|         # Replace the behavior where non-ASCII values in the WSGI environ are |         # Replace the behavior where non-ASCII values in the WSGI environ are | ||||||
|         # arbitrarily decoded with ISO-8859-1. |         # arbitrarily decoded with ISO-8859-1. | ||||||
|         # Refs comment in `get_bytes_from_wsgi()`. |         # Refs comment in `get_bytes_from_wsgi()`. | ||||||
|   | |||||||
| @@ -2,7 +2,7 @@ import codecs | |||||||
| import datetime | import datetime | ||||||
| import locale | import locale | ||||||
| from decimal import Decimal | from decimal import Decimal | ||||||
| from urllib.parse import quote, unquote_to_bytes | from urllib.parse import quote | ||||||
|  |  | ||||||
| from django.utils import six | from django.utils import six | ||||||
| from django.utils.functional import Promise | from django.utils.functional import Promise | ||||||
| @@ -151,20 +151,57 @@ def iri_to_uri(iri): | |||||||
|     return quote(iri, safe="/#%[]=:;$&()+,!?*@'~") |     return quote(iri, safe="/#%[]=:;$&()+,!?*@'~") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # List of byte values that uri_to_iri() decodes from percent encoding. | ||||||
|  | # First, the unreserved characters from RFC 3986: | ||||||
|  | _ascii_ranges = [[45, 46, 95, 126], range(65, 91), range(97, 123)] | ||||||
|  | _hextobyte = { | ||||||
|  |     (fmt % char).encode(): bytes((char,)) | ||||||
|  |     for ascii_range in _ascii_ranges | ||||||
|  |     for char in ascii_range | ||||||
|  |     for fmt in ['%02x', '%02X'] | ||||||
|  | } | ||||||
|  | # And then everything above 128, because bytes ≥ 128 are part of multibyte | ||||||
|  | # unicode characters. | ||||||
|  | _hexdig = '0123456789ABCDEFabcdef' | ||||||
|  | _hextobyte.update({ | ||||||
|  |     (a + b).encode(): bytes.fromhex(a + b) | ||||||
|  |     for a in _hexdig[8:] for b in _hexdig | ||||||
|  | }) | ||||||
|  |  | ||||||
|  |  | ||||||
| def uri_to_iri(uri): | def uri_to_iri(uri): | ||||||
|     """ |     """ | ||||||
|     Converts a Uniform Resource Identifier(URI) into an Internationalized |     Converts a Uniform Resource Identifier(URI) into an Internationalized | ||||||
|     Resource Identifier(IRI). |     Resource Identifier(IRI). | ||||||
|  |  | ||||||
|     This is the algorithm from section 3.2 of RFC 3987. |     This is the algorithm from section 3.2 of RFC 3987, excluding step 4. | ||||||
|  |  | ||||||
|     Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns |     Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns | ||||||
|     a string containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/'). |     a string containing the encoded result (e.g. '/I%20♥%20Django/'). | ||||||
|     """ |     """ | ||||||
|     if uri is None: |     if uri is None: | ||||||
|         return uri |         return uri | ||||||
|     uri = force_bytes(uri) |     uri = force_bytes(uri) | ||||||
|     iri = unquote_to_bytes(uri) |     # Fast selective unqote: First, split on '%' and then starting with the | ||||||
|  |     # second block, decode the first 2 bytes if they represent a hex code to | ||||||
|  |     # decode. The rest of the block is the part after '%AB', not containing | ||||||
|  |     # any '%'. Add that to the output without further processing. | ||||||
|  |     bits = uri.split(b'%') | ||||||
|  |     if len(bits) == 1: | ||||||
|  |         iri = uri | ||||||
|  |     else: | ||||||
|  |         parts = [bits[0]] | ||||||
|  |         append = parts.append | ||||||
|  |         hextobyte = _hextobyte | ||||||
|  |         for item in bits[1:]: | ||||||
|  |             hex = item[:2] | ||||||
|  |             if hex in hextobyte: | ||||||
|  |                 append(hextobyte[item[:2]]) | ||||||
|  |                 append(item[2:]) | ||||||
|  |             else: | ||||||
|  |                 append(b'%') | ||||||
|  |                 append(item) | ||||||
|  |         iri = b''.join(parts) | ||||||
|     return repercent_broken_unicode(iri).decode() |     return repercent_broken_unicode(iri).decode() | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -195,19 +195,17 @@ result. | |||||||
|  |  | ||||||
| Similarly, Django provides :func:`django.utils.encoding.uri_to_iri()` which | Similarly, Django provides :func:`django.utils.encoding.uri_to_iri()` which | ||||||
| implements the conversion from URI to IRI as per :rfc:`3987#section-3.2`. | implements the conversion from URI to IRI as per :rfc:`3987#section-3.2`. | ||||||
| It decodes all percent-encodings except those that don't represent a valid |  | ||||||
| UTF-8 sequence. |  | ||||||
|  |  | ||||||
| An example to demonstrate:: | An example to demonstrate:: | ||||||
|  |  | ||||||
|     >>> uri_to_iri('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93') |     >>> uri_to_iri('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93') | ||||||
|     '/♥♥/?utf8=✓' |     '/♥♥/?utf8=✓' | ||||||
|     >>> uri_to_iri('%A9helloworld') |     >>> uri_to_iri('%A9hello%3Fworld') | ||||||
|     '%A9helloworld' |     '%A9hello%3Fworld' | ||||||
|  |  | ||||||
| In the first example, the UTF-8 characters and reserved characters are | In the first example, the UTF-8 characters are unquoted. In the second, the | ||||||
| unquoted. In the second, the percent-encoding remains unchanged because it | percent-encodings remain unchanged because they lie outside the valid UTF-8 | ||||||
| lies outside the valid UTF-8 range. | range or represent a reserved character. | ||||||
|  |  | ||||||
| Both ``iri_to_uri()`` and ``uri_to_iri()`` functions are idempotent, which means the | Both ``iri_to_uri()`` and ``uri_to_iri()`` functions are idempotent, which means the | ||||||
| following is always true:: | following is always true:: | ||||||
|   | |||||||
| @@ -93,9 +93,11 @@ class TestRFC3987IEncodingUtils(unittest.TestCase): | |||||||
|     def test_uri_to_iri(self): |     def test_uri_to_iri(self): | ||||||
|         cases = [ |         cases = [ | ||||||
|             # Valid UTF-8 sequences are decoded. |             # Valid UTF-8 sequences are decoded. | ||||||
|             ('/%E2%99%A5%E2%99%A5/', '/♥♥/'), |             ('/%e2%89%Ab%E2%99%a5%E2%89%aB/', '/≫♥≫/'), | ||||||
|             ('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'), |             ('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'), | ||||||
|  |             ('/%41%5a%6B/', '/AZk/'), | ||||||
|  |             # Reserved and non-URL valid ASCII chars are not decoded. | ||||||
|  |             ('/%25%20%02%41%7b/', '/%25%20%02A%7b/'), | ||||||
|             # Broken UTF-8 sequences remain escaped. |             # Broken UTF-8 sequences remain escaped. | ||||||
|             ('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'), |             ('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'), | ||||||
|             ('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'), |             ('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'), | ||||||
| @@ -112,11 +114,12 @@ class TestRFC3987IEncodingUtils(unittest.TestCase): | |||||||
|  |  | ||||||
|     def test_complementarity(self): |     def test_complementarity(self): | ||||||
|         cases = [ |         cases = [ | ||||||
|             ('/blog/for/J%C3%BCrgen%20M%C3%BCnster/', '/blog/for/J\xfcrgen M\xfcnster/'), |             ('/blog/for/J%C3%BCrgen%20M%C3%BCnster/', '/blog/for/J\xfcrgen%20M\xfcnster/'), | ||||||
|             ('%&', '%&'), |             ('%&', '%&'), | ||||||
|             ('red&%E2%99%A5ros%#red', 'red&♥ros%#red'), |             ('red&%E2%99%A5ros%#red', 'red&♥ros%#red'), | ||||||
|             ('/%E2%99%A5%E2%99%A5/', '/♥♥/'), |             ('/%E2%99%A5%E2%99%A5/', '/♥♥/'), | ||||||
|             ('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'), |             ('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'), | ||||||
|  |             ('/%25%20%02%7b/', '/%25%20%02%7b/'), | ||||||
|             ('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'), |             ('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'), | ||||||
|             ('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'), |             ('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'), | ||||||
|             ('/%E2%99%A5%E2%99%E2%99%A5/', '/♥%E2%99♥/'), |             ('/%E2%99%A5%E2%99%E2%99%A5/', '/♥%E2%99♥/'), | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user