mirror of
				https://github.com/django/django.git
				synced 2025-10-25 22:56:12 +00:00 
			
		
		
		
	Fixed #19237 -- Used HTML parser to strip tags
The regex method used until now for the strip_tags utility is fast, but subject to flaws and security issues. Consensus and good practice lead use to use a slower but safer method.
This commit is contained in:
		| @@ -16,6 +16,9 @@ from django.utils.functional import allow_lazy | |||||||
| from django.utils import six | from django.utils import six | ||||||
| from django.utils.text import normalize_newlines | from django.utils.text import normalize_newlines | ||||||
|  |  | ||||||
|  | from .html_parser import HTMLParser | ||||||
|  |  | ||||||
|  |  | ||||||
| # Configuration for urlize() function. | # Configuration for urlize() function. | ||||||
| TRAILING_PUNCTUATION = ['.', ',', ':', ';', '.)'] | TRAILING_PUNCTUATION = ['.', ',', ':', ';', '.)'] | ||||||
| WRAPPING_PUNCTUATION = [('(', ')'), ('<', '>'), ('[', ']'), ('<', '>')] | WRAPPING_PUNCTUATION = [('(', ')'), ('<', '>'), ('[', ']'), ('<', '>')] | ||||||
| @@ -33,7 +36,6 @@ link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+') | |||||||
| html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE) | html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE) | ||||||
| hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL) | hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL) | ||||||
| trailing_empty_content_re = re.compile(r'(?:<p>(?: |\s|<br \/>)*?</p>\s*)+\Z') | trailing_empty_content_re = re.compile(r'(?:<p>(?: |\s|<br \/>)*?</p>\s*)+\Z') | ||||||
| strip_tags_re = re.compile(r'</?\S([^=>]*=(\s*"[^"]*"|\s*\'[^\']*\'|\S*)|[^>])*?>', re.IGNORECASE) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def escape(text): | def escape(text): | ||||||
| @@ -116,9 +118,31 @@ def linebreaks(value, autoescape=False): | |||||||
|     return '\n\n'.join(paras) |     return '\n\n'.join(paras) | ||||||
| linebreaks = allow_lazy(linebreaks, six.text_type) | linebreaks = allow_lazy(linebreaks, six.text_type) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class MLStripper(HTMLParser): | ||||||
|  |     def __init__(self): | ||||||
|  |         HTMLParser.__init__(self) | ||||||
|  |         self.reset() | ||||||
|  |         self.fed = [] | ||||||
|  |     def handle_data(self, d): | ||||||
|  |         self.fed.append(d) | ||||||
|  |     def handle_entityref(self, name): | ||||||
|  |         self.fed.append('&%s;' % name) | ||||||
|  |     def handle_charref(self, name): | ||||||
|  |         self.fed.append('&#%s;' % name) | ||||||
|  |     def get_data(self): | ||||||
|  |         return ''.join(self.fed) | ||||||
|  |  | ||||||
| def strip_tags(value): | def strip_tags(value): | ||||||
|     """Returns the given HTML with all tags stripped.""" |     """Returns the given HTML with all tags stripped.""" | ||||||
|     return strip_tags_re.sub('', force_text(value)) |     s = MLStripper() | ||||||
|  |     s.feed(value) | ||||||
|  |     data = s.get_data() | ||||||
|  |     try: | ||||||
|  |         res = s.close() | ||||||
|  |     except Exception as e: | ||||||
|  |         data += s.rawdata | ||||||
|  |     return data | ||||||
| strip_tags = allow_lazy(strip_tags) | strip_tags = allow_lazy(strip_tags) | ||||||
|  |  | ||||||
| def remove_tags(html, tags): | def remove_tags(html, tags): | ||||||
|   | |||||||
| @@ -5,6 +5,7 @@ import os | |||||||
|  |  | ||||||
| from django.utils import html | from django.utils import html | ||||||
| from django.utils._os import upath | from django.utils._os import upath | ||||||
|  | from django.utils.encoding import force_text | ||||||
| from django.utils.unittest import TestCase | from django.utils.unittest import TestCase | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -63,10 +64,12 @@ class TestUtilsHtml(TestCase): | |||||||
|     def test_strip_tags(self): |     def test_strip_tags(self): | ||||||
|         f = html.strip_tags |         f = html.strip_tags | ||||||
|         items = ( |         items = ( | ||||||
|  |             ('<p>See: 'é is an apostrophe followed by e acute</p>', | ||||||
|  |              'See: 'é is an apostrophe followed by e acute'), | ||||||
|             ('<adf>a', 'a'), |             ('<adf>a', 'a'), | ||||||
|             ('</adf>a', 'a'), |             ('</adf>a', 'a'), | ||||||
|             ('<asdf><asdf>e', 'e'), |             ('<asdf><asdf>e', 'e'), | ||||||
|             ('<f', '<f'), |             ('hi, <f x', 'hi, <f x'), | ||||||
|             ('</fe', '</fe'), |             ('</fe', '</fe'), | ||||||
|             ('<x>b<y>', 'b'), |             ('<x>b<y>', 'b'), | ||||||
|             ('a<p onclick="alert(\'<test>\')">b</p>c', 'abc'), |             ('a<p onclick="alert(\'<test>\')">b</p>c', 'abc'), | ||||||
| @@ -81,8 +84,9 @@ class TestUtilsHtml(TestCase): | |||||||
|         for filename in ('strip_tags1.html', 'strip_tags2.txt'): |         for filename in ('strip_tags1.html', 'strip_tags2.txt'): | ||||||
|             path = os.path.join(os.path.dirname(upath(__file__)), 'files', filename) |             path = os.path.join(os.path.dirname(upath(__file__)), 'files', filename) | ||||||
|             with open(path, 'r') as fp: |             with open(path, 'r') as fp: | ||||||
|  |                 content = force_text(fp.read()) | ||||||
|                 start = datetime.now() |                 start = datetime.now() | ||||||
|                 stripped = html.strip_tags(fp.read()) |                 stripped = html.strip_tags(content) | ||||||
|                 elapsed = datetime.now() - start |                 elapsed = datetime.now() - start | ||||||
|             self.assertEqual(elapsed.seconds, 0) |             self.assertEqual(elapsed.seconds, 0) | ||||||
|             self.assertIn("Please try again.", stripped) |             self.assertIn("Please try again.", stripped) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user