mirror of
				https://github.com/django/django.git
				synced 2025-10-31 09:41:08 +00:00 
			
		
		
		
	Fixed #7267 - UnicodeDecodeError in clean_html
Thanks to Nikolay for the report, and gav and aaugustin for the patch. git-svn-id: http://code.djangoproject.com/svn/django/trunk@16118 bcc190cf-cafb-0310-a4f2-bffc1f526a37
This commit is contained in:
		| @@ -13,7 +13,7 @@ LEADING_PUNCTUATION  = ['(', '<', '<'] | ||||
| TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '>'] | ||||
|  | ||||
| # List of possible strings used for bullets in bulleted lists. | ||||
| DOTS = ['·', '*', '\xe2\x80\xa2', '•', '•', '•'] | ||||
| DOTS = [u'·', u'*', u'\u2022', u'•', u'•', u'•'] | ||||
|  | ||||
| unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)') | ||||
| word_split_re = re.compile(r'(\s+)') | ||||
| @@ -180,13 +180,13 @@ def clean_html(text): | ||||
|     text = html_gunk_re.sub('', text) | ||||
|     # Convert hard-coded bullets into HTML unordered lists. | ||||
|     def replace_p_tags(match): | ||||
|         s = match.group().replace('</p>', '</li>') | ||||
|         s = match.group().replace(u'</p>', u'</li>') | ||||
|         for d in DOTS: | ||||
|             s = s.replace('<p>%s' % d, '<li>') | ||||
|             s = s.replace(u'<p>%s' % d, u'<li>') | ||||
|         return u'<ul>\n%s\n</ul>' % s | ||||
|     text = hard_coded_bullets_re.sub(replace_p_tags, text) | ||||
|     # Remove stuff like "<p>  </p>", but only if it's at the bottom | ||||
|     # of the text. | ||||
|     text = trailing_empty_content_re.sub('', text) | ||||
|     text = trailing_empty_content_re.sub(u'', text) | ||||
|     return text | ||||
| clean_html = allow_lazy(clean_html, unicode) | ||||
|   | ||||
| @@ -121,3 +121,15 @@ class TestUtilsHtml(unittest.TestCase): | ||||
|         ) | ||||
|         for value, output in items: | ||||
|             self.check_output(f, value, output) | ||||
|  | ||||
|     def test_clean_html(self): | ||||
|         f = html.clean_html | ||||
|         items = ( | ||||
|             (u'<p>I <i>believe</i> in <b>semantic markup</b>!</p>', u'<p>I <em>believe</em> in <strong>semantic markup</strong>!</p>'), | ||||
|             (u'I escape & I don\'t <a href="#" target="_blank">target</a>', u'I escape & I don\'t <a href="#" >target</a>'), | ||||
|             (u'<p>I kill whitespace</p><br clear="all"><p> </p>', u'<p>I kill whitespace</p>'), | ||||
|             # also a regression test for #7267: this used to raise an UnicodeDecodeError | ||||
|             (u'<p>* foo</p><p>* bar</p>', u'<ul>\n<li> foo</li><li> bar</li>\n</ul>'), | ||||
|         ) | ||||
|         for value, output in items: | ||||
|             self.check_output(f, value, output) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user