Ticket #429: html.py.2.patch
File html.py.2.patch, 4.5 KB (added by , 19 years ago) |
---|
-
django_src/django/utils/html.py
1 "Useful HTML utilities suitable for global use by World Online projects." 1 """ 2 HTML utilities suitable for global use. 3 """ 2 4 3 5 import re, string 4 6 … … 9 11 # list of possible strings used for bullets in bulleted lists 10 12 DOTS = ['·', '*', '\xe2\x80\xa2', '•', '•', '•'] 11 13 12 UNENCODED_AMPERSANDS_RE= re.compile(r'&(?!(\w+|#\d+);)')13 WORD_SPLIT_RE= re.compile(r'(\s+)')14 PUNCTUATION_RE= re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % \14 unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)') 15 word_split_re = re.compile(r'(\s+)') 16 punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % \ 15 17 ('|'.join([re.escape(p) for p in LEADING_PUNCTUATION]), 16 18 '|'.join([re.escape(p) for p in TRAILING_PUNCTUATION]))) 17 SIMPLE_EMAIL_RE= re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$')18 LINK_TARGET_ATTRIBUTE= re.compile(r'(<a [^>]*?)target=[^\s>]+')19 HTML_GUNK= re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)20 HARD_CODED_BULLETS= re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(d) for d in DOTS]), re.DOTALL)21 TRAILING_EMPTY_CONTENT= re.compile(r'(?:<p>(?: |\s|<br \/>)*?</p>\s*)+\Z')19 simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$') 20 link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+') 21 html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE) 22 hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(d) for d in DOTS]), re.DOTALL) 23 trailing_empty_content_re = re.compile(r'(?:<p>(?: |\s|<br \/>)*?</p>\s*)+\Z') 22 24 23 25 def escape(html): 24 26 "Returns the given HTML with ampersands, quotes and carets encoded" … … 43 45 44 46 def fix_ampersands(value): 45 47 "Returns the given HTML with all unencoded ampersands encoded correctly" 46 return UNENCODED_AMPERSANDS_RE.sub('&', value)48 return unencoded_ampersands_re.sub('&', value) 47 49 48 50 def urlize(text, trim_url_limit=None, nofollow=False): 49 51 """ … … 57 59 If nofollow is True, the URLs in link text will get a rel="nofollow" attribute. 58 60 """ 59 61 trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or '')) or x 60 words = WORD_SPLIT_RE.split(text)62 words = word_split_re.split(text) 61 63 nofollow_attr = nofollow and ' rel="nofollow"' or '' 62 64 for i, word in enumerate(words): 63 match = PUNCTUATION_RE.match(word)65 match = punctuation_re.match(word) 64 66 if match: 65 67 lead, middle, trail = match.groups() 66 68 if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \ … … 70 72 if middle.startswith('http://') or middle.startswith('https://'): 71 73 middle = '<a href="%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle)) 72 74 if '@' in middle and not middle.startswith('www.') and not ':' in middle \ 73 and SIMPLE_EMAIL_RE.match(middle):75 and simple_email_re.match(middle): 74 76 middle = '<a href="mailto:%s">%s</a>' % (middle, middle) 75 77 if lead + middle + trail != word: 76 78 words[i] = lead + middle + trail … … 94 96 text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text) 95 97 text = fix_ampersands(text) 96 98 # Remove all target="" attributes from <a> tags. 97 text = LINK_TARGET_ATTRIBUTE.sub('\\1', text)99 text = link_target_attribute_re.sub('\\1', text) 98 100 # Trim stupid HTML such as <br clear="all">. 99 text = HTML_GUNK.sub('', text)101 text = html_gunk_re.sub('', text) 100 102 # Convert hard-coded bullets into HTML unordered lists. 101 103 def replace_p_tags(match): 102 104 s = match.group().replace('</p>', '</li>') 103 105 for d in DOTS: 104 106 s = s.replace('<p>%s' % d, '<li>') 105 107 return '<ul>\n%s\n</ul>' % s 106 text = HARD_CODED_BULLETS.sub(replace_p_tags, text)108 text = hard_coded_bullets_re.sub(replace_p_tags, text) 107 109 # Remove stuff like "<p> </p>", but only if it's at the bottom of the text. 108 text = TRAILING_EMPTY_CONTENT.sub('', text)110 text = trailing_empty_content_re.sub('', text) 109 111 return text 110 112