marcus: handle html entities properly when stripping html

This commit is contained in:
Damien Elmes 2010-10-27 20:12:00 +09:00
parent 8ce0ff5b8b
commit 34d6efe1df

View file

@ -8,7 +8,7 @@ Miscellaneous utilities
"""
__docformat__ = 'restructuredtext'
import re, os, random, time, types, math
import re, os, random, time, types, math, htmlentitydefs
try:
import hashlib
@ -130,8 +130,7 @@ def stripHTML(s):
s = re.sub("(?s)<style.*?>.*?</style>", "", s)
s = re.sub("(?s)<script.*?>.*?</script>", "", s)
s = re.sub("<.*?>", "", s)
s = s.replace("&lt;", "<")
s = s.replace("&gt;", ">")
s = entsToTxt(s)
return s
def stripHTMLMedia(s):
@ -160,6 +159,27 @@ def tidyHTML(html):
html = re.sub(u"^<table><tr><td style=\"border: none;\">(.*)<br></td></tr></table>$", u"\\1", html)
return html
def entsToTxt(html):
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, html)
# IDs
##############################################################################