marcus: handle html entities properly when stripping html

This commit is contained in:
Damien Elmes 2010-10-27 20:12:00 +09:00
parent 8ce0ff5b8b
commit 34d6efe1df

View file

@ -8,7 +8,7 @@ Miscellaneous utilities
""" """
__docformat__ = 'restructuredtext' __docformat__ = 'restructuredtext'
import re, os, random, time, types, math import re, os, random, time, types, math, htmlentitydefs
try: try:
import hashlib import hashlib
@ -130,8 +130,7 @@ def stripHTML(s):
s = re.sub("(?s)<style.*?>.*?</style>", "", s) s = re.sub("(?s)<style.*?>.*?</style>", "", s)
s = re.sub("(?s)<script.*?>.*?</script>", "", s) s = re.sub("(?s)<script.*?>.*?</script>", "", s)
s = re.sub("<.*?>", "", s) s = re.sub("<.*?>", "", s)
s = s.replace("&lt;", "<") s = entsToTxt(s)
s = s.replace("&gt;", ">")
return s return s
def stripHTMLMedia(s): def stripHTMLMedia(s):
@ -160,6 +159,27 @@ def tidyHTML(html):
html = re.sub(u"^<table><tr><td style=\"border: none;\">(.*)<br></td></tr></table>$", u"\\1", html) html = re.sub(u"^<table><tr><td style=\"border: none;\">(.*)<br></td></tr></table>$", u"\\1", html)
return html return html
def entsToTxt(html):
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, html)
# IDs # IDs
############################################################################## ##############################################################################