marcus: handle html entities properly when stripping html

2025-12-31 15:52:58 -05:00 · 2010-10-27 20:12:00 +09:00 · 2010-10-27 20:12:00 +09:00 · 34d6efe1df
commit 34d6efe1df
parent 8ce0ff5b8b
1 changed files with 23 additions and 3 deletions
--- a/anki/utils.py
+++ b/anki/utils.py
@ -8,7 +8,7 @@ Miscellaneous utilities
 """
 __docformat__ = 'restructuredtext'
-import re, os, random, time, types, math
+import re, os, random, time, types, math, htmlentitydefs
 try:
    import hashlib
@ -130,8 +130,7 @@ def stripHTML(s):
    s = re.sub("(?s)<style.*?>.*?</style>", "", s)
    s = re.sub("(?s)<script.*?>.*?</script>", "", s)
    s = re.sub("<.*?>", "", s)
-    s = s.replace("&lt;", "<")
+    s = entsToTxt(s)
    s = s.replace("&gt;", ">")
    return s
 def stripHTMLMedia(s):
@ -160,6 +159,27 @@ def tidyHTML(html):
    html = re.sub(u"^<table><tr><td style=\"border: none;\">(.*)<br></td></tr></table>$", u"\\1", html)
    return html
 def entsToTxt(html):
    def fixup(m):
        text = m.group(0)
        if text[:2] == "&#":
            # character reference
            try:
                if text[:3] == "&#x":
                    return unichr(int(text[3:-1], 16))
                else:
                    return unichr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            # named entity
            try:
                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text # leave as is
    return re.sub("&#?\w+;", fixup, html)
 # IDs
 ##############################################################################