marcus: handle html entities properly when stripping html

2025-12-31 15:52:58 -05:00 · 2010-10-27 20:12:00 +09:00 · 2010-10-27 20:12:00 +09:00 · 34d6efe1df
commit 34d6efe1df
parent 8ce0ff5b8b
1 changed files with 23 additions and 3 deletions
--- a/anki/utils.py
+++ b/anki/utils.py
@ -8,7 +8,7 @@ Miscellaneous utilities
 """
 __docformat__ = 'restructuredtext'

-import re, os, random, time, types, math
+import re, os, random, time, types, math, htmlentitydefs

 try:
    import hashlib
@ -130,8 +130,7 @@ def stripHTML(s):
    s = re.sub("(?s)<style.*?>.*?</style>", "", s)
    s = re.sub("(?s)<script.*?>.*?</script>", "", s)
    s = re.sub("<.*?>", "", s)
-    s = s.replace("&lt;", "<")
-    s = s.replace("&gt;", ">")
+    s = entsToTxt(s)
    return s

 def stripHTMLMedia(s):
@ -160,6 +159,27 @@ def tidyHTML(html):
    html = re.sub(u"^<table><tr><td style=\"border: none;\">(.*)<br></td></tr></table>$", u"\\1", html)
    return html

+def entsToTxt(html):
+    def fixup(m):
+        text = m.group(0)
+        if text[:2] == "&#":
+            # character reference
+            try:
+                if text[:3] == "&#x":
+                    return unichr(int(text[3:-1], 16))
+                else:
+                    return unichr(int(text[2:-1]))
+            except ValueError:
+                pass
+        else:
+            # named entity
+            try:
+                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
+            except KeyError:
+                pass
+        return text # leave as is
+    return re.sub("&#?\w+;", fixup, html)
+
 # IDs
 ##############################################################################