diff --git a/anki/utils.py b/anki/utils.py index 7e7805411..c3fffe826 100644 --- a/anki/utils.py +++ b/anki/utils.py @@ -8,7 +8,7 @@ Miscellaneous utilities """ __docformat__ = 'restructuredtext' -import re, os, random, time, types, math +import re, os, random, time, types, math, htmlentitydefs try: import hashlib @@ -130,8 +130,7 @@ def stripHTML(s): s = re.sub("(?s).*?", "", s) s = re.sub("(?s).*?", "", s) s = re.sub("<.*?>", "", s) - s = s.replace("<", "<") - s = s.replace(">", ">") + s = entsToTxt(s) return s def stripHTMLMedia(s): @@ -160,6 +159,27 @@ def tidyHTML(html): html = re.sub(u"^
(.*)
$", u"\\1", html) return html +def entsToTxt(html): + def fixup(m): + text = m.group(0) + if text[:2] == "&#": + # character reference + try: + if text[:3] == "&#x": + return unichr(int(text[3:-1], 16)) + else: + return unichr(int(text[2:-1])) + except ValueError: + pass + else: + # named entity + try: + text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) + except KeyError: + pass + return text # leave as is + return re.sub("&#?\w+;", fixup, html) + # IDs ##############################################################################