mirror of
https://github.com/ankitects/anki.git
synced 2025-09-19 06:22:22 -04:00
marcus: handle html entities properly when stripping html
This commit is contained in:
parent
8ce0ff5b8b
commit
34d6efe1df
1 changed files with 23 additions and 3 deletions
|
@ -8,7 +8,7 @@ Miscellaneous utilities
|
|||
"""
|
||||
__docformat__ = 'restructuredtext'
|
||||
|
||||
import re, os, random, time, types, math
|
||||
import re, os, random, time, types, math, htmlentitydefs
|
||||
|
||||
try:
|
||||
import hashlib
|
||||
|
@ -130,8 +130,7 @@ def stripHTML(s):
|
|||
s = re.sub("(?s)<style.*?>.*?</style>", "", s)
|
||||
s = re.sub("(?s)<script.*?>.*?</script>", "", s)
|
||||
s = re.sub("<.*?>", "", s)
|
||||
s = s.replace("<", "<")
|
||||
s = s.replace(">", ">")
|
||||
s = entsToTxt(s)
|
||||
return s
|
||||
|
||||
def stripHTMLMedia(s):
|
||||
|
@ -160,6 +159,27 @@ def tidyHTML(html):
|
|||
html = re.sub(u"^<table><tr><td style=\"border: none;\">(.*)<br></td></tr></table>$", u"\\1", html)
|
||||
return html
|
||||
|
||||
def entsToTxt(html):
|
||||
def fixup(m):
|
||||
text = m.group(0)
|
||||
if text[:2] == "&#":
|
||||
# character reference
|
||||
try:
|
||||
if text[:3] == "&#x":
|
||||
return unichr(int(text[3:-1], 16))
|
||||
else:
|
||||
return unichr(int(text[2:-1]))
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
# named entity
|
||||
try:
|
||||
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
|
||||
except KeyError:
|
||||
pass
|
||||
return text # leave as is
|
||||
return re.sub("&#?\w+;", fixup, html)
|
||||
|
||||
# IDs
|
||||
##############################################################################
|
||||
|
||||
|
|
Loading…
Reference in a new issue