mirror of
https://github.com/ankitects/anki.git
synced 2025-09-19 06:22:22 -04:00
marcus: handle html entities properly when stripping html
This commit is contained in:
parent
8ce0ff5b8b
commit
34d6efe1df
1 changed files with 23 additions and 3 deletions
|
@ -8,7 +8,7 @@ Miscellaneous utilities
|
||||||
"""
|
"""
|
||||||
__docformat__ = 'restructuredtext'
|
__docformat__ = 'restructuredtext'
|
||||||
|
|
||||||
import re, os, random, time, types, math
|
import re, os, random, time, types, math, htmlentitydefs
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import hashlib
|
import hashlib
|
||||||
|
@ -130,8 +130,7 @@ def stripHTML(s):
|
||||||
s = re.sub("(?s)<style.*?>.*?</style>", "", s)
|
s = re.sub("(?s)<style.*?>.*?</style>", "", s)
|
||||||
s = re.sub("(?s)<script.*?>.*?</script>", "", s)
|
s = re.sub("(?s)<script.*?>.*?</script>", "", s)
|
||||||
s = re.sub("<.*?>", "", s)
|
s = re.sub("<.*?>", "", s)
|
||||||
s = s.replace("<", "<")
|
s = entsToTxt(s)
|
||||||
s = s.replace(">", ">")
|
|
||||||
return s
|
return s
|
||||||
|
|
||||||
def stripHTMLMedia(s):
|
def stripHTMLMedia(s):
|
||||||
|
@ -160,6 +159,27 @@ def tidyHTML(html):
|
||||||
html = re.sub(u"^<table><tr><td style=\"border: none;\">(.*)<br></td></tr></table>$", u"\\1", html)
|
html = re.sub(u"^<table><tr><td style=\"border: none;\">(.*)<br></td></tr></table>$", u"\\1", html)
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
def entsToTxt(html):
|
||||||
|
def fixup(m):
|
||||||
|
text = m.group(0)
|
||||||
|
if text[:2] == "&#":
|
||||||
|
# character reference
|
||||||
|
try:
|
||||||
|
if text[:3] == "&#x":
|
||||||
|
return unichr(int(text[3:-1], 16))
|
||||||
|
else:
|
||||||
|
return unichr(int(text[2:-1]))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
# named entity
|
||||||
|
try:
|
||||||
|
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
return text # leave as is
|
||||||
|
return re.sub("&#?\w+;", fixup, html)
|
||||||
|
|
||||||
# IDs
|
# IDs
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue