mirror of
https://github.com/ankitects/anki.git
synced 2025-09-18 22:12:21 -04:00
Pre-compile HTML-stripping regexes.
This commit is contained in:
parent
1a9ef3734e
commit
0ce829b6b3
1 changed files with 10 additions and 5 deletions
|
@ -122,17 +122,22 @@ def fmtFloat(float_value, point=1):
|
||||||
|
|
||||||
# HTML
|
# HTML
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
reStyle = re.compile("(?s)<style.*?>.*?</style>")
|
||||||
|
reScript = re.compile("(?s)<script.*?>.*?</script>")
|
||||||
|
reTag = re.compile("<.*?>")
|
||||||
|
reEnts = re.compile("&#?\w+;")
|
||||||
|
reMedia = re.compile("<img[^>]+src=[\"']?([^\"'>]+)[\"']?[^>]*>")
|
||||||
|
|
||||||
def stripHTML(s):
|
def stripHTML(s):
|
||||||
s = re.sub("(?s)<style.*?>.*?</style>", "", s)
|
s = reStyle.sub("", s)
|
||||||
s = re.sub("(?s)<script.*?>.*?</script>", "", s)
|
s = reScript.sub("", s)
|
||||||
s = re.sub("<.*?>", "", s)
|
s = reTag.sub("", s)
|
||||||
s = entsToTxt(s)
|
s = entsToTxt(s)
|
||||||
return s
|
return s
|
||||||
|
|
||||||
def stripHTMLMedia(s):
|
def stripHTMLMedia(s):
|
||||||
"Strip HTML but keep media filenames"
|
"Strip HTML but keep media filenames"
|
||||||
s = re.sub("<img[^>]+src=[\"']?([^\"'>]+)[\"']?[^>]*>", " \\1 ", s)
|
s = reMedia.sub(" \\1 ", s)
|
||||||
return stripHTML(s)
|
return stripHTML(s)
|
||||||
|
|
||||||
def minimizeHTML(s):
|
def minimizeHTML(s):
|
||||||
|
@ -164,7 +169,7 @@ def entsToTxt(html):
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
return text # leave as is
|
return text # leave as is
|
||||||
return re.sub("&#?\w+;", fixup, html)
|
return reEnts.sub(fixup, html)
|
||||||
|
|
||||||
# IDs
|
# IDs
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
Loading…
Reference in a new issue