From 0ce829b6b347b03a98d4dc714ecff9e18fa507ee Mon Sep 17 00:00:00 2001 From: Houssam Salem Date: Sun, 23 Jun 2013 17:09:10 +1000 Subject: [PATCH] Pre-compile HTML-stripping regexes. --- anki/utils.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/anki/utils.py b/anki/utils.py index 3315225fd..34b943e95 100644 --- a/anki/utils.py +++ b/anki/utils.py @@ -122,17 +122,22 @@ def fmtFloat(float_value, point=1): # HTML ############################################################################## +reStyle = re.compile("(?s).*?") +reScript = re.compile("(?s).*?") +reTag = re.compile("<.*?>") +reEnts = re.compile("&#?\w+;") +reMedia = re.compile("]+src=[\"']?([^\"'>]+)[\"']?[^>]*>") def stripHTML(s): - s = re.sub("(?s).*?", "", s) - s = re.sub("(?s).*?", "", s) - s = re.sub("<.*?>", "", s) + s = reStyle.sub("", s) + s = reScript.sub("", s) + s = reTag.sub("", s) s = entsToTxt(s) return s def stripHTMLMedia(s): "Strip HTML but keep media filenames" - s = re.sub("]+src=[\"']?([^\"'>]+)[\"']?[^>]*>", " \\1 ", s) + s = reMedia.sub(" \\1 ", s) return stripHTML(s) def minimizeHTML(s): @@ -164,7 +169,7 @@ def entsToTxt(html): except KeyError: pass return text # leave as is - return re.sub("&#?\w+;", fixup, html) + return reEnts.sub(fixup, html) # IDs ##############################################################################