Pre-compile HTML-stripping regexes.

This commit is contained in:
Houssam Salem 2013-06-23 17:09:10 +10:00
parent 1a9ef3734e
commit 0ce829b6b3

View file

@ -122,17 +122,22 @@ def fmtFloat(float_value, point=1):
# HTML # HTML
############################################################################## ##############################################################################
reStyle = re.compile("(?s)<style.*?>.*?</style>")
reScript = re.compile("(?s)<script.*?>.*?</script>")
reTag = re.compile("<.*?>")
reEnts = re.compile("&#?\w+;")
reMedia = re.compile("<img[^>]+src=[\"']?([^\"'>]+)[\"']?[^>]*>")
def stripHTML(s): def stripHTML(s):
s = re.sub("(?s)<style.*?>.*?</style>", "", s) s = reStyle.sub("", s)
s = re.sub("(?s)<script.*?>.*?</script>", "", s) s = reScript.sub("", s)
s = re.sub("<.*?>", "", s) s = reTag.sub("", s)
s = entsToTxt(s) s = entsToTxt(s)
return s return s
def stripHTMLMedia(s): def stripHTMLMedia(s):
"Strip HTML but keep media filenames" "Strip HTML but keep media filenames"
s = re.sub("<img[^>]+src=[\"']?([^\"'>]+)[\"']?[^>]*>", " \\1 ", s) s = reMedia.sub(" \\1 ", s)
return stripHTML(s) return stripHTML(s)
def minimizeHTML(s): def minimizeHTML(s):
@ -164,7 +169,7 @@ def entsToTxt(html):
except KeyError: except KeyError:
pass pass
return text # leave as is return text # leave as is
return re.sub("&#?\w+;", fixup, html) return reEnts.sub(fixup, html)
# IDs # IDs
############################################################################## ##############################################################################