stripHTML() handles entity replacement, which should be done post-strip

This commit is contained in:
Damien Elmes 2012-09-07 04:54:59 +09:00
parent 59cb45eda6
commit b66d1479c2

View file

@ -64,9 +64,6 @@ def _latexFromHtml(col, latex):
# entitydefs defines nbsp as \xa0 instead of a standard space, so we # entitydefs defines nbsp as \xa0 instead of a standard space, so we
# replace it first # replace it first
latex = latex.replace(" ", " ") latex = latex.replace(" ", " ")
for match in re.compile("&([a-z]+);", re.IGNORECASE).finditer(latex):
if match.group(1) in entitydefs:
latex = latex.replace(match.group(), entitydefs[match.group(1)])
latex = re.sub("<br( /)?>", "\n", latex) latex = re.sub("<br( /)?>", "\n", latex)
# replace <div> etc with spaces # replace <div> etc with spaces
latex = re.sub("<.+?>", " ", latex) latex = re.sub("<.+?>", " ", latex)