replace the old stripHTML() methods with the backend implementation

Python's regex engine performs pathologically on regexes like
'<!--.*?-->' when fed a large string of repeating '<!--' clauses.
Thanks to JaimeSlome / security@huntr.dev for the report; closes #1380.

Solved by switching to the Rust implementation, which does not suffer
from this issue.

entsToText(), minimizeHTML(), and the old regex constants have been
removed; they do not appear to be used by any add-ons.
This commit is contained in:
Damien Elmes 2021-10-01 23:07:45 +10:00
parent a28df026b8
commit 0bb273a0ed
6 changed files with 41 additions and 53 deletions

View file

@ -23,6 +23,7 @@ service CardRenderingService {
rpc RenderMarkdown(RenderMarkdownRequest) returns (generic.String);
rpc EncodeIriPaths(generic.String) returns (generic.String);
rpc DecodeIriPaths(generic.String) returns (generic.String);
rpc StripHtml(StripHtmlRequest) returns (generic.String);
}
message ExtractAVTagsRequest {
@ -119,3 +120,13 @@ message RenderMarkdownRequest {
string markdown = 1;
bool sanitize = 2;
}
message StripHtmlRequest {
enum Mode {
NORMAL = 0;
PRESERVE_MEDIA_FILENAMES = 1;
}
string text = 1;
Mode mode = 2;
}

View file

@ -16,7 +16,8 @@ ignored-classes=
BackendError,
SetDeckCollapsedRequest,
ConfigKey,
HelpPageLinkRequest
HelpPageLinkRequest,
StripHtmlRequest
[REPORTS]
output-format=colorized

View file

@ -32,6 +32,7 @@ OpChangesWithId = collection_pb2.OpChangesWithId
OpChangesAfterUndo = collection_pb2.OpChangesAfterUndo
BrowserRow = search_pb2.BrowserRow
BrowserColumns = search_pb2.BrowserColumns
StripHtmlMode = card_rendering_pb2.StripHtmlRequest
import copy
import os

View file

@ -17,8 +17,7 @@ import time
import traceback
from contextlib import contextmanager
from hashlib import sha1
from html.entities import name2codepoint
from typing import Any, Iterable, Iterator, List, Match, Optional, Union
from typing import Any, Iterable, Iterator, List, Optional, Union
from anki.dbproxy import DBProxy
@ -55,37 +54,23 @@ def intTime(scale: int = 1) -> int:
# HTML
##############################################################################
reComment = re.compile("(?s)<!--.*?-->")
reStyle = re.compile("(?si)<style.*?>.*?</style>")
reScript = re.compile("(?si)<script.*?>.*?</script>")
reTag = re.compile("(?s)<.*?>")
reEnts = re.compile(r"&#?\w+;")
reMedia = re.compile("(?i)<img[^>]+src=[\"']?([^\"'>]+)[\"']?[^>]*>")
def stripHTML(s: str) -> str:
s = reComment.sub("", s)
s = reStyle.sub("", s)
s = reScript.sub("", s)
s = reTag.sub("", s)
s = entsToTxt(s)
return s
import anki.lang
from anki.collection import StripHtmlMode
return anki.lang.current_i18n.strip_html(text=s, mode=StripHtmlMode.NORMAL)
def stripHTMLMedia(s: str) -> str:
"Strip HTML but keep media filenames"
s = reMedia.sub(" \\1 ", s)
return stripHTML(s)
import anki.lang
from anki.collection import StripHtmlMode
def minimizeHTML(s: str) -> str:
"Correct Qt's verbose bold/underline/etc."
s = re.sub('<span style="font-weight:600;">(.*?)</span>', "<b>\\1</b>", s)
s = re.sub('<span style="font-style:italic;">(.*?)</span>', "<i>\\1</i>", s)
s = re.sub(
'<span style="text-decoration: underline;">(.*?)</span>', "<u>\\1</u>", s
return anki.lang.current_i18n.strip_html(
text=s, mode=StripHtmlMode.PRESERVE_MEDIA_FILENAMES
)
return s
def htmlToTextLine(s: str) -> str:
@ -100,33 +85,6 @@ def htmlToTextLine(s: str) -> str:
return s
def entsToTxt(html: str) -> str:
# entitydefs defines nbsp as \xa0 instead of a standard space, so we
# replace it first
html = html.replace("&nbsp;", " ")
def fixup(m: Match) -> str:
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return chr(int(text[3:-1], 16))
else:
return chr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = chr(name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return reEnts.sub(fixup, html)
# IDs
##############################################################################

View file

@ -0,0 +1,6 @@
# Copyright: Ankitects Pty Ltd and contributors
# License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
from anki.lang import set_lang
set_lang("en_US")

View file

@ -12,7 +12,7 @@ use crate::{
template::RenderedNode,
text::{
decode_iri_paths, encode_iri_paths, extract_av_tags, sanitize_html_no_images,
strip_av_tags, AvTag,
strip_av_tags, strip_html, strip_html_preserving_media_filenames, AvTag,
},
};
@ -161,6 +161,17 @@ impl CardRenderingService for Backend {
fn decode_iri_paths(&self, input: pb::String) -> Result<pb::String> {
Ok(decode_iri_paths(&input.val).to_string().into())
}
fn strip_html(&self, input: pb::StripHtmlRequest) -> Result<pb::String> {
Ok(match input.mode() {
pb::strip_html_request::Mode::Normal => strip_html(&input.text),
pb::strip_html_request::Mode::PreserveMediaFilenames => {
strip_html_preserving_media_filenames(&input.text)
}
}
.to_string()
.into())
}
}
fn rendered_nodes_to_proto(nodes: Vec<RenderedNode>) -> Vec<pb::RenderedTemplateNode> {