replace the old stripHTML() methods with the backend implementation

Python's regex engine performs pathologically on regexes like
'<!--.*?-->' when fed a large string of repeating '<!--' clauses.
Thanks to JaimeSlome / security@huntr.dev for the report; closes #1380.

Solved by switching to the Rust implementation, which does not suffer
from this issue.

entsToText(), minimizeHTML(), and the old regex constants have been
removed; they do not appear to be used by any add-ons.
This commit is contained in:
Damien Elmes 2021-10-01 23:07:45 +10:00
parent a28df026b8
commit 0bb273a0ed
6 changed files with 41 additions and 53 deletions

View file

@ -23,6 +23,7 @@ service CardRenderingService {
rpc RenderMarkdown(RenderMarkdownRequest) returns (generic.String); rpc RenderMarkdown(RenderMarkdownRequest) returns (generic.String);
rpc EncodeIriPaths(generic.String) returns (generic.String); rpc EncodeIriPaths(generic.String) returns (generic.String);
rpc DecodeIriPaths(generic.String) returns (generic.String); rpc DecodeIriPaths(generic.String) returns (generic.String);
rpc StripHtml(StripHtmlRequest) returns (generic.String);
} }
message ExtractAVTagsRequest { message ExtractAVTagsRequest {
@ -119,3 +120,13 @@ message RenderMarkdownRequest {
string markdown = 1; string markdown = 1;
bool sanitize = 2; bool sanitize = 2;
} }
message StripHtmlRequest {
enum Mode {
NORMAL = 0;
PRESERVE_MEDIA_FILENAMES = 1;
}
string text = 1;
Mode mode = 2;
}

View file

@ -16,7 +16,8 @@ ignored-classes=
BackendError, BackendError,
SetDeckCollapsedRequest, SetDeckCollapsedRequest,
ConfigKey, ConfigKey,
HelpPageLinkRequest HelpPageLinkRequest,
StripHtmlRequest
[REPORTS] [REPORTS]
output-format=colorized output-format=colorized

View file

@ -32,6 +32,7 @@ OpChangesWithId = collection_pb2.OpChangesWithId
OpChangesAfterUndo = collection_pb2.OpChangesAfterUndo OpChangesAfterUndo = collection_pb2.OpChangesAfterUndo
BrowserRow = search_pb2.BrowserRow BrowserRow = search_pb2.BrowserRow
BrowserColumns = search_pb2.BrowserColumns BrowserColumns = search_pb2.BrowserColumns
StripHtmlMode = card_rendering_pb2.StripHtmlRequest
import copy import copy
import os import os

View file

@ -17,8 +17,7 @@ import time
import traceback import traceback
from contextlib import contextmanager from contextlib import contextmanager
from hashlib import sha1 from hashlib import sha1
from html.entities import name2codepoint from typing import Any, Iterable, Iterator, List, Optional, Union
from typing import Any, Iterable, Iterator, List, Match, Optional, Union
from anki.dbproxy import DBProxy from anki.dbproxy import DBProxy
@ -55,37 +54,23 @@ def intTime(scale: int = 1) -> int:
# HTML # HTML
############################################################################## ##############################################################################
reComment = re.compile("(?s)<!--.*?-->")
reStyle = re.compile("(?si)<style.*?>.*?</style>")
reScript = re.compile("(?si)<script.*?>.*?</script>")
reTag = re.compile("(?s)<.*?>")
reEnts = re.compile(r"&#?\w+;")
reMedia = re.compile("(?i)<img[^>]+src=[\"']?([^\"'>]+)[\"']?[^>]*>")
def stripHTML(s: str) -> str: def stripHTML(s: str) -> str:
s = reComment.sub("", s) import anki.lang
s = reStyle.sub("", s) from anki.collection import StripHtmlMode
s = reScript.sub("", s)
s = reTag.sub("", s) return anki.lang.current_i18n.strip_html(text=s, mode=StripHtmlMode.NORMAL)
s = entsToTxt(s)
return s
def stripHTMLMedia(s: str) -> str: def stripHTMLMedia(s: str) -> str:
"Strip HTML but keep media filenames" "Strip HTML but keep media filenames"
s = reMedia.sub(" \\1 ", s) import anki.lang
return stripHTML(s) from anki.collection import StripHtmlMode
return anki.lang.current_i18n.strip_html(
def minimizeHTML(s: str) -> str: text=s, mode=StripHtmlMode.PRESERVE_MEDIA_FILENAMES
"Correct Qt's verbose bold/underline/etc."
s = re.sub('<span style="font-weight:600;">(.*?)</span>', "<b>\\1</b>", s)
s = re.sub('<span style="font-style:italic;">(.*?)</span>', "<i>\\1</i>", s)
s = re.sub(
'<span style="text-decoration: underline;">(.*?)</span>', "<u>\\1</u>", s
) )
return s
def htmlToTextLine(s: str) -> str: def htmlToTextLine(s: str) -> str:
@ -100,33 +85,6 @@ def htmlToTextLine(s: str) -> str:
return s return s
def entsToTxt(html: str) -> str:
# entitydefs defines nbsp as \xa0 instead of a standard space, so we
# replace it first
html = html.replace("&nbsp;", " ")
def fixup(m: Match) -> str:
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return chr(int(text[3:-1], 16))
else:
return chr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = chr(name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return reEnts.sub(fixup, html)
# IDs # IDs
############################################################################## ##############################################################################

View file

@ -0,0 +1,6 @@
# Copyright: Ankitects Pty Ltd and contributors
# License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
from anki.lang import set_lang
set_lang("en_US")

View file

@ -12,7 +12,7 @@ use crate::{
template::RenderedNode, template::RenderedNode,
text::{ text::{
decode_iri_paths, encode_iri_paths, extract_av_tags, sanitize_html_no_images, decode_iri_paths, encode_iri_paths, extract_av_tags, sanitize_html_no_images,
strip_av_tags, AvTag, strip_av_tags, strip_html, strip_html_preserving_media_filenames, AvTag,
}, },
}; };
@ -161,6 +161,17 @@ impl CardRenderingService for Backend {
fn decode_iri_paths(&self, input: pb::String) -> Result<pb::String> { fn decode_iri_paths(&self, input: pb::String) -> Result<pb::String> {
Ok(decode_iri_paths(&input.val).to_string().into()) Ok(decode_iri_paths(&input.val).to_string().into())
} }
fn strip_html(&self, input: pb::StripHtmlRequest) -> Result<pb::String> {
Ok(match input.mode() {
pb::strip_html_request::Mode::Normal => strip_html(&input.text),
pb::strip_html_request::Mode::PreserveMediaFilenames => {
strip_html_preserving_media_filenames(&input.text)
}
}
.to_string()
.into())
}
} }
fn rendered_nodes_to_proto(nodes: Vec<RenderedNode>) -> Vec<pb::RenderedTemplateNode> { fn rendered_nodes_to_proto(nodes: Vec<RenderedNode>) -> Vec<pb::RenderedTemplateNode> {