mirror of
https://github.com/ankitects/anki.git
synced 2025-09-18 14:02:21 -04:00
replace the old stripHTML() methods with the backend implementation
Python's regex engine performs pathologically on regexes like '<!--.*?-->' when fed a large string of repeating '<!--' clauses. Thanks to JaimeSlome / security@huntr.dev for the report; closes #1380. Solved by switching to the Rust implementation, which does not suffer from this issue. entsToText(), minimizeHTML(), and the old regex constants have been removed; they do not appear to be used by any add-ons.
This commit is contained in:
parent
a28df026b8
commit
0bb273a0ed
6 changed files with 41 additions and 53 deletions
|
@ -23,6 +23,7 @@ service CardRenderingService {
|
|||
rpc RenderMarkdown(RenderMarkdownRequest) returns (generic.String);
|
||||
rpc EncodeIriPaths(generic.String) returns (generic.String);
|
||||
rpc DecodeIriPaths(generic.String) returns (generic.String);
|
||||
rpc StripHtml(StripHtmlRequest) returns (generic.String);
|
||||
}
|
||||
|
||||
message ExtractAVTagsRequest {
|
||||
|
@ -119,3 +120,13 @@ message RenderMarkdownRequest {
|
|||
string markdown = 1;
|
||||
bool sanitize = 2;
|
||||
}
|
||||
|
||||
message StripHtmlRequest {
|
||||
enum Mode {
|
||||
NORMAL = 0;
|
||||
PRESERVE_MEDIA_FILENAMES = 1;
|
||||
}
|
||||
|
||||
string text = 1;
|
||||
Mode mode = 2;
|
||||
}
|
||||
|
|
|
@ -16,7 +16,8 @@ ignored-classes=
|
|||
BackendError,
|
||||
SetDeckCollapsedRequest,
|
||||
ConfigKey,
|
||||
HelpPageLinkRequest
|
||||
HelpPageLinkRequest,
|
||||
StripHtmlRequest
|
||||
|
||||
[REPORTS]
|
||||
output-format=colorized
|
||||
|
|
|
@ -32,6 +32,7 @@ OpChangesWithId = collection_pb2.OpChangesWithId
|
|||
OpChangesAfterUndo = collection_pb2.OpChangesAfterUndo
|
||||
BrowserRow = search_pb2.BrowserRow
|
||||
BrowserColumns = search_pb2.BrowserColumns
|
||||
StripHtmlMode = card_rendering_pb2.StripHtmlRequest
|
||||
|
||||
import copy
|
||||
import os
|
||||
|
|
|
@ -17,8 +17,7 @@ import time
|
|||
import traceback
|
||||
from contextlib import contextmanager
|
||||
from hashlib import sha1
|
||||
from html.entities import name2codepoint
|
||||
from typing import Any, Iterable, Iterator, List, Match, Optional, Union
|
||||
from typing import Any, Iterable, Iterator, List, Optional, Union
|
||||
|
||||
from anki.dbproxy import DBProxy
|
||||
|
||||
|
@ -55,37 +54,23 @@ def intTime(scale: int = 1) -> int:
|
|||
|
||||
# HTML
|
||||
##############################################################################
|
||||
reComment = re.compile("(?s)<!--.*?-->")
|
||||
reStyle = re.compile("(?si)<style.*?>.*?</style>")
|
||||
reScript = re.compile("(?si)<script.*?>.*?</script>")
|
||||
reTag = re.compile("(?s)<.*?>")
|
||||
reEnts = re.compile(r"&#?\w+;")
|
||||
reMedia = re.compile("(?i)<img[^>]+src=[\"']?([^\"'>]+)[\"']?[^>]*>")
|
||||
|
||||
|
||||
def stripHTML(s: str) -> str:
|
||||
s = reComment.sub("", s)
|
||||
s = reStyle.sub("", s)
|
||||
s = reScript.sub("", s)
|
||||
s = reTag.sub("", s)
|
||||
s = entsToTxt(s)
|
||||
return s
|
||||
import anki.lang
|
||||
from anki.collection import StripHtmlMode
|
||||
|
||||
return anki.lang.current_i18n.strip_html(text=s, mode=StripHtmlMode.NORMAL)
|
||||
|
||||
|
||||
def stripHTMLMedia(s: str) -> str:
|
||||
"Strip HTML but keep media filenames"
|
||||
s = reMedia.sub(" \\1 ", s)
|
||||
return stripHTML(s)
|
||||
import anki.lang
|
||||
from anki.collection import StripHtmlMode
|
||||
|
||||
|
||||
def minimizeHTML(s: str) -> str:
|
||||
"Correct Qt's verbose bold/underline/etc."
|
||||
s = re.sub('<span style="font-weight:600;">(.*?)</span>', "<b>\\1</b>", s)
|
||||
s = re.sub('<span style="font-style:italic;">(.*?)</span>', "<i>\\1</i>", s)
|
||||
s = re.sub(
|
||||
'<span style="text-decoration: underline;">(.*?)</span>', "<u>\\1</u>", s
|
||||
return anki.lang.current_i18n.strip_html(
|
||||
text=s, mode=StripHtmlMode.PRESERVE_MEDIA_FILENAMES
|
||||
)
|
||||
return s
|
||||
|
||||
|
||||
def htmlToTextLine(s: str) -> str:
|
||||
|
@ -100,33 +85,6 @@ def htmlToTextLine(s: str) -> str:
|
|||
return s
|
||||
|
||||
|
||||
def entsToTxt(html: str) -> str:
|
||||
# entitydefs defines nbsp as \xa0 instead of a standard space, so we
|
||||
# replace it first
|
||||
html = html.replace(" ", " ")
|
||||
|
||||
def fixup(m: Match) -> str:
|
||||
text = m.group(0)
|
||||
if text[:2] == "&#":
|
||||
# character reference
|
||||
try:
|
||||
if text[:3] == "&#x":
|
||||
return chr(int(text[3:-1], 16))
|
||||
else:
|
||||
return chr(int(text[2:-1]))
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
# named entity
|
||||
try:
|
||||
text = chr(name2codepoint[text[1:-1]])
|
||||
except KeyError:
|
||||
pass
|
||||
return text # leave as is
|
||||
|
||||
return reEnts.sub(fixup, html)
|
||||
|
||||
|
||||
# IDs
|
||||
##############################################################################
|
||||
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
# Copyright: Ankitects Pty Ltd and contributors
|
||||
# License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
|
||||
|
||||
from anki.lang import set_lang
|
||||
|
||||
set_lang("en_US")
|
|
@ -12,7 +12,7 @@ use crate::{
|
|||
template::RenderedNode,
|
||||
text::{
|
||||
decode_iri_paths, encode_iri_paths, extract_av_tags, sanitize_html_no_images,
|
||||
strip_av_tags, AvTag,
|
||||
strip_av_tags, strip_html, strip_html_preserving_media_filenames, AvTag,
|
||||
},
|
||||
};
|
||||
|
||||
|
@ -161,6 +161,17 @@ impl CardRenderingService for Backend {
|
|||
fn decode_iri_paths(&self, input: pb::String) -> Result<pb::String> {
|
||||
Ok(decode_iri_paths(&input.val).to_string().into())
|
||||
}
|
||||
|
||||
fn strip_html(&self, input: pb::StripHtmlRequest) -> Result<pb::String> {
|
||||
Ok(match input.mode() {
|
||||
pb::strip_html_request::Mode::Normal => strip_html(&input.text),
|
||||
pb::strip_html_request::Mode::PreserveMediaFilenames => {
|
||||
strip_html_preserving_media_filenames(&input.text)
|
||||
}
|
||||
}
|
||||
.to_string()
|
||||
.into())
|
||||
}
|
||||
}
|
||||
|
||||
fn rendered_nodes_to_proto(nodes: Vec<RenderedNode>) -> Vec<pb::RenderedTemplateNode> {
|
||||
|
|
Loading…
Reference in a new issue