mirror of
https://github.com/ankitects/anki.git
synced 2025-09-20 15:02:21 -04:00
replace the old stripHTML() methods with the backend implementation
Python's regex engine performs pathologically on regexes like '<!--.*?-->' when fed a large string of repeating '<!--' clauses. Thanks to JaimeSlome / security@huntr.dev for the report; closes #1380. Solved by switching to the Rust implementation, which does not suffer from this issue. entsToText(), minimizeHTML(), and the old regex constants have been removed; they do not appear to be used by any add-ons.
This commit is contained in:
parent
a28df026b8
commit
0bb273a0ed
6 changed files with 41 additions and 53 deletions
|
@ -23,6 +23,7 @@ service CardRenderingService {
|
||||||
rpc RenderMarkdown(RenderMarkdownRequest) returns (generic.String);
|
rpc RenderMarkdown(RenderMarkdownRequest) returns (generic.String);
|
||||||
rpc EncodeIriPaths(generic.String) returns (generic.String);
|
rpc EncodeIriPaths(generic.String) returns (generic.String);
|
||||||
rpc DecodeIriPaths(generic.String) returns (generic.String);
|
rpc DecodeIriPaths(generic.String) returns (generic.String);
|
||||||
|
rpc StripHtml(StripHtmlRequest) returns (generic.String);
|
||||||
}
|
}
|
||||||
|
|
||||||
message ExtractAVTagsRequest {
|
message ExtractAVTagsRequest {
|
||||||
|
@ -119,3 +120,13 @@ message RenderMarkdownRequest {
|
||||||
string markdown = 1;
|
string markdown = 1;
|
||||||
bool sanitize = 2;
|
bool sanitize = 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
message StripHtmlRequest {
|
||||||
|
enum Mode {
|
||||||
|
NORMAL = 0;
|
||||||
|
PRESERVE_MEDIA_FILENAMES = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
string text = 1;
|
||||||
|
Mode mode = 2;
|
||||||
|
}
|
||||||
|
|
|
@ -16,7 +16,8 @@ ignored-classes=
|
||||||
BackendError,
|
BackendError,
|
||||||
SetDeckCollapsedRequest,
|
SetDeckCollapsedRequest,
|
||||||
ConfigKey,
|
ConfigKey,
|
||||||
HelpPageLinkRequest
|
HelpPageLinkRequest,
|
||||||
|
StripHtmlRequest
|
||||||
|
|
||||||
[REPORTS]
|
[REPORTS]
|
||||||
output-format=colorized
|
output-format=colorized
|
||||||
|
|
|
@ -32,6 +32,7 @@ OpChangesWithId = collection_pb2.OpChangesWithId
|
||||||
OpChangesAfterUndo = collection_pb2.OpChangesAfterUndo
|
OpChangesAfterUndo = collection_pb2.OpChangesAfterUndo
|
||||||
BrowserRow = search_pb2.BrowserRow
|
BrowserRow = search_pb2.BrowserRow
|
||||||
BrowserColumns = search_pb2.BrowserColumns
|
BrowserColumns = search_pb2.BrowserColumns
|
||||||
|
StripHtmlMode = card_rendering_pb2.StripHtmlRequest
|
||||||
|
|
||||||
import copy
|
import copy
|
||||||
import os
|
import os
|
||||||
|
|
|
@ -17,8 +17,7 @@ import time
|
||||||
import traceback
|
import traceback
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from hashlib import sha1
|
from hashlib import sha1
|
||||||
from html.entities import name2codepoint
|
from typing import Any, Iterable, Iterator, List, Optional, Union
|
||||||
from typing import Any, Iterable, Iterator, List, Match, Optional, Union
|
|
||||||
|
|
||||||
from anki.dbproxy import DBProxy
|
from anki.dbproxy import DBProxy
|
||||||
|
|
||||||
|
@ -55,37 +54,23 @@ def intTime(scale: int = 1) -> int:
|
||||||
|
|
||||||
# HTML
|
# HTML
|
||||||
##############################################################################
|
##############################################################################
|
||||||
reComment = re.compile("(?s)<!--.*?-->")
|
|
||||||
reStyle = re.compile("(?si)<style.*?>.*?</style>")
|
|
||||||
reScript = re.compile("(?si)<script.*?>.*?</script>")
|
|
||||||
reTag = re.compile("(?s)<.*?>")
|
|
||||||
reEnts = re.compile(r"&#?\w+;")
|
|
||||||
reMedia = re.compile("(?i)<img[^>]+src=[\"']?([^\"'>]+)[\"']?[^>]*>")
|
|
||||||
|
|
||||||
|
|
||||||
def stripHTML(s: str) -> str:
|
def stripHTML(s: str) -> str:
|
||||||
s = reComment.sub("", s)
|
import anki.lang
|
||||||
s = reStyle.sub("", s)
|
from anki.collection import StripHtmlMode
|
||||||
s = reScript.sub("", s)
|
|
||||||
s = reTag.sub("", s)
|
return anki.lang.current_i18n.strip_html(text=s, mode=StripHtmlMode.NORMAL)
|
||||||
s = entsToTxt(s)
|
|
||||||
return s
|
|
||||||
|
|
||||||
|
|
||||||
def stripHTMLMedia(s: str) -> str:
|
def stripHTMLMedia(s: str) -> str:
|
||||||
"Strip HTML but keep media filenames"
|
"Strip HTML but keep media filenames"
|
||||||
s = reMedia.sub(" \\1 ", s)
|
import anki.lang
|
||||||
return stripHTML(s)
|
from anki.collection import StripHtmlMode
|
||||||
|
|
||||||
|
return anki.lang.current_i18n.strip_html(
|
||||||
def minimizeHTML(s: str) -> str:
|
text=s, mode=StripHtmlMode.PRESERVE_MEDIA_FILENAMES
|
||||||
"Correct Qt's verbose bold/underline/etc."
|
|
||||||
s = re.sub('<span style="font-weight:600;">(.*?)</span>', "<b>\\1</b>", s)
|
|
||||||
s = re.sub('<span style="font-style:italic;">(.*?)</span>', "<i>\\1</i>", s)
|
|
||||||
s = re.sub(
|
|
||||||
'<span style="text-decoration: underline;">(.*?)</span>', "<u>\\1</u>", s
|
|
||||||
)
|
)
|
||||||
return s
|
|
||||||
|
|
||||||
|
|
||||||
def htmlToTextLine(s: str) -> str:
|
def htmlToTextLine(s: str) -> str:
|
||||||
|
@ -100,33 +85,6 @@ def htmlToTextLine(s: str) -> str:
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
||||||
def entsToTxt(html: str) -> str:
|
|
||||||
# entitydefs defines nbsp as \xa0 instead of a standard space, so we
|
|
||||||
# replace it first
|
|
||||||
html = html.replace(" ", " ")
|
|
||||||
|
|
||||||
def fixup(m: Match) -> str:
|
|
||||||
text = m.group(0)
|
|
||||||
if text[:2] == "&#":
|
|
||||||
# character reference
|
|
||||||
try:
|
|
||||||
if text[:3] == "&#x":
|
|
||||||
return chr(int(text[3:-1], 16))
|
|
||||||
else:
|
|
||||||
return chr(int(text[2:-1]))
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
# named entity
|
|
||||||
try:
|
|
||||||
text = chr(name2codepoint[text[1:-1]])
|
|
||||||
except KeyError:
|
|
||||||
pass
|
|
||||||
return text # leave as is
|
|
||||||
|
|
||||||
return reEnts.sub(fixup, html)
|
|
||||||
|
|
||||||
|
|
||||||
# IDs
|
# IDs
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,6 @@
|
||||||
|
# Copyright: Ankitects Pty Ltd and contributors
|
||||||
|
# License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
|
||||||
|
|
||||||
|
from anki.lang import set_lang
|
||||||
|
|
||||||
|
set_lang("en_US")
|
|
@ -12,7 +12,7 @@ use crate::{
|
||||||
template::RenderedNode,
|
template::RenderedNode,
|
||||||
text::{
|
text::{
|
||||||
decode_iri_paths, encode_iri_paths, extract_av_tags, sanitize_html_no_images,
|
decode_iri_paths, encode_iri_paths, extract_av_tags, sanitize_html_no_images,
|
||||||
strip_av_tags, AvTag,
|
strip_av_tags, strip_html, strip_html_preserving_media_filenames, AvTag,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -161,6 +161,17 @@ impl CardRenderingService for Backend {
|
||||||
fn decode_iri_paths(&self, input: pb::String) -> Result<pb::String> {
|
fn decode_iri_paths(&self, input: pb::String) -> Result<pb::String> {
|
||||||
Ok(decode_iri_paths(&input.val).to_string().into())
|
Ok(decode_iri_paths(&input.val).to_string().into())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn strip_html(&self, input: pb::StripHtmlRequest) -> Result<pb::String> {
|
||||||
|
Ok(match input.mode() {
|
||||||
|
pb::strip_html_request::Mode::Normal => strip_html(&input.text),
|
||||||
|
pb::strip_html_request::Mode::PreserveMediaFilenames => {
|
||||||
|
strip_html_preserving_media_filenames(&input.text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.to_string()
|
||||||
|
.into())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn rendered_nodes_to_proto(nodes: Vec<RenderedNode>) -> Vec<pb::RenderedTemplateNode> {
|
fn rendered_nodes_to_proto(nodes: Vec<RenderedNode>) -> Vec<pb::RenderedTemplateNode> {
|
||||||
|
|
Loading…
Reference in a new issue