From 0bb273a0ed90d7548052d5c9b10467ea577fc1e0 Mon Sep 17 00:00:00 2001 From: Damien Elmes Date: Fri, 1 Oct 2021 23:07:45 +1000 Subject: [PATCH] replace the old stripHTML() methods with the backend implementation Python's regex engine performs pathologically on regexes like '' when fed a large string of repeating '") -reStyle = re.compile("(?si).*?") -reScript = re.compile("(?si).*?") -reTag = re.compile("(?s)<.*?>") -reEnts = re.compile(r"&#?\w+;") -reMedia = re.compile("(?i)]+src=[\"']?([^\"'>]+)[\"']?[^>]*>") def stripHTML(s: str) -> str: - s = reComment.sub("", s) - s = reStyle.sub("", s) - s = reScript.sub("", s) - s = reTag.sub("", s) - s = entsToTxt(s) - return s + import anki.lang + from anki.collection import StripHtmlMode + + return anki.lang.current_i18n.strip_html(text=s, mode=StripHtmlMode.NORMAL) def stripHTMLMedia(s: str) -> str: "Strip HTML but keep media filenames" - s = reMedia.sub(" \\1 ", s) - return stripHTML(s) + import anki.lang + from anki.collection import StripHtmlMode - -def minimizeHTML(s: str) -> str: - "Correct Qt's verbose bold/underline/etc." - s = re.sub('(.*?)', "\\1", s) - s = re.sub('(.*?)', "\\1", s) - s = re.sub( - '(.*?)', "\\1", s + return anki.lang.current_i18n.strip_html( + text=s, mode=StripHtmlMode.PRESERVE_MEDIA_FILENAMES ) - return s def htmlToTextLine(s: str) -> str: @@ -100,33 +85,6 @@ def htmlToTextLine(s: str) -> str: return s -def entsToTxt(html: str) -> str: - # entitydefs defines nbsp as \xa0 instead of a standard space, so we - # replace it first - html = html.replace(" ", " ") - - def fixup(m: Match) -> str: - text = m.group(0) - if text[:2] == "&#": - # character reference - try: - if text[:3] == "&#x": - return chr(int(text[3:-1], 16)) - else: - return chr(int(text[2:-1])) - except ValueError: - pass - else: - # named entity - try: - text = chr(name2codepoint[text[1:-1]]) - except KeyError: - pass - return text # leave as is - - return reEnts.sub(fixup, html) - - # IDs ############################################################################## diff --git a/pylib/tests/__init__.py b/pylib/tests/__init__.py index e69de29bb..7069f4fa7 100644 --- a/pylib/tests/__init__.py +++ b/pylib/tests/__init__.py @@ -0,0 +1,6 @@ +# Copyright: Ankitects Pty Ltd and contributors +# License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html + +from anki.lang import set_lang + +set_lang("en_US") diff --git a/rslib/src/backend/cardrendering.rs b/rslib/src/backend/cardrendering.rs index 0d501e224..6606c7ed1 100644 --- a/rslib/src/backend/cardrendering.rs +++ b/rslib/src/backend/cardrendering.rs @@ -12,7 +12,7 @@ use crate::{ template::RenderedNode, text::{ decode_iri_paths, encode_iri_paths, extract_av_tags, sanitize_html_no_images, - strip_av_tags, AvTag, + strip_av_tags, strip_html, strip_html_preserving_media_filenames, AvTag, }, }; @@ -161,6 +161,17 @@ impl CardRenderingService for Backend { fn decode_iri_paths(&self, input: pb::String) -> Result { Ok(decode_iri_paths(&input.val).to_string().into()) } + + fn strip_html(&self, input: pb::StripHtmlRequest) -> Result { + Ok(match input.mode() { + pb::strip_html_request::Mode::Normal => strip_html(&input.text), + pb::strip_html_request::Mode::PreserveMediaFilenames => { + strip_html_preserving_media_filenames(&input.text) + } + } + .to_string() + .into()) + } } fn rendered_nodes_to_proto(nodes: Vec) -> Vec {