replace the old stripHTML() methods with the backend implementation

Python's regex engine performs pathologically on regexes like '' when fed a large string of repeating '<!--' clauses. Thanks to JaimeSlome / security@huntr.dev for the report; closes #1380. Solved by switching to the Rust implementation, which does not suffer from this issue. entsToText(), minimizeHTML(), and the old regex constants have been removed; they do not appear to be used by any add-ons.
2025-12-25 21:03:17 -05:00 · 2021-10-01 23:07:45 +10:00 · 2021-10-01 23:07:45 +10:00 · 0bb273a0ed
commit 0bb273a0ed
parent a28df026b8
6 changed files with 41 additions and 53 deletions
--- a/proto/anki/card_rendering.proto
+++ b/proto/anki/card_rendering.proto
@ -23,6 +23,7 @@ service CardRenderingService {
  rpc RenderMarkdown(RenderMarkdownRequest) returns (generic.String);
  rpc EncodeIriPaths(generic.String) returns (generic.String);
  rpc DecodeIriPaths(generic.String) returns (generic.String);
+  rpc StripHtml(StripHtmlRequest) returns (generic.String);
 }

 message ExtractAVTagsRequest {
@ -119,3 +120,13 @@ message RenderMarkdownRequest {
  string markdown = 1;
  bool sanitize = 2;
 }
+
+message StripHtmlRequest {
+  enum Mode {
+    NORMAL = 0;
+    PRESERVE_MEDIA_FILENAMES = 1;
+  }
+
+  string text = 1;
+  Mode mode = 2;
+}
--- a/pylib/.pylintrc
+++ b/pylib/.pylintrc
@ -16,7 +16,8 @@ ignored-classes=
  BackendError,
  SetDeckCollapsedRequest,
  ConfigKey,
-  HelpPageLinkRequest
+  HelpPageLinkRequest,
+  StripHtmlRequest

 [REPORTS]
 output-format=colorized
--- a/pylib/anki/collection.py
+++ b/pylib/anki/collection.py
@ -32,6 +32,7 @@ OpChangesWithId = collection_pb2.OpChangesWithId
 OpChangesAfterUndo = collection_pb2.OpChangesAfterUndo
 BrowserRow = search_pb2.BrowserRow
 BrowserColumns = search_pb2.BrowserColumns
+StripHtmlMode = card_rendering_pb2.StripHtmlRequest

 import copy
 import os
--- a/pylib/anki/utils.py
+++ b/pylib/anki/utils.py
@ -17,8 +17,7 @@ import time
 import traceback
 from contextlib import contextmanager
 from hashlib import sha1
-from html.entities import name2codepoint
-from typing import Any, Iterable, Iterator, List, Match, Optional, Union
+from typing import Any, Iterable, Iterator, List, Optional, Union

 from anki.dbproxy import DBProxy

@ -55,37 +54,23 @@ def intTime(scale: int = 1) -> int:

 # HTML
 ##############################################################################
-reComment = re.compile("(?s)<!--.*?-->")
-reStyle = re.compile("(?si)<style.*?>.*?</style>")
-reScript = re.compile("(?si)<script.*?>.*?</script>")
-reTag = re.compile("(?s)<.*?>")
-reEnts = re.compile(r"&#?\w+;")
-reMedia = re.compile("(?i)<img[^>]+src=[\"']?([^\"'>]+)[\"']?[^>]*>")


 def stripHTML(s: str) -> str:
-    s = reComment.sub("", s)
-    s = reStyle.sub("", s)
-    s = reScript.sub("", s)
-    s = reTag.sub("", s)
-    s = entsToTxt(s)
-    return s
+    import anki.lang
+    from anki.collection import StripHtmlMode
+
+    return anki.lang.current_i18n.strip_html(text=s, mode=StripHtmlMode.NORMAL)


 def stripHTMLMedia(s: str) -> str:
    "Strip HTML but keep media filenames"
-    s = reMedia.sub(" \\1 ", s)
-    return stripHTML(s)
+    import anki.lang
+    from anki.collection import StripHtmlMode

-
-def minimizeHTML(s: str) -> str:
-    "Correct Qt's verbose bold/underline/etc."
-    s = re.sub('<span style="font-weight:600;">(.*?)</span>', "<b>\\1</b>", s)
-    s = re.sub('<span style="font-style:italic;">(.*?)</span>', "<i>\\1</i>", s)
-    s = re.sub(
-        '<span style="text-decoration: underline;">(.*?)</span>', "<u>\\1</u>", s
+    return anki.lang.current_i18n.strip_html(
+        text=s, mode=StripHtmlMode.PRESERVE_MEDIA_FILENAMES
    )
-    return s


 def htmlToTextLine(s: str) -> str:
@ -100,33 +85,6 @@ def htmlToTextLine(s: str) -> str:
    return s


-def entsToTxt(html: str) -> str:
-    # entitydefs defines nbsp as \xa0 instead of a standard space, so we
-    # replace it first
-    html = html.replace("&nbsp;", " ")
-
-    def fixup(m: Match) -> str:
-        text = m.group(0)
-        if text[:2] == "&#":
-            # character reference
-            try:
-                if text[:3] == "&#x":
-                    return chr(int(text[3:-1], 16))
-                else:
-                    return chr(int(text[2:-1]))
-            except ValueError:
-                pass
-        else:
-            # named entity
-            try:
-                text = chr(name2codepoint[text[1:-1]])
-            except KeyError:
-                pass
-        return text  # leave as is
-
-    return reEnts.sub(fixup, html)
-
-
 # IDs
 ##############################################################################

--- a/pylib/tests/init.py
+++ b/pylib/tests/init.py
@ -0,0 +1,6 @@
+# Copyright: Ankitects Pty Ltd and contributors
+# License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
+
+from anki.lang import set_lang
+
+set_lang("en_US")
--- a/rslib/src/backend/cardrendering.rs
+++ b/rslib/src/backend/cardrendering.rs
@ -12,7 +12,7 @@ use crate::{
    template::RenderedNode,
    text::{
        decode_iri_paths, encode_iri_paths, extract_av_tags, sanitize_html_no_images,
-        strip_av_tags, AvTag,
+        strip_av_tags, strip_html, strip_html_preserving_media_filenames, AvTag,
    },
 };

@ -161,6 +161,17 @@ impl CardRenderingService for Backend {
    fn decode_iri_paths(&self, input: pb::String) -> Result<pb::String> {
        Ok(decode_iri_paths(&input.val).to_string().into())
    }
+
+    fn strip_html(&self, input: pb::StripHtmlRequest) -> Result<pb::String> {
+        Ok(match input.mode() {
+            pb::strip_html_request::Mode::Normal => strip_html(&input.text),
+            pb::strip_html_request::Mode::PreserveMediaFilenames => {
+                strip_html_preserving_media_filenames(&input.text)
+            }
+        }
+        .to_string()
+        .into())
+    }
 }

 fn rendered_nodes_to_proto(nodes: Vec<RenderedNode>) -> Vec<pb::RenderedTemplateNode> {