replace the old stripHTML() methods with the backend implementation

Python's regex engine performs pathologically on regexes like '' when fed a large string of repeating '<!--' clauses. Thanks to JaimeSlome / security@huntr.dev for the report; closes #1380. Solved by switching to the Rust implementation, which does not suffer from this issue. entsToText(), minimizeHTML(), and the old regex constants have been removed; they do not appear to be used by any add-ons.
2025-11-06 12:47:11 -05:00 · 2021-10-01 23:07:45 +10:00 · 2021-10-01 23:07:45 +10:00 · 0bb273a0ed
commit 0bb273a0ed
parent a28df026b8
6 changed files with 41 additions and 53 deletions
--- a/proto/anki/card_rendering.proto
+++ b/proto/anki/card_rendering.proto
@ -23,6 +23,7 @@ service CardRenderingService {
  rpc RenderMarkdown(RenderMarkdownRequest) returns (generic.String);
  rpc EncodeIriPaths(generic.String) returns (generic.String);
  rpc DecodeIriPaths(generic.String) returns (generic.String);
  rpc StripHtml(StripHtmlRequest) returns (generic.String);
 }
 message ExtractAVTagsRequest {
@ -119,3 +120,13 @@ message RenderMarkdownRequest {
  string markdown = 1;
  bool sanitize = 2;
 }
 message StripHtmlRequest {
  enum Mode {
    NORMAL = 0;
    PRESERVE_MEDIA_FILENAMES = 1;
  }
  string text = 1;
  Mode mode = 2;
 }
--- a/pylib/.pylintrc
+++ b/pylib/.pylintrc
@ -16,7 +16,8 @@ ignored-classes=
  BackendError,
  SetDeckCollapsedRequest,
  ConfigKey,
-  HelpPageLinkRequest
+  HelpPageLinkRequest,
  StripHtmlRequest
 [REPORTS]
 output-format=colorized
--- a/pylib/anki/collection.py
+++ b/pylib/anki/collection.py
@ -32,6 +32,7 @@ OpChangesWithId = collection_pb2.OpChangesWithId
 OpChangesAfterUndo = collection_pb2.OpChangesAfterUndo
 BrowserRow = search_pb2.BrowserRow
 BrowserColumns = search_pb2.BrowserColumns
 StripHtmlMode = card_rendering_pb2.StripHtmlRequest
 import copy
 import os
--- a/pylib/anki/utils.py
+++ b/pylib/anki/utils.py
@ -17,8 +17,7 @@ import time
 import traceback
 from contextlib import contextmanager
 from hashlib import sha1
-from html.entities import name2codepoint
+from typing import Any, Iterable, Iterator, List, Optional, Union
 from typing import Any, Iterable, Iterator, List, Match, Optional, Union
 from anki.dbproxy import DBProxy
@ -55,37 +54,23 @@ def intTime(scale: int = 1) -> int:
 # HTML
 ##############################################################################
 reComment = re.compile("(?s)<!--.*?-->")
 reStyle = re.compile("(?si)<style.*?>.*?</style>")
 reScript = re.compile("(?si)<script.*?>.*?</script>")
 reTag = re.compile("(?s)<.*?>")
 reEnts = re.compile(r"&#?\w+;")
 reMedia = re.compile("(?i)<img[^>]+src=[\"']?([^\"'>]+)[\"']?[^>]*>")
 def stripHTML(s: str) -> str:
-    s = reComment.sub("", s)
+    import anki.lang
-    s = reStyle.sub("", s)
+    from anki.collection import StripHtmlMode
-    s = reScript.sub("", s)
+
-    s = reTag.sub("", s)
+    return anki.lang.current_i18n.strip_html(text=s, mode=StripHtmlMode.NORMAL)
    s = entsToTxt(s)
    return s
 def stripHTMLMedia(s: str) -> str:
    "Strip HTML but keep media filenames"
-    s = reMedia.sub(" \\1 ", s)
+    import anki.lang
-    return stripHTML(s)
+    from anki.collection import StripHtmlMode
-
+    return anki.lang.current_i18n.strip_html(
-def minimizeHTML(s: str) -> str:
+        text=s, mode=StripHtmlMode.PRESERVE_MEDIA_FILENAMES
    "Correct Qt's verbose bold/underline/etc."
    s = re.sub('<span style="font-weight:600;">(.*?)</span>', "<b>\\1</b>", s)
    s = re.sub('<span style="font-style:italic;">(.*?)</span>', "<i>\\1</i>", s)
    s = re.sub(
        '<span style="text-decoration: underline;">(.*?)</span>', "<u>\\1</u>", s
    )
    return s
 def htmlToTextLine(s: str) -> str:
@ -100,33 +85,6 @@ def htmlToTextLine(s: str) -> str:
    return s
 def entsToTxt(html: str) -> str:
    # entitydefs defines nbsp as \xa0 instead of a standard space, so we
    # replace it first
    html = html.replace("&nbsp;", " ")
    def fixup(m: Match) -> str:
        text = m.group(0)
        if text[:2] == "&#":
            # character reference
            try:
                if text[:3] == "&#x":
                    return chr(int(text[3:-1], 16))
                else:
                    return chr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            # named entity
            try:
                text = chr(name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text  # leave as is
    return reEnts.sub(fixup, html)
 # IDs
 ##############################################################################
--- a/pylib/tests/init.py
+++ b/pylib/tests/init.py
@ -0,0 +1,6 @@
 # Copyright: Ankitects Pty Ltd and contributors
 # License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
 from anki.lang import set_lang
 set_lang("en_US")
--- a/rslib/src/backend/cardrendering.rs
+++ b/rslib/src/backend/cardrendering.rs
@ -12,7 +12,7 @@ use crate::{
    template::RenderedNode,
    text::{
        decode_iri_paths, encode_iri_paths, extract_av_tags, sanitize_html_no_images,
-        strip_av_tags, AvTag,
+        strip_av_tags, strip_html, strip_html_preserving_media_filenames, AvTag,
    },
 };
@ -161,6 +161,17 @@ impl CardRenderingService for Backend {
    fn decode_iri_paths(&self, input: pb::String) -> Result<pb::String> {
        Ok(decode_iri_paths(&input.val).to_string().into())
    }
    fn strip_html(&self, input: pb::StripHtmlRequest) -> Result<pb::String> {
        Ok(match input.mode() {
            pb::strip_html_request::Mode::Normal => strip_html(&input.text),
            pb::strip_html_request::Mode::PreserveMediaFilenames => {
                strip_html_preserving_media_filenames(&input.text)
            }
        }
        .to_string()
        .into())
    }
 }
 fn rendered_nodes_to_proto(nodes: Vec<RenderedNode>) -> Vec<pb::RenderedTemplateNode> {