diff --git a/proto/anki/card_rendering.proto b/proto/anki/card_rendering.proto index 744332e19..aa16e4a91 100644 --- a/proto/anki/card_rendering.proto +++ b/proto/anki/card_rendering.proto @@ -23,6 +23,7 @@ service CardRenderingService { rpc RenderMarkdown(RenderMarkdownRequest) returns (generic.String); rpc EncodeIriPaths(generic.String) returns (generic.String); rpc DecodeIriPaths(generic.String) returns (generic.String); + rpc StripHtml(StripHtmlRequest) returns (generic.String); } message ExtractAVTagsRequest { @@ -119,3 +120,13 @@ message RenderMarkdownRequest { string markdown = 1; bool sanitize = 2; } + +message StripHtmlRequest { + enum Mode { + NORMAL = 0; + PRESERVE_MEDIA_FILENAMES = 1; + } + + string text = 1; + Mode mode = 2; +} diff --git a/pylib/.pylintrc b/pylib/.pylintrc index 76d60e6fb..094f99b89 100644 --- a/pylib/.pylintrc +++ b/pylib/.pylintrc @@ -16,7 +16,8 @@ ignored-classes= BackendError, SetDeckCollapsedRequest, ConfigKey, - HelpPageLinkRequest + HelpPageLinkRequest, + StripHtmlRequest [REPORTS] output-format=colorized diff --git a/pylib/anki/collection.py b/pylib/anki/collection.py index 349734036..770b9b30f 100644 --- a/pylib/anki/collection.py +++ b/pylib/anki/collection.py @@ -32,6 +32,7 @@ OpChangesWithId = collection_pb2.OpChangesWithId OpChangesAfterUndo = collection_pb2.OpChangesAfterUndo BrowserRow = search_pb2.BrowserRow BrowserColumns = search_pb2.BrowserColumns +StripHtmlMode = card_rendering_pb2.StripHtmlRequest import copy import os diff --git a/pylib/anki/utils.py b/pylib/anki/utils.py index 330797c0f..53a573c7a 100644 --- a/pylib/anki/utils.py +++ b/pylib/anki/utils.py @@ -17,8 +17,7 @@ import time import traceback from contextlib import contextmanager from hashlib import sha1 -from html.entities import name2codepoint -from typing import Any, Iterable, Iterator, List, Match, Optional, Union +from typing import Any, Iterable, Iterator, List, Optional, Union from anki.dbproxy import DBProxy @@ -55,37 +54,23 @@ def intTime(scale: int = 1) -> int: # HTML ############################################################################## -reComment = re.compile("(?s)") -reStyle = re.compile("(?si).*?") -reScript = re.compile("(?si).*?") -reTag = re.compile("(?s)<.*?>") -reEnts = re.compile(r"&#?\w+;") -reMedia = re.compile("(?i)]+src=[\"']?([^\"'>]+)[\"']?[^>]*>") def stripHTML(s: str) -> str: - s = reComment.sub("", s) - s = reStyle.sub("", s) - s = reScript.sub("", s) - s = reTag.sub("", s) - s = entsToTxt(s) - return s + import anki.lang + from anki.collection import StripHtmlMode + + return anki.lang.current_i18n.strip_html(text=s, mode=StripHtmlMode.NORMAL) def stripHTMLMedia(s: str) -> str: "Strip HTML but keep media filenames" - s = reMedia.sub(" \\1 ", s) - return stripHTML(s) + import anki.lang + from anki.collection import StripHtmlMode - -def minimizeHTML(s: str) -> str: - "Correct Qt's verbose bold/underline/etc." - s = re.sub('(.*?)', "\\1", s) - s = re.sub('(.*?)', "\\1", s) - s = re.sub( - '(.*?)', "\\1", s + return anki.lang.current_i18n.strip_html( + text=s, mode=StripHtmlMode.PRESERVE_MEDIA_FILENAMES ) - return s def htmlToTextLine(s: str) -> str: @@ -100,33 +85,6 @@ def htmlToTextLine(s: str) -> str: return s -def entsToTxt(html: str) -> str: - # entitydefs defines nbsp as \xa0 instead of a standard space, so we - # replace it first - html = html.replace(" ", " ") - - def fixup(m: Match) -> str: - text = m.group(0) - if text[:2] == "&#": - # character reference - try: - if text[:3] == "&#x": - return chr(int(text[3:-1], 16)) - else: - return chr(int(text[2:-1])) - except ValueError: - pass - else: - # named entity - try: - text = chr(name2codepoint[text[1:-1]]) - except KeyError: - pass - return text # leave as is - - return reEnts.sub(fixup, html) - - # IDs ############################################################################## diff --git a/pylib/tests/__init__.py b/pylib/tests/__init__.py index e69de29bb..7069f4fa7 100644 --- a/pylib/tests/__init__.py +++ b/pylib/tests/__init__.py @@ -0,0 +1,6 @@ +# Copyright: Ankitects Pty Ltd and contributors +# License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html + +from anki.lang import set_lang + +set_lang("en_US") diff --git a/rslib/src/backend/cardrendering.rs b/rslib/src/backend/cardrendering.rs index 0d501e224..6606c7ed1 100644 --- a/rslib/src/backend/cardrendering.rs +++ b/rslib/src/backend/cardrendering.rs @@ -12,7 +12,7 @@ use crate::{ template::RenderedNode, text::{ decode_iri_paths, encode_iri_paths, extract_av_tags, sanitize_html_no_images, - strip_av_tags, AvTag, + strip_av_tags, strip_html, strip_html_preserving_media_filenames, AvTag, }, }; @@ -161,6 +161,17 @@ impl CardRenderingService for Backend { fn decode_iri_paths(&self, input: pb::String) -> Result { Ok(decode_iri_paths(&input.val).to_string().into()) } + + fn strip_html(&self, input: pb::StripHtmlRequest) -> Result { + Ok(match input.mode() { + pb::strip_html_request::Mode::Normal => strip_html(&input.text), + pb::strip_html_request::Mode::PreserveMediaFilenames => { + strip_html_preserving_media_filenames(&input.text) + } + } + .to_string() + .into()) + } } fn rendered_nodes_to_proto(nodes: Vec) -> Vec {