diff --git a/Cargo.lock b/Cargo.lock index 5d2be794d..ee8d408ed 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -55,6 +55,7 @@ dependencies = [ "coarsetime", "criterion", "csv 1.1.6 (git+https://github.com/ankitects/rust-csv.git?rev=1c9d3aab6f79a7d815c69f925a46a4590c115f90)", + "dissimilar", "env_logger", "flate2", "fluent", @@ -586,6 +587,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "dissimilar" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c97b9233581d84b8e1e689cdd3a47b6f69770084fc246e86a7f78b0d9c1d4a5" + [[package]] name = "dtoa" version = "0.4.8" diff --git a/cargo/BUILD.bazel b/cargo/BUILD.bazel index 49f56ae98..dba36a62a 100644 --- a/cargo/BUILD.bazel +++ b/cargo/BUILD.bazel @@ -75,6 +75,15 @@ alias( ], ) +alias( + name = "dissimilar", + actual = "@raze__dissimilar__1_0_4//:dissimilar", + tags = [ + "cargo-raze", + "manual", + ], +) + alias( name = "env_logger", actual = "@raze__env_logger__0_9_0//:env_logger", diff --git a/cargo/crates.bzl b/cargo/crates.bzl index 5b7822884..b64c1cdf7 100644 --- a/cargo/crates.bzl +++ b/cargo/crates.bzl @@ -431,6 +431,16 @@ def raze_fetch_remote_crates(): build_file = Label("//cargo/remote:BUILD.dirs-sys-next-0.1.2.bazel"), ) + maybe( + http_archive, + name = "raze__dissimilar__1_0_4", + url = "https://crates.io/api/v1/crates/dissimilar/1.0.4/download", + type = "tar.gz", + sha256 = "8c97b9233581d84b8e1e689cdd3a47b6f69770084fc246e86a7f78b0d9c1d4a5", + strip_prefix = "dissimilar-1.0.4", + build_file = Label("//cargo/remote:BUILD.dissimilar-1.0.4.bazel"), + ) + maybe( http_archive, name = "raze__dtoa__0_4_8", diff --git a/cargo/licenses.json b/cargo/licenses.json index 9b6f7031c..cf0c31233 100644 --- a/cargo/licenses.json +++ b/cargo/licenses.json @@ -341,6 +341,15 @@ "license_file": null, "description": "system-level helper functions for the dirs and directories crates" }, + { + "name": "dissimilar", + "version": "1.0.4", + "authors": "David Tolnay ", + "repository": "https://github.com/dtolnay/dissimilar", + "license": "Apache-2.0", + "license_file": null, + "description": "Diff library with semantic cleanup, based on Google's diff-match-patch" + }, { "name": "either", "version": "1.6.1", diff --git a/cargo/remote/BUILD.dissimilar-1.0.4.bazel b/cargo/remote/BUILD.dissimilar-1.0.4.bazel new file mode 100644 index 000000000..0452b9835 --- /dev/null +++ b/cargo/remote/BUILD.dissimilar-1.0.4.bazel @@ -0,0 +1,58 @@ +""" +@generated +cargo-raze crate build file. + +DO NOT EDIT! Replaced on runs of cargo-raze +""" + +# buildifier: disable=load +load("@bazel_skylib//lib:selects.bzl", "selects") + +# buildifier: disable=load +load( + "@rules_rust//rust:defs.bzl", + "rust_binary", + "rust_library", + "rust_proc_macro", + "rust_test", +) + +package(default_visibility = [ + # Public for visibility by "@raze__crate__version//" targets. + # + # Prefer access through "//cargo", which limits external + # visibility to explicit Cargo.toml dependencies. + "//visibility:public", +]) + +licenses([ + "notice", # Apache-2.0 from expression "Apache-2.0" +]) + +# Generated Targets + +# Unsupported target "bench" with type "bench" omitted + +rust_library( + name = "dissimilar", + srcs = glob(["**/*.rs"]), + crate_features = [ + ], + crate_root = "src/lib.rs", + data = [], + edition = "2018", + rustc_flags = [ + "--cap-lints=allow", + ], + tags = [ + "cargo-raze", + "crate-name=dissimilar", + "manual", + ], + version = "1.0.4", + # buildifier: leave-alone + deps = [ + ], +) + +# Unsupported target "test" with type "test" omitted diff --git a/proto/anki/card_rendering.proto b/proto/anki/card_rendering.proto index 6bf4ed7ff..ab9dc6a44 100644 --- a/proto/anki/card_rendering.proto +++ b/proto/anki/card_rendering.proto @@ -26,6 +26,7 @@ service CardRenderingService { rpc EncodeIriPaths(generic.String) returns (generic.String); rpc DecodeIriPaths(generic.String) returns (generic.String); rpc StripHtml(StripHtmlRequest) returns (generic.String); + rpc CompareAnswer(CompareAnswerRequest) returns (generic.String); } message ExtractAVTagsRequest { @@ -132,3 +133,8 @@ message StripHtmlRequest { string text = 1; Mode mode = 2; } + +message CompareAnswerRequest { + string expected = 1; + string provided = 2; +} diff --git a/pylib/anki/collection.py b/pylib/anki/collection.py index 4a95bedad..43e9b8add 100644 --- a/pylib/anki/collection.py +++ b/pylib/anki/collection.py @@ -1213,6 +1213,9 @@ class Collection(DeprecatedNamesMixin): "Not intended for public consumption at this time." return self._backend.render_markdown(markdown=text, sanitize=sanitize) + def compare_answer(self, expected: str, provided: str) -> str: + return self._backend.compare_answer(expected=expected, provided=provided) + # Timeboxing ########################################################################## # fixme: there doesn't seem to be a good reason why this code is in main.py diff --git a/pylib/rsbridge/cargo/BUILD.bazel b/pylib/rsbridge/cargo/BUILD.bazel index 0c6a08966..3fca9bf94 100644 --- a/pylib/rsbridge/cargo/BUILD.bazel +++ b/pylib/rsbridge/cargo/BUILD.bazel @@ -75,6 +75,15 @@ alias( ], ) +alias( + name = "dissimilar", + actual = "@raze__dissimilar__1_0_4//:dissimilar", + tags = [ + "cargo-raze", + "manual", + ], +) + alias( name = "env_logger", actual = "@raze__env_logger__0_9_0//:env_logger", diff --git a/qt/aqt/clayout.py b/qt/aqt/clayout.py index 1919afa98..be9d50160 100644 --- a/qt/aqt/clayout.py +++ b/qt/aqt/clayout.py @@ -571,7 +571,7 @@ class CardLayout(QDialog): hadHR = origLen != len(txt) def answerRepl(match: Match) -> str: - res = self.mw.reviewer.correct("example", "sample") + res = self.mw.col.compare_answer("example", "sample") if hadHR: res = f"
{res}" return res diff --git a/qt/aqt/reviewer.py b/qt/aqt/reviewer.py index dbfc5fddb..1369df884 100644 --- a/qt/aqt/reviewer.py +++ b/qt/aqt/reviewer.py @@ -3,12 +3,9 @@ from __future__ import annotations -import difflib -import html import json import random import re -import unicodedata as ucd from dataclasses import dataclass from enum import Enum, auto from typing import Any, Callable, Literal, Match, Sequence, cast @@ -24,7 +21,6 @@ from anki.scheduler.v3 import CardAnswer, NextStates, QueuedCards from anki.scheduler.v3 import Scheduler as V3Scheduler from anki.tags import MARKED_TAG from anki.types import assert_exhaustive -from anki.utils import strip_html from aqt import AnkiQt, gui_hooks from aqt.browser.card_info import PreviousReviewerCardInfo, ReviewerCardInfo from aqt.deckoptions import confirm_deck_then_display_options @@ -597,17 +593,10 @@ class Reviewer: buf = buf.replace("
", "") hadHR = len(buf) != origSize # munge correct value - cor = self.mw.col.media.strip(self.typeCorrect) - cor = re.sub("(\n|
|)+", " ", cor) - cor = strip_html(cor) - # ensure we don't chomp multiple whitespace - cor = cor.replace(" ", " ") - cor = html.unescape(cor) - cor = cor.replace("\xa0", " ") - cor = cor.strip() - given = self.typedAnswer + expected = self.typeCorrect + provided = self.typedAnswer # compare with typed answer - res = self.correct(given, cor, showBad=False) + output = self.mw.col.compare_answer(expected, provided) # and update the type answer area def repl(match: Match) -> str: # can't pass a string in directly, and can't use re.escape as it @@ -616,7 +605,7 @@ class Reviewer: {}""".format( self.typeFont, self.typeSize, - res, + output, ) if hadHR: # a hack to ensure the q/a separator falls before the answer @@ -644,84 +633,6 @@ class Reviewer: txt = ", ".join(matches) return txt - def tokenizeComparison( - self, given: str, correct: str - ) -> tuple[list[tuple[bool, str]], list[tuple[bool, str]]]: - # compare in NFC form so accents appear correct - given = ucd.normalize("NFC", given) - correct = ucd.normalize("NFC", correct) - s = difflib.SequenceMatcher(None, given, correct, autojunk=False) - givenElems: list[tuple[bool, str]] = [] - correctElems: list[tuple[bool, str]] = [] - givenPoint = 0 - correctPoint = 0 - offby = 0 - - def logBad(old: int, new: int, s: str, array: list[tuple[bool, str]]) -> None: - if old != new: - array.append((False, s[old:new])) - - def logGood( - start: int, cnt: int, s: str, array: list[tuple[bool, str]] - ) -> None: - if cnt: - array.append((True, s[start : start + cnt])) - - for x, y, cnt in s.get_matching_blocks(): - # if anything was missed in correct, pad given - if cnt and y - offby > x: - givenElems.append((False, "-" * (y - x - offby))) - offby = y - x - # log any proceeding bad elems - logBad(givenPoint, x, given, givenElems) - logBad(correctPoint, y, correct, correctElems) - givenPoint = x + cnt - correctPoint = y + cnt - # log the match - logGood(x, cnt, given, givenElems) - logGood(y, cnt, correct, correctElems) - return givenElems, correctElems - - def correct(self, given: str, correct: str, showBad: bool = True) -> str: - "Diff-corrects the typed-in answer." - givenElems, correctElems = self.tokenizeComparison(given, correct) - - def good(s: str) -> str: - return f"{html.escape(s)}" - - def bad(s: str) -> str: - return f"{html.escape(s)}" - - def missed(s: str) -> str: - return f"{html.escape(s)}" - - if given == correct: - res = good(given) - else: - res = "" - for ok, txt in givenElems: - txt = self._noLoneMarks(txt) - if ok: - res += good(txt) - else: - res += bad(txt) - res += "

" - for ok, txt in correctElems: - txt = self._noLoneMarks(txt) - if ok: - res += good(txt) - else: - res += missed(txt) - res = f"
{res}
" - return res - - def _noLoneMarks(self, s: str) -> str: - # ensure a combining character at the start does not join to - # previous text - if s and ucd.category(s[0]).startswith("M"): - return f"\xa0{s}" - return s - def _getTypedAnswer(self) -> None: self.web.evalWithCallback("getTypedAnswer();", self._onTypedAnswer) diff --git a/rslib/BUILD.bazel b/rslib/BUILD.bazel index 78f84aaf0..2c027a0b8 100644 --- a/rslib/BUILD.bazel +++ b/rslib/BUILD.bazel @@ -77,6 +77,7 @@ rust_library( "//rslib/cargo:chrono", "//rslib/cargo:coarsetime", "//rslib/cargo:csv", + "//rslib/cargo:dissimilar", "//rslib/cargo:flate2", "//rslib/cargo:fluent", "//rslib/cargo:fnv", diff --git a/rslib/Cargo.toml b/rslib/Cargo.toml index 8d7b1ecc1..58e8e8e6e 100644 --- a/rslib/Cargo.toml +++ b/rslib/Cargo.toml @@ -101,3 +101,4 @@ id_tree = "1.8.0" zstd = { version="0.10.0", features=["zstdmt"] } num_cpus = "1.13.1" csv = { git="https://github.com/ankitects/rust-csv.git", rev="1c9d3aab6f79a7d815c69f925a46a4590c115f90" } +dissimilar = "1.0.4" diff --git a/rslib/cargo/BUILD.bazel b/rslib/cargo/BUILD.bazel index 0c6a08966..3fca9bf94 100644 --- a/rslib/cargo/BUILD.bazel +++ b/rslib/cargo/BUILD.bazel @@ -75,6 +75,15 @@ alias( ], ) +alias( + name = "dissimilar", + actual = "@raze__dissimilar__1_0_4//:dissimilar", + tags = [ + "cargo-raze", + "manual", + ], +) + alias( name = "env_logger", actual = "@raze__env_logger__0_9_0//:env_logger", diff --git a/rslib/i18n/cargo/BUILD.bazel b/rslib/i18n/cargo/BUILD.bazel index 0c6a08966..3fca9bf94 100644 --- a/rslib/i18n/cargo/BUILD.bazel +++ b/rslib/i18n/cargo/BUILD.bazel @@ -75,6 +75,15 @@ alias( ], ) +alias( + name = "dissimilar", + actual = "@raze__dissimilar__1_0_4//:dissimilar", + tags = [ + "cargo-raze", + "manual", + ], +) + alias( name = "env_logger", actual = "@raze__env_logger__0_9_0//:env_logger", diff --git a/rslib/i18n_helpers/cargo/BUILD.bazel b/rslib/i18n_helpers/cargo/BUILD.bazel index 0c6a08966..3fca9bf94 100644 --- a/rslib/i18n_helpers/cargo/BUILD.bazel +++ b/rslib/i18n_helpers/cargo/BUILD.bazel @@ -75,6 +75,15 @@ alias( ], ) +alias( + name = "dissimilar", + actual = "@raze__dissimilar__1_0_4//:dissimilar", + tags = [ + "cargo-raze", + "manual", + ], +) + alias( name = "env_logger", actual = "@raze__env_logger__0_9_0//:env_logger", diff --git a/rslib/linkchecker/cargo/BUILD.bazel b/rslib/linkchecker/cargo/BUILD.bazel index 0c6a08966..3fca9bf94 100644 --- a/rslib/linkchecker/cargo/BUILD.bazel +++ b/rslib/linkchecker/cargo/BUILD.bazel @@ -75,6 +75,15 @@ alias( ], ) +alias( + name = "dissimilar", + actual = "@raze__dissimilar__1_0_4//:dissimilar", + tags = [ + "cargo-raze", + "manual", + ], +) + alias( name = "env_logger", actual = "@raze__env_logger__0_9_0//:env_logger", diff --git a/rslib/src/backend/cardrendering.rs b/rslib/src/backend/cardrendering.rs index 3e4697154..6b7177f43 100644 --- a/rslib/src/backend/cardrendering.rs +++ b/rslib/src/backend/cardrendering.rs @@ -15,6 +15,7 @@ use crate::{ decode_iri_paths, encode_iri_paths, sanitize_html_no_images, strip_html, strip_html_preserving_media_filenames, }, + typeanswer::compare_answer, }; impl CardRenderingService for Backend { @@ -147,6 +148,10 @@ impl CardRenderingService for Backend { .to_string() .into()) } + + fn compare_answer(&self, input: pb::CompareAnswerRequest) -> Result { + Ok(compare_answer(&input.expected, &input.provided).into()) + } } fn rendered_nodes_to_proto(nodes: Vec) -> Vec { diff --git a/rslib/src/lib.rs b/rslib/src/lib.rs index 134d1dfc8..9ef072d26 100644 --- a/rslib/src/lib.rs +++ b/rslib/src/lib.rs @@ -43,6 +43,7 @@ pub mod template_filters; pub(crate) mod tests; pub mod text; pub mod timestamp; +mod typeanswer; pub mod types; pub mod undo; pub mod version; diff --git a/rslib/src/typeanswer.rs b/rslib/src/typeanswer.rs new file mode 100644 index 000000000..007756a26 --- /dev/null +++ b/rslib/src/typeanswer.rs @@ -0,0 +1,233 @@ +// Copyright: Ankitects Pty Ltd and contributors +// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html + +// FIXME: space to nbsp in output, or pre-wrap + +use std::borrow::Cow; + +use dissimilar::Chunk; +use lazy_static::lazy_static; +use regex::Regex; +use unic_ucd_category::GeneralCategory; + +use crate::{ + card_rendering::strip_av_tags, + text::{normalize_to_nfc, strip_html}, +}; + +lazy_static! { + static ref LINEBREAKS: Regex = Regex::new( + r#"(?six) + ( + \n + | + + | + + )+ + "# + ) + .unwrap(); +} + +struct DiffContext { + expected: String, + provided: String, +} + +impl DiffContext { + fn new(expected: &str, provided: &str) -> Self { + DiffContext { + expected: prepare_expected(expected), + provided: prepare_provided(provided), + } + } + + fn to_tokens(&self) -> DiffOutput<'_> { + let chunks = dissimilar::diff(&self.provided, &self.expected); + let mut provided = vec![]; + let mut expected = vec![]; + for chunk in chunks { + match chunk { + Chunk::Equal(text) => { + provided.push(DiffToken { + kind: DiffTokenKind::Good, + text: text.into(), + }); + expected.push(DiffToken { + kind: DiffTokenKind::Good, + text: text.into(), + }); + } + Chunk::Delete(text) => { + provided.push(DiffToken { + kind: DiffTokenKind::Bad, + text: text.into(), + }); + } + Chunk::Insert(text) => { + // If the proceeding text was correct, indicate text was missing + if provided + .last() + .map(|v| v.kind == DiffTokenKind::Good) + .unwrap_or_default() + { + provided.push(DiffToken { + kind: DiffTokenKind::Missing, + text: text.into(), + }); + } + expected.push(DiffToken { + kind: DiffTokenKind::Missing, + text: text.into(), + }); + } + } + } + DiffOutput { provided, expected } + } + + fn to_html(&self) -> String { + let output = self.to_tokens(); + let provided = render_tokens(&output.provided); + let expected = render_tokens(&output.expected); + format!( + "
{}
", + if no_mistakes(&output.expected) { + provided + } else { + format!("{provided}

{expected}") + } + ) + } +} + +fn no_mistakes(tokens: &[DiffToken]) -> bool { + tokens.iter().all(|v| v.kind == DiffTokenKind::Good) +} + +fn prepare_expected(expected: &str) -> String { + let without_av = strip_av_tags(expected); + let without_newlines = LINEBREAKS.replace_all(&without_av, " "); + let without_html = strip_html(&without_newlines); + normalize_to_nfc(&without_html).into() +} + +fn prepare_provided(provided: &str) -> String { + normalize_to_nfc(provided).into() +} + +#[derive(Debug, PartialEq)] +enum DiffTokenKind { + Good, + Bad, + Missing, +} + +#[derive(Debug, PartialEq)] +struct DiffToken<'a> { + kind: DiffTokenKind, + text: Cow<'a, str>, +} + +#[derive(Debug, PartialEq)] +struct DiffOutput<'a> { + provided: Vec>, + expected: Vec>, +} + +pub fn compare_answer(expected: &str, provided: &str) -> String { + DiffContext::new(expected, provided).to_html() +} + +fn render_tokens(tokens: &[DiffToken]) -> String { + let text_tokens: Vec<_> = tokens + .iter() + .map(|token| { + let text = with_isolated_leading_mark(&token.text); + let encoded = htmlescape::encode_minimal(&text); + let class = match token.kind { + DiffTokenKind::Good => "typeGood", + DiffTokenKind::Bad => "typeBad", + DiffTokenKind::Missing => "typeMissed", + }; + format!("{encoded}") + }) + .collect(); + text_tokens.join("") +} + +/// If text begins with a mark character, prefix it with a non-breaking +/// space to prevent the mark from joining to the previous token. +fn with_isolated_leading_mark(text: &str) -> Cow { + if let Some(ch) = text.chars().next() { + if GeneralCategory::of(ch).is_mark() { + return format!("\u{a0}{text}").into(); + } + } + text.into() +} + +#[cfg(test)] +mod test { + use DiffTokenKind::*; + + use super::*; + + macro_rules! token { + ($kind:ident, $text:expr) => { + DiffToken { + kind: $kind, + text: $text.into(), + } + }; + } + + #[test] + fn tokens() { + let ctx = DiffContext::new("¿Y ahora qué vamos a hacer?", "y ahora qe vamosa hacer"); + let output = ctx.to_tokens(); + assert_eq!( + output.provided, + vec![ + token!(Bad, "y"), + token!(Good, " ahora q"), + token!(Bad, "e"), + token!(Good, " vamos"), + token!(Missing, " "), + token!(Good, "a hacer"), + token!(Missing, "?"), + ] + ); + assert_eq!( + output.expected, + vec![ + token!(Missing, "¿Y"), + token!(Good, " ahora q"), + token!(Missing, "ué"), + token!(Good, " vamos"), + token!(Missing, " "), + token!(Good, "a hacer"), + token!(Missing, "?"), + ] + ); + } + + #[test] + fn html_and_media() { + let ctx = DiffContext::new("[sound:foo.mp3]1  2", "1 2"); + // the spacing is handled by wrapping html output in white-space: pre-wrap + assert_eq!(ctx.to_tokens().expected, &[token!(Good, "1 2")]); + } + + #[test] + fn missed_chars_only_shown_in_provided_when_after_good() { + let ctx = DiffContext::new("1", "23"); + assert_eq!(ctx.to_tokens().provided, &[token!(Bad, "23")]); + let ctx = DiffContext::new("12", "1"); + assert_eq!( + ctx.to_tokens().provided, + &[token!(Good, "1"), token!(Missing, "2"),] + ); + } +}