From e2124cd790e6852ab32abae869bc19b280ffcc74 Mon Sep 17 00:00:00 2001 From: "a.r" <887320+twwn@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:11:51 +0200 Subject: [PATCH] =?UTF-8?q?=20typeanswer:=20[type:nc]=20=E2=80=93=20ignore?= =?UTF-8?q?s=20combining=20characters=20(#3422)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * typeanswer: fix cleanup Fix: Add prepare_expected back in for the 'nothing typed' & 'correctly typed' cases. This also makes expected_original redundant again. Style: %s/provided/typed/g Style: rename one ch → c Testcase: whitespace_is_trimmed: added a check for the "correctly typed" path and renamed it to tags_removed (there's no whitespace?) Testcase: empty_input_shows_as_code: changed to also check that tags get trimmed * [type:nc] – ignores combining characters Adds a comparison variant to [type] which ignores when combining characters of the expected field are missing from the provided input. It still shows these characters in the 'expected' line for reference. It's useful for languages with e.g. diacritics that are required for reference (such as in dictionaries), but rarely actually learned or used in everyday writing. Among these languages: Arabic, Hebrew, Persian, Urdu. The bool 'combining' controls it as new final parameter of both relevant compare_answer functions. On the Python side, it's set to true by default. Use on the note templates: [type:nc:field] (only the front needs to include :nc) This also removes the need to have both variants of words/sentences present as separate fields, to show them redundantly, etc. * typeanswer: simplify by using nfkd throughout Requires adjusting two testcases, but both render exactly the same in Anki itself. On NFC vs. NKFD: https://stackoverflow.com/a/77432079 * typeanswer: 'simplify' by removing normalize_typed (requiring a bool parameter) I'd prefer to keep this extra method. * typeanswer: micro-optimize vectors Should get rid of most relocations, at the expense of over-allocating. On Vec's (String's) behavior: https://stackoverflow.com/a/72787776 * Mark `combining` as private typeCorrect is not marked as private either, but we can at least do the right thing for newly-added code. * Revert "typeanswer: micro-optimize vectors" This reverts commit 9fbacbfd196f159cbcf339aab6d8aa0830a455a5. * Revert "typeanswer: 'simplify' by removing normalize_typed (requiring a bool parameter)" This reverts commit df2dd3394eedab4b6a53acfac44db52d5728105f. --- proto/anki/card_rendering.proto | 1 + pylib/anki/collection.py | 8 +- qt/aqt/reviewer.py | 7 +- rslib/src/card_rendering/service.rs | 2 +- rslib/src/template_filters.rs | 12 ++ rslib/src/text.rs | 15 +- rslib/src/typeanswer.rs | 213 +++++++++++++++++++++------- 7 files changed, 196 insertions(+), 62 deletions(-) diff --git a/proto/anki/card_rendering.proto b/proto/anki/card_rendering.proto index 145e4b0db..4035ae68b 100644 --- a/proto/anki/card_rendering.proto +++ b/proto/anki/card_rendering.proto @@ -165,6 +165,7 @@ message HtmlToTextLineRequest { message CompareAnswerRequest { string expected = 1; string provided = 2; + bool combining = 3; } message ExtractClozeForTypingRequest { diff --git a/pylib/anki/collection.py b/pylib/anki/collection.py index 6ae37befe..66b2fb618 100644 --- a/pylib/anki/collection.py +++ b/pylib/anki/collection.py @@ -1152,8 +1152,12 @@ class Collection(DeprecatedNamesMixin): "Not intended for public consumption at this time." return self._backend.render_markdown(markdown=text, sanitize=sanitize) - def compare_answer(self, expected: str, provided: str) -> str: - return self._backend.compare_answer(expected=expected, provided=provided) + def compare_answer( + self, expected: str, provided: str, combining: bool = True + ) -> str: + return self._backend.compare_answer( + expected=expected, provided=provided, combining=combining + ) def extract_cloze_for_typing(self, text: str, ordinal: int) -> str: return self._backend.extract_cloze_for_typing(text=text, ordinal=ordinal) diff --git a/qt/aqt/reviewer.py b/qt/aqt/reviewer.py index 4a16f7b47..f64f82208 100644 --- a/qt/aqt/reviewer.py +++ b/qt/aqt/reviewer.py @@ -152,6 +152,7 @@ class Reviewer: self.previous_card: Card | None = None self._answeredIds: list[CardId] = [] self._recordedAudio: str | None = None + self._combining: bool = True self.typeCorrect: str | None = None # web init happens before this is set self.state: Literal["question", "answer", "transition"] | None = None self._refresh_needed: RefreshNeeded | None = None @@ -699,6 +700,7 @@ class Reviewer: return self.typeAnsAnswerFilter(buf) def typeAnsQuestionFilter(self, buf: str) -> str: + self._combining = True self.typeCorrect = None clozeIdx = None m = re.search(self.typeAnsPat, buf) @@ -711,6 +713,9 @@ class Reviewer: clozeIdx = self.card.ord + 1 fld = fld.split(":")[1] # loop through fields for a match + if fld.startswith("nc:"): + self._combining = False + fld = fld.split(":")[1] for f in self.card.note_type()["flds"]: if f["name"] == fld: self.typeCorrect = self.card.note()[f["name"]] @@ -750,7 +755,7 @@ class Reviewer: hadHR = len(buf) != origSize expected = self.typeCorrect provided = self.typedAnswer - output = self.mw.col.compare_answer(expected, provided) + output = self.mw.col.compare_answer(expected, provided, self._combining) # and update the type answer area def repl(match: Match) -> str: diff --git a/rslib/src/card_rendering/service.rs b/rslib/src/card_rendering/service.rs index 7e0f9ba67..8d1585725 100644 --- a/rslib/src/card_rendering/service.rs +++ b/rslib/src/card_rendering/service.rs @@ -167,7 +167,7 @@ impl crate::services::CardRenderingService for Collection { &mut self, input: anki_proto::card_rendering::CompareAnswerRequest, ) -> Result { - Ok(compare_answer(&input.expected, &input.provided).into()) + Ok(compare_answer(&input.expected, &input.provided, input.combining).into()) } fn extract_cloze_for_typing( diff --git a/rslib/src/template_filters.rs b/rslib/src/template_filters.rs index b6408d965..f55d45862 100644 --- a/rslib/src/template_filters.rs +++ b/rslib/src/template_filters.rs @@ -33,6 +33,8 @@ pub(crate) fn apply_filters<'a>( // type:cloze is handled specially let filters = if filters == ["cloze", "type"] { &["type-cloze"] + } else if filters == ["nc", "type"] { + &["type-nc"] } else { filters }; @@ -80,6 +82,7 @@ fn apply_filter( "kana" => kana_filter(text), "type" => type_filter(field_name), "type-cloze" => type_cloze_filter(field_name), + "type-nc" => type_nc_filter(field_name), "hint" => hint_filter(text, field_name), "cloze" => cloze_filter(text, context), "cloze-only" => cloze_only_filter(text, context), @@ -171,6 +174,10 @@ fn type_cloze_filter<'a>(field_name: &str) -> Cow<'a, str> { format!("[[type:cloze:{}]]", field_name).into() } +fn type_nc_filter<'a>(field_name: &str) -> Cow<'a, str> { + format!("[[type:nc:{}]]", field_name).into() +} + fn hint_filter<'a>(text: &'a str, field_name: &str) -> Cow<'a, str> { if text.trim().is_empty() { return text.into(); @@ -238,6 +245,7 @@ field fn typing() { assert_eq!(type_filter("Front"), "[[type:Front]]"); assert_eq!(type_cloze_filter("Front"), "[[type:cloze:Front]]"); + assert_eq!(type_nc_filter("Front"), "[[type:nc:Front]]"); let ctx = RenderContext { fields: &Default::default(), nonempty_fields: &Default::default(), @@ -249,6 +257,10 @@ field apply_filters("ignored", &["cloze", "type"], "Text", &ctx), ("[[type:cloze:Text]]".into(), vec![]) ); + assert_eq!( + apply_filters("ignored", &["nc", "type"], "Text", &ctx), + ("[[type:nc:Text]]".into(), vec![]) + ); } #[test] diff --git a/rslib/src/text.rs b/rslib/src/text.rs index b32ef45c1..7f741540c 100644 --- a/rslib/src/text.rs +++ b/rslib/src/text.rs @@ -13,6 +13,7 @@ use regex::Regex; use unicase::eq as uni_eq; use unicode_normalization::char::is_combining_mark; use unicode_normalization::is_nfc; +use unicode_normalization::is_nfkd; use unicode_normalization::is_nfkd_quick; use unicode_normalization::IsNormalized; use unicode_normalization::UnicodeNormalization; @@ -367,10 +368,9 @@ pub(crate) fn sanitize_html_no_images(html: &str) -> String { } pub(crate) fn normalize_to_nfc(s: &str) -> Cow { - if !is_nfc(s) { - s.chars().nfc().collect::().into() - } else { - s.into() + match is_nfc(s) { + false => s.chars().nfc().collect::().into(), + true => s.into(), } } @@ -380,6 +380,13 @@ pub(crate) fn ensure_string_in_nfc(s: &mut String) { } } +pub(crate) fn normalize_to_nfkd(s: &str) -> Cow { + match is_nfkd(s) { + false => s.chars().nfkd().collect::().into(), + true => s.into(), + } +} + static EXTRA_NO_COMBINING_REPLACEMENTS: phf::Map = phf::phf_map! { '€' => "E", 'Æ' => "AE", diff --git a/rslib/src/typeanswer.rs b/rslib/src/typeanswer.rs index b8d8d4b9a..c1d6547fb 100644 --- a/rslib/src/typeanswer.rs +++ b/rslib/src/typeanswer.rs @@ -9,7 +9,7 @@ use regex::Regex; use unic_ucd_category::GeneralCategory; use crate::card_rendering::strip_av_tags; -use crate::text::normalize_to_nfc; +use crate::text::normalize_to_nfkd; use crate::text::strip_html; static LINEBREAKS: Lazy = Lazy::new(|| { @@ -33,85 +33,85 @@ macro_rules! format_typeans { } // Public API -pub fn compare_answer(expected: &str, provided: &str) -> String { - if provided.is_empty() { - format_typeans!(htmlescape::encode_minimal(expected)) +pub fn compare_answer(expected: &str, typed: &str, combining: bool) -> String { + if typed.is_empty() { + format_typeans!(htmlescape::encode_minimal(&prepare_expected(expected))) + } else if combining { + Diff::new(expected, typed).to_html() } else { - Diff::new(expected, provided).to_html() + DiffNonCombining::new(expected, typed).to_html() } } -struct Diff { - provided: Vec, - expected: Vec, - expected_original: String, -} +// Core Logic +trait DiffTrait { + fn get_typed(&self) -> &[char]; + fn get_expected(&self) -> &[char]; + fn get_expected_original(&self) -> Cow; -impl Diff { - fn new(expected: &str, provided: &str) -> Self { - Self { - provided: normalize_to_nfc(provided).chars().collect(), - expected: normalize_to_nfc(&prepare_expected(expected)) - .chars() - .collect(), - expected_original: expected.to_string(), - } - } + fn new(expected: &str, typed: &str) -> Self; + fn normalize_typed(typed: &str) -> Vec; // Entry Point fn to_html(&self) -> String { - if self.provided == self.expected { + if self.get_typed() == self.get_expected() { format_typeans!(format!( "{}", - self.expected_original + self.get_expected_original() )) } else { let output = self.to_tokens(); - let provided_html = render_tokens(&output.provided_tokens); - let expected_html = render_tokens(&output.expected_tokens); + let typed_html = render_tokens(&output.typed_tokens); + let expected_html = self.render_expected_tokens(&output.expected_tokens); format_typeans!(format!( - "{provided_html}

{expected_html}" + "{typed_html}

{expected_html}" )) } } fn to_tokens(&self) -> DiffTokens { - let mut matcher = SequenceMatcher::new(&self.provided, &self.expected); - let mut provided_tokens = Vec::new(); + let mut matcher = SequenceMatcher::new(self.get_typed(), self.get_expected()); + let mut typed_tokens = Vec::new(); let mut expected_tokens = Vec::new(); for opcode in matcher.get_opcodes() { - let provided_slice = slice(&self.provided, opcode.first_start, opcode.first_end); - let expected_slice = slice(&self.expected, opcode.second_start, opcode.second_end); + let typed_slice = slice(self.get_typed(), opcode.first_start, opcode.first_end); + let expected_slice = slice(self.get_expected(), opcode.second_start, opcode.second_end); match opcode.tag.as_str() { "equal" => { - provided_tokens.push(DiffToken::good(provided_slice)); + typed_tokens.push(DiffToken::good(typed_slice)); expected_tokens.push(DiffToken::good(expected_slice)); } - "delete" => provided_tokens.push(DiffToken::bad(provided_slice)), + "delete" => typed_tokens.push(DiffToken::bad(typed_slice)), "insert" => { - provided_tokens.push(DiffToken::missing( + typed_tokens.push(DiffToken::missing( "-".repeat(expected_slice.chars().count()), )); expected_tokens.push(DiffToken::missing(expected_slice)); } "replace" => { - provided_tokens.push(DiffToken::bad(provided_slice)); + typed_tokens.push(DiffToken::bad(typed_slice)); expected_tokens.push(DiffToken::missing(expected_slice)); } _ => unreachable!(), } } DiffTokens { - provided_tokens, + typed_tokens, expected_tokens, } } + + fn render_expected_tokens(&self, tokens: &[DiffToken]) -> String; } // Utility Functions +fn normalize(string: &str) -> Vec { + normalize_to_nfkd(string).chars().collect() +} + fn slice(chars: &[char], start: usize, end: usize) -> String { chars[start..end].iter().collect() } @@ -139,17 +139,118 @@ fn isolate_leading_mark(text: &str) -> Cow { if text .chars() .next() - .map_or(false, |ch| GeneralCategory::of(ch).is_mark()) + .map_or(false, |c| GeneralCategory::of(c).is_mark()) { - format!("\u{a0}{text}").into() + Cow::Owned(format!("\u{a0}{text}")) } else { - text.into() + Cow::Borrowed(text) } } +// Default Comparison +struct Diff { + typed: Vec, + expected: Vec, +} + +impl DiffTrait for Diff { + fn get_typed(&self) -> &[char] { + &self.typed + } + fn get_expected(&self) -> &[char] { + &self.expected + } + fn get_expected_original(&self) -> Cow { + Cow::Owned(self.get_expected().iter().collect::()) + } + + fn new(expected: &str, typed: &str) -> Self { + Self { + typed: Self::normalize_typed(typed), + expected: normalize(&prepare_expected(expected)), + } + } + fn normalize_typed(typed: &str) -> Vec { + normalize(typed) + } + + fn render_expected_tokens(&self, tokens: &[DiffToken]) -> String { + render_tokens(tokens) + } +} + +// Non-Combining Comparison +struct DiffNonCombining { + base: Diff, + expected_split: Vec, + expected_original: String, +} + +impl DiffTrait for DiffNonCombining { + fn get_typed(&self) -> &[char] { + &self.base.typed + } + fn get_expected(&self) -> &[char] { + &self.base.expected + } + fn get_expected_original(&self) -> Cow { + Cow::Borrowed(&self.expected_original) + } + + fn new(expected: &str, typed: &str) -> Self { + // filter out combining elements + let mut expected_stripped = String::new(); + // tokenized into "char+combining" for final rendering + let mut expected_split: Vec = Vec::new(); + for c in normalize(&prepare_expected(expected)) { + if unicode_normalization::char::is_combining_mark(c) { + if let Some(last) = expected_split.last_mut() { + last.push(c); + } + } else { + expected_stripped.push(c); + expected_split.push(c.to_string()); + } + } + + Self { + base: Diff { + typed: Self::normalize_typed(typed), + expected: expected_stripped.chars().collect(), + }, + expected_split, + expected_original: prepare_expected(expected), + } + } + + fn normalize_typed(typed: &str) -> Vec { + normalize_to_nfkd(typed) + .chars() + .filter(|c| !unicode_normalization::char::is_combining_mark(*c)) + .collect() + } + + // Since the combining characters are still required learning content, use + // expected_split to show them directly in the "expected" line, rather than + // having to otherwise e.g. include their field twice in the note template. + fn render_expected_tokens(&self, tokens: &[DiffToken]) -> String { + let mut idx = 0; + tokens.iter().fold(String::new(), |mut acc, token| { + let end = idx + token.text.chars().count(); + let txt = self.expected_split[idx..end].concat(); + idx = end; + let encoded_text = htmlescape::encode_minimal(&txt); + let class = token.to_class(); + acc.push_str(&format!("{encoded_text}")); + acc + }) + } +} + +// Utility Items #[derive(Debug, PartialEq, Eq)] struct DiffTokens { - provided_tokens: Vec, + typed_tokens: Vec, expected_tokens: Vec, } @@ -170,19 +271,15 @@ impl DiffToken { fn new(kind: DiffTokenKind, text: String) -> Self { Self { kind, text } } - fn good(text: String) -> Self { Self::new(DiffTokenKind::Good, text) } - fn bad(text: String) -> Self { Self::new(DiffTokenKind::Bad, text) } - fn missing(text: String) -> Self { Self::new(DiffTokenKind::Missing, text) } - fn to_class(&self) -> &'static str { match self.kind { DiffTokenKind::Good => "typeGood", @@ -212,11 +309,13 @@ mod test { let ctx = Diff::new("¿Y ahora qué vamos a hacer?", "y ahora qe vamosa hacer"); let output = ctx.to_tokens(); assert_eq!( - output.provided_tokens, + output.typed_tokens, vec![ bad("y"), good(" ahora q"), - bad("e"), + missing("-"), + good("e"), + missing("-"), good(" vamos"), missing("-"), good("a hacer"), @@ -228,7 +327,9 @@ mod test { vec![ missing("¿Y"), good(" ahora q"), - missing("ué"), + missing("u"), + good("e"), + missing("́"), good(" vamos"), missing(" "), good("a hacer"), @@ -245,18 +346,18 @@ mod test { } #[test] - fn missed_chars_only_shown_in_provided_when_after_good() { + fn missed_chars_only_shown_in_typed_when_after_good() { let ctx = Diff::new("1", "23"); - assert_eq!(ctx.to_tokens().provided_tokens, &[bad("23")]); + assert_eq!(ctx.to_tokens().typed_tokens, &[bad("23")]); let ctx = Diff::new("12", "1"); - assert_eq!(ctx.to_tokens().provided_tokens, &[good("1"), missing("-"),]); + assert_eq!(ctx.to_tokens().typed_tokens, &[good("1"), missing("-"),]); } #[test] fn missed_chars_counted_correctly() { let ctx = Diff::new("нос", "нс"); assert_eq!( - ctx.to_tokens().provided_tokens, + ctx.to_tokens().typed_tokens, &[good("н"), missing("-"), good("с")] ); } @@ -266,8 +367,8 @@ mod test { // this was not parsed as expected with dissimilar 1.0.4 let ctx = Diff::new("쓰다듬다", "스다뜸다"); assert_eq!( - ctx.to_tokens().provided_tokens, - &[bad("스"), good("다"), bad("뜸"), good("다"),] + ctx.to_tokens().typed_tokens, + &[bad("ᄉ"), good("ᅳ다"), bad("ᄄ"), good("ᅳᆷ다"),] ); } @@ -285,13 +386,17 @@ mod test { } #[test] - fn whitespace_is_trimmed() { - assert_eq!(prepare_expected("
foo
"), "foo"); + fn tags_removed() { + assert_eq!(prepare_expected("
123
"), "123"); + assert_eq!( + Diff::new("
123
", "123").to_html(), + "123" + ); } #[test] fn empty_input_shows_as_code() { - let ctx = compare_answer("123", ""); + let ctx = compare_answer("
123
", "", true); assert_eq!(ctx, "123"); }