typeanswer: [type:nc] – ignores combining characters (#3422)

* typeanswer: fix cleanup

Fix: Add prepare_expected back in for the 'nothing typed' & 'correctly typed' cases. This also makes expected_original redundant again.

Style: %s/provided/typed/g

Style: rename one ch → c

Testcase: whitespace_is_trimmed: added a check for the "correctly typed" path and renamed it to tags_removed (there's no whitespace?)

Testcase: empty_input_shows_as_code: changed to also check that tags get trimmed

* [type:nc] – ignores combining characters

Adds a comparison variant to [type] which ignores when combining characters of the expected field are missing from the provided input. It still shows these characters in the 'expected' line for reference.

It's useful for languages with e.g. diacritics that are required for reference (such as in dictionaries), but rarely actually learned or used in everyday writing. Among these languages: Arabic, Hebrew, Persian, Urdu.

The bool 'combining' controls it as new final parameter of both relevant compare_answer functions. On the Python side, it's set to true by default.

Use on the note templates: [type:nc:field] (only the front needs to include :nc)

This also removes the need to have both variants of words/sentences present as separate fields, to show them redundantly, etc.

* typeanswer: simplify by using nfkd throughout

Requires adjusting two testcases, but both render exactly the same in Anki itself.

On NFC vs. NKFD: https://stackoverflow.com/a/77432079

* typeanswer: 'simplify' by removing normalize_typed (requiring a bool parameter)

I'd prefer to keep this extra method.

* typeanswer: micro-optimize vectors

Should get rid of most relocations, at the expense of over-allocating.

On Vec's (String's) behavior: https://stackoverflow.com/a/72787776

* Mark `combining` as private

typeCorrect is not marked as private either, but we can at least do
the right thing for newly-added code.

* Revert "typeanswer: micro-optimize vectors"

This reverts commit 9fbacbfd19.

* Revert "typeanswer: 'simplify' by removing normalize_typed (requiring a bool parameter)"

This reverts commit df2dd3394e.
This commit is contained in:
a.r 2024-09-30 15:11:51 +02:00 committed by GitHub
parent 981b37e44d
commit e2124cd790
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 196 additions and 62 deletions

View file

@ -165,6 +165,7 @@ message HtmlToTextLineRequest {
message CompareAnswerRequest { message CompareAnswerRequest {
string expected = 1; string expected = 1;
string provided = 2; string provided = 2;
bool combining = 3;
} }
message ExtractClozeForTypingRequest { message ExtractClozeForTypingRequest {

View file

@ -1152,8 +1152,12 @@ class Collection(DeprecatedNamesMixin):
"Not intended for public consumption at this time." "Not intended for public consumption at this time."
return self._backend.render_markdown(markdown=text, sanitize=sanitize) return self._backend.render_markdown(markdown=text, sanitize=sanitize)
def compare_answer(self, expected: str, provided: str) -> str: def compare_answer(
return self._backend.compare_answer(expected=expected, provided=provided) self, expected: str, provided: str, combining: bool = True
) -> str:
return self._backend.compare_answer(
expected=expected, provided=provided, combining=combining
)
def extract_cloze_for_typing(self, text: str, ordinal: int) -> str: def extract_cloze_for_typing(self, text: str, ordinal: int) -> str:
return self._backend.extract_cloze_for_typing(text=text, ordinal=ordinal) return self._backend.extract_cloze_for_typing(text=text, ordinal=ordinal)

View file

@ -152,6 +152,7 @@ class Reviewer:
self.previous_card: Card | None = None self.previous_card: Card | None = None
self._answeredIds: list[CardId] = [] self._answeredIds: list[CardId] = []
self._recordedAudio: str | None = None self._recordedAudio: str | None = None
self._combining: bool = True
self.typeCorrect: str | None = None # web init happens before this is set self.typeCorrect: str | None = None # web init happens before this is set
self.state: Literal["question", "answer", "transition"] | None = None self.state: Literal["question", "answer", "transition"] | None = None
self._refresh_needed: RefreshNeeded | None = None self._refresh_needed: RefreshNeeded | None = None
@ -699,6 +700,7 @@ class Reviewer:
return self.typeAnsAnswerFilter(buf) return self.typeAnsAnswerFilter(buf)
def typeAnsQuestionFilter(self, buf: str) -> str: def typeAnsQuestionFilter(self, buf: str) -> str:
self._combining = True
self.typeCorrect = None self.typeCorrect = None
clozeIdx = None clozeIdx = None
m = re.search(self.typeAnsPat, buf) m = re.search(self.typeAnsPat, buf)
@ -711,6 +713,9 @@ class Reviewer:
clozeIdx = self.card.ord + 1 clozeIdx = self.card.ord + 1
fld = fld.split(":")[1] fld = fld.split(":")[1]
# loop through fields for a match # loop through fields for a match
if fld.startswith("nc:"):
self._combining = False
fld = fld.split(":")[1]
for f in self.card.note_type()["flds"]: for f in self.card.note_type()["flds"]:
if f["name"] == fld: if f["name"] == fld:
self.typeCorrect = self.card.note()[f["name"]] self.typeCorrect = self.card.note()[f["name"]]
@ -750,7 +755,7 @@ class Reviewer:
hadHR = len(buf) != origSize hadHR = len(buf) != origSize
expected = self.typeCorrect expected = self.typeCorrect
provided = self.typedAnswer provided = self.typedAnswer
output = self.mw.col.compare_answer(expected, provided) output = self.mw.col.compare_answer(expected, provided, self._combining)
# and update the type answer area # and update the type answer area
def repl(match: Match) -> str: def repl(match: Match) -> str:

View file

@ -167,7 +167,7 @@ impl crate::services::CardRenderingService for Collection {
&mut self, &mut self,
input: anki_proto::card_rendering::CompareAnswerRequest, input: anki_proto::card_rendering::CompareAnswerRequest,
) -> Result<generic::String> { ) -> Result<generic::String> {
Ok(compare_answer(&input.expected, &input.provided).into()) Ok(compare_answer(&input.expected, &input.provided, input.combining).into())
} }
fn extract_cloze_for_typing( fn extract_cloze_for_typing(

View file

@ -33,6 +33,8 @@ pub(crate) fn apply_filters<'a>(
// type:cloze is handled specially // type:cloze is handled specially
let filters = if filters == ["cloze", "type"] { let filters = if filters == ["cloze", "type"] {
&["type-cloze"] &["type-cloze"]
} else if filters == ["nc", "type"] {
&["type-nc"]
} else { } else {
filters filters
}; };
@ -80,6 +82,7 @@ fn apply_filter(
"kana" => kana_filter(text), "kana" => kana_filter(text),
"type" => type_filter(field_name), "type" => type_filter(field_name),
"type-cloze" => type_cloze_filter(field_name), "type-cloze" => type_cloze_filter(field_name),
"type-nc" => type_nc_filter(field_name),
"hint" => hint_filter(text, field_name), "hint" => hint_filter(text, field_name),
"cloze" => cloze_filter(text, context), "cloze" => cloze_filter(text, context),
"cloze-only" => cloze_only_filter(text, context), "cloze-only" => cloze_only_filter(text, context),
@ -171,6 +174,10 @@ fn type_cloze_filter<'a>(field_name: &str) -> Cow<'a, str> {
format!("[[type:cloze:{}]]", field_name).into() format!("[[type:cloze:{}]]", field_name).into()
} }
fn type_nc_filter<'a>(field_name: &str) -> Cow<'a, str> {
format!("[[type:nc:{}]]", field_name).into()
}
fn hint_filter<'a>(text: &'a str, field_name: &str) -> Cow<'a, str> { fn hint_filter<'a>(text: &'a str, field_name: &str) -> Cow<'a, str> {
if text.trim().is_empty() { if text.trim().is_empty() {
return text.into(); return text.into();
@ -238,6 +245,7 @@ field</a>
fn typing() { fn typing() {
assert_eq!(type_filter("Front"), "[[type:Front]]"); assert_eq!(type_filter("Front"), "[[type:Front]]");
assert_eq!(type_cloze_filter("Front"), "[[type:cloze:Front]]"); assert_eq!(type_cloze_filter("Front"), "[[type:cloze:Front]]");
assert_eq!(type_nc_filter("Front"), "[[type:nc:Front]]");
let ctx = RenderContext { let ctx = RenderContext {
fields: &Default::default(), fields: &Default::default(),
nonempty_fields: &Default::default(), nonempty_fields: &Default::default(),
@ -249,6 +257,10 @@ field</a>
apply_filters("ignored", &["cloze", "type"], "Text", &ctx), apply_filters("ignored", &["cloze", "type"], "Text", &ctx),
("[[type:cloze:Text]]".into(), vec![]) ("[[type:cloze:Text]]".into(), vec![])
); );
assert_eq!(
apply_filters("ignored", &["nc", "type"], "Text", &ctx),
("[[type:nc:Text]]".into(), vec![])
);
} }
#[test] #[test]

View file

@ -13,6 +13,7 @@ use regex::Regex;
use unicase::eq as uni_eq; use unicase::eq as uni_eq;
use unicode_normalization::char::is_combining_mark; use unicode_normalization::char::is_combining_mark;
use unicode_normalization::is_nfc; use unicode_normalization::is_nfc;
use unicode_normalization::is_nfkd;
use unicode_normalization::is_nfkd_quick; use unicode_normalization::is_nfkd_quick;
use unicode_normalization::IsNormalized; use unicode_normalization::IsNormalized;
use unicode_normalization::UnicodeNormalization; use unicode_normalization::UnicodeNormalization;
@ -367,10 +368,9 @@ pub(crate) fn sanitize_html_no_images(html: &str) -> String {
} }
pub(crate) fn normalize_to_nfc(s: &str) -> Cow<str> { pub(crate) fn normalize_to_nfc(s: &str) -> Cow<str> {
if !is_nfc(s) { match is_nfc(s) {
s.chars().nfc().collect::<String>().into() false => s.chars().nfc().collect::<String>().into(),
} else { true => s.into(),
s.into()
} }
} }
@ -380,6 +380,13 @@ pub(crate) fn ensure_string_in_nfc(s: &mut String) {
} }
} }
pub(crate) fn normalize_to_nfkd(s: &str) -> Cow<str> {
match is_nfkd(s) {
false => s.chars().nfkd().collect::<String>().into(),
true => s.into(),
}
}
static EXTRA_NO_COMBINING_REPLACEMENTS: phf::Map<char, &str> = phf::phf_map! { static EXTRA_NO_COMBINING_REPLACEMENTS: phf::Map<char, &str> = phf::phf_map! {
'€' => "E", '€' => "E",
'Æ' => "AE", 'Æ' => "AE",

View file

@ -9,7 +9,7 @@ use regex::Regex;
use unic_ucd_category::GeneralCategory; use unic_ucd_category::GeneralCategory;
use crate::card_rendering::strip_av_tags; use crate::card_rendering::strip_av_tags;
use crate::text::normalize_to_nfc; use crate::text::normalize_to_nfkd;
use crate::text::strip_html; use crate::text::strip_html;
static LINEBREAKS: Lazy<Regex> = Lazy::new(|| { static LINEBREAKS: Lazy<Regex> = Lazy::new(|| {
@ -33,85 +33,85 @@ macro_rules! format_typeans {
} }
// Public API // Public API
pub fn compare_answer(expected: &str, provided: &str) -> String { pub fn compare_answer(expected: &str, typed: &str, combining: bool) -> String {
if provided.is_empty() { if typed.is_empty() {
format_typeans!(htmlescape::encode_minimal(expected)) format_typeans!(htmlescape::encode_minimal(&prepare_expected(expected)))
} else if combining {
Diff::new(expected, typed).to_html()
} else { } else {
Diff::new(expected, provided).to_html() DiffNonCombining::new(expected, typed).to_html()
} }
} }
struct Diff { // Core Logic
provided: Vec<char>, trait DiffTrait {
expected: Vec<char>, fn get_typed(&self) -> &[char];
expected_original: String, fn get_expected(&self) -> &[char];
} fn get_expected_original(&self) -> Cow<str>;
impl Diff { fn new(expected: &str, typed: &str) -> Self;
fn new(expected: &str, provided: &str) -> Self { fn normalize_typed(typed: &str) -> Vec<char>;
Self {
provided: normalize_to_nfc(provided).chars().collect(),
expected: normalize_to_nfc(&prepare_expected(expected))
.chars()
.collect(),
expected_original: expected.to_string(),
}
}
// Entry Point // Entry Point
fn to_html(&self) -> String { fn to_html(&self) -> String {
if self.provided == self.expected { if self.get_typed() == self.get_expected() {
format_typeans!(format!( format_typeans!(format!(
"<span class=typeGood>{}</span>", "<span class=typeGood>{}</span>",
self.expected_original self.get_expected_original()
)) ))
} else { } else {
let output = self.to_tokens(); let output = self.to_tokens();
let provided_html = render_tokens(&output.provided_tokens); let typed_html = render_tokens(&output.typed_tokens);
let expected_html = render_tokens(&output.expected_tokens); let expected_html = self.render_expected_tokens(&output.expected_tokens);
format_typeans!(format!( format_typeans!(format!(
"{provided_html}<br><span id=typearrow>&darr;</span><br>{expected_html}" "{typed_html}<br><span id=typearrow>&darr;</span><br>{expected_html}"
)) ))
} }
} }
fn to_tokens(&self) -> DiffTokens { fn to_tokens(&self) -> DiffTokens {
let mut matcher = SequenceMatcher::new(&self.provided, &self.expected); let mut matcher = SequenceMatcher::new(self.get_typed(), self.get_expected());
let mut provided_tokens = Vec::new(); let mut typed_tokens = Vec::new();
let mut expected_tokens = Vec::new(); let mut expected_tokens = Vec::new();
for opcode in matcher.get_opcodes() { for opcode in matcher.get_opcodes() {
let provided_slice = slice(&self.provided, opcode.first_start, opcode.first_end); let typed_slice = slice(self.get_typed(), opcode.first_start, opcode.first_end);
let expected_slice = slice(&self.expected, opcode.second_start, opcode.second_end); let expected_slice = slice(self.get_expected(), opcode.second_start, opcode.second_end);
match opcode.tag.as_str() { match opcode.tag.as_str() {
"equal" => { "equal" => {
provided_tokens.push(DiffToken::good(provided_slice)); typed_tokens.push(DiffToken::good(typed_slice));
expected_tokens.push(DiffToken::good(expected_slice)); expected_tokens.push(DiffToken::good(expected_slice));
} }
"delete" => provided_tokens.push(DiffToken::bad(provided_slice)), "delete" => typed_tokens.push(DiffToken::bad(typed_slice)),
"insert" => { "insert" => {
provided_tokens.push(DiffToken::missing( typed_tokens.push(DiffToken::missing(
"-".repeat(expected_slice.chars().count()), "-".repeat(expected_slice.chars().count()),
)); ));
expected_tokens.push(DiffToken::missing(expected_slice)); expected_tokens.push(DiffToken::missing(expected_slice));
} }
"replace" => { "replace" => {
provided_tokens.push(DiffToken::bad(provided_slice)); typed_tokens.push(DiffToken::bad(typed_slice));
expected_tokens.push(DiffToken::missing(expected_slice)); expected_tokens.push(DiffToken::missing(expected_slice));
} }
_ => unreachable!(), _ => unreachable!(),
} }
} }
DiffTokens { DiffTokens {
provided_tokens, typed_tokens,
expected_tokens, expected_tokens,
} }
} }
fn render_expected_tokens(&self, tokens: &[DiffToken]) -> String;
} }
// Utility Functions // Utility Functions
fn normalize(string: &str) -> Vec<char> {
normalize_to_nfkd(string).chars().collect()
}
fn slice(chars: &[char], start: usize, end: usize) -> String { fn slice(chars: &[char], start: usize, end: usize) -> String {
chars[start..end].iter().collect() chars[start..end].iter().collect()
} }
@ -139,17 +139,118 @@ fn isolate_leading_mark(text: &str) -> Cow<str> {
if text if text
.chars() .chars()
.next() .next()
.map_or(false, |ch| GeneralCategory::of(ch).is_mark()) .map_or(false, |c| GeneralCategory::of(c).is_mark())
{ {
format!("\u{a0}{text}").into() Cow::Owned(format!("\u{a0}{text}"))
} else { } else {
text.into() Cow::Borrowed(text)
} }
} }
// Default Comparison
struct Diff {
typed: Vec<char>,
expected: Vec<char>,
}
impl DiffTrait for Diff {
fn get_typed(&self) -> &[char] {
&self.typed
}
fn get_expected(&self) -> &[char] {
&self.expected
}
fn get_expected_original(&self) -> Cow<str> {
Cow::Owned(self.get_expected().iter().collect::<String>())
}
fn new(expected: &str, typed: &str) -> Self {
Self {
typed: Self::normalize_typed(typed),
expected: normalize(&prepare_expected(expected)),
}
}
fn normalize_typed(typed: &str) -> Vec<char> {
normalize(typed)
}
fn render_expected_tokens(&self, tokens: &[DiffToken]) -> String {
render_tokens(tokens)
}
}
// Non-Combining Comparison
struct DiffNonCombining {
base: Diff,
expected_split: Vec<String>,
expected_original: String,
}
impl DiffTrait for DiffNonCombining {
fn get_typed(&self) -> &[char] {
&self.base.typed
}
fn get_expected(&self) -> &[char] {
&self.base.expected
}
fn get_expected_original(&self) -> Cow<str> {
Cow::Borrowed(&self.expected_original)
}
fn new(expected: &str, typed: &str) -> Self {
// filter out combining elements
let mut expected_stripped = String::new();
// tokenized into "char+combining" for final rendering
let mut expected_split: Vec<String> = Vec::new();
for c in normalize(&prepare_expected(expected)) {
if unicode_normalization::char::is_combining_mark(c) {
if let Some(last) = expected_split.last_mut() {
last.push(c);
}
} else {
expected_stripped.push(c);
expected_split.push(c.to_string());
}
}
Self {
base: Diff {
typed: Self::normalize_typed(typed),
expected: expected_stripped.chars().collect(),
},
expected_split,
expected_original: prepare_expected(expected),
}
}
fn normalize_typed(typed: &str) -> Vec<char> {
normalize_to_nfkd(typed)
.chars()
.filter(|c| !unicode_normalization::char::is_combining_mark(*c))
.collect()
}
// Since the combining characters are still required learning content, use
// expected_split to show them directly in the "expected" line, rather than
// having to otherwise e.g. include their field twice in the note template.
fn render_expected_tokens(&self, tokens: &[DiffToken]) -> String {
let mut idx = 0;
tokens.iter().fold(String::new(), |mut acc, token| {
let end = idx + token.text.chars().count();
let txt = self.expected_split[idx..end].concat();
idx = end;
let encoded_text = htmlescape::encode_minimal(&txt);
let class = token.to_class();
acc.push_str(&format!("<span class={class}>{encoded_text}</span>"));
acc
})
}
}
// Utility Items
#[derive(Debug, PartialEq, Eq)] #[derive(Debug, PartialEq, Eq)]
struct DiffTokens { struct DiffTokens {
provided_tokens: Vec<DiffToken>, typed_tokens: Vec<DiffToken>,
expected_tokens: Vec<DiffToken>, expected_tokens: Vec<DiffToken>,
} }
@ -170,19 +271,15 @@ impl DiffToken {
fn new(kind: DiffTokenKind, text: String) -> Self { fn new(kind: DiffTokenKind, text: String) -> Self {
Self { kind, text } Self { kind, text }
} }
fn good(text: String) -> Self { fn good(text: String) -> Self {
Self::new(DiffTokenKind::Good, text) Self::new(DiffTokenKind::Good, text)
} }
fn bad(text: String) -> Self { fn bad(text: String) -> Self {
Self::new(DiffTokenKind::Bad, text) Self::new(DiffTokenKind::Bad, text)
} }
fn missing(text: String) -> Self { fn missing(text: String) -> Self {
Self::new(DiffTokenKind::Missing, text) Self::new(DiffTokenKind::Missing, text)
} }
fn to_class(&self) -> &'static str { fn to_class(&self) -> &'static str {
match self.kind { match self.kind {
DiffTokenKind::Good => "typeGood", DiffTokenKind::Good => "typeGood",
@ -212,11 +309,13 @@ mod test {
let ctx = Diff::new("¿Y ahora qué vamos a hacer?", "y ahora qe vamosa hacer"); let ctx = Diff::new("¿Y ahora qué vamos a hacer?", "y ahora qe vamosa hacer");
let output = ctx.to_tokens(); let output = ctx.to_tokens();
assert_eq!( assert_eq!(
output.provided_tokens, output.typed_tokens,
vec![ vec![
bad("y"), bad("y"),
good(" ahora q"), good(" ahora q"),
bad("e"), missing("-"),
good("e"),
missing("-"),
good(" vamos"), good(" vamos"),
missing("-"), missing("-"),
good("a hacer"), good("a hacer"),
@ -228,7 +327,9 @@ mod test {
vec![ vec![
missing("¿Y"), missing("¿Y"),
good(" ahora q"), good(" ahora q"),
missing(""), missing("u"),
good("e"),
missing("́"),
good(" vamos"), good(" vamos"),
missing(" "), missing(" "),
good("a hacer"), good("a hacer"),
@ -245,18 +346,18 @@ mod test {
} }
#[test] #[test]
fn missed_chars_only_shown_in_provided_when_after_good() { fn missed_chars_only_shown_in_typed_when_after_good() {
let ctx = Diff::new("1", "23"); let ctx = Diff::new("1", "23");
assert_eq!(ctx.to_tokens().provided_tokens, &[bad("23")]); assert_eq!(ctx.to_tokens().typed_tokens, &[bad("23")]);
let ctx = Diff::new("12", "1"); let ctx = Diff::new("12", "1");
assert_eq!(ctx.to_tokens().provided_tokens, &[good("1"), missing("-"),]); assert_eq!(ctx.to_tokens().typed_tokens, &[good("1"), missing("-"),]);
} }
#[test] #[test]
fn missed_chars_counted_correctly() { fn missed_chars_counted_correctly() {
let ctx = Diff::new("нос", "нс"); let ctx = Diff::new("нос", "нс");
assert_eq!( assert_eq!(
ctx.to_tokens().provided_tokens, ctx.to_tokens().typed_tokens,
&[good("н"), missing("-"), good("с")] &[good("н"), missing("-"), good("с")]
); );
} }
@ -266,8 +367,8 @@ mod test {
// this was not parsed as expected with dissimilar 1.0.4 // this was not parsed as expected with dissimilar 1.0.4
let ctx = Diff::new("쓰다듬다", "스다뜸다"); let ctx = Diff::new("쓰다듬다", "스다뜸다");
assert_eq!( assert_eq!(
ctx.to_tokens().provided_tokens, ctx.to_tokens().typed_tokens,
&[bad(""), good(""), bad(""), good(""),] &[bad(""), good("ᅳ다"), bad(""), good("ᅳᆷ다"),]
); );
} }
@ -285,13 +386,17 @@ mod test {
} }
#[test] #[test]
fn whitespace_is_trimmed() { fn tags_removed() {
assert_eq!(prepare_expected("<div>foo</div>"), "foo"); assert_eq!(prepare_expected("<div>123</div>"), "123");
assert_eq!(
Diff::new("<div>123</div>", "123").to_html(),
"<code id=typeans><span class=typeGood>123</span></code>"
);
} }
#[test] #[test]
fn empty_input_shows_as_code() { fn empty_input_shows_as_code() {
let ctx = compare_answer("123", ""); let ctx = compare_answer("<div>123</div>", "", true);
assert_eq!(ctx, "<code id=typeans>123</code>"); assert_eq!(ctx, "<code id=typeans>123</code>");
} }