From 9901ae428a6623f3ed393e9faa3b17c69d9748d6 Mon Sep 17 00:00:00 2001 From: TRIAEIOU <94647023+TRIAEIOU@users.noreply.github.com> Date: Mon, 19 Dec 2022 03:03:15 +0100 Subject: [PATCH] Nested clozes and increased cloze meta data (#2141) * Nested clozes and increased cloze meta data * Update contributors * This reverts commit 3423df73f89f04a606b1bff3542a68a49ca52e9f. * Update CONTRIBUTORS * Formating * Formating * Formating * Formating * Formating * Formating * Formating * Formating * Code refactor * Formating * Formating * Formating * Formating and dead code * Correct test case * Remove Hint and Close storage of token string * Update * Formating * Formating * Formating * Use write! instead of .push_str(&format). * Formating --- CONTRIBUTORS | 1 + pylib/tests/test_models.py | 54 ++-- rslib/src/cloze.rs | 448 ++++++++++++++++++++++++++-------- rslib/src/template_filters.rs | 2 +- 4 files changed, 384 insertions(+), 121 deletions(-) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 7d1960ae6..5f886fd76 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -106,6 +106,7 @@ Bart Louwers Sam Penny Yutsuten Zoom +TRIAEIOU Stefan Kangas ******************** diff --git a/pylib/tests/test_models.py b/pylib/tests/test_models.py index 5e45e35bd..f9450c387 100644 --- a/pylib/tests/test_models.py +++ b/pylib/tests/test_models.py @@ -185,41 +185,53 @@ def test_cloze(): note["Text"] = "hello {{c1::world}}" assert col.addNote(note) == 1 assert ( - f'hello [...]' + f'hello [...]' in note.cards()[0].question() ) - assert 'hello world' in note.cards()[0].answer() + assert ( + 'hello world' + in note.cards()[0].answer() + ) # and with a comment note = col.new_note(m) note["Text"] = "hello {{c1::world::typical}}" assert col.addNote(note) == 1 assert ( - f'[typical]' + f'[typical]' in note.cards()[0].question() ) - assert 'world' in note.cards()[0].answer() + assert ( + 'world' in note.cards()[0].answer() + ) # and with 2 clozes note = col.new_note(m) note["Text"] = "hello {{c1::world}} {{c2::bar}}" assert col.addNote(note) == 2 (c1, c2) = note.cards() assert ( - f'[...] bar' + f'[...] bar' in c1.question() ) - assert 'world bar' in c1.answer() assert ( - f'world [...]' + 'world bar' + in c1.answer() + ) + assert ( + f'world [...]' in c2.question() ) - assert 'world bar' in c2.answer() + assert ( + 'world bar' + in c2.answer() + ) # if there are multiple answers for a single cloze, they are given in a # list note = col.new_note(m) note["Text"] = "a {{c1::b}} {{c1::c}}" assert col.addNote(note) == 1 - assert 'b c' in ( - note.cards()[0].answer() + assert ( + 'b c' + in (note.cards()[0].answer()) ) # if we add another cloze, a card should be generated cnt = col.card_count() @@ -280,7 +292,9 @@ def test_cloze_mathjax(): assert ( note.cards()[0] .question() - .endswith(r'\(a\) [...] \[ [...] \]') + .endswith( + r'\(a\) [...] \[ [...] \]' + ) ) @@ -310,26 +324,26 @@ def test_chained_mods(): col.models.update(m) note = col.newNote() - q1 = 'phrase' - a1 = "sentence" - q2 = 'en chaine' - a2 = "chained" + a1 = 'phrase' + h1 = "sentence" + a2 = 'en chaine' + h2 = "chained" note[ "Text" ] = "This {{{{c1::{}::{}}}}} demonstrates {{{{c1::{}::{}}}}} clozes.".format( - q1, a1, - q2, + h1, a2, + h2, ) assert col.addNote(note) == 1 assert ( - f'This [sentence]' - f' demonstrates [chained] clozes.' + 'This [sentence]' + f' demonstrates [chained] clozes.' in note.cards()[0].question() ) assert ( - f'This phrase demonstrates en chaine clozes.' + 'This phrase demonstrates en chaine clozes.' in note.cards()[0].answer() ) diff --git a/rslib/src/cloze.rs b/rslib/src/cloze.rs index 0612f7795..84190c9ab 100644 --- a/rslib/src/cloze.rs +++ b/rslib/src/cloze.rs @@ -1,129 +1,300 @@ // Copyright: Ankitects Pty Ltd and contributors // License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html -use std::{borrow::Cow, collections::HashSet}; - +use htmlescape::encode_attribute; use lazy_static::lazy_static; +use nom::{ + branch::alt, + bytes::complete::{tag, take_while}, + combinator::map, + IResult, +}; use regex::{Captures, Regex}; +use std::{borrow::Cow, collections::HashSet, fmt::Write}; use crate::{latex::contains_latex, template::RenderContext, text::strip_html_preserving_entities}; lazy_static! { - static ref CLOZE: Regex = Regex::new( - r#"(?xsi) - \{\{ - c(\d+):: # 1 = cloze number - (.*?) # 2 = clozed text - (?: - ::(.*?) # 3 = optional hint - )? - \}\} - "# - ) - .unwrap(); static ref MATHJAX: Regex = Regex::new( r#"(?xsi) (\\[(\[]) # 1 = mathjax opening tag (.*?) # 2 = inner content - (\\[])]) # 3 = mathjax closing tag + (\\[])]) # 3 = mathjax closing tag "# ) .unwrap(); } -mod cloze_caps { - // cloze ordinal - pub const ORD: usize = 1; - // the occluded text - pub const TEXT: usize = 2; - // optional hint - pub const HINT: usize = 3; -} - mod mathjax_caps { pub const OPENING_TAG: usize = 1; pub const INNER_TEXT: usize = 2; pub const CLOSING_TAG: usize = 3; } -pub fn reveal_cloze_text(text: &str, cloze_ord: u16, question: bool) -> Cow { - let mut cloze_ord_was_in_text = false; +#[derive(Debug)] +enum Token<'a> { + OpenCloze(u16), + Text(&'a str), + CloseCloze, +} - let output = CLOZE.replace_all(text, |caps: &Captures| { - let captured_ord = caps - .get(cloze_caps::ORD) - .unwrap() - .as_str() - .parse() - .unwrap_or(0); - - let text = caps.get(cloze_caps::TEXT).unwrap().as_str().to_owned(); - if captured_ord != cloze_ord { - // other cloze deletions are unchanged - return text; - } else { - cloze_ord_was_in_text = true; - } - - let text_attr; - let replacement; - if question { - text_attr = format!(r#" data-cloze="{}""#, htmlescape::encode_attribute(&text)); - // hint provided? - if let Some(hint) = caps.get(cloze_caps::HINT) { - replacement = format!("[{}]", hint.as_str()); - } else { - replacement = "[...]".to_string(); +/// Tokenize string +fn tokenize(mut text: &str) -> impl Iterator { + fn open_cloze(text: &str) -> IResult<&str, Token> { + // opening brackets and 'c' + let (text, _opening_brackets_and_c) = tag("{{c")(text)?; + // following number + let (text, digits) = take_while(|c: char| c.is_ascii_digit())(text)?; + let digits: u16 = match digits.parse() { + Ok(digits) => digits, + Err(_) => { + // not a valid number; fail to recognize + return Err(nom::Err::Error(nom::error::make_error( + text, + nom::error::ErrorKind::Digit, + ))); } - } else { - text_attr = "".to_string(); - replacement = text; - } - - format!(r#"{}"#, text_attr, replacement) - }); - - if !cloze_ord_was_in_text { - return "".into(); + }; + // :: + let (text, _colons) = tag("::")(text)?; + Ok((text, Token::OpenCloze(digits))) } - // if no cloze deletions are found, Anki returns an empty string - match output { - Cow::Borrowed(_) => "".into(), - other => other, + fn close_cloze(text: &str) -> IResult<&str, Token> { + map(tag("}}"), |_| Token::CloseCloze)(text) + } + + /// Match a run of text until an open/close marker is encountered. + fn normal_text(text: &str) -> IResult<&str, Token> { + if text.is_empty() { + return Err(nom::Err::Error(nom::error::make_error( + text, + nom::error::ErrorKind::Eof, + ))); + } + let mut index = 0; + let mut other_token = alt((open_cloze, close_cloze)); + while other_token(&text[index..]).is_err() && index < text.len() { + index += 1; + } + Ok((&text[index..], Token::Text(&text[0..index]))) + } + + std::iter::from_fn(move || { + if text.is_empty() { + None + } else { + let (remaining_text, token) = + alt((open_cloze, close_cloze, normal_text))(text).unwrap(); + text = remaining_text; + Some(token) + } + }) +} + +#[derive(Debug)] +enum TextOrCloze<'a> { + Text(&'a str), + Cloze(ExtractedCloze<'a>), +} + +#[derive(Debug)] +struct ExtractedCloze<'a> { + ordinal: u16, + nodes: Vec>, + hint: Option<&'a str>, +} + +impl ExtractedCloze<'_> { + /// Return the cloze's hint, or "..." if none was provided. + fn hint(&self) -> &str { + self.hint.unwrap_or("...") + } + + fn clozed_text(&self) -> Cow { + // happy efficient path? + if self.nodes.len() == 1 { + if let TextOrCloze::Text(text) = self.nodes.last().unwrap() { + return (*text).into(); + } + } + + let mut buf = String::new(); + for node in &self.nodes { + match node { + TextOrCloze::Text(text) => buf.push_str(text), + TextOrCloze::Cloze(cloze) => buf.push_str(&cloze.clozed_text()), + } + } + + buf.into() + } +} + +fn parse_text_with_clozes(text: &str) -> Vec> { + let mut open_clozes: Vec = vec![]; + let mut output = vec![]; + for token in tokenize(text) { + match token { + Token::OpenCloze(ordinal) => open_clozes.push(ExtractedCloze { + ordinal, + nodes: Vec::with_capacity(1), // common case + hint: None, + }), + Token::Text(mut text) => { + if let Some(cloze) = open_clozes.last_mut() { + // extract hint if found + if let Some((head, tail)) = text.split_once("::") { + text = head; + cloze.hint = Some(tail); + } + cloze.nodes.push(TextOrCloze::Text(text)); + } else { + output.push(TextOrCloze::Text(text)); + } + } + Token::CloseCloze => { + // take the currently active cloze + if let Some(cloze) = open_clozes.pop() { + let target = if let Some(outer_cloze) = open_clozes.last_mut() { + // and place it into the cloze layer above + &mut outer_cloze.nodes + } else { + // or the top level if no other clozes active + &mut output + }; + target.push(TextOrCloze::Cloze(cloze)); + } else { + // closing marker outside of any clozes + output.push(TextOrCloze::Text("}}")) + } + } + } + } + output +} + +fn reveal_cloze_text_in_nodes( + node: &TextOrCloze, + cloze_ord: u16, + question: bool, + output: &mut Vec, +) { + if let TextOrCloze::Cloze(cloze) = node { + if cloze.ordinal == cloze_ord { + if question { + output.push(cloze.hint().into()) + } else { + output.push(cloze.clozed_text().into()) + } + } + for node in &cloze.nodes { + reveal_cloze_text_in_nodes(node, cloze_ord, question, output); + } + } +} + +fn reveal_cloze( + cloze: &ExtractedCloze, + cloze_ord: u16, + question: bool, + active_cloze_found_in_text: &mut bool, + buf: &mut String, +) { + let active = cloze.ordinal == cloze_ord; + *active_cloze_found_in_text |= active; + match (question, active) { + (true, true) => { + // question side with active cloze; all inner content is elided + let mut content_buf = String::new(); + for node in &cloze.nodes { + match node { + TextOrCloze::Text(text) => content_buf.push_str(text), + TextOrCloze::Cloze(cloze) => reveal_cloze( + cloze, + cloze_ord, + question, + active_cloze_found_in_text, + &mut content_buf, + ), + } + } + write!( + buf, + r#"[{}]"#, + encode_attribute(&content_buf), + cloze.ordinal, + cloze.hint() + ) + .unwrap(); + } + (false, true) => { + write!( + buf, + r#""#, + cloze.ordinal + ) + .unwrap(); + for node in &cloze.nodes { + match node { + TextOrCloze::Text(text) => buf.push_str(text), + TextOrCloze::Cloze(cloze) => { + reveal_cloze(cloze, cloze_ord, question, active_cloze_found_in_text, buf) + } + } + } + buf.push_str(""); + } + (_, false) => { + // question or answer side inactive cloze; text shown, children may be active + write!( + buf, + r#""#, + cloze.ordinal + ) + .unwrap(); + for node in &cloze.nodes { + match node { + TextOrCloze::Text(text) => buf.push_str(text), + TextOrCloze::Cloze(cloze) => { + reveal_cloze(cloze, cloze_ord, question, active_cloze_found_in_text, buf) + } + } + } + buf.push_str("") + } + } +} + +pub fn reveal_cloze_text(text: &str, cloze_ord: u16, question: bool) -> Cow { + let mut buf = String::new(); + let mut active_cloze_found_in_text = false; + for node in &parse_text_with_clozes(text) { + match node { + // top-level text is indiscriminately added + TextOrCloze::Text(text) => buf.push_str(text), + TextOrCloze::Cloze(cloze) => reveal_cloze( + cloze, + cloze_ord, + question, + &mut active_cloze_found_in_text, + &mut buf, + ), + } + } + if active_cloze_found_in_text { + buf.into() + } else { + Cow::from("") } } pub fn reveal_cloze_text_only(text: &str, cloze_ord: u16, question: bool) -> Cow { - CLOZE - .captures_iter(text) - .filter(|caps| { - let captured_ord = caps - .get(cloze_caps::ORD) - .unwrap() - .as_str() - .parse() - .unwrap_or(0); - - captured_ord == cloze_ord - }) - .map(|caps| { - let cloze = if question { - // hint provided? - if let Some(hint) = caps.get(cloze_caps::HINT) { - hint.as_str() - } else { - "..." - } - } else { - caps.get(cloze_caps::TEXT).unwrap().as_str() - }; - - cloze - }) - .collect::>() - .join(", ") - .into() + let mut output = Vec::new(); + for node in &parse_text_with_clozes(text) { + reveal_cloze_text_in_nodes(node, cloze_ord, question, &mut output); + } + output.join(", ").into() } /// If text contains any LaTeX tags, render the front and back @@ -144,7 +315,9 @@ pub fn expand_clozes_to_reveal_latex(text: &str) -> String { } pub(crate) fn contains_cloze(text: &str) -> bool { - CLOZE.is_match(text) + parse_text_with_clozes(text) + .iter() + .any(|node| matches!(node, TextOrCloze::Cloze(_))) } pub fn cloze_numbers_in_string(html: &str) -> HashSet { @@ -153,15 +326,20 @@ pub fn cloze_numbers_in_string(html: &str) -> HashSet { set } -#[allow(clippy::implicit_hasher)] -pub fn add_cloze_numbers_in_string(field: &str, set: &mut HashSet) { - for cap in CLOZE.captures_iter(field) { - if let Ok(n) = cap[1].parse() { - set.insert(n); +fn add_cloze_numbers_in_text_with_clozes(nodes: &[TextOrCloze], set: &mut HashSet) { + for node in nodes { + if let TextOrCloze::Cloze(cloze) = node { + set.insert(cloze.ordinal); + add_cloze_numbers_in_text_with_clozes(&cloze.nodes, set); } } } +#[allow(clippy::implicit_hasher)] +pub fn add_cloze_numbers_in_string(field: &str, set: &mut HashSet) { + add_cloze_numbers_in_text_with_clozes(&parse_text_with_clozes(field), set) +} + fn strip_html_inside_mathjax(text: &str) -> Cow { MATHJAX.replace_all(text, |caps: &Captures| -> String { format!( @@ -232,6 +410,76 @@ mod test { ); } + #[test] + fn nested_cloze_plain_text() { + assert_eq!( + strip_html(reveal_cloze_text("foo {{c1::bar {{c2::baz}}}}", 1, true).as_ref()), + "foo [...]" + ); + assert_eq!( + strip_html(reveal_cloze_text("foo {{c1::bar {{c2::baz}}}}", 1, false).as_ref()), + "foo bar baz" + ); + assert_eq!( + strip_html(reveal_cloze_text("foo {{c1::bar {{c2::baz}}::qux}}", 2, true).as_ref()), + "foo bar [...]" + ); + assert_eq!( + strip_html(reveal_cloze_text("foo {{c1::bar {{c2::baz}}::qux}}", 2, false).as_ref()), + "foo bar baz" + ); + assert_eq!( + strip_html(reveal_cloze_text("foo {{c1::bar {{c2::baz}}::qux}}", 1, true).as_ref()), + "foo [qux]" + ); + assert_eq!( + strip_html(reveal_cloze_text("foo {{c1::bar {{c2::baz}}::qux}}", 1, false).as_ref()), + "foo bar baz" + ); + } + + #[test] + fn nested_cloze_html() { + assert_eq!( + cloze_numbers_in_string("{{c2::te{{c1::s}}}}t{{"), + vec![1, 2].into_iter().collect::>() + ); + assert_eq!( + reveal_cloze_text("foo {{c1::bar {{c2::baz}}}}", 1, true), + format!( + r#"foo [...]"#, + htmlescape::encode_attribute( + r#"bar baz"# + ) + ) + ); + assert_eq!( + reveal_cloze_text("foo {{c1::bar {{c2::baz}}}}", 1, false), + r#"foo bar baz"# + ); + assert_eq!( + reveal_cloze_text("foo {{c1::bar {{c2::baz}}::qux}}", 2, true), + r#"foo bar [...]"# + ); + assert_eq!( + reveal_cloze_text("foo {{c1::bar {{c2::baz}}::qux}}", 2, false), + r#"foo bar baz"# + ); + assert_eq!( + reveal_cloze_text("foo {{c1::bar {{c2::baz}}::qux}}", 1, true), + format!( + r#"foo [qux]"#, + htmlescape::encode_attribute( + r#"bar baz"# + ) + ) + ); + assert_eq!( + reveal_cloze_text("foo {{c1::bar {{c2::baz}}::qux}}", 1, false), + r#"foo bar baz"# + ); + } + #[test] fn mathjax_html() { // escaped angle brackets should be preserved diff --git a/rslib/src/template_filters.rs b/rslib/src/template_filters.rs index 21b5b99ce..ddd8814ed 100644 --- a/rslib/src/template_filters.rs +++ b/rslib/src/template_filters.rs @@ -256,7 +256,7 @@ field assert_eq!(strip_html(&cloze_filter(text, &ctx)).as_ref(), "[...] two"); assert_eq!( cloze_filter(text, &ctx), - r#"[...] two"# + r#"[...] two"# ); ctx.card_ord = 1;