Anki/rslib/src/card_rendering/parser.rs
llama aa5684638b
Improve performance of card rendering parser (#3886)
* refactor parser

* update test

* add tests

* refactor CardNodes

* Increase nested cloze limit to underlying protobuf limit (dae)
2025-03-31 17:38:46 +07:00

313 lines
9.5 KiB
Rust

// Copyright: Ankitects Pty Ltd and contributors
// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
use std::collections::HashMap;
use nom::branch::alt;
use nom::bytes::complete::is_not;
use nom::bytes::complete::tag;
use nom::character::complete::anychar;
use nom::character::complete::multispace0;
use nom::combinator::map;
use nom::combinator::not;
use nom::combinator::recognize;
use nom::combinator::rest;
use nom::combinator::success;
use nom::combinator::value;
use nom::multi::fold_many0;
use nom::multi::many0;
use nom::sequence::delimited;
use nom::sequence::pair;
use nom::sequence::preceded;
use nom::sequence::separated_pair;
use nom::sequence::terminated;
use nom::sequence::tuple;
use super::CardNodes;
use super::Directive;
use super::Node;
use super::OtherDirective;
use super::TtsDirective;
type IResult<'a, O> = nom::IResult<&'a str, O>;
impl<'a> CardNodes<'a> {
pub(super) fn parse(mut txt: &'a str) -> Self {
let mut nodes = Vec::new();
let mut text_only = true;
while let Ok((remaining, node)) = node(txt) {
text_only &= matches!(node, Node::Text(_));
txt = remaining;
nodes.push(node);
}
Self { nodes, text_only }
}
}
impl<'a> Directive<'a> {
fn new(name: &'a str, options: Vec<(&'a str, &'a str)>, content: &'a str) -> Self {
match name {
"tts" => {
let mut lang = "";
let mut voices = vec![];
let mut speed = 1.0;
let mut blank = None;
let mut other_options = HashMap::new();
for option in options {
match option.0 {
"lang" => lang = option.1,
"voices" => voices = option.1.split(',').collect(),
"speed" => speed = option.1.parse().unwrap_or(1.0),
"cloze_blank" => blank = Some(option.1),
_ => {
other_options.insert(option.0, option.1);
}
}
}
Self::Tts(TtsDirective {
content,
lang,
voices,
speed,
blank,
options: other_options,
})
}
_ => Self::Other(OtherDirective {
name,
content,
options: options.into_iter().collect(),
}),
}
}
}
/// Consume 0 or more of anything in " \t\r\n" after `parser`.
fn trailing_whitespace0<'parser, 's, P, O>(parser: P) -> impl FnMut(&'s str) -> IResult<'s, O>
where
P: FnMut(&'s str) -> IResult<'s, O> + 'parser,
{
terminated(parser, multispace0)
}
/// Parse until char in `arr` is found. Always succeeds.
fn is_not0<'parser, 'arr: 'parser, 's: 'parser>(
arr: &'arr str,
) -> impl FnMut(&'s str) -> IResult<'s, &'s str> + 'parser {
alt((is_not(arr), success("")))
}
fn node(s: &str) -> IResult<Node> {
alt((sound_node, tag_node, text_node))(s)
}
/// A sound tag `[sound:resource]`, where `resource` is pointing to a sound or
/// video file.
fn sound_node(s: &str) -> IResult<Node> {
map(
delimited(tag("[sound:"), is_not("]"), tag("]")),
Node::SoundOrVideo,
)(s)
}
fn take_till_potential_tag_start(s: &str) -> IResult<&str> {
use nom::InputTake;
// first char could be '[', but wasn't part of a node, so skip (eof ends parse)
let (after, offset) = anychar(s).map(|(s, c)| (s, c.len_utf8()))?;
Ok(match after.find('[') {
Some(pos) => s.take_split(offset + pos),
_ => rest(s)?,
})
}
/// An Anki tag `[anki:tag...]...[/anki:tag]`.
fn tag_node(s: &str) -> IResult<Node> {
/// Match the start of an opening tag and return its name.
fn name(s: &str) -> IResult<&str> {
preceded(tag("[anki:"), is_not("] \t\r\n"))(s)
}
/// Return a parser to match an opening `name` tag and return its options.
fn opening_parser<'name, 's: 'name>(
name: &'name str,
) -> impl FnMut(&'s str) -> IResult<'s, Vec<(&'s str, &'s str)>> + 'name {
/// List of whitespace-separated `key=val` tuples, where `val` may be
/// empty.
fn options(s: &str) -> IResult<Vec<(&str, &str)>> {
fn key(s: &str) -> IResult<&str> {
is_not("] \t\r\n=")(s)
}
fn val(s: &str) -> IResult<&str> {
alt((
delimited(tag("\""), is_not0("\""), tag("\"")),
is_not0("] \t\r\n\""),
))(s)
}
many0(trailing_whitespace0(separated_pair(key, tag("="), val)))(s)
}
delimited(
pair(tag("[anki:"), trailing_whitespace0(tag(name))),
options,
tag("]"),
)
}
/// Return a parser to match a closing `name` tag.
fn closing_parser<'parser, 'name: 'parser, 's: 'parser>(
name: &'name str,
) -> impl FnMut(&'s str) -> IResult<'s, ()> + 'parser {
value((), tuple((tag("[/anki:"), tag(name), tag("]"))))
}
/// Return a parser to match and return anything until a closing `name` tag
/// is found.
fn content_parser<'parser, 'name: 'parser, 's: 'parser>(
name: &'name str,
) -> impl FnMut(&'s str) -> IResult<'s, &'s str> + 'parser {
recognize(fold_many0(
pair(not(closing_parser(name)), take_till_potential_tag_start),
// we don't need to accumulate anything
|| (),
|_, _| (),
))
}
let (_, tag_name) = name(s)?;
map(
terminated(
pair(opening_parser(tag_name), content_parser(tag_name)),
closing_parser(tag_name),
),
|(options, content)| Node::Directive(Directive::new(tag_name, options, content)),
)(s)
}
fn text_node(s: &str) -> IResult<Node> {
map(take_till_potential_tag_start, Node::Text)(s)
}
#[cfg(test)]
mod test {
use super::*;
macro_rules! assert_parsed_nodes {
($txt:expr $(, $node:expr)*) => {
assert_eq!(CardNodes::parse($txt).nodes, vec![$($node),*]);
}
}
#[test]
fn parsing() {
use Node::*;
// empty
assert_parsed_nodes!("");
// text
assert_parsed_nodes!("foo", Text("foo"));
// broken sound/tags are just text as well
assert_parsed_nodes!("[sound:]", Text("[sound:]"));
assert_parsed_nodes!("[anki:][/anki:]", Text("[anki:]"), Text("[/anki:]"));
assert_parsed_nodes!(
"[anki:foo][/anki:bar]",
Text("[anki:foo]"),
Text("[/anki:bar]")
);
assert_parsed_nodes!(
"abc[anki:foo]def[/anki:bar]ghi][[anki:bar][",
Text("abc"),
Text("[anki:foo]def"),
Text("[/anki:bar]ghi]"),
Text("["),
Text("[anki:bar]"),
Text("[")
);
// sound
assert_parsed_nodes!("[sound:foo]", SoundOrVideo("foo"));
assert_parsed_nodes!(
"foo [sound:bar] baz",
Text("foo "),
SoundOrVideo("bar"),
Text(" baz")
);
assert_parsed_nodes!(
"[sound:foo][sound:bar]",
SoundOrVideo("foo"),
SoundOrVideo("bar")
);
// tags
assert_parsed_nodes!(
"[anki:foo]bar[/anki:foo]",
Directive(super::Directive::Other(OtherDirective {
name: "foo",
content: "bar",
options: HashMap::new()
}))
);
assert_parsed_nodes!(
"[anki:foo]]bar[[/anki:foo]",
Directive(super::Directive::Other(OtherDirective {
name: "foo",
content: "]bar[",
options: HashMap::new()
}))
);
assert_parsed_nodes!(
"[anki:foo bar=baz][/anki:foo]",
Directive(super::Directive::Other(OtherDirective {
name: "foo",
content: "",
options: [("bar", "baz")].into_iter().collect(),
}))
);
// unquoted white space separates options, "]" terminates
assert_parsed_nodes!(
"[anki:foo\na=b\tc=d e=f][/anki:foo]",
Directive(super::Directive::Other(OtherDirective {
name: "foo",
content: "",
options: [("a", "b"), ("c", "d"), ("e", "f")].into_iter().collect(),
}))
);
assert_parsed_nodes!(
"[anki:foo a=\"b \t\n c ]\"][/anki:foo]",
Directive(super::Directive::Other(OtherDirective {
name: "foo",
content: "",
options: [("a", "b \t\n c ]")].into_iter().collect(),
}))
);
// tts tags
assert_parsed_nodes!(
"[anki:tts lang=jp_JP voices=Alice,Bob speed=0.5 cloze_blank= bar=baz][/anki:tts]",
Directive(super::Directive::Tts(TtsDirective {
content: "",
lang: "jp_JP",
voices: vec!["Alice", "Bob"],
speed: 0.5,
blank: Some(""),
options: [("bar", "baz")].into_iter().collect(),
}))
);
assert_parsed_nodes!(
"[anki:tts speed=foo][/anki:tts]",
Directive(super::Directive::Tts(TtsDirective {
content: "",
lang: "",
voices: vec![],
speed: 1.0,
blank: None,
options: HashMap::new(),
}))
);
}
}