Improve performance of card rendering parser (#3886)

* refactor parser

* update test

* add tests

* refactor CardNodes

* Increase nested cloze limit to underlying protobuf limit (dae)
This commit is contained in:
llama 2025-03-31 18:38:46 +08:00 committed by GitHub
parent 52781aaab8
commit aa5684638b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 55 additions and 19 deletions

View file

@ -33,24 +33,24 @@ pub fn prettify_av_tags<S: Into<String> + AsRef<str>>(txt: S) -> String {
} }
/// Parse `txt` into [CardNodes] and return the result, /// Parse `txt` into [CardNodes] and return the result,
/// or [None] if it is only a text node. /// or [None] if it only contains text nodes.
fn nodes_or_text_only(txt: &str) -> Option<CardNodes> { fn nodes_or_text_only(txt: &str) -> Option<CardNodes> {
let nodes = CardNodes::parse(txt); let nodes = CardNodes::parse(txt);
match nodes.0[..] { (!nodes.text_only).then_some(nodes)
[Node::Text(_)] => None,
_ => Some(nodes),
}
} }
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
struct CardNodes<'a>(Vec<Node<'a>>); struct CardNodes<'a> {
nodes: Vec<Node<'a>>,
text_only: bool,
}
impl<'iter, 'nodes> IntoIterator for &'iter CardNodes<'nodes> { impl<'iter, 'nodes> IntoIterator for &'iter CardNodes<'nodes> {
type Item = &'iter Node<'nodes>; type Item = &'iter Node<'nodes>;
type IntoIter = std::slice::Iter<'iter, Node<'nodes>>; type IntoIter = std::slice::Iter<'iter, Node<'nodes>>;
fn into_iter(self) -> Self::IntoIter { fn into_iter(self) -> Self::IntoIter {
self.0.iter() self.nodes.iter()
} }
} }

View file

@ -11,10 +11,11 @@ use nom::character::complete::multispace0;
use nom::combinator::map; use nom::combinator::map;
use nom::combinator::not; use nom::combinator::not;
use nom::combinator::recognize; use nom::combinator::recognize;
use nom::combinator::rest;
use nom::combinator::success; use nom::combinator::success;
use nom::combinator::value; use nom::combinator::value;
use nom::multi::fold_many0;
use nom::multi::many0; use nom::multi::many0;
use nom::multi::many1;
use nom::sequence::delimited; use nom::sequence::delimited;
use nom::sequence::pair; use nom::sequence::pair;
use nom::sequence::preceded; use nom::sequence::preceded;
@ -33,12 +34,14 @@ type IResult<'a, O> = nom::IResult<&'a str, O>;
impl<'a> CardNodes<'a> { impl<'a> CardNodes<'a> {
pub(super) fn parse(mut txt: &'a str) -> Self { pub(super) fn parse(mut txt: &'a str) -> Self {
let mut nodes = Vec::new(); let mut nodes = Vec::new();
let mut text_only = true;
while let Ok((remaining, node)) = node(txt) { while let Ok((remaining, node)) = node(txt) {
text_only &= matches!(node, Node::Text(_));
txt = remaining; txt = remaining;
nodes.push(node); nodes.push(node);
} }
Self(nodes) Self { nodes, text_only }
} }
} }
@ -98,7 +101,7 @@ fn is_not0<'parser, 'arr: 'parser, 's: 'parser>(
} }
fn node(s: &str) -> IResult<Node> { fn node(s: &str) -> IResult<Node> {
alt((text_node, sound_node, tag_node))(s) alt((sound_node, tag_node, text_node))(s)
} }
/// A sound tag `[sound:resource]`, where `resource` is pointing to a sound or /// A sound tag `[sound:resource]`, where `resource` is pointing to a sound or
@ -110,6 +113,16 @@ fn sound_node(s: &str) -> IResult<Node> {
)(s) )(s)
} }
fn take_till_potential_tag_start(s: &str) -> IResult<&str> {
use nom::InputTake;
// first char could be '[', but wasn't part of a node, so skip (eof ends parse)
let (after, offset) = anychar(s).map(|(s, c)| (s, c.len_utf8()))?;
Ok(match after.find('[') {
Some(pos) => s.take_split(offset + pos),
_ => rest(s)?,
})
}
/// An Anki tag `[anki:tag...]...[/anki:tag]`. /// An Anki tag `[anki:tag...]...[/anki:tag]`.
fn tag_node(s: &str) -> IResult<Node> { fn tag_node(s: &str) -> IResult<Node> {
/// Match the start of an opening tag and return its name. /// Match the start of an opening tag and return its name.
@ -157,7 +170,12 @@ fn tag_node(s: &str) -> IResult<Node> {
fn content_parser<'parser, 'name: 'parser, 's: 'parser>( fn content_parser<'parser, 'name: 'parser, 's: 'parser>(
name: &'name str, name: &'name str,
) -> impl FnMut(&'s str) -> IResult<'s, &'s str> + 'parser { ) -> impl FnMut(&'s str) -> IResult<'s, &'s str> + 'parser {
recognize(many0(pair(not(closing_parser(name)), anychar))) recognize(fold_many0(
pair(not(closing_parser(name)), take_till_potential_tag_start),
// we don't need to accumulate anything
|| (),
|_, _| (),
))
} }
let (_, tag_name) = name(s)?; let (_, tag_name) = name(s)?;
@ -171,10 +189,7 @@ fn tag_node(s: &str) -> IResult<Node> {
} }
fn text_node(s: &str) -> IResult<Node> { fn text_node(s: &str) -> IResult<Node> {
map( map(take_till_potential_tag_start, Node::Text)(s)
recognize(many1(pair(not(alt((sound_node, tag_node))), anychar))),
Node::Text,
)(s)
} }
#[cfg(test)] #[cfg(test)]
@ -183,7 +198,7 @@ mod test {
macro_rules! assert_parsed_nodes { macro_rules! assert_parsed_nodes {
($txt:expr $(, $node:expr)*) => { ($txt:expr $(, $node:expr)*) => {
assert_eq!(CardNodes::parse($txt), CardNodes(vec![$($node),*])); assert_eq!(CardNodes::parse($txt).nodes, vec![$($node),*]);
} }
} }
@ -198,8 +213,21 @@ mod test {
assert_parsed_nodes!("foo", Text("foo")); assert_parsed_nodes!("foo", Text("foo"));
// broken sound/tags are just text as well // broken sound/tags are just text as well
assert_parsed_nodes!("[sound:]", Text("[sound:]")); assert_parsed_nodes!("[sound:]", Text("[sound:]"));
assert_parsed_nodes!("[anki:][/anki:]", Text("[anki:][/anki:]")); assert_parsed_nodes!("[anki:][/anki:]", Text("[anki:]"), Text("[/anki:]"));
assert_parsed_nodes!("[anki:foo][/anki:bar]", Text("[anki:foo][/anki:bar]")); assert_parsed_nodes!(
"[anki:foo][/anki:bar]",
Text("[anki:foo]"),
Text("[/anki:bar]")
);
assert_parsed_nodes!(
"abc[anki:foo]def[/anki:bar]ghi][[anki:bar][",
Text("abc"),
Text("[anki:foo]def"),
Text("[/anki:bar]ghi]"),
Text("["),
Text("[anki:bar]"),
Text("[")
);
// sound // sound
assert_parsed_nodes!("[sound:foo]", SoundOrVideo("foo")); assert_parsed_nodes!("[sound:foo]", SoundOrVideo("foo"));
@ -224,6 +252,14 @@ mod test {
options: HashMap::new() options: HashMap::new()
})) }))
); );
assert_parsed_nodes!(
"[anki:foo]]bar[[/anki:foo]",
Directive(super::Directive::Other(OtherDirective {
name: "foo",
content: "]bar[",
options: HashMap::new()
}))
);
assert_parsed_nodes!( assert_parsed_nodes!(
"[anki:foo bar=baz][/anki:foo]", "[anki:foo bar=baz][/anki:foo]",
Directive(super::Directive::Other(OtherDirective { Directive(super::Directive::Other(OtherDirective {

View file

@ -159,7 +159,7 @@ fn parse_text_with_clozes(text: &str) -> Vec<TextOrCloze<'_>> {
for token in tokenize(text) { for token in tokenize(text) {
match token { match token {
Token::OpenCloze(ordinal) => { Token::OpenCloze(ordinal) => {
if open_clozes.len() < 8 { if open_clozes.len() < 10 {
open_clozes.push(ExtractedCloze { open_clozes.push(ExtractedCloze {
ordinal, ordinal,
nodes: Vec::with_capacity(1), // common case nodes: Vec::with_capacity(1), // common case