From 1f3751d191fa34fec655f5122dcb4a6f6e98ffa4 Mon Sep 17 00:00:00 2001 From: Shaun Ren Date: Tue, 23 Feb 2021 13:16:47 -0500 Subject: [PATCH] Fix extraneous whitespaces from strip_html_for_tts --- CONTRIBUTORS | 1 + rslib/src/text.rs | 36 +++++++++++++++++++++++++++++------- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index c443d2878..fca2aa1b0 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -76,6 +76,7 @@ Daniel Wallgren Kerrick Staley Maksim Abramchuk Benjamin Kulnik +Shaun Ren ******************** diff --git a/rslib/src/text.rs b/rslib/src/text.rs index 7af9635a1..2142a553f 100644 --- a/rslib/src/text.rs +++ b/rslib/src/text.rs @@ -52,6 +52,19 @@ lazy_static! { )) .unwrap(); + static ref HTML_LINEBREAK_TAGS: Regex = Regex::new( + r#"(?xsi) + + "# + ).unwrap(); + static ref HTML_MEDIA_TAGS: Regex = Regex::new( r#"(?xsi) # the start of the image, audio, or object tag @@ -148,10 +161,17 @@ pub fn decode_entities(html: &str) -> Cow { } pub fn strip_html_for_tts(html: &str) -> Cow { - match HTML.replace_all(html, " ") { - Cow::Borrowed(_) => decode_entities(html), - Cow::Owned(s) => decode_entities(&s).to_string().into(), + let mut out: Cow = html.into(); + + if let Cow::Owned(o) = HTML_LINEBREAK_TAGS.replace_all(html, " ") { + out = o.into(); } + + if let Cow::Owned(o) = strip_html(out.as_ref()) { + out = o.into(); + } + + out } pub fn strip_av_tags(text: &str) -> Cow { @@ -419,8 +439,10 @@ mod test { #[test] fn audio() { - let s = - "abc[sound:fo&o.mp3]def[anki:tts][en_US voices=Bob,Jane speed=1.2]foo
1>2[/anki:tts]gh"; + let s = concat!( + "abc[sound:fo&obar.mp3]def[anki:tts][en_US voices=Bob,Jane speed=1.2]", + "foo bar
1>2[/anki:tts]gh", + ); assert_eq!(strip_av_tags(s), "abcdefgh"); let (text, tags) = extract_av_tags(s, true); @@ -429,9 +451,9 @@ mod test { assert_eq!( tags, vec![ - AVTag::SoundOrVideo("fo&o.mp3".into()), + AVTag::SoundOrVideo("fo&obar.mp3".into()), AVTag::TextToSpeech { - field_text: "foo 1>2".into(), + field_text: "foo bar 1>2".into(), lang: "en_US".into(), voices: vec!["Bob".into(), "Jane".into()], other_args: vec![],