From 873c4e617d7e031c1e04352dfb9896bd77675f64 Mon Sep 17 00:00:00 2001 From: llama <100429699+iamllama@users.noreply.github.com> Date: Sat, 25 Jan 2025 15:16:24 +0800 Subject: [PATCH] Recognise and check for media referenced in tags (#3763) * recognise and check tags for media * add test --- pylib/anki/media.py | 4 ++-- rslib/src/media/check.rs | 23 +++++++++++++++++++++++ rslib/src/text.rs | 4 ++-- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/pylib/anki/media.py b/pylib/anki/media.py index b76d7116c..5d8653a97 100644 --- a/pylib/anki/media.py +++ b/pylib/anki/media.py @@ -33,9 +33,9 @@ class MediaManager(DeprecatedNamesMixin): sound_regexps = [r"(?i)(\[sound:(?P[^]]+)\])"] html_media_regexps = [ # src element quoted case - r"(?i)(<(?:img|audio)\b[^>]* src=(?P[\"'])(?P[^>]+?)(?P=str)[^>]*>)", + r"(?i)(<(?:img|audio|source)\b[^>]* src=(?P[\"'])(?P[^>]+?)(?P=str)[^>]*>)", # unquoted case - r"(?i)(<(?:img|audio)\b[^>]* src=(?!['\"])(?P[^ >]+)[^>]*?>)", + r"(?i)(<(?:img|audio|source)\b[^>]* src=(?!['\"])(?P[^ >]+)[^>]*?>)", # src element quoted case r"(?i)(]* data=(?P[\"'])(?P[^>]+?)(?P=str)[^>]*>)", # unquoted case diff --git a/rslib/src/media/check.rs b/rslib/src/media/check.rs index bc85b177b..20d684826 100644 --- a/rslib/src/media/check.rs +++ b/rslib/src/media/check.rs @@ -846,4 +846,27 @@ Unused: unused.jpg Ok(()) } + + #[test] + fn source_tags() -> Result<()> { + let (_dir, _mgr, mut col) = common_setup()?; + let mut checker = col.media_checker()?; + + let field = ""; + let seen = normalize_and_maybe_rename_files_helper(&mut checker, field); + assert!(seen.contains("foo-ss.mp3")); + assert!(seen.contains("bar-ss.ogg")); + + let field = r#" + + + fancy jif + + "#; + let seen = normalize_and_maybe_rename_files_helper(&mut checker, field); + assert!(seen.contains("foo-dd.webp")); + assert!(seen.contains("bar-dd.gif")); + + Ok(()) + } } diff --git a/rslib/src/text.rs b/rslib/src/text.rs index aebdb3caf..f83332ff8 100644 --- a/rslib/src/text.rs +++ b/rslib/src/text.rs @@ -107,8 +107,8 @@ static HTML_LINEBREAK_TAGS: LazyLock = LazyLock::new(|| { pub static HTML_MEDIA_TAGS: LazyLock = LazyLock::new(|| { Regex::new( r#"(?xsi) - # the start of the image, audio, or object tag - <\b(?:img|audio|video|object)\b + # the start of the image, audio, object, or source tag + <\b(?:img|audio|video|object|source)\b # any non-`>`, except inside `"` or `'` (?: