Fix regex skipping over all src except the last (#3021)

Adapting HTML_MEDIA_TAGS to allow for `>` inside '' and "" led to
multiple images inside a field sometimes being disregarded and
marked as "unused". This seems to have been caused by a missing
lazy (?) quantifier for the regex part skipping over non-`>`.
This commit is contained in:
Viktor Ricci 2024-02-24 09:27:38 +01:00 committed by GitHub
parent 1a7f8b4fdf
commit 68b374e65f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 23 additions and 1 deletions

View file

@ -816,6 +816,28 @@ Unused: unused.jpg
let seen = normalize_and_maybe_rename_files_helper(&mut checker, field);
assert!(!seen.contains("illegal.jpg"));
Ok(())
}
#[test]
fn multiple_images() -> Result<()> {
let (_dir, _mgr, mut col) = common_setup()?;
let mut checker = col.media_checker()?;
let field = "<img alt='foo' src='foo-ss.jpg'><img alt='bar' src='bar-ss.jpg'>";
let seen = normalize_and_maybe_rename_files_helper(&mut checker, field);
assert!(seen.contains("foo-ss.jpg"));
assert!(seen.contains("bar-ss.jpg"));
let field = "<img alt=\"foo\" src=\"foo-dd.jpg\"><img alt=\"bar\" src=\"bar-dd.jpg\">";
let seen = normalize_and_maybe_rename_files_helper(&mut checker, field);
assert!(seen.contains("foo-dd.jpg"));
assert!(seen.contains("bar-dd.jpg"));
let field = "<img alt='foo' src='foo-sd.jpg'><img alt=\"bar\" src=\"bar-sd.jpg\">";
let seen = normalize_and_maybe_rename_files_helper(&mut checker, field);
assert!(seen.contains("foo-sd.jpg"));
assert!(seen.contains("bar-sd.jpg"));
Ok(())
}
}

View file

@ -113,7 +113,7 @@ lazy_static! {
"[^"]+?"
|
'[^']+?'
)+
)+?
# capture `src` or `data` attribute
\b(?:src|data)\b=