Extract inline images as part of media check

We also need to get to the bottom of what's causing this:
https://forums.ankiweb.net/t/anki-browse-extremely-laggy/32533
This commit is contained in:
Damien Elmes 2023-07-31 12:02:51 +10:00
parent 7a34f83d40
commit a35c1a058d
6 changed files with 170 additions and 90 deletions

7
Cargo.lock generated
View file

@ -108,6 +108,7 @@ dependencies = [
"convert_case", "convert_case",
"criterion", "criterion",
"csv", "csv",
"data-encoding",
"difflib", "difflib",
"dirs", "dirs",
"envy", "envy",
@ -934,6 +935,12 @@ dependencies = [
"memchr", "memchr",
] ]
[[package]]
name = "data-encoding"
version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c2e66c9d817f1720209181c316d28635c050fa304f9c79e47a520882661b7308"
[[package]] [[package]]
name = "deadpool" name = "deadpool"
version = "0.9.5" version = "0.9.5"

View file

@ -61,6 +61,7 @@ clap = { version = "4.3.10", features = ["derive"] }
coarsetime = "0.1.23" coarsetime = "0.1.23"
convert_case = "0.6.0" convert_case = "0.6.0"
criterion = { version = "0.5.1" } criterion = { version = "0.5.1" }
data-encoding = "2.4.0"
difflib = "0.4.0" difflib = "0.4.0"
flate2 = "1.0.26" flate2 = "1.0.26"
fluent = "0.16.0" fluent = "0.16.0"

View file

@ -494,6 +494,15 @@
"license_file": null, "license_file": null,
"description": "Bare bones CSV parsing with no_std support." "description": "Bare bones CSV parsing with no_std support."
}, },
{
"name": "data-encoding",
"version": "2.4.0",
"authors": "Julien Cretin <git@ia0.eu>",
"repository": "https://github.com/ia0/data-encoding",
"license": "MIT",
"license_file": null,
"description": "Efficient and customizable data-encoding functions like base64, base32, and hex"
},
{ {
"name": "deadpool", "name": "deadpool",
"version": "0.9.5", "version": "0.9.5",

View file

@ -14,6 +14,7 @@ media-check-unused-count = Unused files: { $count }
media-check-renamed-count = Renamed files: { $count } media-check-renamed-count = Renamed files: { $count }
media-check-oversize-count = Over 100MB: { $count } media-check-oversize-count = Over 100MB: { $count }
media-check-subfolder-count = Subfolders: { $count } media-check-subfolder-count = Subfolders: { $count }
media-check-extracted-count = Extracted images: { $count }
## Shown at the top of each section ## Shown at the top of each section

View file

@ -52,6 +52,7 @@ chrono.workspace = true
coarsetime.workspace = true coarsetime.workspace = true
convert_case.workspace = true convert_case.workspace = true
csv.workspace = true csv.workspace = true
data-encoding.workspace = true
difflib.workspace = true difflib.workspace = true
dirs.workspace = true dirs.workspace = true
envy.workspace = true envy.workspace = true

View file

@ -6,16 +6,21 @@ use std::collections::HashMap;
use std::collections::HashSet; use std::collections::HashSet;
use std::fs; use std::fs;
use std::io; use std::io;
use std::path::Path;
use anki_i18n::without_unicode_isolation; use anki_i18n::without_unicode_isolation;
use anki_io::write_file;
use data_encoding::BASE64;
use once_cell::sync::Lazy;
use regex::Regex;
use tracing::debug; use tracing::debug;
use tracing::info;
use crate::error::DbErrorKind; use crate::error::DbErrorKind;
use crate::latex::extract_latex_expanding_clozes; use crate::latex::extract_latex_expanding_clozes;
use crate::media::files::data_for_file; use crate::media::files::data_for_file;
use crate::media::files::filename_if_normalized; use crate::media::files::filename_if_normalized;
use crate::media::files::normalize_nfc_filename; use crate::media::files::normalize_nfc_filename;
use crate::media::files::sha1_of_data;
use crate::media::files::trash_folder; use crate::media::files::trash_folder;
use crate::media::MediaManager; use crate::media::MediaManager;
use crate::prelude::*; use crate::prelude::*;
@ -24,6 +29,7 @@ use crate::sync::media::progress::MediaCheckProgress;
use crate::sync::media::MAX_INDIVIDUAL_MEDIA_FILE_SIZE; use crate::sync::media::MAX_INDIVIDUAL_MEDIA_FILE_SIZE;
use crate::text::extract_media_refs; use crate::text::extract_media_refs;
use crate::text::normalize_to_nfc; use crate::text::normalize_to_nfc;
use crate::text::CowMapping;
use crate::text::MediaRef; use crate::text::MediaRef;
use crate::text::REMOTE_FILENAME; use crate::text::REMOTE_FILENAME;
@ -37,6 +43,7 @@ pub struct MediaCheckOutput {
pub oversize: Vec<String>, pub oversize: Vec<String>,
pub trash_count: u64, pub trash_count: u64,
pub trash_bytes: u64, pub trash_bytes: u64,
pub inlined_image_count: u64,
} }
#[derive(Debug, PartialEq, Eq, Default)] #[derive(Debug, PartialEq, Eq, Default)]
@ -57,6 +64,7 @@ pub struct MediaChecker<'a> {
col: &'a mut Collection, col: &'a mut Collection,
media: MediaManager, media: MediaManager,
progress: ThrottlingProgressHandler<MediaCheckProgress>, progress: ThrottlingProgressHandler<MediaCheckProgress>,
inlined_image_count: u64,
} }
impl MediaChecker<'_> { impl MediaChecker<'_> {
@ -65,6 +73,7 @@ impl MediaChecker<'_> {
media: col.media()?, media: col.media()?,
progress: col.new_progress_handler(), progress: col.new_progress_handler(),
col, col,
inlined_image_count: 0,
}) })
} }
@ -82,6 +91,7 @@ impl MediaChecker<'_> {
oversize: folder_check.oversize, oversize: folder_check.oversize,
trash_count, trash_count,
trash_bytes, trash_bytes,
inlined_image_count: self.inlined_image_count,
}) })
} }
@ -102,6 +112,11 @@ impl MediaChecker<'_> {
buf += &tr.media_check_unused_count(output.unused.len()); buf += &tr.media_check_unused_count(output.unused.len());
buf.push('\n'); buf.push('\n');
if output.inlined_image_count > 0 {
buf += &tr.media_check_extracted_count(output.inlined_image_count);
buf.push('\n');
}
if !output.renamed.is_empty() { if !output.renamed.is_empty() {
buf += &tr.media_check_renamed_count(output.renamed.len()); buf += &tr.media_check_renamed_count(output.renamed.len());
buf.push('\n'); buf.push('\n');
@ -344,12 +359,7 @@ impl MediaChecker<'_> {
.or_insert_with(Vec::new) .or_insert_with(Vec::new)
.push(nid) .push(nid)
}; };
if fix_and_extract_media_refs( if self.fix_and_extract_media_refs(&mut note, &mut tracker, renamed)? {
&mut note,
&mut tracker,
renamed,
&self.media.media_folder,
)? {
// note was modified, needs saving // note was modified, needs saving
note.prepare_for_update(nt, false)?; note.prepare_for_update(nt, false)?;
note.set_modified(usn); note.set_modified(usn);
@ -368,24 +378,19 @@ impl MediaChecker<'_> {
Ok(referenced_files) Ok(referenced_files)
} }
}
/// Returns true if note was modified. /// Returns true if note was modified.
fn fix_and_extract_media_refs( fn fix_and_extract_media_refs(
&mut self,
note: &mut Note, note: &mut Note,
mut tracker: impl FnMut(String), mut tracker: impl FnMut(String),
renamed: &HashMap<String, String>, renamed: &HashMap<String, String>,
media_folder: &Path, ) -> Result<bool> {
) -> Result<bool> {
let mut updated = false; let mut updated = false;
for idx in 0..note.fields().len() { for idx in 0..note.fields().len() {
let field = normalize_and_maybe_rename_files( let field =
&note.fields()[idx], self.normalize_and_maybe_rename_files(&note.fields()[idx], renamed, &mut tracker)?;
renamed,
&mut tracker,
media_folder,
);
if let Cow::Owned(field) = field { if let Cow::Owned(field) = field {
// field was modified, need to save // field was modified, need to save
note.set_field(idx, field)?; note.set_field(idx, field)?;
@ -394,16 +399,16 @@ fn fix_and_extract_media_refs(
} }
Ok(updated) Ok(updated)
} }
/// Convert any filenames that are not in NFC form into NFC, /// Convert any filenames that are not in NFC form into NFC,
/// and update any files that were renamed on disk. /// and update any files that were renamed on disk.
fn normalize_and_maybe_rename_files<'a>( fn normalize_and_maybe_rename_files<'a>(
&mut self,
field: &'a str, field: &'a str,
renamed: &HashMap<String, String>, renamed: &HashMap<String, String>,
mut tracker: impl FnMut(String), mut tracker: impl FnMut(String),
media_folder: &Path, ) -> Result<Cow<'a, str>> {
) -> Cow<'a, str> {
let refs = extract_media_refs(field); let refs = extract_media_refs(field);
let mut field: Cow<str> = field.into(); let mut field: Cow<str> = field.into();
@ -413,8 +418,10 @@ fn normalize_and_maybe_rename_files<'a>(
continue; continue;
} }
let mut fname = self.maybe_extract_inline_image(&media_ref.fname_decoded)?;
// normalize fname into NFC // normalize fname into NFC
let mut fname = normalize_to_nfc(&media_ref.fname_decoded); fname = fname.map_cow(normalize_to_nfc);
// and look it up to see if it's been renamed // and look it up to see if it's been renamed
if let Some(new_name) = renamed.get(fname.as_ref()) { if let Some(new_name) = renamed.get(fname.as_ref()) {
fname = new_name.to_owned().into(); fname = new_name.to_owned().into();
@ -427,7 +434,7 @@ fn normalize_and_maybe_rename_files<'a>(
// files stored on AnkiWeb are in normalized form. // files stored on AnkiWeb are in normalized form.
if matches!(fname, Cow::Borrowed(_)) { if matches!(fname, Cow::Borrowed(_)) {
if let Cow::Owned(normname) = normalize_nfc_filename(fname.as_ref().into()) { if let Cow::Owned(normname) = normalize_nfc_filename(fname.as_ref().into()) {
let path = media_folder.join(&normname); let path = self.media.media_folder.join(&normname);
if path.exists() { if path.exists() {
fname = normname.into(); fname = normname.into();
} }
@ -441,7 +448,32 @@ fn normalize_and_maybe_rename_files<'a>(
tracker(fname.into_owned()); tracker(fname.into_owned());
} }
field Ok(field)
}
fn maybe_extract_inline_image<'a>(&mut self, fname_decoded: &'a str) -> Result<Cow<'a, str>> {
static BASE64_IMG: Lazy<Regex> = Lazy::new(|| {
Regex::new("(?i)^data:image/(jpg|jpeg|png|gif|webp);base64,(.+)$").unwrap()
});
let Some(caps) = BASE64_IMG.captures(fname_decoded) else {
return Ok(fname_decoded.into());
};
let (_all, [ext, data]) = caps.extract();
let data = data.trim();
let data = match BASE64.decode(data.as_bytes()) {
Ok(data) => data,
Err(err) => {
info!("invalid base64: {}", err);
return Ok(fname_decoded.into());
}
};
let checksum = hex::encode(sha1_of_data(&data));
let external_fname = format!("paste-{checksum}.{ext}");
write_file(self.media.media_folder.join(&external_fname), data)?;
self.inlined_image_count += 1;
Ok(external_fname.into())
}
} }
fn rename_media_ref_in_field(field: &str, media_ref: &MediaRef, new_name: &str) -> String { fn rename_media_ref_in_field(field: &str, media_ref: &MediaRef, new_name: &str) -> String {
@ -502,8 +534,10 @@ pub(crate) mod test {
include_bytes!("../../tests/support/mediacheck.anki2"); include_bytes!("../../tests/support/mediacheck.anki2");
use std::collections::HashMap; use std::collections::HashMap;
use std::path::Path;
use anki_io::create_dir; use anki_io::create_dir;
use anki_io::read_to_string;
use anki_io::write_file; use anki_io::write_file;
use tempfile::tempdir; use tempfile::tempdir;
use tempfile::TempDir; use tempfile::TempDir;
@ -558,7 +592,8 @@ pub(crate) mod test {
dirs: vec!["folder".to_string()], dirs: vec!["folder".to_string()],
oversize: vec![], oversize: vec![],
trash_count: 0, trash_count: 0,
trash_bytes: 0 trash_bytes: 0,
inlined_image_count: 0,
} }
); );
@ -675,7 +710,8 @@ Unused: unused.jpg
dirs: vec![], dirs: vec![],
oversize: vec![], oversize: vec![],
trash_count: 0, trash_count: 0,
trash_bytes: 0 trash_bytes: 0,
inlined_image_count: 0,
} }
); );
assert!(fs::metadata(mgr.media_folder.join("ぱぱ.jpg")).is_ok()); assert!(fs::metadata(mgr.media_folder.join("ぱぱ.jpg")).is_ok());
@ -693,7 +729,8 @@ Unused: unused.jpg
dirs: vec![], dirs: vec![],
oversize: vec![], oversize: vec![],
trash_count: 0, trash_count: 0,
trash_bytes: 0 trash_bytes: 0,
inlined_image_count: 0,
} }
); );
assert!(fs::metadata(mgr.media_folder.join("ぱぱ.jpg")).is_err()); assert!(fs::metadata(mgr.media_folder.join("ぱぱ.jpg")).is_err());
@ -703,31 +740,55 @@ Unused: unused.jpg
Ok(()) Ok(())
} }
fn normalize_and_maybe_rename_files_helper(field: &str) -> HashSet<String> { fn normalize_and_maybe_rename_files_helper(
checker: &mut MediaChecker,
field: &str,
) -> HashSet<String> {
let mut seen = HashSet::new(); let mut seen = HashSet::new();
normalize_and_maybe_rename_files( checker
field, .normalize_and_maybe_rename_files(field, &HashMap::new(), |fname| {
&HashMap::new(),
|fname| {
seen.insert(fname); seen.insert(fname);
}, })
Path::new("/tmp"), .unwrap();
);
seen seen
} }
#[test] #[test]
fn html_encoding() { fn html_encoding() -> Result<()> {
let (_dir, _mgr, mut col) = common_setup()?;
let mut checker = col.media_checker()?;
let mut field = "[sound:a &amp; b.mp3]"; let mut field = "[sound:a &amp; b.mp3]";
let seen = normalize_and_maybe_rename_files_helper(field); let seen = normalize_and_maybe_rename_files_helper(&mut checker, field);
assert!(seen.contains("a & b.mp3")); assert!(seen.contains("a & b.mp3"));
field = r#"<img src="a&b.jpg">"#; field = r#"<img src="a&b.jpg">"#;
let seen = normalize_and_maybe_rename_files_helper(field); let seen = normalize_and_maybe_rename_files_helper(&mut checker, field);
assert!(seen.contains("a&b.jpg")); assert!(seen.contains("a&b.jpg"));
field = r#"<img src="a&amp;b.jpg">"#; field = r#"<img src="a&amp;b.jpg">"#;
let seen = normalize_and_maybe_rename_files_helper(field); let seen = normalize_and_maybe_rename_files_helper(&mut checker, field);
assert!(seen.contains("a&b.jpg")); assert!(seen.contains("a&b.jpg"));
Ok(())
}
#[test]
fn inlined_images() -> Result<()> {
let (_dir, mgr, mut col) = common_setup()?;
NoteAdder::basic(&mut col)
// b'foo'
.fields(&["foo", "<img src='data:image/jpg;base64,Zm9v'>"])
.add(&mut col);
let mut checker = col.media_checker()?;
let output = checker.check()?;
assert_eq!(output.inlined_image_count, 1);
assert_eq!(
&read_to_string(
mgr.media_folder
.join("paste-0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33.jpg")
)?,
"foo"
);
Ok(())
} }
} }