diff --git a/rslib/Cargo.toml b/rslib/Cargo.toml index b0443ae8f..9f5b18ceb 100644 --- a/rslib/Cargo.toml +++ b/rslib/Cargo.toml @@ -29,6 +29,7 @@ log = "0.4.8" serde_tuple = "0.4.0" coarsetime = "0.1.12" utime = "0.2.1" +serde-aux = "0.6.1" [target.'cfg(target_vendor="apple")'.dependencies] rusqlite = { version = "0.21.0", features = ["trace"] } diff --git a/rslib/src/lib.rs b/rslib/src/lib.rs index d3ff6d8a1..a888f812b 100644 --- a/rslib/src/lib.rs +++ b/rslib/src/lib.rs @@ -17,3 +17,5 @@ pub mod sched; pub mod template; pub mod template_filters; pub mod text; +pub mod time; +pub mod types; diff --git a/rslib/src/media/check.rs b/rslib/src/media/check.rs index 193cc7e95..126e0c6ad 100644 --- a/rslib/src/media/check.rs +++ b/rslib/src/media/check.rs @@ -2,29 +2,37 @@ // License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html use crate::err::{AnkiError, Result}; +use crate::media::col::{ + for_every_note, get_note_types, mark_collection_modified, open_or_create_collection_db, + set_note, Note, +}; use crate::media::database::MediaDatabaseContext; use crate::media::files::{ data_for_file, filename_if_normalized, remove_files, trash_folder, MEDIA_SYNC_FILESIZE_LIMIT, }; -use crate::media::MediaManager; +use crate::text::{normalize_to_nfc, MediaRef}; +use crate::{media::MediaManager, text::extract_media_refs}; use coarsetime::Instant; use log::debug; +use std::collections::{HashMap, HashSet}; +use std::path::Path; use std::{borrow::Cow, fs, time}; #[derive(Debug, PartialEq)] pub struct MediaCheckOutput { - files: Vec, - renamed: Vec, + unused: Vec, + missing: Vec, + renamed: HashMap, dirs: Vec, oversize: Vec, } -/// A file that was renamed due to invalid chars or non-NFC encoding. -/// On Apple computers, files in NFD format are not renamed. -#[derive(Debug, PartialEq)] -pub struct RenamedFile { - current_fname: String, - original_fname: String, +#[derive(Debug, PartialEq, Default)] +struct MediaFolderCheck { + files: Vec, + renamed: HashMap, + dirs: Vec, + oversize: Vec, } pub struct MediaChecker<'a, P> @@ -32,6 +40,7 @@ where P: FnMut(usize) -> bool, { mgr: &'a MediaManager, + col_path: &'a Path, progress_cb: P, checked: usize, progress_updated: Instant, @@ -41,9 +50,14 @@ impl

MediaChecker<'_, P> where P: FnMut(usize) -> bool, { - pub fn new(mgr: &MediaManager, progress_cb: P) -> MediaChecker<'_, P> { + pub fn new<'a>( + mgr: &'a MediaManager, + col_path: &'a Path, + progress_cb: P, + ) -> MediaChecker<'a, P> { MediaChecker { mgr, + col_path, progress_cb, checked: 0, progress_updated: Instant::now(), @@ -53,12 +67,28 @@ where pub fn check(&mut self) -> Result { self.expire_old_trash()?; - // loop through on-disk files - let mut dirs = vec![]; - let mut oversize = vec![]; - let mut all_files = vec![]; - let mut renamed_files = vec![]; let mut ctx = self.mgr.dbctx(); + + let folder_check = self.check_media_folder(&mut ctx)?; + let referenced_files = self.check_media_references(&folder_check.renamed)?; + let (unused, missing) = find_unused_and_missing(folder_check.files, referenced_files); + + Ok(MediaCheckOutput { + unused, + missing, + renamed: folder_check.renamed, + dirs: folder_check.dirs, + oversize: folder_check.oversize, + }) + } + + /// Check all the files in the media folder. + /// + /// - Renames files with invalid names + /// - Notes folders/oversized files + /// - Gathers a list of all files + fn check_media_folder(&mut self, ctx: &mut MediaDatabaseContext) -> Result { + let mut out = MediaFolderCheck::default(); for dentry in self.mgr.media_folder.read_dir()? { let dentry = dentry?; @@ -76,14 +106,14 @@ where // skip folders if dentry.file_type()?.is_dir() { - dirs.push(disk_fname.to_string()); + out.dirs.push(disk_fname.to_string()); continue; } // ignore large files and zero byte files let metadata = dentry.metadata()?; if metadata.len() > MEDIA_SYNC_FILESIZE_LIMIT as u64 { - oversize.push(disk_fname.to_string()); + out.oversize.push(disk_fname.to_string()); continue; } if metadata.len() == 0 { @@ -91,23 +121,21 @@ where } // rename if required - let (norm_name, renamed) = self.normalize_and_maybe_rename(&mut ctx, &disk_fname)?; + let (norm_name, renamed) = self.normalize_and_maybe_rename(ctx, &disk_fname)?; if renamed { - renamed_files.push(RenamedFile { - current_fname: norm_name.to_string(), - original_fname: disk_fname.to_string(), - }) + let orig_as_nfc = normalize_to_nfc(&disk_fname); + // if the only difference is the unicode normalization, + // we don't mark the file as a renamed file + if orig_as_nfc.as_ref() != norm_name.as_ref() { + out.renamed + .insert(orig_as_nfc.to_string(), norm_name.to_string()); + } } - all_files.push(norm_name.into_owned()); + out.files.push(norm_name.into_owned()); } - Ok(MediaCheckOutput { - files: all_files, - renamed: renamed_files, - dirs, - oversize, - }) + Ok(out) } /// Returns (normalized_form, needs_rename) @@ -182,30 +210,149 @@ where Ok(()) } + + /// Find all media references in notes, fixing as necessary. + fn check_media_references( + &mut self, + renamed: &HashMap, + ) -> Result> { + let mut db = open_or_create_collection_db(self.col_path)?; + let trx = db.transaction()?; + + let mut referenced_files = HashSet::new(); + let note_types = get_note_types(&trx)?; + let mut collection_modified = false; + + for_every_note(&trx, |note| { + self.checked += 1; + if self.checked % 10 == 0 { + self.maybe_fire_progress_cb()?; + } + if fix_and_extract_media_refs(note, &mut referenced_files, renamed)? { + // note was modified, needs saving + set_note( + &trx, + note, + note_types + .get(¬e.mid) + .ok_or_else(|| AnkiError::DBError { + info: "missing note type".to_string(), + })?, + )?; + collection_modified = true; + } + + Ok(()) + })?; + + if collection_modified { + mark_collection_modified(&trx)?; + trx.commit()?; + } + + Ok(referenced_files) + } +} + +/// Returns true if note was modified. +fn fix_and_extract_media_refs( + note: &mut Note, + seen_files: &mut HashSet, + renamed: &HashMap, +) -> Result { + let mut updated = false; + + for idx in 0..note.fields().len() { + let field = normalize_and_maybe_rename_files(¬e.fields()[idx], renamed, seen_files); + if let Cow::Owned(field) = field { + // field was modified, need to save + note.set_field(idx, field)?; + updated = true; + } + } + + Ok(updated) +} + +/// Convert any filenames that are not in NFC form into NFC, +/// and update any files that were renamed on disk. +fn normalize_and_maybe_rename_files<'a>( + field: &'a str, + renamed: &HashMap, + seen_files: &mut HashSet, +) -> Cow<'a, str> { + let refs = extract_media_refs(field); + let mut field: Cow = field.into(); + + for media_ref in refs { + // normalize fname into NFC + let mut fname = normalize_to_nfc(media_ref.fname); + // and look it up to see if it's been renamed + if let Some(new_name) = renamed.get(fname.as_ref()) { + fname = new_name.to_owned().into(); + } + // if it was not in NFC or was renamed, update the field + if let Cow::Owned(ref new_name) = fname { + field = rename_media_ref_in_field(field.as_ref(), &media_ref, new_name).into(); + } + // and mark this filename as having been referenced + seen_files.insert(fname.into_owned()); + } + + field +} + +fn rename_media_ref_in_field(field: &str, media_ref: &MediaRef, new_name: &str) -> String { + let updated_tag = media_ref.full_ref.replace(media_ref.fname, new_name); + field.replace(media_ref.full_ref, &updated_tag) +} + +/// Returns (unused, missing) +fn find_unused_and_missing( + files: Vec, + mut references: HashSet, +) -> (Vec, Vec) { + let mut unused = vec![]; + + for file in files { + if !references.contains(&file) { + unused.push(file); + } else { + references.remove(&file); + } + } + + (unused, references.into_iter().collect()) } #[cfg(test)] mod test { use crate::err::Result; - use crate::media::check::{MediaCheckOutput, MediaChecker, RenamedFile}; + use crate::media::check::{MediaCheckOutput, MediaChecker}; use crate::media::MediaManager; use std::fs; + use std::path::PathBuf; use tempfile::{tempdir, TempDir}; - fn common_setup() -> Result<(TempDir, MediaManager)> { + fn common_setup() -> Result<(TempDir, MediaManager, PathBuf)> { let dir = tempdir()?; let media_dir = dir.path().join("media"); fs::create_dir(&media_dir)?; let media_db = dir.path().join("media.db"); + let col_path = dir.path().join("col.anki2"); + fs::write( + &col_path, + &include_bytes!("../../tests/support/mediacheck.anki2")[..], + )?; let mgr = MediaManager::new(&media_dir, media_db)?; - Ok((dir, mgr)) + Ok((dir, mgr, col_path)) } #[test] fn test_media_check() -> Result<()> { - let (_dir, mgr) = common_setup()?; + let (_dir, mgr, col_path) = common_setup()?; // add some test files fs::write(&mgr.media_folder.join("zerobytes"), "")?; @@ -214,18 +361,17 @@ mod test { fs::write(&mgr.media_folder.join("foo[.jpg"), "foo")?; let progress = |_n| true; - let mut checker = MediaChecker::new(&mgr, progress); - let mut output = checker.check()?; - output.files.sort(); + let mut checker = MediaChecker::new(&mgr, &col_path, progress); + let output = checker.check()?; assert_eq!( output, MediaCheckOutput { - files: vec!["foo.jpg".to_string(), "normal.jpg".to_string()], - renamed: vec![RenamedFile { - current_fname: "foo.jpg".to_string(), - original_fname: "foo[.jpg".to_string() - }], + unused: vec![], + missing: vec!["ぱぱ.jpg".into()], + renamed: vec![("foo[.jpg".into(), "foo.jpg".into())] + .into_iter() + .collect(), dirs: vec!["folder".to_string()], oversize: vec![] } @@ -239,13 +385,14 @@ mod test { #[test] fn test_unicode_normalization() -> Result<()> { - let (_dir, mgr) = common_setup()?; + let (_dir, mgr, col_path) = common_setup()?; fs::write(&mgr.media_folder.join("ぱぱ.jpg"), "nfd encoding")?; let progress = |_n| true; - let mut checker = MediaChecker::new(&mgr, progress); - let output = checker.check()?; + let mut checker = MediaChecker::new(&mgr, &col_path, progress); + let mut output = checker.check()?; + output.missing.sort(); if cfg!(target_vendor = "apple") { // on a Mac, the file should not have been renamed, but the returned name @@ -253,8 +400,9 @@ mod test { assert_eq!( output, MediaCheckOutput { - files: vec!["ぱぱ.jpg".to_string()], - renamed: vec![], + unused: vec![], + missing: vec!["foo[.jpg".into(), "normal.jpg".into()], + renamed: Default::default(), dirs: vec![], oversize: vec![] } @@ -265,11 +413,11 @@ mod test { assert_eq!( output, MediaCheckOutput { - files: vec!["ぱぱ.jpg".to_string()], - renamed: vec![RenamedFile { - current_fname: "ぱぱ.jpg".to_string(), - original_fname: "ぱぱ.jpg".to_string() - }], + unused: vec![], + missing: vec!["foo[.jpg".into(), "normal.jpg".into()], + renamed: vec![("ぱぱ.jpg".into(), "ぱぱ.jpg".into())] + .into_iter() + .collect(), dirs: vec![], oversize: vec![] } diff --git a/rslib/src/media/col.rs b/rslib/src/media/col.rs new file mode 100644 index 000000000..51bbb8796 --- /dev/null +++ b/rslib/src/media/col.rs @@ -0,0 +1,153 @@ +// Copyright: Ankitects Pty Ltd and contributors +// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html + +/// Basic note reading/updating functionality for the media DB check. +use crate::err::{AnkiError, Result}; +use crate::text::strip_html_preserving_image_filenames; +use crate::time::i64_unix_timestamp; +use crate::types::{ObjID, Timestamp, Usn}; +use rusqlite::{params, Connection, Row, NO_PARAMS}; +use serde_aux::field_attributes::deserialize_number_from_string; +use serde_derive::Deserialize; +use std::collections::HashMap; +use std::convert::TryInto; +use std::path::Path; + +#[derive(Debug)] +pub(super) struct Note { + pub id: ObjID, + pub mid: ObjID, + pub mtime_secs: Timestamp, + pub usn: Usn, + fields: Vec, +} + +impl Note { + pub fn fields(&self) -> &Vec { + &self.fields + } + + pub fn set_field(&mut self, idx: usize, text: impl Into) -> Result<()> { + if idx >= self.fields.len() { + return Err(AnkiError::invalid_input( + "field idx out of range".to_string(), + )); + } + + self.fields[idx] = text.into(); + + Ok(()) + } +} + +fn field_checksum(text: &str) -> u32 { + let digest = sha1::Sha1::from(text).digest().bytes(); + u32::from_be_bytes(digest[..4].try_into().unwrap()) +} + +pub(super) fn open_or_create_collection_db(path: &Path) -> Result { + let db = Connection::open(path)?; + + db.pragma_update(None, "locking_mode", &"exclusive")?; + db.pragma_update(None, "page_size", &4096)?; + db.pragma_update(None, "cache_size", &(-40 * 1024))?; + db.pragma_update(None, "legacy_file_format", &false)?; + db.pragma_update(None, "journal", &"wal")?; + db.set_prepared_statement_cache_capacity(5); + + Ok(db) +} + +#[derive(Deserialize, Debug)] +pub(super) struct NoteType { + #[serde(deserialize_with = "deserialize_number_from_string")] + id: ObjID, + #[serde(rename = "sortf")] + sort_field_idx: u16, +} + +pub(super) fn get_note_types(db: &Connection) -> Result> { + let mut stmt = db.prepare("select models from col")?; + let note_types = stmt + .query_and_then(NO_PARAMS, |row| -> Result> { + let v: HashMap = serde_json::from_str(row.get_raw(0).as_str()?)?; + Ok(v) + })? + .next() + .ok_or_else(|| AnkiError::DBError { + info: "col table empty".to_string(), + })??; + Ok(note_types) +} + +#[allow(dead_code)] +fn get_note(db: &Connection, nid: ObjID) -> Result> { + let mut stmt = db.prepare_cached("select id, mid, mod, usn, flds from notes where id=?")?; + let note = stmt.query_and_then(params![nid], row_to_note)?.next(); + + note.transpose() +} + +pub(super) fn for_every_note Result<()>>( + db: &Connection, + mut func: F, +) -> Result<()> { + let mut stmt = db.prepare("select id, mid, mod, usn, flds from notes")?; + for result in stmt.query_and_then(NO_PARAMS, |row| { + let mut note = row_to_note(row)?; + func(&mut note) + })? { + result?; + } + Ok(()) +} + +fn row_to_note(row: &Row) -> Result { + Ok(Note { + id: row.get(0)?, + mid: row.get(1)?, + mtime_secs: row.get(2)?, + usn: row.get(3)?, + fields: row + .get_raw(4) + .as_str()? + .split('\x1f') + .map(|s| s.to_string()) + .collect(), + }) +} + +pub(super) fn set_note(db: &Connection, note: &mut Note, note_type: &NoteType) -> Result<()> { + note.mtime_secs = i64_unix_timestamp(); + // hard-coded for now + note.usn = -1; + let csum = field_checksum(¬e.fields()[0]); + let sort_field = strip_html_preserving_image_filenames( + note.fields() + .get(note_type.sort_field_idx as usize) + .ok_or_else(|| AnkiError::DBError { + info: "sort field out of range".to_string(), + })?, + ); + + let mut stmt = + db.prepare_cached("update notes set mod=?,usn=?,flds=?,sfld=?,csum=? where id=?")?; + stmt.execute(params![ + note.mtime_secs, + note.usn, + note.fields().join("\x1f"), + sort_field, + csum, + note.id, + ])?; + + Ok(()) +} + +pub(super) fn mark_collection_modified(db: &Connection) -> Result<()> { + db.execute( + "update col set usn=-1, mod=?", + params![i64_unix_timestamp()], + )?; + Ok(()) +} diff --git a/rslib/src/media/mod.rs b/rslib/src/media/mod.rs index 911921c5e..51eaab001 100644 --- a/rslib/src/media/mod.rs +++ b/rslib/src/media/mod.rs @@ -11,6 +11,7 @@ use std::path::{Path, PathBuf}; pub mod changetracker; pub mod check; +pub mod col; pub mod database; pub mod files; pub mod sync; diff --git a/rslib/src/text.rs b/rslib/src/text.rs index a612632f2..ae910577d 100644 --- a/rslib/src/text.rs +++ b/rslib/src/text.rs @@ -31,8 +31,33 @@ lazy_static! { .unwrap(); static ref IMG_TAG: Regex = Regex::new( - // group 1 is filename - r#"(?i)]+src=["']?([^"'>]+)["']?[^>]*>"# + r#"(?xsi) + # the start of the image tag + ]+src= + (?: + # 1: double-quoted filename + " + ([^"]+?) + " + [^>]*> + | + # 2: single-quoted filename + ' + ([^']+?) + ' + [^>]*> + | + # 3: unquoted filename + ([^ >]+?) + (?: + # then either a space and the rest + \x20[^>]*> + | + # or the tag immediately ends + > + ) + ) + "# ).unwrap(); // videos are also in sound tags @@ -106,6 +131,39 @@ pub fn extract_av_tags<'a>(text: &'a str, question_side: bool) -> (Cow<'a, str>, (replaced_text, tags) } +#[derive(Debug)] +pub(crate) struct MediaRef<'a> { + pub full_ref: &'a str, + pub fname: &'a str, +} + +pub(crate) fn extract_media_refs(text: &str) -> Vec { + let mut out = vec![]; + + for caps in IMG_TAG.captures_iter(text) { + out.push(MediaRef { + full_ref: caps.get(0).unwrap().as_str(), + fname: caps + .get(1) + .or_else(|| caps.get(2)) + .or_else(|| caps.get(3)) + .unwrap() + .as_str(), + }); + } + + for caps in AV_TAGS.captures_iter(text) { + if let Some(m) = caps.get(1) { + out.push(MediaRef { + full_ref: caps.get(0).unwrap().as_str(), + fname: m.as_str(), + }); + } + } + + out +} + fn tts_tag_from_string<'a>(field_text: &'a str, args: &'a str) -> AVTag { let mut other_args = vec![]; let mut split_args = args.split_ascii_whitespace(); @@ -141,7 +199,7 @@ fn tts_tag_from_string<'a>(field_text: &'a str, args: &'a str) -> AVTag { } pub fn strip_html_preserving_image_filenames(html: &str) -> Cow { - let without_fnames = IMG_TAG.replace_all(html, r" $1 "); + let without_fnames = IMG_TAG.replace_all(html, r" ${1}${2}${3} "); let without_html = HTML.replace_all(&without_fnames, ""); // no changes? if let Cow::Borrowed(b) = without_html { @@ -157,7 +215,6 @@ pub(crate) fn contains_latex(text: &str) -> bool { LATEX.is_match(text) } -#[allow(dead_code)] pub(crate) fn normalize_to_nfc(s: &str) -> Cow { if !is_nfc(s) { s.chars().nfc().collect::().into() diff --git a/rslib/src/time.rs b/rslib/src/time.rs new file mode 100644 index 000000000..fcdede0d2 --- /dev/null +++ b/rslib/src/time.rs @@ -0,0 +1,11 @@ +// Copyright: Ankitects Pty Ltd and contributors +// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html + +use std::time; + +pub(crate) fn i64_unix_timestamp() -> i64 { + time::SystemTime::now() + .duration_since(time::SystemTime::UNIX_EPOCH) + .unwrap() + .as_secs() as i64 +} diff --git a/rslib/src/types.rs b/rslib/src/types.rs new file mode 100644 index 000000000..0ae6e2a83 --- /dev/null +++ b/rslib/src/types.rs @@ -0,0 +1,9 @@ +// Copyright: Ankitects Pty Ltd and contributors +// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html + +// while Anki tends to only use positive numbers, sqlite only supports +// signed integers, so these numbers are signed as well. + +pub type ObjID = i64; +pub type Usn = i32; +pub type Timestamp = i64; diff --git a/rslib/tests/support/mediacheck.anki2 b/rslib/tests/support/mediacheck.anki2 new file mode 100644 index 000000000..5580db4ef Binary files /dev/null and b/rslib/tests/support/mediacheck.anki2 differ