From 7583a7e6b36f955027e0e767ac079e3671c2272f Mon Sep 17 00:00:00 2001 From: RumovZ Date: Thu, 7 Apr 2022 12:45:02 +0200 Subject: [PATCH] Prepare media based on checksums - Ensure all existing media files are hashed. - Hash incoming files during preparation to detect conflicts. - Uniquify names of conflicting files with hash (not notetype id). - Mark media files as used while importing notes. - Finally copy used media. --- .../src/import_export/package/apkg/import.rs | 157 +++++++++--------- rslib/src/import_export/package/media.rs | 20 ++- rslib/src/media/database.rs | 19 ++- rslib/src/media/files.rs | 9 +- rslib/src/media/mod.rs | 14 +- 5 files changed, 136 insertions(+), 83 deletions(-) diff --git a/rslib/src/import_export/package/apkg/import.rs b/rslib/src/import_export/package/apkg/import.rs index 05e691b20..bb2fe6853 100644 --- a/rslib/src/import_export/package/apkg/import.rs +++ b/rslib/src/import_export/package/apkg/import.rs @@ -2,11 +2,12 @@ // License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html use std::{ + borrow::Cow, collections::{HashMap, HashSet}, - fs::{self, File}, - io::{self, Write}, + fs::File, + io::{self}, mem, - path::{Path, PathBuf}, + path::Path, sync::Arc, }; @@ -18,14 +19,19 @@ use crate::{ collection::CollectionBuilder, import_export::{ gather::ExchangeData, - package::{media::extract_media_entries, Meta}, + package::{ + media::{extract_media_entries, safe_normalized_file_name, SafeMediaEntry}, + Meta, + }, + }, + media::{ + files::{add_hash_suffix_to_file_stem, sha1_of_reader}, + MediaManager, }, - io::{atomic_rename, tempfile_in_parent_of}, prelude::*, text::replace_media_refs, }; -#[derive(Debug)] struct Context { archive: ZipArchive, guid_map: HashMap, @@ -34,8 +40,11 @@ struct Context { existing_notetypes: HashSet, data: ExchangeData, usn: Usn, - media_map: HashMap, - target_media_folder: PathBuf, + /// Map of source media files, that do not already exist in the target. + /// + /// original, normalized file name → (refererenced on import material, + /// entry with possibly remapped file name) + used_media_entries: HashMap, conflicting_notes: HashSet, } @@ -60,6 +69,22 @@ impl NoteMeta { } } +impl SafeMediaEntry { + fn with_hash_from_archive(&mut self, archive: &mut ZipArchive) -> Result<()> { + if self.sha1 == [0; 20] { + let mut reader = self.fetch_file(archive)?; + self.sha1 = sha1_of_reader(&mut reader)?; + } + Ok(()) + } + + /// Requires sha1 to be set. Returns old file name. + fn uniquify_name(&mut self) -> String { + let new_name = add_hash_suffix_to_file_stem(&self.name, &self.sha1); + mem::replace(&mut self.name, new_name) + } +} + impl Collection { pub fn import_apkg( &mut self, @@ -71,18 +96,19 @@ impl Collection { let archive = ZipArchive::new(file)?; let mut ctx = Context::new(archive, self, search, with_scheduling)?; + ctx.prepare_media(self)?; ctx.prepare_notetypes(self)?; ctx.prepare_notes()?; - self.insert_data(&ctx.data) + self.insert_data(&ctx.data)?; + ctx.copy_media(&self.media_folder)?; + Ok(()) } -} -fn build_media_map(archive: &mut ZipArchive) -> Result> { - Ok(extract_media_entries(&Meta::new_legacy(), archive)? - .into_iter() - .map(|entry| (entry.name, entry.index.to_string())) - .collect()) + fn all_existing_sha1s(&mut self) -> Result> { + let mgr = MediaManager::new(&self.media_folder, &self.media_db)?; + mgr.all_checksums(|_| true, &self.log) + } } impl ExchangeData { @@ -111,21 +137,37 @@ impl Context { with_scheduling: bool, ) -> Result { let data = ExchangeData::gather_from_archive(&mut archive, search, with_scheduling)?; - let media_map = build_media_map(&mut archive)?; Ok(Self { archive, data, guid_map: target_col.storage.note_guid_map()?, existing_notes: target_col.storage.get_all_note_ids()?, existing_notetypes: target_col.storage.get_all_notetype_ids()?, - media_map, - target_media_folder: target_col.media_folder.clone(), usn: target_col.usn()?, conflicting_notes: HashSet::new(), remapped_notetypes: HashMap::new(), + used_media_entries: HashMap::new(), }) } + fn prepare_media(&mut self, target_col: &mut Collection) -> Result<()> { + let existing_sha1s = target_col.all_existing_sha1s()?; + for mut entry in extract_media_entries(&Meta::new_legacy(), &mut self.archive)? { + if let Some(other_sha1) = existing_sha1s.get(&entry.name) { + entry.with_hash_from_archive(&mut self.archive)?; + if entry.sha1 != *other_sha1 { + let original_name = entry.uniquify_name(); + self.used_media_entries + .insert(original_name, (false, entry)); + } + } else { + self.used_media_entries + .insert(entry.name.clone(), (false, entry)); + } + } + Ok(()) + } + fn prepare_notetypes(&mut self, target_col: &mut Collection) -> Result<()> { for notetype in mem::take(&mut self.data.notetypes) { if let Some(existing) = target_col.get_notetype(notetype.id)? { @@ -229,60 +271,28 @@ impl Context { } fn munge_media(&mut self, note: &mut Note) -> Result<()> { - let notetype_id = note.notetype_id; for field in note.fields_mut() { - if let Some(new_field) = self.replace_media_refs_fallible(field, notetype_id)? { + if let Some(new_field) = self.replace_media_refs(field) { *field = new_field; }; } Ok(()) } - fn replace_media_refs_fallible( - &mut self, - field: &mut String, - notetype_id: NotetypeId, - ) -> Result> { - let mut res = Ok(()); - let out = replace_media_refs(field, |name| { - if res.is_err() { - None - } else { - self.merge_media_maybe_renaming(name, notetype_id) - .unwrap_or_else(|err| { - res = Err(err); - None - }) - } - }); - res.map(|_| out) - } - - fn merge_media_maybe_renaming( - &mut self, - name: &str, - notetype: NotetypeId, - ) -> Result> { - Ok(if let Some(zip_name) = self.media_map.get(name) { - let alternate_name = alternate_media_name(name, notetype); - let alternate_path = self.target_media_folder.join(&alternate_name); - if alternate_path.exists() { - Some(alternate_name) - } else { - let mut data = Vec::new(); - io::copy(&mut self.archive.by_name(zip_name)?, &mut data)?; - let target_path = self.target_media_folder.join(name); - if !target_path.exists() { - write_data_atomically(&data, &target_path)?; - None - } else if data == fs::read(target_path)? { - None - } else { - write_data_atomically(&data, &alternate_path)?; - Some(alternate_name) + fn replace_media_refs(&mut self, field: &mut String) -> Option { + replace_media_refs(field, |name| { + if let Ok(normalized) = safe_normalized_file_name(name) { + if let Some((used, entry)) = self.used_media_entries.get_mut(normalized.as_ref()) { + *used = true; + if entry.name != name { + // name is not normalized, and/or remapped + return Some(entry.name.clone()); + } + } else if let Cow::Owned(s) = normalized { + // no entry; might be a reference to an existing file, so ensure normalization + return Some(s); } } - } else { None }) } @@ -292,20 +302,15 @@ impl Context { note_id.0 += 999; } } -} -fn write_data_atomically(data: &[u8], path: &Path) -> Result<()> { - let mut tempfile = tempfile_in_parent_of(path)?; - tempfile.write_all(data)?; - atomic_rename(tempfile, path, false) -} - -fn alternate_media_name(name: &str, notetype_id: NotetypeId) -> String { - let (stem, dot, extension) = name - .rsplit_once('.') - .map(|(stem, ext)| (stem, ".", ext)) - .unwrap_or((name, "", "")); - format!("{stem}_{notetype_id}{dot}{extension}") + fn copy_media(&mut self, media_folder: &Path) -> Result<()> { + for (used, entry) in self.used_media_entries.values() { + if *used { + entry.copy_from_archive(&mut self.archive, media_folder)?; + } + } + Ok(()) + } } impl Notetype { diff --git a/rslib/src/import_export/package/media.rs b/rslib/src/import_export/package/media.rs index be25d1c77..03f5b8615 100644 --- a/rslib/src/import_export/package/media.rs +++ b/rslib/src/import_export/package/media.rs @@ -10,19 +10,22 @@ use std::{ }; use prost::Message; +use tempfile::NamedTempFile; use zip::{read::ZipFile, ZipArchive}; use zstd::stream::copy_decode; use super::{MediaEntries, MediaEntry, Meta}; use crate::{ - error::ImportError, io::filename_is_safe, media::files::normalize_filename, prelude::*, + error::ImportError, + io::{atomic_rename, filename_is_safe}, + media::files::normalize_filename, + prelude::*, }; /// Like [MediaEntry], but with a safe filename and set zip filename. pub(super) struct SafeMediaEntry { pub(super) name: String, pub(super) size: u32, - #[allow(dead_code)] pub(super) sha1: [u8; 20], pub(super) index: usize, } @@ -98,6 +101,17 @@ impl SafeMediaEntry { .map(|metadata| metadata.len() as u64 == self_size) .unwrap_or_default() } + + pub(super) fn copy_from_archive( + &self, + archive: &mut ZipArchive, + target_folder: &Path, + ) -> Result<()> { + let mut file = self.fetch_file(archive)?; + let mut tempfile = NamedTempFile::new_in(target_folder)?; + io::copy(&mut file, &mut tempfile)?; + atomic_rename(tempfile, &self.file_path(target_folder), false) + } } pub(super) fn extract_media_entries( @@ -113,7 +127,7 @@ pub(super) fn extract_media_entries( } } -fn safe_normalized_file_name(name: &str) -> Result> { +pub(super) fn safe_normalized_file_name(name: &str) -> Result> { if !filename_is_safe(name) { Err(AnkiError::ImportError(ImportError::Corrupt)) } else { diff --git a/rslib/src/media/database.rs b/rslib/src/media/database.rs index 858035c33..d4ed3a20b 100644 --- a/rslib/src/media/database.rs +++ b/rslib/src/media/database.rs @@ -5,7 +5,7 @@ use std::{collections::HashMap, path::Path}; use rusqlite::{params, Connection, OptionalExtension, Row, Statement}; -use crate::error::Result; +use crate::prelude::*; fn trace(s: &str) { println!("sql: {}", s) @@ -222,6 +222,14 @@ delete from media where fname=?" Ok(map?) } + /// Error if any checksums are missing or broken. + pub(super) fn all_checksums(&mut self) -> Result> { + self.db + .prepare("SELECT fname, csum FROM media")? + .query_and_then([], row_to_name_and_checksum)? + .collect() + } + pub(super) fn force_resync(&mut self) -> Result<()> { self.db .execute_batch("delete from media; update meta set lastUsn = 0, dirMod = 0") @@ -250,6 +258,15 @@ fn row_to_entry(row: &Row) -> rusqlite::Result { }) } +fn row_to_name_and_checksum(row: &Row) -> Result<(String, [u8; 20])> { + let file_name = row.get(0)?; + let sha1_str: String = row.get(1)?; + let mut sha1 = [0; 20]; + hex::decode_to_slice(sha1_str, &mut sha1) + .map_err(|_| AnkiError::invalid_input(format!("bad media checksum: {file_name}")))?; + Ok((file_name, sha1)) +} + #[cfg(test)] mod test { use tempfile::NamedTempFile; diff --git a/rslib/src/media/files.rs b/rslib/src/media/files.rs index b56b1baa3..12ef08145 100644 --- a/rslib/src/media/files.rs +++ b/rslib/src/media/files.rs @@ -194,7 +194,7 @@ where } /// Convert foo.jpg into foo-abcde12345679.jpg -fn add_hash_suffix_to_file_stem(fname: &str, hash: &[u8; 20]) -> String { +pub(crate) fn add_hash_suffix_to_file_stem(fname: &str, hash: &[u8; 20]) -> String { // when appending a hash to make unique, it will be 40 bytes plus the hyphen. let max_len = MAX_FILENAME_LENGTH - 40 - 1; @@ -283,10 +283,15 @@ fn existing_file_sha1(path: &Path) -> io::Result> { /// Return the SHA1 of a file, failing if it doesn't exist. pub(crate) fn sha1_of_file(path: &Path) -> io::Result<[u8; 20]> { let mut file = fs::File::open(path)?; + sha1_of_reader(&mut file) +} + +/// Return the SHA1 of a stream. +pub(crate) fn sha1_of_reader(reader: &mut impl Read) -> io::Result<[u8; 20]> { let mut hasher = Sha1::new(); let mut buf = [0; 64 * 1024]; loop { - match file.read(&mut buf) { + match reader.read(&mut buf) { Ok(0) => break, Ok(n) => hasher.update(&buf[0..n]), Err(e) => { diff --git a/rslib/src/media/mod.rs b/rslib/src/media/mod.rs index 351befa1f..c18d5882e 100644 --- a/rslib/src/media/mod.rs +++ b/rslib/src/media/mod.rs @@ -3,19 +3,21 @@ use std::{ borrow::Cow, + collections::HashMap, path::{Path, PathBuf}, }; use rusqlite::Connection; use slog::Logger; +use self::changetracker::ChangeTracker; use crate::{ - error::Result, media::{ database::{open_or_create, MediaDatabaseContext, MediaEntry}, files::{add_data_to_folder_uniquely, mtime_as_i64, remove_files, sha1_of_data}, sync::{MediaSyncProgress, MediaSyncer}, }, + prelude::*, }; pub mod changetracker; @@ -153,4 +155,14 @@ impl MediaManager { pub fn dbctx(&self) -> MediaDatabaseContext { MediaDatabaseContext::new(&self.db) } + + pub fn all_checksums( + &self, + progress: impl FnMut(usize) -> bool, + log: &Logger, + ) -> Result> { + let mut dbctx = self.dbctx(); + ChangeTracker::new(&self.media_folder, progress, log).register_changes(&mut dbctx)?; + dbctx.all_checksums() + } }