Prepare media based on checksums

- Ensure all existing media files are hashed.
- Hash incoming files during preparation to detect conflicts.
- Uniquify names of conflicting files with hash (not notetype id).
- Mark media files as used while importing notes.
- Finally copy used media.
This commit is contained in:
RumovZ 2022-04-07 12:45:02 +02:00
parent a0085e7fd4
commit 7583a7e6b3
5 changed files with 136 additions and 83 deletions

View file

@ -2,11 +2,12 @@
// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html // License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
use std::{ use std::{
borrow::Cow,
collections::{HashMap, HashSet}, collections::{HashMap, HashSet},
fs::{self, File}, fs::File,
io::{self, Write}, io::{self},
mem, mem,
path::{Path, PathBuf}, path::Path,
sync::Arc, sync::Arc,
}; };
@ -18,14 +19,19 @@ use crate::{
collection::CollectionBuilder, collection::CollectionBuilder,
import_export::{ import_export::{
gather::ExchangeData, gather::ExchangeData,
package::{media::extract_media_entries, Meta}, package::{
media::{extract_media_entries, safe_normalized_file_name, SafeMediaEntry},
Meta,
},
},
media::{
files::{add_hash_suffix_to_file_stem, sha1_of_reader},
MediaManager,
}, },
io::{atomic_rename, tempfile_in_parent_of},
prelude::*, prelude::*,
text::replace_media_refs, text::replace_media_refs,
}; };
#[derive(Debug)]
struct Context { struct Context {
archive: ZipArchive<File>, archive: ZipArchive<File>,
guid_map: HashMap<String, NoteMeta>, guid_map: HashMap<String, NoteMeta>,
@ -34,8 +40,11 @@ struct Context {
existing_notetypes: HashSet<NotetypeId>, existing_notetypes: HashSet<NotetypeId>,
data: ExchangeData, data: ExchangeData,
usn: Usn, usn: Usn,
media_map: HashMap<String, String>, /// Map of source media files, that do not already exist in the target.
target_media_folder: PathBuf, ///
/// original, normalized file name → (refererenced on import material,
/// entry with possibly remapped file name)
used_media_entries: HashMap<String, (bool, SafeMediaEntry)>,
conflicting_notes: HashSet<String>, conflicting_notes: HashSet<String>,
} }
@ -60,6 +69,22 @@ impl NoteMeta {
} }
} }
impl SafeMediaEntry {
fn with_hash_from_archive(&mut self, archive: &mut ZipArchive<File>) -> Result<()> {
if self.sha1 == [0; 20] {
let mut reader = self.fetch_file(archive)?;
self.sha1 = sha1_of_reader(&mut reader)?;
}
Ok(())
}
/// Requires sha1 to be set. Returns old file name.
fn uniquify_name(&mut self) -> String {
let new_name = add_hash_suffix_to_file_stem(&self.name, &self.sha1);
mem::replace(&mut self.name, new_name)
}
}
impl Collection { impl Collection {
pub fn import_apkg( pub fn import_apkg(
&mut self, &mut self,
@ -71,18 +96,19 @@ impl Collection {
let archive = ZipArchive::new(file)?; let archive = ZipArchive::new(file)?;
let mut ctx = Context::new(archive, self, search, with_scheduling)?; let mut ctx = Context::new(archive, self, search, with_scheduling)?;
ctx.prepare_media(self)?;
ctx.prepare_notetypes(self)?; ctx.prepare_notetypes(self)?;
ctx.prepare_notes()?; ctx.prepare_notes()?;
self.insert_data(&ctx.data) self.insert_data(&ctx.data)?;
ctx.copy_media(&self.media_folder)?;
Ok(())
} }
}
fn build_media_map(archive: &mut ZipArchive<File>) -> Result<HashMap<String, String>> { fn all_existing_sha1s(&mut self) -> Result<HashMap<String, [u8; 20]>> {
Ok(extract_media_entries(&Meta::new_legacy(), archive)? let mgr = MediaManager::new(&self.media_folder, &self.media_db)?;
.into_iter() mgr.all_checksums(|_| true, &self.log)
.map(|entry| (entry.name, entry.index.to_string())) }
.collect())
} }
impl ExchangeData { impl ExchangeData {
@ -111,21 +137,37 @@ impl Context {
with_scheduling: bool, with_scheduling: bool,
) -> Result<Self> { ) -> Result<Self> {
let data = ExchangeData::gather_from_archive(&mut archive, search, with_scheduling)?; let data = ExchangeData::gather_from_archive(&mut archive, search, with_scheduling)?;
let media_map = build_media_map(&mut archive)?;
Ok(Self { Ok(Self {
archive, archive,
data, data,
guid_map: target_col.storage.note_guid_map()?, guid_map: target_col.storage.note_guid_map()?,
existing_notes: target_col.storage.get_all_note_ids()?, existing_notes: target_col.storage.get_all_note_ids()?,
existing_notetypes: target_col.storage.get_all_notetype_ids()?, existing_notetypes: target_col.storage.get_all_notetype_ids()?,
media_map,
target_media_folder: target_col.media_folder.clone(),
usn: target_col.usn()?, usn: target_col.usn()?,
conflicting_notes: HashSet::new(), conflicting_notes: HashSet::new(),
remapped_notetypes: HashMap::new(), remapped_notetypes: HashMap::new(),
used_media_entries: HashMap::new(),
}) })
} }
fn prepare_media(&mut self, target_col: &mut Collection) -> Result<()> {
let existing_sha1s = target_col.all_existing_sha1s()?;
for mut entry in extract_media_entries(&Meta::new_legacy(), &mut self.archive)? {
if let Some(other_sha1) = existing_sha1s.get(&entry.name) {
entry.with_hash_from_archive(&mut self.archive)?;
if entry.sha1 != *other_sha1 {
let original_name = entry.uniquify_name();
self.used_media_entries
.insert(original_name, (false, entry));
}
} else {
self.used_media_entries
.insert(entry.name.clone(), (false, entry));
}
}
Ok(())
}
fn prepare_notetypes(&mut self, target_col: &mut Collection) -> Result<()> { fn prepare_notetypes(&mut self, target_col: &mut Collection) -> Result<()> {
for notetype in mem::take(&mut self.data.notetypes) { for notetype in mem::take(&mut self.data.notetypes) {
if let Some(existing) = target_col.get_notetype(notetype.id)? { if let Some(existing) = target_col.get_notetype(notetype.id)? {
@ -229,60 +271,28 @@ impl Context {
} }
fn munge_media(&mut self, note: &mut Note) -> Result<()> { fn munge_media(&mut self, note: &mut Note) -> Result<()> {
let notetype_id = note.notetype_id;
for field in note.fields_mut() { for field in note.fields_mut() {
if let Some(new_field) = self.replace_media_refs_fallible(field, notetype_id)? { if let Some(new_field) = self.replace_media_refs(field) {
*field = new_field; *field = new_field;
}; };
} }
Ok(()) Ok(())
} }
fn replace_media_refs_fallible( fn replace_media_refs(&mut self, field: &mut String) -> Option<String> {
&mut self, replace_media_refs(field, |name| {
field: &mut String, if let Ok(normalized) = safe_normalized_file_name(name) {
notetype_id: NotetypeId, if let Some((used, entry)) = self.used_media_entries.get_mut(normalized.as_ref()) {
) -> Result<Option<String>> { *used = true;
let mut res = Ok(()); if entry.name != name {
let out = replace_media_refs(field, |name| { // name is not normalized, and/or remapped
if res.is_err() { return Some(entry.name.clone());
None }
} else { } else if let Cow::Owned(s) = normalized {
self.merge_media_maybe_renaming(name, notetype_id) // no entry; might be a reference to an existing file, so ensure normalization
.unwrap_or_else(|err| { return Some(s);
res = Err(err);
None
})
}
});
res.map(|_| out)
}
fn merge_media_maybe_renaming(
&mut self,
name: &str,
notetype: NotetypeId,
) -> Result<Option<String>> {
Ok(if let Some(zip_name) = self.media_map.get(name) {
let alternate_name = alternate_media_name(name, notetype);
let alternate_path = self.target_media_folder.join(&alternate_name);
if alternate_path.exists() {
Some(alternate_name)
} else {
let mut data = Vec::new();
io::copy(&mut self.archive.by_name(zip_name)?, &mut data)?;
let target_path = self.target_media_folder.join(name);
if !target_path.exists() {
write_data_atomically(&data, &target_path)?;
None
} else if data == fs::read(target_path)? {
None
} else {
write_data_atomically(&data, &alternate_path)?;
Some(alternate_name)
} }
} }
} else {
None None
}) })
} }
@ -292,20 +302,15 @@ impl Context {
note_id.0 += 999; note_id.0 += 999;
} }
} }
}
fn write_data_atomically(data: &[u8], path: &Path) -> Result<()> { fn copy_media(&mut self, media_folder: &Path) -> Result<()> {
let mut tempfile = tempfile_in_parent_of(path)?; for (used, entry) in self.used_media_entries.values() {
tempfile.write_all(data)?; if *used {
atomic_rename(tempfile, path, false) entry.copy_from_archive(&mut self.archive, media_folder)?;
} }
}
fn alternate_media_name(name: &str, notetype_id: NotetypeId) -> String { Ok(())
let (stem, dot, extension) = name }
.rsplit_once('.')
.map(|(stem, ext)| (stem, ".", ext))
.unwrap_or((name, "", ""));
format!("{stem}_{notetype_id}{dot}{extension}")
} }
impl Notetype { impl Notetype {

View file

@ -10,19 +10,22 @@ use std::{
}; };
use prost::Message; use prost::Message;
use tempfile::NamedTempFile;
use zip::{read::ZipFile, ZipArchive}; use zip::{read::ZipFile, ZipArchive};
use zstd::stream::copy_decode; use zstd::stream::copy_decode;
use super::{MediaEntries, MediaEntry, Meta}; use super::{MediaEntries, MediaEntry, Meta};
use crate::{ use crate::{
error::ImportError, io::filename_is_safe, media::files::normalize_filename, prelude::*, error::ImportError,
io::{atomic_rename, filename_is_safe},
media::files::normalize_filename,
prelude::*,
}; };
/// Like [MediaEntry], but with a safe filename and set zip filename. /// Like [MediaEntry], but with a safe filename and set zip filename.
pub(super) struct SafeMediaEntry { pub(super) struct SafeMediaEntry {
pub(super) name: String, pub(super) name: String,
pub(super) size: u32, pub(super) size: u32,
#[allow(dead_code)]
pub(super) sha1: [u8; 20], pub(super) sha1: [u8; 20],
pub(super) index: usize, pub(super) index: usize,
} }
@ -98,6 +101,17 @@ impl SafeMediaEntry {
.map(|metadata| metadata.len() as u64 == self_size) .map(|metadata| metadata.len() as u64 == self_size)
.unwrap_or_default() .unwrap_or_default()
} }
pub(super) fn copy_from_archive(
&self,
archive: &mut ZipArchive<File>,
target_folder: &Path,
) -> Result<()> {
let mut file = self.fetch_file(archive)?;
let mut tempfile = NamedTempFile::new_in(target_folder)?;
io::copy(&mut file, &mut tempfile)?;
atomic_rename(tempfile, &self.file_path(target_folder), false)
}
} }
pub(super) fn extract_media_entries( pub(super) fn extract_media_entries(
@ -113,7 +127,7 @@ pub(super) fn extract_media_entries(
} }
} }
fn safe_normalized_file_name(name: &str) -> Result<Cow<str>> { pub(super) fn safe_normalized_file_name(name: &str) -> Result<Cow<str>> {
if !filename_is_safe(name) { if !filename_is_safe(name) {
Err(AnkiError::ImportError(ImportError::Corrupt)) Err(AnkiError::ImportError(ImportError::Corrupt))
} else { } else {

View file

@ -5,7 +5,7 @@ use std::{collections::HashMap, path::Path};
use rusqlite::{params, Connection, OptionalExtension, Row, Statement}; use rusqlite::{params, Connection, OptionalExtension, Row, Statement};
use crate::error::Result; use crate::prelude::*;
fn trace(s: &str) { fn trace(s: &str) {
println!("sql: {}", s) println!("sql: {}", s)
@ -222,6 +222,14 @@ delete from media where fname=?"
Ok(map?) Ok(map?)
} }
/// Error if any checksums are missing or broken.
pub(super) fn all_checksums(&mut self) -> Result<HashMap<String, [u8; 20]>> {
self.db
.prepare("SELECT fname, csum FROM media")?
.query_and_then([], row_to_name_and_checksum)?
.collect()
}
pub(super) fn force_resync(&mut self) -> Result<()> { pub(super) fn force_resync(&mut self) -> Result<()> {
self.db self.db
.execute_batch("delete from media; update meta set lastUsn = 0, dirMod = 0") .execute_batch("delete from media; update meta set lastUsn = 0, dirMod = 0")
@ -250,6 +258,15 @@ fn row_to_entry(row: &Row) -> rusqlite::Result<MediaEntry> {
}) })
} }
fn row_to_name_and_checksum(row: &Row) -> Result<(String, [u8; 20])> {
let file_name = row.get(0)?;
let sha1_str: String = row.get(1)?;
let mut sha1 = [0; 20];
hex::decode_to_slice(sha1_str, &mut sha1)
.map_err(|_| AnkiError::invalid_input(format!("bad media checksum: {file_name}")))?;
Ok((file_name, sha1))
}
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use tempfile::NamedTempFile; use tempfile::NamedTempFile;

View file

@ -194,7 +194,7 @@ where
} }
/// Convert foo.jpg into foo-abcde12345679.jpg /// Convert foo.jpg into foo-abcde12345679.jpg
fn add_hash_suffix_to_file_stem(fname: &str, hash: &[u8; 20]) -> String { pub(crate) fn add_hash_suffix_to_file_stem(fname: &str, hash: &[u8; 20]) -> String {
// when appending a hash to make unique, it will be 40 bytes plus the hyphen. // when appending a hash to make unique, it will be 40 bytes plus the hyphen.
let max_len = MAX_FILENAME_LENGTH - 40 - 1; let max_len = MAX_FILENAME_LENGTH - 40 - 1;
@ -283,10 +283,15 @@ fn existing_file_sha1(path: &Path) -> io::Result<Option<[u8; 20]>> {
/// Return the SHA1 of a file, failing if it doesn't exist. /// Return the SHA1 of a file, failing if it doesn't exist.
pub(crate) fn sha1_of_file(path: &Path) -> io::Result<[u8; 20]> { pub(crate) fn sha1_of_file(path: &Path) -> io::Result<[u8; 20]> {
let mut file = fs::File::open(path)?; let mut file = fs::File::open(path)?;
sha1_of_reader(&mut file)
}
/// Return the SHA1 of a stream.
pub(crate) fn sha1_of_reader(reader: &mut impl Read) -> io::Result<[u8; 20]> {
let mut hasher = Sha1::new(); let mut hasher = Sha1::new();
let mut buf = [0; 64 * 1024]; let mut buf = [0; 64 * 1024];
loop { loop {
match file.read(&mut buf) { match reader.read(&mut buf) {
Ok(0) => break, Ok(0) => break,
Ok(n) => hasher.update(&buf[0..n]), Ok(n) => hasher.update(&buf[0..n]),
Err(e) => { Err(e) => {

View file

@ -3,19 +3,21 @@
use std::{ use std::{
borrow::Cow, borrow::Cow,
collections::HashMap,
path::{Path, PathBuf}, path::{Path, PathBuf},
}; };
use rusqlite::Connection; use rusqlite::Connection;
use slog::Logger; use slog::Logger;
use self::changetracker::ChangeTracker;
use crate::{ use crate::{
error::Result,
media::{ media::{
database::{open_or_create, MediaDatabaseContext, MediaEntry}, database::{open_or_create, MediaDatabaseContext, MediaEntry},
files::{add_data_to_folder_uniquely, mtime_as_i64, remove_files, sha1_of_data}, files::{add_data_to_folder_uniquely, mtime_as_i64, remove_files, sha1_of_data},
sync::{MediaSyncProgress, MediaSyncer}, sync::{MediaSyncProgress, MediaSyncer},
}, },
prelude::*,
}; };
pub mod changetracker; pub mod changetracker;
@ -153,4 +155,14 @@ impl MediaManager {
pub fn dbctx(&self) -> MediaDatabaseContext { pub fn dbctx(&self) -> MediaDatabaseContext {
MediaDatabaseContext::new(&self.db) MediaDatabaseContext::new(&self.db)
} }
pub fn all_checksums(
&self,
progress: impl FnMut(usize) -> bool,
log: &Logger,
) -> Result<HashMap<String, [u8; 20]>> {
let mut dbctx = self.dbctx();
ChangeTracker::new(&self.media_folder, progress, log).register_changes(&mut dbctx)?;
dbctx.all_checksums()
}
} }