Prepare media based on checksums

- Ensure all existing media files are hashed.
- Hash incoming files during preparation to detect conflicts.
- Uniquify names of conflicting files with hash (not notetype id).
- Mark media files as used while importing notes.
- Finally copy used media.
This commit is contained in:
RumovZ 2022-04-07 12:45:02 +02:00
parent a0085e7fd4
commit 7583a7e6b3
5 changed files with 136 additions and 83 deletions

View file

@ -2,11 +2,12 @@
// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
use std::{
borrow::Cow,
collections::{HashMap, HashSet},
fs::{self, File},
io::{self, Write},
fs::File,
io::{self},
mem,
path::{Path, PathBuf},
path::Path,
sync::Arc,
};
@ -18,14 +19,19 @@ use crate::{
collection::CollectionBuilder,
import_export::{
gather::ExchangeData,
package::{media::extract_media_entries, Meta},
package::{
media::{extract_media_entries, safe_normalized_file_name, SafeMediaEntry},
Meta,
},
},
media::{
files::{add_hash_suffix_to_file_stem, sha1_of_reader},
MediaManager,
},
io::{atomic_rename, tempfile_in_parent_of},
prelude::*,
text::replace_media_refs,
};
#[derive(Debug)]
struct Context {
archive: ZipArchive<File>,
guid_map: HashMap<String, NoteMeta>,
@ -34,8 +40,11 @@ struct Context {
existing_notetypes: HashSet<NotetypeId>,
data: ExchangeData,
usn: Usn,
media_map: HashMap<String, String>,
target_media_folder: PathBuf,
/// Map of source media files, that do not already exist in the target.
///
/// original, normalized file name → (refererenced on import material,
/// entry with possibly remapped file name)
used_media_entries: HashMap<String, (bool, SafeMediaEntry)>,
conflicting_notes: HashSet<String>,
}
@ -60,6 +69,22 @@ impl NoteMeta {
}
}
impl SafeMediaEntry {
fn with_hash_from_archive(&mut self, archive: &mut ZipArchive<File>) -> Result<()> {
if self.sha1 == [0; 20] {
let mut reader = self.fetch_file(archive)?;
self.sha1 = sha1_of_reader(&mut reader)?;
}
Ok(())
}
/// Requires sha1 to be set. Returns old file name.
fn uniquify_name(&mut self) -> String {
let new_name = add_hash_suffix_to_file_stem(&self.name, &self.sha1);
mem::replace(&mut self.name, new_name)
}
}
impl Collection {
pub fn import_apkg(
&mut self,
@ -71,18 +96,19 @@ impl Collection {
let archive = ZipArchive::new(file)?;
let mut ctx = Context::new(archive, self, search, with_scheduling)?;
ctx.prepare_media(self)?;
ctx.prepare_notetypes(self)?;
ctx.prepare_notes()?;
self.insert_data(&ctx.data)
self.insert_data(&ctx.data)?;
ctx.copy_media(&self.media_folder)?;
Ok(())
}
}
fn build_media_map(archive: &mut ZipArchive<File>) -> Result<HashMap<String, String>> {
Ok(extract_media_entries(&Meta::new_legacy(), archive)?
.into_iter()
.map(|entry| (entry.name, entry.index.to_string()))
.collect())
fn all_existing_sha1s(&mut self) -> Result<HashMap<String, [u8; 20]>> {
let mgr = MediaManager::new(&self.media_folder, &self.media_db)?;
mgr.all_checksums(|_| true, &self.log)
}
}
impl ExchangeData {
@ -111,21 +137,37 @@ impl Context {
with_scheduling: bool,
) -> Result<Self> {
let data = ExchangeData::gather_from_archive(&mut archive, search, with_scheduling)?;
let media_map = build_media_map(&mut archive)?;
Ok(Self {
archive,
data,
guid_map: target_col.storage.note_guid_map()?,
existing_notes: target_col.storage.get_all_note_ids()?,
existing_notetypes: target_col.storage.get_all_notetype_ids()?,
media_map,
target_media_folder: target_col.media_folder.clone(),
usn: target_col.usn()?,
conflicting_notes: HashSet::new(),
remapped_notetypes: HashMap::new(),
used_media_entries: HashMap::new(),
})
}
fn prepare_media(&mut self, target_col: &mut Collection) -> Result<()> {
let existing_sha1s = target_col.all_existing_sha1s()?;
for mut entry in extract_media_entries(&Meta::new_legacy(), &mut self.archive)? {
if let Some(other_sha1) = existing_sha1s.get(&entry.name) {
entry.with_hash_from_archive(&mut self.archive)?;
if entry.sha1 != *other_sha1 {
let original_name = entry.uniquify_name();
self.used_media_entries
.insert(original_name, (false, entry));
}
} else {
self.used_media_entries
.insert(entry.name.clone(), (false, entry));
}
}
Ok(())
}
fn prepare_notetypes(&mut self, target_col: &mut Collection) -> Result<()> {
for notetype in mem::take(&mut self.data.notetypes) {
if let Some(existing) = target_col.get_notetype(notetype.id)? {
@ -229,60 +271,28 @@ impl Context {
}
fn munge_media(&mut self, note: &mut Note) -> Result<()> {
let notetype_id = note.notetype_id;
for field in note.fields_mut() {
if let Some(new_field) = self.replace_media_refs_fallible(field, notetype_id)? {
if let Some(new_field) = self.replace_media_refs(field) {
*field = new_field;
};
}
Ok(())
}
fn replace_media_refs_fallible(
&mut self,
field: &mut String,
notetype_id: NotetypeId,
) -> Result<Option<String>> {
let mut res = Ok(());
let out = replace_media_refs(field, |name| {
if res.is_err() {
None
} else {
self.merge_media_maybe_renaming(name, notetype_id)
.unwrap_or_else(|err| {
res = Err(err);
None
})
fn replace_media_refs(&mut self, field: &mut String) -> Option<String> {
replace_media_refs(field, |name| {
if let Ok(normalized) = safe_normalized_file_name(name) {
if let Some((used, entry)) = self.used_media_entries.get_mut(normalized.as_ref()) {
*used = true;
if entry.name != name {
// name is not normalized, and/or remapped
return Some(entry.name.clone());
}
});
res.map(|_| out)
}
fn merge_media_maybe_renaming(
&mut self,
name: &str,
notetype: NotetypeId,
) -> Result<Option<String>> {
Ok(if let Some(zip_name) = self.media_map.get(name) {
let alternate_name = alternate_media_name(name, notetype);
let alternate_path = self.target_media_folder.join(&alternate_name);
if alternate_path.exists() {
Some(alternate_name)
} else {
let mut data = Vec::new();
io::copy(&mut self.archive.by_name(zip_name)?, &mut data)?;
let target_path = self.target_media_folder.join(name);
if !target_path.exists() {
write_data_atomically(&data, &target_path)?;
None
} else if data == fs::read(target_path)? {
None
} else {
write_data_atomically(&data, &alternate_path)?;
Some(alternate_name)
} else if let Cow::Owned(s) = normalized {
// no entry; might be a reference to an existing file, so ensure normalization
return Some(s);
}
}
} else {
None
})
}
@ -292,20 +302,15 @@ impl Context {
note_id.0 += 999;
}
}
}
fn write_data_atomically(data: &[u8], path: &Path) -> Result<()> {
let mut tempfile = tempfile_in_parent_of(path)?;
tempfile.write_all(data)?;
atomic_rename(tempfile, path, false)
}
fn alternate_media_name(name: &str, notetype_id: NotetypeId) -> String {
let (stem, dot, extension) = name
.rsplit_once('.')
.map(|(stem, ext)| (stem, ".", ext))
.unwrap_or((name, "", ""));
format!("{stem}_{notetype_id}{dot}{extension}")
fn copy_media(&mut self, media_folder: &Path) -> Result<()> {
for (used, entry) in self.used_media_entries.values() {
if *used {
entry.copy_from_archive(&mut self.archive, media_folder)?;
}
}
Ok(())
}
}
impl Notetype {

View file

@ -10,19 +10,22 @@ use std::{
};
use prost::Message;
use tempfile::NamedTempFile;
use zip::{read::ZipFile, ZipArchive};
use zstd::stream::copy_decode;
use super::{MediaEntries, MediaEntry, Meta};
use crate::{
error::ImportError, io::filename_is_safe, media::files::normalize_filename, prelude::*,
error::ImportError,
io::{atomic_rename, filename_is_safe},
media::files::normalize_filename,
prelude::*,
};
/// Like [MediaEntry], but with a safe filename and set zip filename.
pub(super) struct SafeMediaEntry {
pub(super) name: String,
pub(super) size: u32,
#[allow(dead_code)]
pub(super) sha1: [u8; 20],
pub(super) index: usize,
}
@ -98,6 +101,17 @@ impl SafeMediaEntry {
.map(|metadata| metadata.len() as u64 == self_size)
.unwrap_or_default()
}
pub(super) fn copy_from_archive(
&self,
archive: &mut ZipArchive<File>,
target_folder: &Path,
) -> Result<()> {
let mut file = self.fetch_file(archive)?;
let mut tempfile = NamedTempFile::new_in(target_folder)?;
io::copy(&mut file, &mut tempfile)?;
atomic_rename(tempfile, &self.file_path(target_folder), false)
}
}
pub(super) fn extract_media_entries(
@ -113,7 +127,7 @@ pub(super) fn extract_media_entries(
}
}
fn safe_normalized_file_name(name: &str) -> Result<Cow<str>> {
pub(super) fn safe_normalized_file_name(name: &str) -> Result<Cow<str>> {
if !filename_is_safe(name) {
Err(AnkiError::ImportError(ImportError::Corrupt))
} else {

View file

@ -5,7 +5,7 @@ use std::{collections::HashMap, path::Path};
use rusqlite::{params, Connection, OptionalExtension, Row, Statement};
use crate::error::Result;
use crate::prelude::*;
fn trace(s: &str) {
println!("sql: {}", s)
@ -222,6 +222,14 @@ delete from media where fname=?"
Ok(map?)
}
/// Error if any checksums are missing or broken.
pub(super) fn all_checksums(&mut self) -> Result<HashMap<String, [u8; 20]>> {
self.db
.prepare("SELECT fname, csum FROM media")?
.query_and_then([], row_to_name_and_checksum)?
.collect()
}
pub(super) fn force_resync(&mut self) -> Result<()> {
self.db
.execute_batch("delete from media; update meta set lastUsn = 0, dirMod = 0")
@ -250,6 +258,15 @@ fn row_to_entry(row: &Row) -> rusqlite::Result<MediaEntry> {
})
}
fn row_to_name_and_checksum(row: &Row) -> Result<(String, [u8; 20])> {
let file_name = row.get(0)?;
let sha1_str: String = row.get(1)?;
let mut sha1 = [0; 20];
hex::decode_to_slice(sha1_str, &mut sha1)
.map_err(|_| AnkiError::invalid_input(format!("bad media checksum: {file_name}")))?;
Ok((file_name, sha1))
}
#[cfg(test)]
mod test {
use tempfile::NamedTempFile;

View file

@ -194,7 +194,7 @@ where
}
/// Convert foo.jpg into foo-abcde12345679.jpg
fn add_hash_suffix_to_file_stem(fname: &str, hash: &[u8; 20]) -> String {
pub(crate) fn add_hash_suffix_to_file_stem(fname: &str, hash: &[u8; 20]) -> String {
// when appending a hash to make unique, it will be 40 bytes plus the hyphen.
let max_len = MAX_FILENAME_LENGTH - 40 - 1;
@ -283,10 +283,15 @@ fn existing_file_sha1(path: &Path) -> io::Result<Option<[u8; 20]>> {
/// Return the SHA1 of a file, failing if it doesn't exist.
pub(crate) fn sha1_of_file(path: &Path) -> io::Result<[u8; 20]> {
let mut file = fs::File::open(path)?;
sha1_of_reader(&mut file)
}
/// Return the SHA1 of a stream.
pub(crate) fn sha1_of_reader(reader: &mut impl Read) -> io::Result<[u8; 20]> {
let mut hasher = Sha1::new();
let mut buf = [0; 64 * 1024];
loop {
match file.read(&mut buf) {
match reader.read(&mut buf) {
Ok(0) => break,
Ok(n) => hasher.update(&buf[0..n]),
Err(e) => {

View file

@ -3,19 +3,21 @@
use std::{
borrow::Cow,
collections::HashMap,
path::{Path, PathBuf},
};
use rusqlite::Connection;
use slog::Logger;
use self::changetracker::ChangeTracker;
use crate::{
error::Result,
media::{
database::{open_or_create, MediaDatabaseContext, MediaEntry},
files::{add_data_to_folder_uniquely, mtime_as_i64, remove_files, sha1_of_data},
sync::{MediaSyncProgress, MediaSyncer},
},
prelude::*,
};
pub mod changetracker;
@ -153,4 +155,14 @@ impl MediaManager {
pub fn dbctx(&self) -> MediaDatabaseContext {
MediaDatabaseContext::new(&self.db)
}
pub fn all_checksums(
&self,
progress: impl FnMut(usize) -> bool,
log: &Logger,
) -> Result<HashMap<String, [u8; 20]>> {
let mut dbctx = self.dbctx();
ChangeTracker::new(&self.media_folder, progress, log).register_changes(&mut dbctx)?;
dbctx.all_checksums()
}
}