mirror of
https://github.com/ankitects/anki.git
synced 2025-09-20 23:12:21 -04:00
Prepare media based on checksums
- Ensure all existing media files are hashed. - Hash incoming files during preparation to detect conflicts. - Uniquify names of conflicting files with hash (not notetype id). - Mark media files as used while importing notes. - Finally copy used media.
This commit is contained in:
parent
a0085e7fd4
commit
7583a7e6b3
5 changed files with 136 additions and 83 deletions
|
@ -2,11 +2,12 @@
|
|||
// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
|
||||
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
collections::{HashMap, HashSet},
|
||||
fs::{self, File},
|
||||
io::{self, Write},
|
||||
fs::File,
|
||||
io::{self},
|
||||
mem,
|
||||
path::{Path, PathBuf},
|
||||
path::Path,
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
|
@ -18,14 +19,19 @@ use crate::{
|
|||
collection::CollectionBuilder,
|
||||
import_export::{
|
||||
gather::ExchangeData,
|
||||
package::{media::extract_media_entries, Meta},
|
||||
package::{
|
||||
media::{extract_media_entries, safe_normalized_file_name, SafeMediaEntry},
|
||||
Meta,
|
||||
},
|
||||
},
|
||||
media::{
|
||||
files::{add_hash_suffix_to_file_stem, sha1_of_reader},
|
||||
MediaManager,
|
||||
},
|
||||
io::{atomic_rename, tempfile_in_parent_of},
|
||||
prelude::*,
|
||||
text::replace_media_refs,
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Context {
|
||||
archive: ZipArchive<File>,
|
||||
guid_map: HashMap<String, NoteMeta>,
|
||||
|
@ -34,8 +40,11 @@ struct Context {
|
|||
existing_notetypes: HashSet<NotetypeId>,
|
||||
data: ExchangeData,
|
||||
usn: Usn,
|
||||
media_map: HashMap<String, String>,
|
||||
target_media_folder: PathBuf,
|
||||
/// Map of source media files, that do not already exist in the target.
|
||||
///
|
||||
/// original, normalized file name → (refererenced on import material,
|
||||
/// entry with possibly remapped file name)
|
||||
used_media_entries: HashMap<String, (bool, SafeMediaEntry)>,
|
||||
conflicting_notes: HashSet<String>,
|
||||
}
|
||||
|
||||
|
@ -60,6 +69,22 @@ impl NoteMeta {
|
|||
}
|
||||
}
|
||||
|
||||
impl SafeMediaEntry {
|
||||
fn with_hash_from_archive(&mut self, archive: &mut ZipArchive<File>) -> Result<()> {
|
||||
if self.sha1 == [0; 20] {
|
||||
let mut reader = self.fetch_file(archive)?;
|
||||
self.sha1 = sha1_of_reader(&mut reader)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Requires sha1 to be set. Returns old file name.
|
||||
fn uniquify_name(&mut self) -> String {
|
||||
let new_name = add_hash_suffix_to_file_stem(&self.name, &self.sha1);
|
||||
mem::replace(&mut self.name, new_name)
|
||||
}
|
||||
}
|
||||
|
||||
impl Collection {
|
||||
pub fn import_apkg(
|
||||
&mut self,
|
||||
|
@ -71,18 +96,19 @@ impl Collection {
|
|||
let archive = ZipArchive::new(file)?;
|
||||
|
||||
let mut ctx = Context::new(archive, self, search, with_scheduling)?;
|
||||
ctx.prepare_media(self)?;
|
||||
ctx.prepare_notetypes(self)?;
|
||||
ctx.prepare_notes()?;
|
||||
|
||||
self.insert_data(&ctx.data)
|
||||
self.insert_data(&ctx.data)?;
|
||||
ctx.copy_media(&self.media_folder)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn build_media_map(archive: &mut ZipArchive<File>) -> Result<HashMap<String, String>> {
|
||||
Ok(extract_media_entries(&Meta::new_legacy(), archive)?
|
||||
.into_iter()
|
||||
.map(|entry| (entry.name, entry.index.to_string()))
|
||||
.collect())
|
||||
fn all_existing_sha1s(&mut self) -> Result<HashMap<String, [u8; 20]>> {
|
||||
let mgr = MediaManager::new(&self.media_folder, &self.media_db)?;
|
||||
mgr.all_checksums(|_| true, &self.log)
|
||||
}
|
||||
}
|
||||
|
||||
impl ExchangeData {
|
||||
|
@ -111,21 +137,37 @@ impl Context {
|
|||
with_scheduling: bool,
|
||||
) -> Result<Self> {
|
||||
let data = ExchangeData::gather_from_archive(&mut archive, search, with_scheduling)?;
|
||||
let media_map = build_media_map(&mut archive)?;
|
||||
Ok(Self {
|
||||
archive,
|
||||
data,
|
||||
guid_map: target_col.storage.note_guid_map()?,
|
||||
existing_notes: target_col.storage.get_all_note_ids()?,
|
||||
existing_notetypes: target_col.storage.get_all_notetype_ids()?,
|
||||
media_map,
|
||||
target_media_folder: target_col.media_folder.clone(),
|
||||
usn: target_col.usn()?,
|
||||
conflicting_notes: HashSet::new(),
|
||||
remapped_notetypes: HashMap::new(),
|
||||
used_media_entries: HashMap::new(),
|
||||
})
|
||||
}
|
||||
|
||||
fn prepare_media(&mut self, target_col: &mut Collection) -> Result<()> {
|
||||
let existing_sha1s = target_col.all_existing_sha1s()?;
|
||||
for mut entry in extract_media_entries(&Meta::new_legacy(), &mut self.archive)? {
|
||||
if let Some(other_sha1) = existing_sha1s.get(&entry.name) {
|
||||
entry.with_hash_from_archive(&mut self.archive)?;
|
||||
if entry.sha1 != *other_sha1 {
|
||||
let original_name = entry.uniquify_name();
|
||||
self.used_media_entries
|
||||
.insert(original_name, (false, entry));
|
||||
}
|
||||
} else {
|
||||
self.used_media_entries
|
||||
.insert(entry.name.clone(), (false, entry));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn prepare_notetypes(&mut self, target_col: &mut Collection) -> Result<()> {
|
||||
for notetype in mem::take(&mut self.data.notetypes) {
|
||||
if let Some(existing) = target_col.get_notetype(notetype.id)? {
|
||||
|
@ -229,60 +271,28 @@ impl Context {
|
|||
}
|
||||
|
||||
fn munge_media(&mut self, note: &mut Note) -> Result<()> {
|
||||
let notetype_id = note.notetype_id;
|
||||
for field in note.fields_mut() {
|
||||
if let Some(new_field) = self.replace_media_refs_fallible(field, notetype_id)? {
|
||||
if let Some(new_field) = self.replace_media_refs(field) {
|
||||
*field = new_field;
|
||||
};
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn replace_media_refs_fallible(
|
||||
&mut self,
|
||||
field: &mut String,
|
||||
notetype_id: NotetypeId,
|
||||
) -> Result<Option<String>> {
|
||||
let mut res = Ok(());
|
||||
let out = replace_media_refs(field, |name| {
|
||||
if res.is_err() {
|
||||
None
|
||||
} else {
|
||||
self.merge_media_maybe_renaming(name, notetype_id)
|
||||
.unwrap_or_else(|err| {
|
||||
res = Err(err);
|
||||
None
|
||||
})
|
||||
fn replace_media_refs(&mut self, field: &mut String) -> Option<String> {
|
||||
replace_media_refs(field, |name| {
|
||||
if let Ok(normalized) = safe_normalized_file_name(name) {
|
||||
if let Some((used, entry)) = self.used_media_entries.get_mut(normalized.as_ref()) {
|
||||
*used = true;
|
||||
if entry.name != name {
|
||||
// name is not normalized, and/or remapped
|
||||
return Some(entry.name.clone());
|
||||
}
|
||||
});
|
||||
res.map(|_| out)
|
||||
}
|
||||
|
||||
fn merge_media_maybe_renaming(
|
||||
&mut self,
|
||||
name: &str,
|
||||
notetype: NotetypeId,
|
||||
) -> Result<Option<String>> {
|
||||
Ok(if let Some(zip_name) = self.media_map.get(name) {
|
||||
let alternate_name = alternate_media_name(name, notetype);
|
||||
let alternate_path = self.target_media_folder.join(&alternate_name);
|
||||
if alternate_path.exists() {
|
||||
Some(alternate_name)
|
||||
} else {
|
||||
let mut data = Vec::new();
|
||||
io::copy(&mut self.archive.by_name(zip_name)?, &mut data)?;
|
||||
let target_path = self.target_media_folder.join(name);
|
||||
if !target_path.exists() {
|
||||
write_data_atomically(&data, &target_path)?;
|
||||
None
|
||||
} else if data == fs::read(target_path)? {
|
||||
None
|
||||
} else {
|
||||
write_data_atomically(&data, &alternate_path)?;
|
||||
Some(alternate_name)
|
||||
} else if let Cow::Owned(s) = normalized {
|
||||
// no entry; might be a reference to an existing file, so ensure normalization
|
||||
return Some(s);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
})
|
||||
}
|
||||
|
@ -292,20 +302,15 @@ impl Context {
|
|||
note_id.0 += 999;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn write_data_atomically(data: &[u8], path: &Path) -> Result<()> {
|
||||
let mut tempfile = tempfile_in_parent_of(path)?;
|
||||
tempfile.write_all(data)?;
|
||||
atomic_rename(tempfile, path, false)
|
||||
}
|
||||
|
||||
fn alternate_media_name(name: &str, notetype_id: NotetypeId) -> String {
|
||||
let (stem, dot, extension) = name
|
||||
.rsplit_once('.')
|
||||
.map(|(stem, ext)| (stem, ".", ext))
|
||||
.unwrap_or((name, "", ""));
|
||||
format!("{stem}_{notetype_id}{dot}{extension}")
|
||||
fn copy_media(&mut self, media_folder: &Path) -> Result<()> {
|
||||
for (used, entry) in self.used_media_entries.values() {
|
||||
if *used {
|
||||
entry.copy_from_archive(&mut self.archive, media_folder)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Notetype {
|
||||
|
|
|
@ -10,19 +10,22 @@ use std::{
|
|||
};
|
||||
|
||||
use prost::Message;
|
||||
use tempfile::NamedTempFile;
|
||||
use zip::{read::ZipFile, ZipArchive};
|
||||
use zstd::stream::copy_decode;
|
||||
|
||||
use super::{MediaEntries, MediaEntry, Meta};
|
||||
use crate::{
|
||||
error::ImportError, io::filename_is_safe, media::files::normalize_filename, prelude::*,
|
||||
error::ImportError,
|
||||
io::{atomic_rename, filename_is_safe},
|
||||
media::files::normalize_filename,
|
||||
prelude::*,
|
||||
};
|
||||
|
||||
/// Like [MediaEntry], but with a safe filename and set zip filename.
|
||||
pub(super) struct SafeMediaEntry {
|
||||
pub(super) name: String,
|
||||
pub(super) size: u32,
|
||||
#[allow(dead_code)]
|
||||
pub(super) sha1: [u8; 20],
|
||||
pub(super) index: usize,
|
||||
}
|
||||
|
@ -98,6 +101,17 @@ impl SafeMediaEntry {
|
|||
.map(|metadata| metadata.len() as u64 == self_size)
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
pub(super) fn copy_from_archive(
|
||||
&self,
|
||||
archive: &mut ZipArchive<File>,
|
||||
target_folder: &Path,
|
||||
) -> Result<()> {
|
||||
let mut file = self.fetch_file(archive)?;
|
||||
let mut tempfile = NamedTempFile::new_in(target_folder)?;
|
||||
io::copy(&mut file, &mut tempfile)?;
|
||||
atomic_rename(tempfile, &self.file_path(target_folder), false)
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn extract_media_entries(
|
||||
|
@ -113,7 +127,7 @@ pub(super) fn extract_media_entries(
|
|||
}
|
||||
}
|
||||
|
||||
fn safe_normalized_file_name(name: &str) -> Result<Cow<str>> {
|
||||
pub(super) fn safe_normalized_file_name(name: &str) -> Result<Cow<str>> {
|
||||
if !filename_is_safe(name) {
|
||||
Err(AnkiError::ImportError(ImportError::Corrupt))
|
||||
} else {
|
||||
|
|
|
@ -5,7 +5,7 @@ use std::{collections::HashMap, path::Path};
|
|||
|
||||
use rusqlite::{params, Connection, OptionalExtension, Row, Statement};
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::prelude::*;
|
||||
|
||||
fn trace(s: &str) {
|
||||
println!("sql: {}", s)
|
||||
|
@ -222,6 +222,14 @@ delete from media where fname=?"
|
|||
Ok(map?)
|
||||
}
|
||||
|
||||
/// Error if any checksums are missing or broken.
|
||||
pub(super) fn all_checksums(&mut self) -> Result<HashMap<String, [u8; 20]>> {
|
||||
self.db
|
||||
.prepare("SELECT fname, csum FROM media")?
|
||||
.query_and_then([], row_to_name_and_checksum)?
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub(super) fn force_resync(&mut self) -> Result<()> {
|
||||
self.db
|
||||
.execute_batch("delete from media; update meta set lastUsn = 0, dirMod = 0")
|
||||
|
@ -250,6 +258,15 @@ fn row_to_entry(row: &Row) -> rusqlite::Result<MediaEntry> {
|
|||
})
|
||||
}
|
||||
|
||||
fn row_to_name_and_checksum(row: &Row) -> Result<(String, [u8; 20])> {
|
||||
let file_name = row.get(0)?;
|
||||
let sha1_str: String = row.get(1)?;
|
||||
let mut sha1 = [0; 20];
|
||||
hex::decode_to_slice(sha1_str, &mut sha1)
|
||||
.map_err(|_| AnkiError::invalid_input(format!("bad media checksum: {file_name}")))?;
|
||||
Ok((file_name, sha1))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use tempfile::NamedTempFile;
|
||||
|
|
|
@ -194,7 +194,7 @@ where
|
|||
}
|
||||
|
||||
/// Convert foo.jpg into foo-abcde12345679.jpg
|
||||
fn add_hash_suffix_to_file_stem(fname: &str, hash: &[u8; 20]) -> String {
|
||||
pub(crate) fn add_hash_suffix_to_file_stem(fname: &str, hash: &[u8; 20]) -> String {
|
||||
// when appending a hash to make unique, it will be 40 bytes plus the hyphen.
|
||||
let max_len = MAX_FILENAME_LENGTH - 40 - 1;
|
||||
|
||||
|
@ -283,10 +283,15 @@ fn existing_file_sha1(path: &Path) -> io::Result<Option<[u8; 20]>> {
|
|||
/// Return the SHA1 of a file, failing if it doesn't exist.
|
||||
pub(crate) fn sha1_of_file(path: &Path) -> io::Result<[u8; 20]> {
|
||||
let mut file = fs::File::open(path)?;
|
||||
sha1_of_reader(&mut file)
|
||||
}
|
||||
|
||||
/// Return the SHA1 of a stream.
|
||||
pub(crate) fn sha1_of_reader(reader: &mut impl Read) -> io::Result<[u8; 20]> {
|
||||
let mut hasher = Sha1::new();
|
||||
let mut buf = [0; 64 * 1024];
|
||||
loop {
|
||||
match file.read(&mut buf) {
|
||||
match reader.read(&mut buf) {
|
||||
Ok(0) => break,
|
||||
Ok(n) => hasher.update(&buf[0..n]),
|
||||
Err(e) => {
|
||||
|
|
|
@ -3,19 +3,21 @@
|
|||
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
collections::HashMap,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use rusqlite::Connection;
|
||||
use slog::Logger;
|
||||
|
||||
use self::changetracker::ChangeTracker;
|
||||
use crate::{
|
||||
error::Result,
|
||||
media::{
|
||||
database::{open_or_create, MediaDatabaseContext, MediaEntry},
|
||||
files::{add_data_to_folder_uniquely, mtime_as_i64, remove_files, sha1_of_data},
|
||||
sync::{MediaSyncProgress, MediaSyncer},
|
||||
},
|
||||
prelude::*,
|
||||
};
|
||||
|
||||
pub mod changetracker;
|
||||
|
@ -153,4 +155,14 @@ impl MediaManager {
|
|||
pub fn dbctx(&self) -> MediaDatabaseContext {
|
||||
MediaDatabaseContext::new(&self.db)
|
||||
}
|
||||
|
||||
pub fn all_checksums(
|
||||
&self,
|
||||
progress: impl FnMut(usize) -> bool,
|
||||
log: &Logger,
|
||||
) -> Result<HashMap<String, [u8; 20]>> {
|
||||
let mut dbctx = self.dbctx();
|
||||
ChangeTracker::new(&self.media_folder, progress, log).register_changes(&mut dbctx)?;
|
||||
dbctx.all_checksums()
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue