mirror of
https://github.com/ankitects/anki.git
synced 2025-09-21 07:22:23 -04:00
Prepare media based on checksums
- Ensure all existing media files are hashed. - Hash incoming files during preparation to detect conflicts. - Uniquify names of conflicting files with hash (not notetype id). - Mark media files as used while importing notes. - Finally copy used media.
This commit is contained in:
parent
a0085e7fd4
commit
7583a7e6b3
5 changed files with 136 additions and 83 deletions
|
@ -2,11 +2,12 @@
|
||||||
// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
|
// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
|
||||||
|
|
||||||
use std::{
|
use std::{
|
||||||
|
borrow::Cow,
|
||||||
collections::{HashMap, HashSet},
|
collections::{HashMap, HashSet},
|
||||||
fs::{self, File},
|
fs::File,
|
||||||
io::{self, Write},
|
io::{self},
|
||||||
mem,
|
mem,
|
||||||
path::{Path, PathBuf},
|
path::Path,
|
||||||
sync::Arc,
|
sync::Arc,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -18,14 +19,19 @@ use crate::{
|
||||||
collection::CollectionBuilder,
|
collection::CollectionBuilder,
|
||||||
import_export::{
|
import_export::{
|
||||||
gather::ExchangeData,
|
gather::ExchangeData,
|
||||||
package::{media::extract_media_entries, Meta},
|
package::{
|
||||||
|
media::{extract_media_entries, safe_normalized_file_name, SafeMediaEntry},
|
||||||
|
Meta,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
media::{
|
||||||
|
files::{add_hash_suffix_to_file_stem, sha1_of_reader},
|
||||||
|
MediaManager,
|
||||||
},
|
},
|
||||||
io::{atomic_rename, tempfile_in_parent_of},
|
|
||||||
prelude::*,
|
prelude::*,
|
||||||
text::replace_media_refs,
|
text::replace_media_refs,
|
||||||
};
|
};
|
||||||
|
|
||||||
#[derive(Debug)]
|
|
||||||
struct Context {
|
struct Context {
|
||||||
archive: ZipArchive<File>,
|
archive: ZipArchive<File>,
|
||||||
guid_map: HashMap<String, NoteMeta>,
|
guid_map: HashMap<String, NoteMeta>,
|
||||||
|
@ -34,8 +40,11 @@ struct Context {
|
||||||
existing_notetypes: HashSet<NotetypeId>,
|
existing_notetypes: HashSet<NotetypeId>,
|
||||||
data: ExchangeData,
|
data: ExchangeData,
|
||||||
usn: Usn,
|
usn: Usn,
|
||||||
media_map: HashMap<String, String>,
|
/// Map of source media files, that do not already exist in the target.
|
||||||
target_media_folder: PathBuf,
|
///
|
||||||
|
/// original, normalized file name → (refererenced on import material,
|
||||||
|
/// entry with possibly remapped file name)
|
||||||
|
used_media_entries: HashMap<String, (bool, SafeMediaEntry)>,
|
||||||
conflicting_notes: HashSet<String>,
|
conflicting_notes: HashSet<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -60,6 +69,22 @@ impl NoteMeta {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl SafeMediaEntry {
|
||||||
|
fn with_hash_from_archive(&mut self, archive: &mut ZipArchive<File>) -> Result<()> {
|
||||||
|
if self.sha1 == [0; 20] {
|
||||||
|
let mut reader = self.fetch_file(archive)?;
|
||||||
|
self.sha1 = sha1_of_reader(&mut reader)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Requires sha1 to be set. Returns old file name.
|
||||||
|
fn uniquify_name(&mut self) -> String {
|
||||||
|
let new_name = add_hash_suffix_to_file_stem(&self.name, &self.sha1);
|
||||||
|
mem::replace(&mut self.name, new_name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl Collection {
|
impl Collection {
|
||||||
pub fn import_apkg(
|
pub fn import_apkg(
|
||||||
&mut self,
|
&mut self,
|
||||||
|
@ -71,18 +96,19 @@ impl Collection {
|
||||||
let archive = ZipArchive::new(file)?;
|
let archive = ZipArchive::new(file)?;
|
||||||
|
|
||||||
let mut ctx = Context::new(archive, self, search, with_scheduling)?;
|
let mut ctx = Context::new(archive, self, search, with_scheduling)?;
|
||||||
|
ctx.prepare_media(self)?;
|
||||||
ctx.prepare_notetypes(self)?;
|
ctx.prepare_notetypes(self)?;
|
||||||
ctx.prepare_notes()?;
|
ctx.prepare_notes()?;
|
||||||
|
|
||||||
self.insert_data(&ctx.data)
|
self.insert_data(&ctx.data)?;
|
||||||
|
ctx.copy_media(&self.media_folder)?;
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
fn build_media_map(archive: &mut ZipArchive<File>) -> Result<HashMap<String, String>> {
|
fn all_existing_sha1s(&mut self) -> Result<HashMap<String, [u8; 20]>> {
|
||||||
Ok(extract_media_entries(&Meta::new_legacy(), archive)?
|
let mgr = MediaManager::new(&self.media_folder, &self.media_db)?;
|
||||||
.into_iter()
|
mgr.all_checksums(|_| true, &self.log)
|
||||||
.map(|entry| (entry.name, entry.index.to_string()))
|
}
|
||||||
.collect())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ExchangeData {
|
impl ExchangeData {
|
||||||
|
@ -111,21 +137,37 @@ impl Context {
|
||||||
with_scheduling: bool,
|
with_scheduling: bool,
|
||||||
) -> Result<Self> {
|
) -> Result<Self> {
|
||||||
let data = ExchangeData::gather_from_archive(&mut archive, search, with_scheduling)?;
|
let data = ExchangeData::gather_from_archive(&mut archive, search, with_scheduling)?;
|
||||||
let media_map = build_media_map(&mut archive)?;
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
archive,
|
archive,
|
||||||
data,
|
data,
|
||||||
guid_map: target_col.storage.note_guid_map()?,
|
guid_map: target_col.storage.note_guid_map()?,
|
||||||
existing_notes: target_col.storage.get_all_note_ids()?,
|
existing_notes: target_col.storage.get_all_note_ids()?,
|
||||||
existing_notetypes: target_col.storage.get_all_notetype_ids()?,
|
existing_notetypes: target_col.storage.get_all_notetype_ids()?,
|
||||||
media_map,
|
|
||||||
target_media_folder: target_col.media_folder.clone(),
|
|
||||||
usn: target_col.usn()?,
|
usn: target_col.usn()?,
|
||||||
conflicting_notes: HashSet::new(),
|
conflicting_notes: HashSet::new(),
|
||||||
remapped_notetypes: HashMap::new(),
|
remapped_notetypes: HashMap::new(),
|
||||||
|
used_media_entries: HashMap::new(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn prepare_media(&mut self, target_col: &mut Collection) -> Result<()> {
|
||||||
|
let existing_sha1s = target_col.all_existing_sha1s()?;
|
||||||
|
for mut entry in extract_media_entries(&Meta::new_legacy(), &mut self.archive)? {
|
||||||
|
if let Some(other_sha1) = existing_sha1s.get(&entry.name) {
|
||||||
|
entry.with_hash_from_archive(&mut self.archive)?;
|
||||||
|
if entry.sha1 != *other_sha1 {
|
||||||
|
let original_name = entry.uniquify_name();
|
||||||
|
self.used_media_entries
|
||||||
|
.insert(original_name, (false, entry));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
self.used_media_entries
|
||||||
|
.insert(entry.name.clone(), (false, entry));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
fn prepare_notetypes(&mut self, target_col: &mut Collection) -> Result<()> {
|
fn prepare_notetypes(&mut self, target_col: &mut Collection) -> Result<()> {
|
||||||
for notetype in mem::take(&mut self.data.notetypes) {
|
for notetype in mem::take(&mut self.data.notetypes) {
|
||||||
if let Some(existing) = target_col.get_notetype(notetype.id)? {
|
if let Some(existing) = target_col.get_notetype(notetype.id)? {
|
||||||
|
@ -229,60 +271,28 @@ impl Context {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn munge_media(&mut self, note: &mut Note) -> Result<()> {
|
fn munge_media(&mut self, note: &mut Note) -> Result<()> {
|
||||||
let notetype_id = note.notetype_id;
|
|
||||||
for field in note.fields_mut() {
|
for field in note.fields_mut() {
|
||||||
if let Some(new_field) = self.replace_media_refs_fallible(field, notetype_id)? {
|
if let Some(new_field) = self.replace_media_refs(field) {
|
||||||
*field = new_field;
|
*field = new_field;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn replace_media_refs_fallible(
|
fn replace_media_refs(&mut self, field: &mut String) -> Option<String> {
|
||||||
&mut self,
|
replace_media_refs(field, |name| {
|
||||||
field: &mut String,
|
if let Ok(normalized) = safe_normalized_file_name(name) {
|
||||||
notetype_id: NotetypeId,
|
if let Some((used, entry)) = self.used_media_entries.get_mut(normalized.as_ref()) {
|
||||||
) -> Result<Option<String>> {
|
*used = true;
|
||||||
let mut res = Ok(());
|
if entry.name != name {
|
||||||
let out = replace_media_refs(field, |name| {
|
// name is not normalized, and/or remapped
|
||||||
if res.is_err() {
|
return Some(entry.name.clone());
|
||||||
None
|
|
||||||
} else {
|
|
||||||
self.merge_media_maybe_renaming(name, notetype_id)
|
|
||||||
.unwrap_or_else(|err| {
|
|
||||||
res = Err(err);
|
|
||||||
None
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
});
|
} else if let Cow::Owned(s) = normalized {
|
||||||
res.map(|_| out)
|
// no entry; might be a reference to an existing file, so ensure normalization
|
||||||
}
|
return Some(s);
|
||||||
|
|
||||||
fn merge_media_maybe_renaming(
|
|
||||||
&mut self,
|
|
||||||
name: &str,
|
|
||||||
notetype: NotetypeId,
|
|
||||||
) -> Result<Option<String>> {
|
|
||||||
Ok(if let Some(zip_name) = self.media_map.get(name) {
|
|
||||||
let alternate_name = alternate_media_name(name, notetype);
|
|
||||||
let alternate_path = self.target_media_folder.join(&alternate_name);
|
|
||||||
if alternate_path.exists() {
|
|
||||||
Some(alternate_name)
|
|
||||||
} else {
|
|
||||||
let mut data = Vec::new();
|
|
||||||
io::copy(&mut self.archive.by_name(zip_name)?, &mut data)?;
|
|
||||||
let target_path = self.target_media_folder.join(name);
|
|
||||||
if !target_path.exists() {
|
|
||||||
write_data_atomically(&data, &target_path)?;
|
|
||||||
None
|
|
||||||
} else if data == fs::read(target_path)? {
|
|
||||||
None
|
|
||||||
} else {
|
|
||||||
write_data_atomically(&data, &alternate_path)?;
|
|
||||||
Some(alternate_name)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
None
|
None
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -292,20 +302,15 @@ impl Context {
|
||||||
note_id.0 += 999;
|
note_id.0 += 999;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
fn write_data_atomically(data: &[u8], path: &Path) -> Result<()> {
|
fn copy_media(&mut self, media_folder: &Path) -> Result<()> {
|
||||||
let mut tempfile = tempfile_in_parent_of(path)?;
|
for (used, entry) in self.used_media_entries.values() {
|
||||||
tempfile.write_all(data)?;
|
if *used {
|
||||||
atomic_rename(tempfile, path, false)
|
entry.copy_from_archive(&mut self.archive, media_folder)?;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
fn alternate_media_name(name: &str, notetype_id: NotetypeId) -> String {
|
Ok(())
|
||||||
let (stem, dot, extension) = name
|
}
|
||||||
.rsplit_once('.')
|
|
||||||
.map(|(stem, ext)| (stem, ".", ext))
|
|
||||||
.unwrap_or((name, "", ""));
|
|
||||||
format!("{stem}_{notetype_id}{dot}{extension}")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Notetype {
|
impl Notetype {
|
||||||
|
|
|
@ -10,19 +10,22 @@ use std::{
|
||||||
};
|
};
|
||||||
|
|
||||||
use prost::Message;
|
use prost::Message;
|
||||||
|
use tempfile::NamedTempFile;
|
||||||
use zip::{read::ZipFile, ZipArchive};
|
use zip::{read::ZipFile, ZipArchive};
|
||||||
use zstd::stream::copy_decode;
|
use zstd::stream::copy_decode;
|
||||||
|
|
||||||
use super::{MediaEntries, MediaEntry, Meta};
|
use super::{MediaEntries, MediaEntry, Meta};
|
||||||
use crate::{
|
use crate::{
|
||||||
error::ImportError, io::filename_is_safe, media::files::normalize_filename, prelude::*,
|
error::ImportError,
|
||||||
|
io::{atomic_rename, filename_is_safe},
|
||||||
|
media::files::normalize_filename,
|
||||||
|
prelude::*,
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Like [MediaEntry], but with a safe filename and set zip filename.
|
/// Like [MediaEntry], but with a safe filename and set zip filename.
|
||||||
pub(super) struct SafeMediaEntry {
|
pub(super) struct SafeMediaEntry {
|
||||||
pub(super) name: String,
|
pub(super) name: String,
|
||||||
pub(super) size: u32,
|
pub(super) size: u32,
|
||||||
#[allow(dead_code)]
|
|
||||||
pub(super) sha1: [u8; 20],
|
pub(super) sha1: [u8; 20],
|
||||||
pub(super) index: usize,
|
pub(super) index: usize,
|
||||||
}
|
}
|
||||||
|
@ -98,6 +101,17 @@ impl SafeMediaEntry {
|
||||||
.map(|metadata| metadata.len() as u64 == self_size)
|
.map(|metadata| metadata.len() as u64 == self_size)
|
||||||
.unwrap_or_default()
|
.unwrap_or_default()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(super) fn copy_from_archive(
|
||||||
|
&self,
|
||||||
|
archive: &mut ZipArchive<File>,
|
||||||
|
target_folder: &Path,
|
||||||
|
) -> Result<()> {
|
||||||
|
let mut file = self.fetch_file(archive)?;
|
||||||
|
let mut tempfile = NamedTempFile::new_in(target_folder)?;
|
||||||
|
io::copy(&mut file, &mut tempfile)?;
|
||||||
|
atomic_rename(tempfile, &self.file_path(target_folder), false)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(super) fn extract_media_entries(
|
pub(super) fn extract_media_entries(
|
||||||
|
@ -113,7 +127,7 @@ pub(super) fn extract_media_entries(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn safe_normalized_file_name(name: &str) -> Result<Cow<str>> {
|
pub(super) fn safe_normalized_file_name(name: &str) -> Result<Cow<str>> {
|
||||||
if !filename_is_safe(name) {
|
if !filename_is_safe(name) {
|
||||||
Err(AnkiError::ImportError(ImportError::Corrupt))
|
Err(AnkiError::ImportError(ImportError::Corrupt))
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -5,7 +5,7 @@ use std::{collections::HashMap, path::Path};
|
||||||
|
|
||||||
use rusqlite::{params, Connection, OptionalExtension, Row, Statement};
|
use rusqlite::{params, Connection, OptionalExtension, Row, Statement};
|
||||||
|
|
||||||
use crate::error::Result;
|
use crate::prelude::*;
|
||||||
|
|
||||||
fn trace(s: &str) {
|
fn trace(s: &str) {
|
||||||
println!("sql: {}", s)
|
println!("sql: {}", s)
|
||||||
|
@ -222,6 +222,14 @@ delete from media where fname=?"
|
||||||
Ok(map?)
|
Ok(map?)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Error if any checksums are missing or broken.
|
||||||
|
pub(super) fn all_checksums(&mut self) -> Result<HashMap<String, [u8; 20]>> {
|
||||||
|
self.db
|
||||||
|
.prepare("SELECT fname, csum FROM media")?
|
||||||
|
.query_and_then([], row_to_name_and_checksum)?
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
pub(super) fn force_resync(&mut self) -> Result<()> {
|
pub(super) fn force_resync(&mut self) -> Result<()> {
|
||||||
self.db
|
self.db
|
||||||
.execute_batch("delete from media; update meta set lastUsn = 0, dirMod = 0")
|
.execute_batch("delete from media; update meta set lastUsn = 0, dirMod = 0")
|
||||||
|
@ -250,6 +258,15 @@ fn row_to_entry(row: &Row) -> rusqlite::Result<MediaEntry> {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn row_to_name_and_checksum(row: &Row) -> Result<(String, [u8; 20])> {
|
||||||
|
let file_name = row.get(0)?;
|
||||||
|
let sha1_str: String = row.get(1)?;
|
||||||
|
let mut sha1 = [0; 20];
|
||||||
|
hex::decode_to_slice(sha1_str, &mut sha1)
|
||||||
|
.map_err(|_| AnkiError::invalid_input(format!("bad media checksum: {file_name}")))?;
|
||||||
|
Ok((file_name, sha1))
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use tempfile::NamedTempFile;
|
use tempfile::NamedTempFile;
|
||||||
|
|
|
@ -194,7 +194,7 @@ where
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Convert foo.jpg into foo-abcde12345679.jpg
|
/// Convert foo.jpg into foo-abcde12345679.jpg
|
||||||
fn add_hash_suffix_to_file_stem(fname: &str, hash: &[u8; 20]) -> String {
|
pub(crate) fn add_hash_suffix_to_file_stem(fname: &str, hash: &[u8; 20]) -> String {
|
||||||
// when appending a hash to make unique, it will be 40 bytes plus the hyphen.
|
// when appending a hash to make unique, it will be 40 bytes plus the hyphen.
|
||||||
let max_len = MAX_FILENAME_LENGTH - 40 - 1;
|
let max_len = MAX_FILENAME_LENGTH - 40 - 1;
|
||||||
|
|
||||||
|
@ -283,10 +283,15 @@ fn existing_file_sha1(path: &Path) -> io::Result<Option<[u8; 20]>> {
|
||||||
/// Return the SHA1 of a file, failing if it doesn't exist.
|
/// Return the SHA1 of a file, failing if it doesn't exist.
|
||||||
pub(crate) fn sha1_of_file(path: &Path) -> io::Result<[u8; 20]> {
|
pub(crate) fn sha1_of_file(path: &Path) -> io::Result<[u8; 20]> {
|
||||||
let mut file = fs::File::open(path)?;
|
let mut file = fs::File::open(path)?;
|
||||||
|
sha1_of_reader(&mut file)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the SHA1 of a stream.
|
||||||
|
pub(crate) fn sha1_of_reader(reader: &mut impl Read) -> io::Result<[u8; 20]> {
|
||||||
let mut hasher = Sha1::new();
|
let mut hasher = Sha1::new();
|
||||||
let mut buf = [0; 64 * 1024];
|
let mut buf = [0; 64 * 1024];
|
||||||
loop {
|
loop {
|
||||||
match file.read(&mut buf) {
|
match reader.read(&mut buf) {
|
||||||
Ok(0) => break,
|
Ok(0) => break,
|
||||||
Ok(n) => hasher.update(&buf[0..n]),
|
Ok(n) => hasher.update(&buf[0..n]),
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
|
|
|
@ -3,19 +3,21 @@
|
||||||
|
|
||||||
use std::{
|
use std::{
|
||||||
borrow::Cow,
|
borrow::Cow,
|
||||||
|
collections::HashMap,
|
||||||
path::{Path, PathBuf},
|
path::{Path, PathBuf},
|
||||||
};
|
};
|
||||||
|
|
||||||
use rusqlite::Connection;
|
use rusqlite::Connection;
|
||||||
use slog::Logger;
|
use slog::Logger;
|
||||||
|
|
||||||
|
use self::changetracker::ChangeTracker;
|
||||||
use crate::{
|
use crate::{
|
||||||
error::Result,
|
|
||||||
media::{
|
media::{
|
||||||
database::{open_or_create, MediaDatabaseContext, MediaEntry},
|
database::{open_or_create, MediaDatabaseContext, MediaEntry},
|
||||||
files::{add_data_to_folder_uniquely, mtime_as_i64, remove_files, sha1_of_data},
|
files::{add_data_to_folder_uniquely, mtime_as_i64, remove_files, sha1_of_data},
|
||||||
sync::{MediaSyncProgress, MediaSyncer},
|
sync::{MediaSyncProgress, MediaSyncer},
|
||||||
},
|
},
|
||||||
|
prelude::*,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub mod changetracker;
|
pub mod changetracker;
|
||||||
|
@ -153,4 +155,14 @@ impl MediaManager {
|
||||||
pub fn dbctx(&self) -> MediaDatabaseContext {
|
pub fn dbctx(&self) -> MediaDatabaseContext {
|
||||||
MediaDatabaseContext::new(&self.db)
|
MediaDatabaseContext::new(&self.db)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn all_checksums(
|
||||||
|
&self,
|
||||||
|
progress: impl FnMut(usize) -> bool,
|
||||||
|
log: &Logger,
|
||||||
|
) -> Result<HashMap<String, [u8; 20]>> {
|
||||||
|
let mut dbctx = self.dbctx();
|
||||||
|
ChangeTracker::new(&self.media_folder, progress, log).register_changes(&mut dbctx)?;
|
||||||
|
dbctx.all_checksums()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue