From d5772ac43a792780bc1622abf3a90e23839b9b3b Mon Sep 17 00:00:00 2001 From: Damien Elmes Date: Wed, 1 Feb 2023 21:09:41 +1000 Subject: [PATCH] Refactor MediaIter Removes the dependency on a local path, and allows the unicode checks to be skipped if appropriate. --- .../src/import_export/package/apkg/export.rs | 2 +- .../package/apkg/import/media.rs | 2 +- .../import_export/package/colpkg/export.rs | 116 +----------- rslib/src/import_export/package/media.rs | 170 +++++++++++++++++- rslib/src/import_export/package/mod.rs | 3 + 5 files changed, 183 insertions(+), 110 deletions(-) diff --git a/rslib/src/import_export/package/apkg/export.rs b/rslib/src/import_export/package/apkg/export.rs index 3be69e3f3..e20d5d762 100644 --- a/rslib/src/import_export/package/apkg/export.rs +++ b/rslib/src/import_export/package/apkg/export.rs @@ -8,7 +8,7 @@ use std::path::PathBuf; use crate::collection::CollectionBuilder; use crate::import_export::gather::ExchangeData; use crate::import_export::package::colpkg::export::export_collection; -use crate::import_export::package::colpkg::export::MediaIter; +use crate::import_export::package::media::MediaIter; use crate::import_export::package::Meta; use crate::import_export::ExportProgress; use crate::import_export::IncrementableProgress; diff --git a/rslib/src/import_export/package/apkg/import/media.rs b/rslib/src/import_export/package/apkg/import/media.rs index 72eb00500..35f405713 100644 --- a/rslib/src/import_export/package/apkg/import/media.rs +++ b/rslib/src/import_export/package/apkg/import/media.rs @@ -10,8 +10,8 @@ use zip::ZipArchive; use super::Context; use crate::error::FileIoSnafu; use crate::error::FileOp; -use crate::import_export::package::colpkg::export::MediaCopier; use crate::import_export::package::media::extract_media_entries; +use crate::import_export::package::media::MediaCopier; use crate::import_export::package::media::SafeMediaEntry; use crate::import_export::ImportProgress; use crate::import_export::IncrementableProgress; diff --git a/rslib/src/import_export/package/colpkg/export.rs b/rslib/src/import_export/package/colpkg/export.rs index fa7b7f4e6..be23499f0 100644 --- a/rslib/src/import_export/package/colpkg/export.rs +++ b/rslib/src/import_export/package/colpkg/export.rs @@ -1,9 +1,7 @@ // Copyright: Ankitects Pty Ltd and contributors // License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html -use std::borrow::Cow; use std::collections::HashMap; -use std::ffi::OsStr; use std::fs::File; use std::io; use std::io::Read; @@ -12,8 +10,6 @@ use std::path::Path; use std::path::PathBuf; use prost::Message; -use sha1::Digest; -use sha1::Sha1; use tempfile::NamedTempFile; use zip::write::FileOptions; use zip::CompressionMethod; @@ -27,14 +23,14 @@ use super::super::MediaEntry; use super::super::Meta; use super::super::Version; use crate::collection::CollectionBuilder; +use crate::import_export::package::media::MediaCopier; +use crate::import_export::package::media::MediaIter; use crate::import_export::ExportProgress; use crate::import_export::IncrementableProgress; use crate::io::atomic_rename; use crate::io::new_tempfile; use crate::io::new_tempfile_in_parent_of; use crate::io::open_file; -use crate::io::read_dir_files; -use crate::media::files::filename_if_normalized; use crate::prelude::*; use crate::storage::SchemaVersion; @@ -82,36 +78,6 @@ impl Collection { } } -pub struct MediaIter(Box>>); - -impl MediaIter { - /// Iterator over all files in the given path, without traversing - /// subfolders. - pub fn from_folder(path: &Path) -> Result { - Ok(Self(Box::new( - read_dir_files(path)?.map(|res| res.map(|entry| entry.path())), - ))) - } - - /// Iterator over all given files in the given folder. - /// Missing files are silently ignored. - pub fn from_file_list( - list: impl IntoIterator + 'static, - folder: PathBuf, - ) -> Self { - Self(Box::new( - list.into_iter() - .map(move |file| folder.join(file)) - .filter(|path| path.exists()) - .map(Ok), - )) - } - - pub fn empty() -> Self { - Self(Box::new(std::iter::empty())) - } -} - fn export_collection_file( out_path: impl AsRef, col_path: impl AsRef, @@ -298,88 +264,24 @@ fn write_media_files( let mut incrementor = progress.incrementor(ExportProgress::Media); for (index, res) in media.0.enumerate() { incrementor.increment()?; - let path = res?; + let mut entry = res?; zip.start_file(index.to_string(), file_options_stored())?; - let mut file = open_file(&path)?; - let file_name = path.file_name().or_invalid("not a file path")?; - let name = normalized_unicode_file_name(file_name)?; - - let (size, sha1) = copier.copy(&mut file, zip)?; - media_entries.push(MediaEntry::new(name, size, sha1)); + let (size, sha1) = copier.copy(&mut entry.data, zip)?; + media_entries.push(MediaEntry::new(entry.nfc_filename, size, sha1)); } Ok(()) } -fn normalized_unicode_file_name(filename: &OsStr) -> Result { - let filename = filename.to_str().or_invalid("non-unicode filename")?; - filename_if_normalized(filename) - .map(Cow::into_owned) - .ok_or(AnkiError::MediaCheckRequired) -} - -/// Copies and hashes while optionally encoding. -/// If compressing, the encoder is reused to optimize for repeated calls. -pub(crate) struct MediaCopier { - encoding: bool, - encoder: Option>, - buf: [u8; 64 * 1024], -} - -impl MediaCopier { - pub(crate) fn new(encoding: bool) -> Self { - Self { - encoding, - encoder: None, - buf: [0; 64 * 1024], - } - } - - fn encoder(&mut self) -> Option> { - self.encoding.then(|| { - self.encoder - .take() - .unwrap_or_else(|| RawEncoder::with_dictionary(0, &[]).unwrap()) - }) - } - - /// Returns size and sha1 hash of the copied data. - pub(crate) fn copy( - &mut self, - reader: &mut impl Read, - writer: &mut impl Write, - ) -> Result<(usize, Sha1Hash)> { - let mut size = 0; - let mut hasher = Sha1::new(); - self.buf = [0; 64 * 1024]; - let mut wrapped_writer = MaybeEncodedWriter::new(writer, self.encoder()); - - loop { - let count = match reader.read(&mut self.buf) { - Ok(0) => break, - Err(e) if e.kind() == io::ErrorKind::Interrupted => continue, - result => result?, - }; - size += count; - hasher.update(&self.buf[..count]); - wrapped_writer.write(&self.buf[..count])?; - } - - self.encoder = wrapped_writer.finish()?; - - Ok((size, hasher.finalize().into())) - } -} - -enum MaybeEncodedWriter<'a, W: Write> { +pub(crate) enum MaybeEncodedWriter<'a, W: Write> { Stored(&'a mut W), Encoded(zio::Writer<&'a mut W, RawEncoder<'static>>), } impl<'a, W: Write> MaybeEncodedWriter<'a, W> { - fn new(writer: &'a mut W, encoder: Option>) -> Self { + pub fn new(writer: &'a mut W, encoder: Option>) -> Self { if let Some(encoder) = encoder { Self::Encoded(zio::Writer::new(writer, encoder)) } else { @@ -387,7 +289,7 @@ impl<'a, W: Write> MaybeEncodedWriter<'a, W> { } } - fn write(&mut self, buf: &[u8]) -> Result<()> { + pub fn write(&mut self, buf: &[u8]) -> Result<()> { match self { Self::Stored(writer) => writer.write_all(buf)?, Self::Encoded(writer) => writer.write_all(buf)?, @@ -395,7 +297,7 @@ impl<'a, W: Write> MaybeEncodedWriter<'a, W> { Ok(()) } - fn finish(self) -> Result>> { + pub fn finish(self) -> Result>> { Ok(match self { Self::Stored(_) => None, Self::Encoded(mut writer) => { diff --git a/rslib/src/import_export/package/media.rs b/rslib/src/import_export/package/media.rs index 104270d77..52e6ef02d 100644 --- a/rslib/src/import_export/package/media.rs +++ b/rslib/src/import_export/package/media.rs @@ -3,25 +3,36 @@ use std::borrow::Cow; use std::collections::HashMap; +use std::ffi::OsString; use std::fs; use std::fs::File; use std::io; +use std::io::Read; +use std::io::Write; use std::path::Path; use std::path::PathBuf; use prost::Message; +use sha1::Digest; +use sha1::Sha1; use zip::read::ZipFile; use zip::ZipArchive; use zstd::stream::copy_decode; +use zstd::stream::raw::Encoder as RawEncoder; -use super::colpkg::export::MediaCopier; use super::MediaEntries; use super::MediaEntry; use super::Meta; +use crate::error::FileIoError; +use crate::error::FileOp; use crate::error::ImportError; +use crate::error::InvalidInputError; +use crate::import_export::package::colpkg::export::MaybeEncodedWriter; use crate::io::atomic_rename; use crate::io::filename_is_safe; use crate::io::new_tempfile_in; +use crate::io::read_dir_files; +use crate::media::files::filename_if_normalized; use crate::media::files::normalize_filename; use crate::prelude::*; @@ -171,6 +182,163 @@ impl MediaEntries { } } +pub struct MediaIterEntry { + pub nfc_filename: String, + pub data: Box, +} + +#[derive(Debug)] +pub enum MediaIterError { + InvalidFilename { + filename: OsString, + }, + IoError { + filename: String, + source: io::Error, + }, + Other { + source: Box, + }, +} + +impl TryFrom<&Path> for MediaIterEntry { + type Error = MediaIterError; + + fn try_from(value: &Path) -> std::result::Result { + let nfc_filename: String = value + .file_name() + .and_then(|s| s.to_str()) + .and_then(filename_if_normalized) + .ok_or_else(|| MediaIterError::InvalidFilename { + filename: value.as_os_str().to_owned(), + })? + .into(); + let file = File::open(value).map_err(|err| MediaIterError::IoError { + filename: nfc_filename.clone(), + source: err, + })?; + Ok(MediaIterEntry { + nfc_filename, + data: Box::new(file) as _, + }) + } +} + +impl From for AnkiError { + fn from(err: MediaIterError) -> Self { + match err { + MediaIterError::InvalidFilename { .. } => AnkiError::MediaCheckRequired, + MediaIterError::IoError { filename, source } => FileIoError { + path: filename.into(), + op: FileOp::Read, + source, + } + .into(), + MediaIterError::Other { source } => InvalidInputError { + message: "".to_string(), + source: Some(source), + backtrace: None, + } + .into(), + } + } +} + +pub struct MediaIter(pub Box>>); + +impl MediaIter { + pub fn new(iter: I) -> Self + where + I: Iterator> + 'static, + { + Self(Box::new(iter)) + } + + /// Iterator over all files in the given path, without traversing + /// subfolders. + pub fn from_folder(path: &Path) -> Result { + let path2 = path.to_owned(); + Ok(Self::new(read_dir_files(path)?.map(move |res| match res { + Ok(entry) => MediaIterEntry::try_from(entry.path().as_path()), + Err(err) => Err(MediaIterError::IoError { + filename: path2.to_string_lossy().into(), + source: err, + }), + }))) + } + + /// Iterator over all given files in the given folder. + /// Missing files are silently ignored. + pub fn from_file_list( + list: impl IntoIterator + 'static, + folder: PathBuf, + ) -> Self { + Self::new( + list.into_iter() + .map(move |file| folder.join(file)) + .filter(|path| path.exists()) + .map(|path| MediaIterEntry::try_from(path.as_path())), + ) + } + + pub fn empty() -> Self { + Self::new([].into_iter()) + } +} + +/// Copies and hashes while optionally encoding. +/// If compressing, the encoder is reused to optimize for repeated calls. +pub(crate) struct MediaCopier { + encoding: bool, + encoder: Option>, + buf: [u8; 64 * 1024], +} + +impl MediaCopier { + pub(crate) fn new(encoding: bool) -> Self { + Self { + encoding, + encoder: None, + buf: [0; 64 * 1024], + } + } + + fn encoder(&mut self) -> Option> { + self.encoding.then(|| { + self.encoder + .take() + .unwrap_or_else(|| RawEncoder::with_dictionary(0, &[]).unwrap()) + }) + } + + /// Returns size and sha1 hash of the copied data. + pub(crate) fn copy( + &mut self, + reader: &mut impl Read, + writer: &mut impl Write, + ) -> Result<(usize, Sha1Hash)> { + let mut size = 0; + let mut hasher = Sha1::new(); + self.buf = [0; 64 * 1024]; + let mut wrapped_writer = MaybeEncodedWriter::new(writer, self.encoder()); + + loop { + let count = match reader.read(&mut self.buf) { + Ok(0) => break, + Err(e) if e.kind() == io::ErrorKind::Interrupted => continue, + result => result?, + }; + size += count; + hasher.update(&self.buf[..count]); + wrapped_writer.write(&self.buf[..count])?; + } + + self.encoder = wrapped_writer.finish()?; + + Ok((size, hasher.finalize().into())) + } +} + #[cfg(test)] mod test { use super::*; diff --git a/rslib/src/import_export/package/mod.rs b/rslib/src/import_export/package/mod.rs index 68f3d2b0e..b9da55fe0 100644 --- a/rslib/src/import_export/package/mod.rs +++ b/rslib/src/import_export/package/mod.rs @@ -9,6 +9,9 @@ mod meta; pub(crate) use apkg::NoteMeta; pub(crate) use colpkg::export::export_colpkg_from_data; pub use colpkg::import::import_colpkg; +pub use media::MediaIter; +pub use media::MediaIterEntry; +pub use media::MediaIterError; pub(self) use meta::Meta; pub(self) use meta::Version;