diff --git a/rslib/Cargo.toml b/rslib/Cargo.toml index 222b677d1..9d36ef51f 100644 --- a/rslib/Cargo.toml +++ b/rslib/Cargo.toml @@ -16,6 +16,9 @@ regex = "1.3.3" hex = "0.4.0" blake3 = "0.1.0" htmlescape = "0.3.1" +sha1 = "0.6.0" +unicode-normalization = "0.1.12" +tempfile = "3.1.0" [build-dependencies] prost-build = "0.5.0" diff --git a/rslib/src/lib.rs b/rslib/src/lib.rs index eb2150b6b..b55f7890e 100644 --- a/rslib/src/lib.rs +++ b/rslib/src/lib.rs @@ -6,6 +6,7 @@ mod backend_proto; pub mod backend; pub mod cloze; pub mod err; +pub mod media; pub mod sched; pub mod template; pub mod template_filters; diff --git a/rslib/src/media.rs b/rslib/src/media.rs new file mode 100644 index 000000000..8b5346324 --- /dev/null +++ b/rslib/src/media.rs @@ -0,0 +1,281 @@ +// Copyright: Ankitects Pty Ltd and contributors +// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html + +use lazy_static::lazy_static; +use regex::Regex; +use sha1::Sha1; +use std::borrow::Cow; +use std::io::Read; +use std::path::Path; +use std::{fs, io}; +use unicode_normalization::{is_nfc_quick, IsNormalized, UnicodeNormalization}; + +/// The maximum length we allow a filename to be. When combined +/// with the rest of the path, the full path needs to be under ~240 chars +/// on some platforms, and some filesystems like eCryptFS will increase +/// the length of the filename. +static MAX_FILENAME_LENGTH: usize = 120; + +lazy_static! { + static ref WINDOWS_DEVICE_NAME: Regex = Regex::new( + r#"(?xi) + # starting with one of the following names + ^ + ( + CON | PRN | AUX | NUL | COM[1-9] | LPT[1-9] + ) + # either followed by a dot, or no extension + ( + \. | $ + ) + "# + ) + .unwrap(); +} + +/// True if character may cause problems on one or more platforms. +fn disallowed_char(char: char) -> bool { + match char { + '[' | ']' | '<' | '>' | ':' | '"' | '/' | '?' | '*' | '^' | '\\' | '|' => true, + c if c.is_ascii_control() => true, + _ => false, + } +} + +/// Adjust filename into the format Anki expects. +/// +/// - The filename is normalized to NFC. +/// - Any problem characters are removed. +/// - Windows device names like CON and PRN have '_' appended +/// - The filename is limited to 120 bytes. +fn normalize_filename(fname: &str) -> Cow { + let mut output = Cow::Borrowed(fname); + + if is_nfc_quick(output.chars()) != IsNormalized::Yes { + output = output.chars().nfc().collect::().into(); + } + + if output.chars().any(disallowed_char) { + output = output.replace(disallowed_char, "").into() + } + + if let Cow::Owned(o) = WINDOWS_DEVICE_NAME.replace_all(output.as_ref(), "${1}_${2}") { + output = o.into(); + } + + if let Cow::Owned(o) = truncate_filename(output.as_ref(), MAX_FILENAME_LENGTH) { + output = o.into(); + } + + output +} + +/// Write desired_name into folder, renaming if existing file has different content. +/// Returns the used filename. +pub fn add_data_to_folder_uniquely<'a, P>( + folder: P, + desired_name: &'a str, + data: &[u8], +) -> io::Result> +where + P: AsRef, +{ + let normalized_name = normalize_filename(desired_name); + + let mut target_path = folder.as_ref().join(normalized_name.as_ref()); + + let existing_file_hash = existing_file_sha1(&target_path)?; + if existing_file_hash.is_none() { + // no file with that name exists yet + fs::write(&target_path, data)?; + return Ok(normalized_name); + } + + let data_hash = sha1_of_data(data); + if existing_file_hash.unwrap() == data_hash { + // existing file has same checksum, nothing to do + return Ok(normalized_name); + } + + // give it a unique name based on its hash + let hashed_name = add_hash_suffix_to_file_stem(normalized_name.as_ref(), &data_hash); + target_path.set_file_name(&hashed_name); + + fs::write(&target_path, data)?; + Ok(hashed_name.into()) +} + +/// Convert foo.jpg into foo-abcde12345679.jpg +fn add_hash_suffix_to_file_stem(fname: &str, hash: &[u8; 20]) -> String { + // when appending a hash to make unique, it will be 20 bytes plus the hyphen. + let max_len = MAX_FILENAME_LENGTH - 20 - 1; + + let (stem, ext) = split_and_truncate_filename(fname, max_len); + + format!("{}-{}.{}", stem, hex::encode(hash), ext) +} + +/// If filename is longer than max_bytes, truncate it. +fn truncate_filename(fname: &str, max_bytes: usize) -> Cow { + if fname.len() <= max_bytes { + return Cow::Borrowed(fname); + } + + let (stem, ext) = split_and_truncate_filename(fname, max_bytes); + + format!("{}.{}", stem, ext).into() +} + +/// Split filename into stem and extension, and trim both so the +/// resulting filename would be under max_bytes. +/// Returns (stem, extension) +fn split_and_truncate_filename(fname: &str, max_bytes: usize) -> (&str, &str) { + // the code assumes the length will be at least 11 + debug_assert!(max_bytes > 10); + + let mut iter = fname.rsplitn(2, '.'); + let mut ext = iter.next().unwrap(); + let mut stem = if let Some(s) = iter.next() { + s + } else { + // no extension, so ext holds the full filename + let ext_tmp = ext; + ext = ""; + ext_tmp + }; + + // cap extension to 10 bytes so stem_len can't be negative + ext = truncate_to_char_boundary(ext, 10); + + // cap stem, allowing for the . + let stem_len = max_bytes - ext.len() - 1; + stem = truncate_to_char_boundary(stem, stem_len); + + (stem, ext) +} + +/// Trim a string on a valid UTF8 boundary. +/// Based on a funtion in the Rust stdlib. +fn truncate_to_char_boundary(s: &str, mut max: usize) -> &str { + if max >= s.len() { + s + } else { + while !s.is_char_boundary(max) { + max -= 1; + } + &s[..max] + } +} + +/// Return the SHA1 of a file if it exists, or None. +fn existing_file_sha1(path: &Path) -> io::Result> { + match sha1_of_file(path) { + Ok(o) => Ok(Some(o)), + Err(e) => { + if e.kind() == io::ErrorKind::NotFound { + Ok(None) + } else { + Err(e) + } + } + } +} + +/// Return the SHA1 of a file, failing if it doesn't exist. +fn sha1_of_file(path: &Path) -> io::Result<[u8; 20]> { + let mut file = fs::File::open(path)?; + let mut hasher = Sha1::new(); + let mut buf = [0; 64 * 1024]; + loop { + match file.read(&mut buf) { + Ok(0) => break, + Ok(n) => hasher.update(&buf[0..n]), + Err(e) => { + if e.kind() == io::ErrorKind::Interrupted { + continue; + } else { + return Err(e); + } + } + }; + } + Ok(hasher.digest().bytes()) +} + +/// Return the SHA1 of provided data. +fn sha1_of_data(data: &[u8]) -> [u8; 20] { + let mut hasher = Sha1::new(); + hasher.update(data); + hasher.digest().bytes() +} + +#[cfg(test)] +mod test { + use crate::media::{ + add_data_to_folder_uniquely, add_hash_suffix_to_file_stem, normalize_filename, + sha1_of_data, MAX_FILENAME_LENGTH, + }; + use std::borrow::Cow; + use tempfile::tempdir; + + #[test] + fn test_normalize() { + assert_eq!(normalize_filename("foo.jpg"), Cow::Borrowed("foo.jpg")); + assert_eq!( + normalize_filename("con.jpg[]><:\"/?*^\\|\0\r\n").as_ref(), + "con_.jpg" + ); + + let expected_stem_len = MAX_FILENAME_LENGTH - ".jpg".len(); + assert_eq!( + normalize_filename(&format!("{}.jpg", "x".repeat(MAX_FILENAME_LENGTH * 2))), + "x".repeat(expected_stem_len) + ".jpg" + ); + } + + #[test] + fn test_add_hash_suffix() { + let hash = sha1_of_data("hello".as_bytes()); + assert_eq!( + add_hash_suffix_to_file_stem("test.jpg", &hash).as_str(), + "test-aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d.jpg" + ); + } + + #[test] + fn test_adding() { + let dir = tempdir().unwrap(); + let dpath = dir.path(); + + // no existing file case + assert_eq!( + add_data_to_folder_uniquely(dpath, "test.mp3", "hello".as_bytes()).unwrap(), + "test.mp3" + ); + + // same contents case + assert_eq!( + add_data_to_folder_uniquely(dpath, "test.mp3", "hello".as_bytes()).unwrap(), + "test.mp3" + ); + + // different contents + assert_eq!( + add_data_to_folder_uniquely(dpath, "test.mp3", "hello1".as_bytes()).unwrap(), + "test-88fdd585121a4ccb3d1540527aee53a77c77abb8.mp3" + ); + + let mut written_files = std::fs::read_dir(dpath) + .unwrap() + .map(|d| d.unwrap().file_name().to_string_lossy().into_owned()) + .collect::>(); + written_files.sort(); + assert_eq!( + written_files, + vec![ + "test-88fdd585121a4ccb3d1540527aee53a77c77abb8.mp3", + "test.mp3", + ] + ); + } +}