gather field references in Rust; media check now mostly complete

This commit is contained in:
Damien Elmes 2020-02-10 14:19:39 +10:00
parent aa832e9117
commit fabfcb0338
9 changed files with 437 additions and 55 deletions

View file

@ -29,6 +29,7 @@ log = "0.4.8"
serde_tuple = "0.4.0"
coarsetime = "0.1.12"
utime = "0.2.1"
serde-aux = "0.6.1"
[target.'cfg(target_vendor="apple")'.dependencies]
rusqlite = { version = "0.21.0", features = ["trace"] }

View file

@ -17,3 +17,5 @@ pub mod sched;
pub mod template;
pub mod template_filters;
pub mod text;
pub mod time;
pub mod types;

View file

@ -2,29 +2,37 @@
// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
use crate::err::{AnkiError, Result};
use crate::media::col::{
for_every_note, get_note_types, mark_collection_modified, open_or_create_collection_db,
set_note, Note,
};
use crate::media::database::MediaDatabaseContext;
use crate::media::files::{
data_for_file, filename_if_normalized, remove_files, trash_folder, MEDIA_SYNC_FILESIZE_LIMIT,
};
use crate::media::MediaManager;
use crate::text::{normalize_to_nfc, MediaRef};
use crate::{media::MediaManager, text::extract_media_refs};
use coarsetime::Instant;
use log::debug;
use std::collections::{HashMap, HashSet};
use std::path::Path;
use std::{borrow::Cow, fs, time};
#[derive(Debug, PartialEq)]
pub struct MediaCheckOutput {
files: Vec<String>,
renamed: Vec<RenamedFile>,
unused: Vec<String>,
missing: Vec<String>,
renamed: HashMap<String, String>,
dirs: Vec<String>,
oversize: Vec<String>,
}
/// A file that was renamed due to invalid chars or non-NFC encoding.
/// On Apple computers, files in NFD format are not renamed.
#[derive(Debug, PartialEq)]
pub struct RenamedFile {
current_fname: String,
original_fname: String,
#[derive(Debug, PartialEq, Default)]
struct MediaFolderCheck {
files: Vec<String>,
renamed: HashMap<String, String>,
dirs: Vec<String>,
oversize: Vec<String>,
}
pub struct MediaChecker<'a, P>
@ -32,6 +40,7 @@ where
P: FnMut(usize) -> bool,
{
mgr: &'a MediaManager,
col_path: &'a Path,
progress_cb: P,
checked: usize,
progress_updated: Instant,
@ -41,9 +50,14 @@ impl<P> MediaChecker<'_, P>
where
P: FnMut(usize) -> bool,
{
pub fn new(mgr: &MediaManager, progress_cb: P) -> MediaChecker<'_, P> {
pub fn new<'a>(
mgr: &'a MediaManager,
col_path: &'a Path,
progress_cb: P,
) -> MediaChecker<'a, P> {
MediaChecker {
mgr,
col_path,
progress_cb,
checked: 0,
progress_updated: Instant::now(),
@ -53,12 +67,28 @@ where
pub fn check(&mut self) -> Result<MediaCheckOutput> {
self.expire_old_trash()?;
// loop through on-disk files
let mut dirs = vec![];
let mut oversize = vec![];
let mut all_files = vec![];
let mut renamed_files = vec![];
let mut ctx = self.mgr.dbctx();
let folder_check = self.check_media_folder(&mut ctx)?;
let referenced_files = self.check_media_references(&folder_check.renamed)?;
let (unused, missing) = find_unused_and_missing(folder_check.files, referenced_files);
Ok(MediaCheckOutput {
unused,
missing,
renamed: folder_check.renamed,
dirs: folder_check.dirs,
oversize: folder_check.oversize,
})
}
/// Check all the files in the media folder.
///
/// - Renames files with invalid names
/// - Notes folders/oversized files
/// - Gathers a list of all files
fn check_media_folder(&mut self, ctx: &mut MediaDatabaseContext) -> Result<MediaFolderCheck> {
let mut out = MediaFolderCheck::default();
for dentry in self.mgr.media_folder.read_dir()? {
let dentry = dentry?;
@ -76,14 +106,14 @@ where
// skip folders
if dentry.file_type()?.is_dir() {
dirs.push(disk_fname.to_string());
out.dirs.push(disk_fname.to_string());
continue;
}
// ignore large files and zero byte files
let metadata = dentry.metadata()?;
if metadata.len() > MEDIA_SYNC_FILESIZE_LIMIT as u64 {
oversize.push(disk_fname.to_string());
out.oversize.push(disk_fname.to_string());
continue;
}
if metadata.len() == 0 {
@ -91,23 +121,21 @@ where
}
// rename if required
let (norm_name, renamed) = self.normalize_and_maybe_rename(&mut ctx, &disk_fname)?;
let (norm_name, renamed) = self.normalize_and_maybe_rename(ctx, &disk_fname)?;
if renamed {
renamed_files.push(RenamedFile {
current_fname: norm_name.to_string(),
original_fname: disk_fname.to_string(),
})
let orig_as_nfc = normalize_to_nfc(&disk_fname);
// if the only difference is the unicode normalization,
// we don't mark the file as a renamed file
if orig_as_nfc.as_ref() != norm_name.as_ref() {
out.renamed
.insert(orig_as_nfc.to_string(), norm_name.to_string());
}
}
all_files.push(norm_name.into_owned());
out.files.push(norm_name.into_owned());
}
Ok(MediaCheckOutput {
files: all_files,
renamed: renamed_files,
dirs,
oversize,
})
Ok(out)
}
/// Returns (normalized_form, needs_rename)
@ -182,30 +210,149 @@ where
Ok(())
}
/// Find all media references in notes, fixing as necessary.
fn check_media_references(
&mut self,
renamed: &HashMap<String, String>,
) -> Result<HashSet<String>> {
let mut db = open_or_create_collection_db(self.col_path)?;
let trx = db.transaction()?;
let mut referenced_files = HashSet::new();
let note_types = get_note_types(&trx)?;
let mut collection_modified = false;
for_every_note(&trx, |note| {
self.checked += 1;
if self.checked % 10 == 0 {
self.maybe_fire_progress_cb()?;
}
if fix_and_extract_media_refs(note, &mut referenced_files, renamed)? {
// note was modified, needs saving
set_note(
&trx,
note,
note_types
.get(&note.mid)
.ok_or_else(|| AnkiError::DBError {
info: "missing note type".to_string(),
})?,
)?;
collection_modified = true;
}
Ok(())
})?;
if collection_modified {
mark_collection_modified(&trx)?;
trx.commit()?;
}
Ok(referenced_files)
}
}
/// Returns true if note was modified.
fn fix_and_extract_media_refs(
note: &mut Note,
seen_files: &mut HashSet<String>,
renamed: &HashMap<String, String>,
) -> Result<bool> {
let mut updated = false;
for idx in 0..note.fields().len() {
let field = normalize_and_maybe_rename_files(&note.fields()[idx], renamed, seen_files);
if let Cow::Owned(field) = field {
// field was modified, need to save
note.set_field(idx, field)?;
updated = true;
}
}
Ok(updated)
}
/// Convert any filenames that are not in NFC form into NFC,
/// and update any files that were renamed on disk.
fn normalize_and_maybe_rename_files<'a>(
field: &'a str,
renamed: &HashMap<String, String>,
seen_files: &mut HashSet<String>,
) -> Cow<'a, str> {
let refs = extract_media_refs(field);
let mut field: Cow<str> = field.into();
for media_ref in refs {
// normalize fname into NFC
let mut fname = normalize_to_nfc(media_ref.fname);
// and look it up to see if it's been renamed
if let Some(new_name) = renamed.get(fname.as_ref()) {
fname = new_name.to_owned().into();
}
// if it was not in NFC or was renamed, update the field
if let Cow::Owned(ref new_name) = fname {
field = rename_media_ref_in_field(field.as_ref(), &media_ref, new_name).into();
}
// and mark this filename as having been referenced
seen_files.insert(fname.into_owned());
}
field
}
fn rename_media_ref_in_field(field: &str, media_ref: &MediaRef, new_name: &str) -> String {
let updated_tag = media_ref.full_ref.replace(media_ref.fname, new_name);
field.replace(media_ref.full_ref, &updated_tag)
}
/// Returns (unused, missing)
fn find_unused_and_missing(
files: Vec<String>,
mut references: HashSet<String>,
) -> (Vec<String>, Vec<String>) {
let mut unused = vec![];
for file in files {
if !references.contains(&file) {
unused.push(file);
} else {
references.remove(&file);
}
}
(unused, references.into_iter().collect())
}
#[cfg(test)]
mod test {
use crate::err::Result;
use crate::media::check::{MediaCheckOutput, MediaChecker, RenamedFile};
use crate::media::check::{MediaCheckOutput, MediaChecker};
use crate::media::MediaManager;
use std::fs;
use std::path::PathBuf;
use tempfile::{tempdir, TempDir};
fn common_setup() -> Result<(TempDir, MediaManager)> {
fn common_setup() -> Result<(TempDir, MediaManager, PathBuf)> {
let dir = tempdir()?;
let media_dir = dir.path().join("media");
fs::create_dir(&media_dir)?;
let media_db = dir.path().join("media.db");
let col_path = dir.path().join("col.anki2");
fs::write(
&col_path,
&include_bytes!("../../tests/support/mediacheck.anki2")[..],
)?;
let mgr = MediaManager::new(&media_dir, media_db)?;
Ok((dir, mgr))
Ok((dir, mgr, col_path))
}
#[test]
fn test_media_check() -> Result<()> {
let (_dir, mgr) = common_setup()?;
let (_dir, mgr, col_path) = common_setup()?;
// add some test files
fs::write(&mgr.media_folder.join("zerobytes"), "")?;
@ -214,18 +361,17 @@ mod test {
fs::write(&mgr.media_folder.join("foo[.jpg"), "foo")?;
let progress = |_n| true;
let mut checker = MediaChecker::new(&mgr, progress);
let mut output = checker.check()?;
output.files.sort();
let mut checker = MediaChecker::new(&mgr, &col_path, progress);
let output = checker.check()?;
assert_eq!(
output,
MediaCheckOutput {
files: vec!["foo.jpg".to_string(), "normal.jpg".to_string()],
renamed: vec![RenamedFile {
current_fname: "foo.jpg".to_string(),
original_fname: "foo[.jpg".to_string()
}],
unused: vec![],
missing: vec!["ぱぱ.jpg".into()],
renamed: vec![("foo[.jpg".into(), "foo.jpg".into())]
.into_iter()
.collect(),
dirs: vec!["folder".to_string()],
oversize: vec![]
}
@ -239,13 +385,14 @@ mod test {
#[test]
fn test_unicode_normalization() -> Result<()> {
let (_dir, mgr) = common_setup()?;
let (_dir, mgr, col_path) = common_setup()?;
fs::write(&mgr.media_folder.join("ぱぱ.jpg"), "nfd encoding")?;
let progress = |_n| true;
let mut checker = MediaChecker::new(&mgr, progress);
let output = checker.check()?;
let mut checker = MediaChecker::new(&mgr, &col_path, progress);
let mut output = checker.check()?;
output.missing.sort();
if cfg!(target_vendor = "apple") {
// on a Mac, the file should not have been renamed, but the returned name
@ -253,8 +400,9 @@ mod test {
assert_eq!(
output,
MediaCheckOutput {
files: vec!["ぱぱ.jpg".to_string()],
renamed: vec![],
unused: vec![],
missing: vec!["foo[.jpg".into(), "normal.jpg".into()],
renamed: Default::default(),
dirs: vec![],
oversize: vec![]
}
@ -265,11 +413,11 @@ mod test {
assert_eq!(
output,
MediaCheckOutput {
files: vec!["ぱぱ.jpg".to_string()],
renamed: vec![RenamedFile {
current_fname: "ぱぱ.jpg".to_string(),
original_fname: "ぱぱ.jpg".to_string()
}],
unused: vec![],
missing: vec!["foo[.jpg".into(), "normal.jpg".into()],
renamed: vec![("ぱぱ.jpg".into(), "ぱぱ.jpg".into())]
.into_iter()
.collect(),
dirs: vec![],
oversize: vec![]
}

153
rslib/src/media/col.rs Normal file
View file

@ -0,0 +1,153 @@
// Copyright: Ankitects Pty Ltd and contributors
// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
/// Basic note reading/updating functionality for the media DB check.
use crate::err::{AnkiError, Result};
use crate::text::strip_html_preserving_image_filenames;
use crate::time::i64_unix_timestamp;
use crate::types::{ObjID, Timestamp, Usn};
use rusqlite::{params, Connection, Row, NO_PARAMS};
use serde_aux::field_attributes::deserialize_number_from_string;
use serde_derive::Deserialize;
use std::collections::HashMap;
use std::convert::TryInto;
use std::path::Path;
#[derive(Debug)]
pub(super) struct Note {
pub id: ObjID,
pub mid: ObjID,
pub mtime_secs: Timestamp,
pub usn: Usn,
fields: Vec<String>,
}
impl Note {
pub fn fields(&self) -> &Vec<String> {
&self.fields
}
pub fn set_field(&mut self, idx: usize, text: impl Into<String>) -> Result<()> {
if idx >= self.fields.len() {
return Err(AnkiError::invalid_input(
"field idx out of range".to_string(),
));
}
self.fields[idx] = text.into();
Ok(())
}
}
fn field_checksum(text: &str) -> u32 {
let digest = sha1::Sha1::from(text).digest().bytes();
u32::from_be_bytes(digest[..4].try_into().unwrap())
}
pub(super) fn open_or_create_collection_db(path: &Path) -> Result<Connection> {
let db = Connection::open(path)?;
db.pragma_update(None, "locking_mode", &"exclusive")?;
db.pragma_update(None, "page_size", &4096)?;
db.pragma_update(None, "cache_size", &(-40 * 1024))?;
db.pragma_update(None, "legacy_file_format", &false)?;
db.pragma_update(None, "journal", &"wal")?;
db.set_prepared_statement_cache_capacity(5);
Ok(db)
}
#[derive(Deserialize, Debug)]
pub(super) struct NoteType {
#[serde(deserialize_with = "deserialize_number_from_string")]
id: ObjID,
#[serde(rename = "sortf")]
sort_field_idx: u16,
}
pub(super) fn get_note_types(db: &Connection) -> Result<HashMap<ObjID, NoteType>> {
let mut stmt = db.prepare("select models from col")?;
let note_types = stmt
.query_and_then(NO_PARAMS, |row| -> Result<HashMap<ObjID, NoteType>> {
let v: HashMap<ObjID, NoteType> = serde_json::from_str(row.get_raw(0).as_str()?)?;
Ok(v)
})?
.next()
.ok_or_else(|| AnkiError::DBError {
info: "col table empty".to_string(),
})??;
Ok(note_types)
}
#[allow(dead_code)]
fn get_note(db: &Connection, nid: ObjID) -> Result<Option<Note>> {
let mut stmt = db.prepare_cached("select id, mid, mod, usn, flds from notes where id=?")?;
let note = stmt.query_and_then(params![nid], row_to_note)?.next();
note.transpose()
}
pub(super) fn for_every_note<F: FnMut(&mut Note) -> Result<()>>(
db: &Connection,
mut func: F,
) -> Result<()> {
let mut stmt = db.prepare("select id, mid, mod, usn, flds from notes")?;
for result in stmt.query_and_then(NO_PARAMS, |row| {
let mut note = row_to_note(row)?;
func(&mut note)
})? {
result?;
}
Ok(())
}
fn row_to_note(row: &Row) -> Result<Note> {
Ok(Note {
id: row.get(0)?,
mid: row.get(1)?,
mtime_secs: row.get(2)?,
usn: row.get(3)?,
fields: row
.get_raw(4)
.as_str()?
.split('\x1f')
.map(|s| s.to_string())
.collect(),
})
}
pub(super) fn set_note(db: &Connection, note: &mut Note, note_type: &NoteType) -> Result<()> {
note.mtime_secs = i64_unix_timestamp();
// hard-coded for now
note.usn = -1;
let csum = field_checksum(&note.fields()[0]);
let sort_field = strip_html_preserving_image_filenames(
note.fields()
.get(note_type.sort_field_idx as usize)
.ok_or_else(|| AnkiError::DBError {
info: "sort field out of range".to_string(),
})?,
);
let mut stmt =
db.prepare_cached("update notes set mod=?,usn=?,flds=?,sfld=?,csum=? where id=?")?;
stmt.execute(params![
note.mtime_secs,
note.usn,
note.fields().join("\x1f"),
sort_field,
csum,
note.id,
])?;
Ok(())
}
pub(super) fn mark_collection_modified(db: &Connection) -> Result<()> {
db.execute(
"update col set usn=-1, mod=?",
params![i64_unix_timestamp()],
)?;
Ok(())
}

View file

@ -11,6 +11,7 @@ use std::path::{Path, PathBuf};
pub mod changetracker;
pub mod check;
pub mod col;
pub mod database;
pub mod files;
pub mod sync;

View file

@ -31,8 +31,33 @@ lazy_static! {
.unwrap();
static ref IMG_TAG: Regex = Regex::new(
// group 1 is filename
r#"(?i)<img[^>]+src=["']?([^"'>]+)["']?[^>]*>"#
r#"(?xsi)
# the start of the image tag
<img[^>]+src=
(?:
# 1: double-quoted filename
"
([^"]+?)
"
[^>]*>
|
# 2: single-quoted filename
'
([^']+?)
'
[^>]*>
|
# 3: unquoted filename
([^ >]+?)
(?:
# then either a space and the rest
\x20[^>]*>
|
# or the tag immediately ends
>
)
)
"#
).unwrap();
// videos are also in sound tags
@ -106,6 +131,39 @@ pub fn extract_av_tags<'a>(text: &'a str, question_side: bool) -> (Cow<'a, str>,
(replaced_text, tags)
}
#[derive(Debug)]
pub(crate) struct MediaRef<'a> {
pub full_ref: &'a str,
pub fname: &'a str,
}
pub(crate) fn extract_media_refs(text: &str) -> Vec<MediaRef> {
let mut out = vec![];
for caps in IMG_TAG.captures_iter(text) {
out.push(MediaRef {
full_ref: caps.get(0).unwrap().as_str(),
fname: caps
.get(1)
.or_else(|| caps.get(2))
.or_else(|| caps.get(3))
.unwrap()
.as_str(),
});
}
for caps in AV_TAGS.captures_iter(text) {
if let Some(m) = caps.get(1) {
out.push(MediaRef {
full_ref: caps.get(0).unwrap().as_str(),
fname: m.as_str(),
});
}
}
out
}
fn tts_tag_from_string<'a>(field_text: &'a str, args: &'a str) -> AVTag {
let mut other_args = vec![];
let mut split_args = args.split_ascii_whitespace();
@ -141,7 +199,7 @@ fn tts_tag_from_string<'a>(field_text: &'a str, args: &'a str) -> AVTag {
}
pub fn strip_html_preserving_image_filenames(html: &str) -> Cow<str> {
let without_fnames = IMG_TAG.replace_all(html, r" $1 ");
let without_fnames = IMG_TAG.replace_all(html, r" ${1}${2}${3} ");
let without_html = HTML.replace_all(&without_fnames, "");
// no changes?
if let Cow::Borrowed(b) = without_html {
@ -157,7 +215,6 @@ pub(crate) fn contains_latex(text: &str) -> bool {
LATEX.is_match(text)
}
#[allow(dead_code)]
pub(crate) fn normalize_to_nfc(s: &str) -> Cow<str> {
if !is_nfc(s) {
s.chars().nfc().collect::<String>().into()

11
rslib/src/time.rs Normal file
View file

@ -0,0 +1,11 @@
// Copyright: Ankitects Pty Ltd and contributors
// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
use std::time;
pub(crate) fn i64_unix_timestamp() -> i64 {
time::SystemTime::now()
.duration_since(time::SystemTime::UNIX_EPOCH)
.unwrap()
.as_secs() as i64
}

9
rslib/src/types.rs Normal file
View file

@ -0,0 +1,9 @@
// Copyright: Ankitects Pty Ltd and contributors
// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
// while Anki tends to only use positive numbers, sqlite only supports
// signed integers, so these numbers are signed as well.
pub type ObjID = i64;
pub type Usn = i32;
pub type Timestamp = i64;

Binary file not shown.