Deck scoped dupe check (#2372)

* Support limiting dupe check to deck

* Expose deck limiting dupe check on frontend

* Make CSV dupe options configurable with headers

* Rename duplicate file headers

* Change dupe check limit to enum
This commit is contained in:
RumovZ 2023-02-16 08:53:36 +01:00 committed by GitHub
parent b4290fbe44
commit 5a53da23ca
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 275 additions and 40 deletions

View file

@ -106,6 +106,8 @@ importing-update = Update
importing-tag-all-notes = Tag all notes
importing-tag-updated-notes = Tag updated notes
importing-file = File
importing-match-scope = Match scope
importing-notetype-and-deck = Notetype and deck
## NO NEED TO TRANSLATE. This text is no longer used by Anki, and will be removed in the future.

View file

@ -122,8 +122,8 @@ message CsvMetadataRequest {
message CsvMetadata {
enum DupeResolution {
UPDATE = 0;
IGNORE = 1;
ADD = 2;
PRESERVE = 1;
DUPLICATE = 2;
// UPDATE_IF_NEWER = 3;
}
// Order roughly in ascending expected frequency in note text, because the
@ -161,6 +161,10 @@ message CsvMetadata {
// One-based. 0 means n/a.
uint32 notetype_column = 9;
}
enum MatchScope {
NOTETYPE = 0;
NOTETYPE_AND_DECK = 1;
}
// One-based. 0 means n/a.
uint32 tags_column = 10;
bool force_delimiter = 11;
@ -168,6 +172,7 @@ message CsvMetadata {
repeated generic.StringList preview = 13;
uint32 guid_column = 14;
DupeResolution dupe_resolution = 15;
MatchScope match_scope = 16;
}
message ExportCardCsvRequest {

View file

@ -273,10 +273,10 @@ class LogQueue:
def first_field_queue(log: ImportLogWithChanges.Log) -> LogQueue:
if log.dupe_resolution == DupeResolution.ADD:
if log.dupe_resolution == DupeResolution.DUPLICATE:
summary_template = tr.importing_added_duplicate_with_first_field
action_string = tr.adding_added()
elif log.dupe_resolution == DupeResolution.IGNORE:
elif log.dupe_resolution == DupeResolution.PRESERVE:
summary_template = tr.importing_first_field_matched
action_string = tr.importing_skipped()
else:

View file

@ -118,6 +118,10 @@ pub fn write_backend_proto_rs() {
"CsvMetadata.DupeResolution",
"#[derive(serde_derive::Deserialize, serde_derive::Serialize)]",
)
.type_attribute(
"CsvMetadata.MatchScope",
"#[derive(serde_derive::Deserialize, serde_derive::Serialize)]",
)
.compile_protos(paths.as_slice(), &[proto_dir])
.unwrap();
}

View file

@ -9,6 +9,7 @@ use crate::prelude::*;
#[strum(serialize_all = "camelCase")]
pub enum I32ConfigKey {
CsvDuplicateResolution,
MatchScope,
}
impl Collection {

View file

@ -28,21 +28,28 @@ impl Collection {
progress_fn: impl 'static + FnMut(ImportProgress, bool) -> bool,
) -> Result<OpOutput<NoteLog>> {
let file = open_file(path)?;
let default_deck = metadata.deck()?.name_or_id();
let default_notetype = metadata.notetype()?.name_or_id();
let mut ctx = ColumnContext::new(&metadata)?;
let notes = ctx.deserialize_csv(file, metadata.delimiter())?;
let mut data = ForeignData::from(metadata);
data.notes = notes;
data.import(self, progress_fn)
}
}
impl From<CsvMetadata> for ForeignData {
fn from(metadata: CsvMetadata) -> Self {
ForeignData {
dupe_resolution: metadata.dupe_resolution(),
default_deck,
default_notetype,
notes,
match_scope: metadata.match_scope(),
default_deck: metadata.deck().map(|d| d.name_or_id()).unwrap_or_default(),
default_notetype: metadata
.notetype()
.map(|nt| nt.name_or_id())
.unwrap_or_default(),
global_tags: metadata.global_tags,
updated_tags: metadata.updated_tags,
..Default::default()
}
.import(self, progress_fn)
}
}
@ -289,6 +296,7 @@ mod test {
})),
preview: Vec::new(),
dupe_resolution: 0,
match_scope: 0,
}
}
}

View file

@ -23,6 +23,7 @@ pub use crate::pb::import_export::csv_metadata::Deck as CsvDeck;
pub use crate::pb::import_export::csv_metadata::Delimiter;
pub use crate::pb::import_export::csv_metadata::DupeResolution;
pub use crate::pb::import_export::csv_metadata::MappedNotetype;
pub use crate::pb::import_export::csv_metadata::MatchScope;
pub use crate::pb::import_export::csv_metadata::Notetype as CsvNotetype;
pub use crate::pb::import_export::CsvMetadata;
use crate::prelude::*;
@ -54,14 +55,7 @@ impl Collection {
notetype_id: Option<NotetypeId>,
is_html: Option<bool>,
) -> Result<CsvMetadata> {
let dupe_resolution =
DupeResolution::from_i32(self.get_config_i32(I32ConfigKey::CsvDuplicateResolution))
.map(|r| r as i32)
.unwrap_or_default();
let mut metadata = CsvMetadata {
dupe_resolution,
..Default::default()
};
let mut metadata = CsvMetadata::from_config(self);
let meta_len = self.parse_meta_lines(&mut reader, &mut metadata)? as u64;
maybe_set_fallback_delimiter(delimiter, &mut metadata, &mut reader, meta_len)?;
let records = collect_preview_records(&mut metadata, reader)?;
@ -168,6 +162,16 @@ impl Collection {
metadata.guid_column = n;
}
}
"match scope" => {
if let Some(scope) = MatchScope::from_text(value) {
metadata.match_scope = scope as i32;
}
}
"if matches" => {
if let Some(resolution) = DupeResolution::from_text(value) {
metadata.dupe_resolution = resolution as i32;
}
}
_ => (),
}
}
@ -244,6 +248,46 @@ impl Collection {
}
}
impl CsvMetadata {
/// Defaults with config values filled in.
fn from_config(col: &Collection) -> Self {
Self {
dupe_resolution: DupeResolution::from_config(col) as i32,
match_scope: MatchScope::from_config(col) as i32,
..Default::default()
}
}
}
impl DupeResolution {
fn from_config(col: &Collection) -> Self {
Self::from_i32(col.get_config_i32(I32ConfigKey::CsvDuplicateResolution)).unwrap_or_default()
}
fn from_text(text: &str) -> Option<Self> {
match text {
"update current" => Some(Self::Update),
"keep current" => Some(Self::Preserve),
"keep both" => Some(Self::Duplicate),
_ => None,
}
}
}
impl MatchScope {
fn from_config(col: &Collection) -> Self {
Self::from_i32(col.get_config_i32(I32ConfigKey::MatchScope)).unwrap_or_default()
}
fn from_text(text: &str) -> Option<Self> {
match text {
"notetype" => Some(Self::Notetype),
"notetype + deck" => Some(Self::NotetypeAndDeck),
_ => None,
}
}
}
fn parse_columns(line: &str, delimiter: Delimiter) -> Result<Vec<String>> {
map_single_record(line, delimiter, |record| {
record.iter().map(ToString::to_string).collect()

View file

@ -16,6 +16,7 @@ use crate::import_export::text::ForeignData;
use crate::import_export::text::ForeignNote;
use crate::import_export::text::ForeignNotetype;
use crate::import_export::text::ForeignTemplate;
use crate::import_export::text::MatchScope;
use crate::import_export::ImportProgress;
use crate::import_export::IncrementableProgress;
use crate::import_export::NoteLog;
@ -37,10 +38,7 @@ impl ForeignData {
let mut progress = IncrementableProgress::new(progress_fn);
progress.call(ImportProgress::File)?;
col.transact(Op::Import, |col| {
col.set_config_i32_inner(
I32ConfigKey::CsvDuplicateResolution,
self.dupe_resolution as i32,
)?;
self.update_config(col)?;
let mut ctx = Context::new(&self, col)?;
ctx.import_foreign_notetypes(self.notetypes)?;
ctx.import_foreign_notes(
@ -51,6 +49,15 @@ impl ForeignData {
)
})
}
fn update_config(&self, col: &mut Collection) -> Result<()> {
col.set_config_i32_inner(
I32ConfigKey::CsvDuplicateResolution,
self.dupe_resolution as i32,
)?;
col.set_config_i32_inner(I32ConfigKey::MatchScope, self.match_scope as i32)?;
Ok(())
}
}
impl NoteLog {
@ -73,7 +80,7 @@ struct Context<'a> {
today: u32,
dupe_resolution: DupeResolution,
card_gen_ctxs: HashMap<(NotetypeId, DeckId), CardGenContext<Arc<Notetype>>>,
existing_checksums: HashMap<(NotetypeId, u32), Vec<NoteId>>,
existing_checksums: ExistingChecksums,
existing_guids: HashMap<String, NoteId>,
}
@ -83,6 +90,37 @@ struct DeckIdsByNameOrId {
default: Option<DeckId>,
}
/// Notes in the collection indexed by notetype, checksum and optionally deck.
/// With deck, a note will be included in as many entries as its cards
/// have different original decks.
#[derive(Debug)]
enum ExistingChecksums {
ByNotetype(HashMap<(NotetypeId, u32), Vec<NoteId>>),
ByNotetypeAndDeck(HashMap<(NotetypeId, u32, DeckId), Vec<NoteId>>),
}
impl ExistingChecksums {
fn new(col: &mut Collection, match_scope: MatchScope) -> Result<Self> {
match match_scope {
MatchScope::Notetype => col
.storage
.all_notes_by_type_and_checksum()
.map(Self::ByNotetype),
MatchScope::NotetypeAndDeck => col
.storage
.all_notes_by_type_checksum_and_deck()
.map(Self::ByNotetypeAndDeck),
}
}
fn get(&self, notetype: NotetypeId, checksum: u32, deck: DeckId) -> Option<&Vec<NoteId>> {
match self {
Self::ByNotetype(map) => map.get(&(notetype, checksum)),
Self::ByNotetypeAndDeck(map) => map.get(&(notetype, checksum, deck)),
}
}
}
struct NoteContext<'a> {
note: ForeignNote,
dupes: Vec<Duplicate>,
@ -152,7 +190,7 @@ impl<'a> Context<'a> {
col.notetype_by_name_or_id(&data.default_notetype)?,
);
let deck_ids = DeckIdsByNameOrId::new(col, &data.default_deck)?;
let existing_checksums = col.storage.all_notes_by_type_and_checksum()?;
let existing_checksums = ExistingChecksums::new(col, data.match_scope)?;
let existing_guids = col.storage.all_notes_by_guid()?;
Ok(Self {
@ -247,7 +285,7 @@ impl<'a> Context<'a> {
updated_tags: &'tags [String],
) -> Result<NoteContext<'tags>> {
self.prepare_foreign_note(&mut note)?;
let dupes = self.find_duplicates(&notetype, &note)?;
let dupes = self.find_duplicates(&notetype, &note, deck_id)?;
Ok(NoteContext {
note,
dupes,
@ -263,11 +301,16 @@ impl<'a> Context<'a> {
self.col.canonify_foreign_tags(note, self.usn)
}
fn find_duplicates(&self, notetype: &Notetype, note: &ForeignNote) -> Result<Vec<Duplicate>> {
fn find_duplicates(
&self,
notetype: &Notetype,
note: &ForeignNote,
deck_id: DeckId,
) -> Result<Vec<Duplicate>> {
if note.guid.is_empty() {
if let Some(nids) = note
.checksum()
.and_then(|csum| self.existing_checksums.get(&(notetype.id, csum)))
.and_then(|csum| self.existing_checksums.get(notetype.id, csum, deck_id))
{
return self.get_first_field_dupes(note, nids);
}
@ -297,15 +340,15 @@ impl<'a> Context<'a> {
fn import_note(&mut self, ctx: NoteContext, log: &mut NoteLog) -> Result<()> {
match self.dupe_resolution {
_ if !ctx.is_dupe() => self.add_note(ctx, log)?,
DupeResolution::Add if ctx.is_guid_dupe() => {
DupeResolution::Duplicate if ctx.is_guid_dupe() => {
log.duplicate.push(ctx.note.into_log_note())
}
DupeResolution::Add if !ctx.has_first_field() => {
DupeResolution::Duplicate if !ctx.has_first_field() => {
log.empty_first_field.push(ctx.note.into_log_note())
}
DupeResolution::Add => self.add_note(ctx, log)?,
DupeResolution::Duplicate => self.add_note(ctx, log)?,
DupeResolution::Update => self.update_with_note(ctx, log)?,
DupeResolution::Ignore => log.first_field_match.push(ctx.note.into_log_note()),
DupeResolution::Preserve => log.first_field_match.push(ctx.note.into_log_note()),
}
Ok(())
}
@ -588,6 +631,8 @@ impl ForeignTemplate {
mod test {
use super::*;
use crate::collection::open_test_collection;
use crate::tests::DeckAdder;
use crate::tests::NoteAdder;
impl ForeignData {
fn with_defaults() -> Self {
@ -611,7 +656,7 @@ mod test {
let mut col = open_test_collection();
let mut data = ForeignData::with_defaults();
data.add_note(&["same", "old"]);
data.dupe_resolution = DupeResolution::Add;
data.dupe_resolution = DupeResolution::Duplicate;
data.clone().import(&mut col, |_, _| true).unwrap();
data.import(&mut col, |_, _| true).unwrap();
@ -623,7 +668,7 @@ mod test {
let mut col = open_test_collection();
let mut data = ForeignData::with_defaults();
data.add_note(&["same", "old"]);
data.dupe_resolution = DupeResolution::Ignore;
data.dupe_resolution = DupeResolution::Preserve;
data.clone().import(&mut col, |_, _| true).unwrap();
assert_eq!(col.storage.notes_table_len(), 1);
@ -711,4 +756,27 @@ mod test {
data.import(&mut col, |_, _| true).unwrap();
assert_eq!(col.storage.get_all_notes()[0].tags, ["bar", "baz"]);
}
#[test]
fn should_only_update_duplicates_in_same_deck_if_limit_is_enabled() {
let mut col = open_test_collection();
let other_deck_id = DeckAdder::new("other").add(&mut col).id;
NoteAdder::basic(&mut col)
.fields(&["foo", "old"])
.add(&mut col);
NoteAdder::basic(&mut col)
.fields(&["foo", "old"])
.deck(other_deck_id)
.add(&mut col);
let mut data = ForeignData::with_defaults();
data.match_scope = MatchScope::NotetypeAndDeck;
data.add_note(&["foo", "new"]);
data.import(&mut col, |_, _| true).unwrap();
let notes = col.storage.get_all_notes();
// same deck, should be updated
assert_eq!(notes[0].fields()[1], "new");
// other deck, should be unchanged
assert_eq!(notes[1].fields()[1], "old");
}
}

View file

@ -10,11 +10,13 @@ use serde_derive::Serialize;
use super::LogNote;
use crate::pb::import_export::csv_metadata::DupeResolution;
use crate::pb::import_export::csv_metadata::MatchScope;
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(default)]
pub struct ForeignData {
dupe_resolution: DupeResolution,
match_scope: MatchScope,
default_deck: NameOrId,
default_notetype: NameOrId,
notes: Vec<ForeignNote>,

View file

@ -7,15 +7,11 @@ use std::collections::HashSet;
use rusqlite::params;
use rusqlite::Row;
use crate::error::Result;
use crate::import_export::package::NoteMeta;
use crate::notes::Note;
use crate::notes::NoteId;
use crate::notes::NoteTags;
use crate::notetype::NotetypeId;
use crate::prelude::*;
use crate::tags::join_tags;
use crate::tags::split_tags;
use crate::timestamp::TimestampMillis;
pub(crate) fn split_fields(fields: &str) -> Vec<String> {
fields.split('\x1f').map(Into::into).collect()
@ -195,6 +191,22 @@ impl super::SqliteStorage {
Ok(map)
}
pub(crate) fn all_notes_by_type_checksum_and_deck(
&self,
) -> Result<HashMap<(NotetypeId, u32, DeckId), Vec<NoteId>>> {
let mut map = HashMap::new();
let mut stmt = self
.db
.prepare(include_str!("notes_types_checksums_decks.sql"))?;
let mut rows = stmt.query([])?;
while let Some(row) = rows.next()? {
map.entry((row.get(1)?, row.get(2)?, row.get(3)?))
.or_insert_with(Vec::new)
.push(row.get(0)?);
}
Ok(map)
}
/// Return total number of notes. Slow.
pub(crate) fn total_notes(&self) -> Result<u32> {
self.db

View file

@ -0,0 +1,9 @@
SELECT DISTINCT notes.id,
notes.mid,
notes.csum,
CASE
WHEN cards.odid = 0 THEN cards.did
ELSE cards.odid
END AS did
FROM notes
JOIN cards ON notes.id = cards.nid

View file

@ -173,3 +173,37 @@ impl DeckAdder {
deck
}
}
#[derive(Debug, Clone)]
pub(crate) struct NoteAdder {
note: Note,
deck: DeckId,
}
impl NoteAdder {
pub(crate) fn new(col: &mut Collection, notetype: &str) -> Self {
Self {
note: col.new_note(notetype),
deck: DeckId(1),
}
}
pub(crate) fn basic(col: &mut Collection) -> Self {
Self::new(col, "basic")
}
pub(crate) fn fields(mut self, fields: &[&str]) -> Self {
*self.note.fields_mut() = fields.iter().map(ToString::to_string).collect();
self
}
pub(crate) fn deck(mut self, deck: DeckId) -> Self {
self.deck = deck;
self
}
pub(crate) fn add(mut self, col: &mut Collection) -> Note {
col.add_note(&mut self.note, self.deck).unwrap();
self.note
}
}

View file

@ -0,0 +1,41 @@
<!--
Copyright: Ankitects Pty Ltd and contributors
License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
-->
<script lang="ts">
import * as tr from "@tslib/ftl";
import { ImportExport } from "@tslib/proto";
import Col from "../components/Col.svelte";
import Row from "../components/Row.svelte";
import Select from "../components/Select.svelte";
import SelectOption from "../components/SelectOption.svelte";
export let matchScope: ImportExport.CsvMetadata.MatchScope;
const matchScopes = [
{
value: ImportExport.CsvMetadata.MatchScope.NOTETYPE,
label: tr.notetypesNotetype(),
},
{
value: ImportExport.CsvMetadata.MatchScope.NOTETYPE_AND_DECK,
label: tr.importingNotetypeAndDeck(),
},
];
$: label = matchScopes.find((r) => r.value === matchScope)?.label;
</script>
<Row --cols={2}>
<Col --col-size={1}>
{tr.importingMatchScope()}
</Col>
<Col --col-size={1}>
<Select bind:value={matchScope} {label}>
{#each matchScopes as { label, value }}
<SelectOption {value}>{label}</SelectOption>
{/each}
</Select>
</Col>
</Row>

View file

@ -19,11 +19,11 @@ License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
label: tr.importingUpdate(),
},
{
value: ImportExport.CsvMetadata.DupeResolution.ADD,
value: ImportExport.CsvMetadata.DupeResolution.DUPLICATE,
label: tr.importingDuplicate(),
},
{
value: ImportExport.CsvMetadata.DupeResolution.IGNORE,
value: ImportExport.CsvMetadata.DupeResolution.PRESERVE,
label: tr.importingPreserve(),
},
];

View file

@ -11,6 +11,7 @@ License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
import Container from "../components/Container.svelte";
import Row from "../components/Row.svelte";
import Spacer from "../components/Spacer.svelte";
import DeckDupeCheckSwitch from "./DeckDupeCheckSwitch.svelte";
import DeckSelector from "./DeckSelector.svelte";
import DelimiterSelector from "./DelimiterSelector.svelte";
import DupeResolutionSelector from "./DupeResolutionSelector.svelte";
@ -27,6 +28,7 @@ License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
export let notetypeNameIds: Notetypes.NotetypeNameId[];
export let deckNameIds: Decks.DeckNameId[];
export let dupeResolution: ImportExport.CsvMetadata.DupeResolution;
export let matchScope: ImportExport.CsvMetadata.MatchScope;
export let delimiter: ImportExport.CsvMetadata.Delimiter;
export let forceDelimiter: boolean;
@ -73,6 +75,7 @@ License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
path,
metadata: ImportExport.CsvMetadata.create({
dupeResolution,
matchScope,
delimiter,
forceDelimiter,
isHtml,
@ -119,6 +122,7 @@ License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
<DeckSelector {deckNameIds} bind:deckId />
{/if}
<DupeResolutionSelector bind:dupeResolution />
<DeckDupeCheckSwitch bind:matchScope />
<Tags bind:globalTags bind:updatedTags />
</Container>
</Col>

View file

@ -48,6 +48,7 @@ export async function setupImportCsvPage(path: string): Promise<ImportCsvPage> {
deckNameIds: decks.entries,
notetypeNameIds: notetypes.entries,
dupeResolution: metadata.dupeResolution,
matchScope: metadata.matchScope,
delimiter: metadata.delimiter,
forceDelimiter: metadata.forceDelimiter,
isHtml: metadata.isHtml,