Map columns to fields, not the other way around

This commit is contained in:
RumovZ 2022-05-14 11:01:49 +02:00
parent 0556696b47
commit 95f1734feb
6 changed files with 298 additions and 177 deletions

View file

@ -97,22 +97,20 @@ message MediaEntries {
}
message ImportCsvRequest {
message CsvColumn {
enum Other {
IGNORE = 0;
TAGS = 1;
}
oneof variant {
uint32 field = 1;
Other other = 2;
}
// Source column indices for note data. Negative values mean n/a.
message Columns {
repeated int32 fields = 1;
int32 tags = 2;
int32 deck = 3;
int32 notetype = 4;
}
string path = 1;
int64 deck_id = 2;
int64 notetype_id = 3;
repeated CsvColumn columns = 4;
CsvMetadata.Delimiter delimiter = 5;
bool is_html = 6;
CsvMetadata.Delimiter delimiter = 4;
bool is_html = 5;
Columns columns = 6;
repeated string column_names = 7;
}
message CsvMetadataRequest {

View file

@ -6,14 +6,8 @@ use std::path::Path;
use super::{progress::Progress, Backend};
pub(super) use crate::backend_proto::importexport_service::Service as ImportExportService;
use crate::{
backend_proto::{
self as pb,
export_anki_package_request::Selector,
import_csv_request::{csv_column, CsvColumn},
},
import_export::{
package::import_colpkg, text::csv::Column, ExportProgress, ImportProgress, NoteLog,
},
backend_proto::{self as pb, export_anki_package_request::Selector},
import_export::{package::import_colpkg, ExportProgress, ImportProgress, NoteLog},
prelude::*,
search::SearchNode,
};
@ -93,9 +87,12 @@ impl ImportExportService for Backend {
&input.path,
input.deck_id.into(),
input.notetype_id.into(),
input.columns.into_iter().map(Into::into).collect(),
delimiter,
input.is_html,
input
.columns
.ok_or_else(|| AnkiError::invalid_input("missing value"))?,
input.column_names,
)
})
.map(Into::into)
@ -142,17 +139,3 @@ impl From<OpOutput<NoteLog>> for pb::ImportResponse {
}
}
}
impl From<CsvColumn> for Column {
fn from(column: CsvColumn) -> Self {
match column.variant.unwrap_or(csv_column::Variant::Other(0)) {
csv_column::Variant::Field(idx) => Column::Field(idx as usize),
csv_column::Variant::Other(i) => {
match csv_column::Other::from_i32(i).unwrap_or_default() {
csv_column::Other::Tags => Column::Tags,
csv_column::Other::Ignore => Column::Ignore,
}
}
}
}
}

View file

@ -2,35 +2,35 @@
// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
use std::{
collections::HashMap,
fs::File,
io::{BufRead, BufReader, Read, Seek, SeekFrom},
};
use crate::{
backend_proto::import_export::import_csv_request::Columns as ProtoColumns,
import_export::{
text::{
csv::{metadata::Delimiter, Column},
ForeignData, ForeignNote,
},
text::{csv::metadata::Delimiter, import::NotetypeForString, ForeignData, ForeignNote},
NoteLog,
},
prelude::*,
};
impl Collection {
#[allow(clippy::too_many_arguments)]
pub fn import_csv(
&mut self,
path: &str,
deck_id: DeckId,
notetype_id: NotetypeId,
columns: Vec<Column>,
delimiter: Delimiter,
is_html: bool,
columns: ProtoColumns,
column_names: Vec<String>,
) -> Result<OpOutput<NoteLog>> {
let notetype = self.get_notetype(notetype_id)?.ok_or(AnkiError::NotFound)?;
let fields_len = notetype.fields.len();
let file = File::open(path)?;
let notes = deserialize_csv(file, &columns, fields_len, delimiter, is_html)?;
let mut ctx = ColumnContext::new(columns, column_names, is_html, self);
let notes = ctx.deserialize_csv(file, delimiter)?;
ForeignData {
// TODO: refactor to allow passing ids directly
@ -43,21 +43,170 @@ impl Collection {
}
}
fn deserialize_csv(
mut reader: impl Read + Seek,
columns: &[Column],
fields_len: usize,
delimiter: Delimiter,
is_html: bool,
) -> Result<Vec<ForeignNote>> {
remove_tags_line_from_reader(&mut reader)?;
let mut csv_reader = csv::ReaderBuilder::new()
.has_headers(false)
.flexible(true)
.comment(Some(b'#'))
.delimiter(delimiter.byte())
.from_reader(reader);
deserialize_csv_reader(&mut csv_reader, columns, fields_len, is_html)
/// Column indices for the fields of a notetype.
type FieldSourceColumns = Vec<Option<usize>>;
struct ColumnContext<'a, C: NotetypeForString> {
tags_column: Option<usize>,
deck_column: Option<usize>,
notetype_column: Option<usize>,
/// Source column indices for the fields of a notetype, identified by its
/// name or id as string. The empty string corresponds to the default notetype.
notetype_fields: HashMap<String, FieldSourceColumns>,
/// CSV column labels. Used to identify the source columns for the fields
/// of a notetype.
column_names: HashMap<String, usize>,
/// How fields are converted to strings. Used for escaping HTML if appropriate.
stringify: fn(&str) -> String,
col: &'a mut C,
}
impl<'a, C: NotetypeForString> ColumnContext<'a, C> {
fn new(
columns: ProtoColumns,
column_names: Vec<String>,
is_html: bool,
col: &'a mut C,
) -> Self {
Self {
tags_column: columns.tags.try_into().ok(),
deck_column: columns.deck.try_into().ok(),
notetype_column: columns.notetype.try_into().ok(),
notetype_fields: notetype_fields_map(&columns.fields),
column_names: column_names_map(column_names),
stringify: stringify_fn(is_html),
col,
}
}
fn deserialize_csv(
&mut self,
mut reader: impl Read + Seek,
delimiter: Delimiter,
) -> Result<Vec<ForeignNote>> {
remove_tags_line_from_reader(&mut reader)?;
let mut csv_reader = csv::ReaderBuilder::new()
.has_headers(false)
.flexible(true)
.comment(Some(b'#'))
.delimiter(delimiter.byte())
.from_reader(reader);
self.deserialize_csv_reader(&mut csv_reader)
}
fn deserialize_csv_reader(
&mut self,
reader: &mut csv::Reader<impl Read>,
) -> Result<Vec<ForeignNote>> {
reader
.records()
.into_iter()
.map(|res| {
res.map_err(Into::into)
.and_then(|record| self.foreign_note_from_record(&record))
})
.collect()
}
fn foreign_note_from_record(&mut self, record: &csv::StringRecord) -> Result<ForeignNote> {
let notetype = self.gather_notetype(record);
let deck = self.gather_deck(record);
let tags = self.gather_tags(record);
let fields = self.gather_note_fields(record, &notetype)?;
Ok(ForeignNote {
notetype,
fields,
tags,
deck,
..Default::default()
})
}
fn gather_notetype(&self, record: &csv::StringRecord) -> String {
self.notetype_column
.and_then(|i| record.get(i))
.unwrap_or_default()
.to_string()
}
fn gather_deck(&self, record: &csv::StringRecord) -> String {
self.deck_column
.and_then(|i| record.get(i))
.unwrap_or_default()
.to_string()
}
fn gather_tags(&self, record: &csv::StringRecord) -> Vec<String> {
self.tags_column
.and_then(|i| record.get(i))
.unwrap_or_default()
.split_whitespace()
.filter(|s| !s.is_empty())
.map(ToString::to_string)
.collect()
}
fn gather_note_fields(
&mut self,
record: &csv::StringRecord,
notetype: &str,
) -> Result<Vec<String>> {
let stringify = self.stringify;
Ok(self
.get_notetype_fields(notetype)?
.iter()
.map(|opt| opt.and_then(|idx| record.get(idx)).unwrap_or_default())
.map(stringify)
.collect())
}
fn get_notetype_fields(&mut self, notetype: &str) -> Result<&FieldSourceColumns> {
Ok(if self.notetype_fields.contains_key(notetype) {
// borrow checker doesn't allow to use `if let` here
// https://users.rust-lang.org/t/solved-borrow-doesnt-drop-returning-this-value-requires-that/24182
self.notetype_fields.get(notetype).unwrap()
} else {
// TODO: more specific errors
let nt = self
.col
.notetype_for_string(notetype)?
.ok_or(AnkiError::NotFound)?;
let map = self.build_notetype_fields_map(&nt);
if map[0].is_none() {
return Err(AnkiError::NotFound);
}
self.notetype_fields
.entry(notetype.to_string())
.or_insert(map)
})
}
fn build_notetype_fields_map(&mut self, notetype: &Notetype) -> FieldSourceColumns {
notetype
.fields
.iter()
.map(|f| self.column_names.get(&f.name).copied())
.collect()
}
}
fn column_names_map(column_names: Vec<String>) -> HashMap<String, usize> {
HashMap::from_iter(column_names.into_iter().enumerate().map(|(i, s)| (s, i)))
}
fn notetype_fields_map(default_fields: &[i32]) -> HashMap<String, FieldSourceColumns> {
let default_fields = default_fields.iter().map(|&i| i.try_into().ok()).collect();
let mut notetype_fields = HashMap::new();
notetype_fields.insert(String::new(), default_fields);
notetype_fields
}
fn stringify_fn(is_html: bool) -> fn(&str) -> String {
if is_html {
ToString::to_string
} else {
htmlescape::encode_minimal
}
}
/// If the reader's first line starts with "tags:", which is allowed for historic
@ -77,161 +226,150 @@ fn remove_tags_line_from_reader(reader: &mut (impl Read + Seek)) -> Result<()> {
Ok(())
}
fn deserialize_csv_reader(
reader: &mut csv::Reader<impl Read>,
columns: &[Column],
fields_len: usize,
is_html: bool,
) -> Result<Vec<ForeignNote>> {
reader
.records()
.into_iter()
.map(|res| {
res.map(|record| ForeignNote::from_record(record, columns, fields_len, is_html))
.map_err(Into::into)
})
.collect()
}
impl ForeignNote {
fn from_record(
record: csv::StringRecord,
columns: &[Column],
fields_len: usize,
is_html: bool,
) -> Self {
let mut note = Self {
fields: vec!["".to_string(); fields_len],
..Default::default()
};
for (&column, value) in columns.iter().zip(record.iter()) {
note.add_column_value(column, value, is_html);
}
note
}
fn add_column_value(&mut self, column: Column, value: &str, is_html: bool) {
match column {
Column::Ignore => (),
Column::Field(idx) => {
if let Some(field) = self.fields.get_mut(idx) {
*field = if is_html {
value.to_string()
} else {
htmlescape::encode_minimal(value)
};
}
}
Column::Tags => self.tags.extend(value.split(' ').map(ToString::to_string)),
}
}
}
#[cfg(test)]
mod test {
use std::io::Cursor;
use std::{io::Cursor, sync::Arc};
use super::*;
use crate::notetype::all_stock_notetypes;
macro_rules! import {
($options:expr, $csv:expr) => {{
let reader = Cursor::new($csv);
let delimiter = $options.delimiter;
let mut ctx = $options.ctx();
ctx.deserialize_csv(reader, delimiter).unwrap()
}};
}
macro_rules! assert_imported_fields {
($options:expr,$csv:expr, $expected:expr) => {
let reader = Cursor::new($csv);
let notes = deserialize_csv(
reader,
&$options.columns,
$options.fields_len,
$options.delimiter,
$options.is_html,
)
.unwrap();
($options:expr, $csv:expr, $expected:expr) => {
let notes = import!($options, $csv);
let fields: Vec<_> = notes.into_iter().map(|note| note.fields).collect();
assert_eq!(fields, $expected);
};
}
struct MockNotetypeForString(HashMap<String, Arc<Notetype>>);
impl NotetypeForString for MockNotetypeForString {
fn notetype_for_string(&mut self, name_or_id: &str) -> Result<Option<Arc<Notetype>>> {
Ok(self.0.get(name_or_id).cloned())
}
}
impl MockNotetypeForString {
fn new() -> Self {
Self(HashMap::from_iter(
all_stock_notetypes(&I18n::template_only())
.into_iter()
.map(|nt| (nt.name.clone(), Arc::new(nt))),
))
}
}
struct CsvOptions {
columns: Vec<Column>,
fields_len: usize,
delimiter: Delimiter,
is_html: bool,
columns: ProtoColumns,
column_names: Vec<String>,
mock_col: MockNotetypeForString,
}
impl CsvOptions {
fn new() -> Self {
Self {
columns: vec![Column::Field(0), Column::Field(1)],
fields_len: 2,
delimiter: Delimiter::Comma,
is_html: false,
columns: ProtoColumns {
tags: -1,
deck: -1,
notetype: -1,
fields: vec![0, 1],
},
column_names: vec![],
mock_col: MockNotetypeForString::new(),
}
}
fn add_column(mut self, column: Column) -> Self {
self.columns.push(column);
self
}
fn columns(mut self, columns: Vec<Column>) -> Self {
self.columns = columns;
self
}
fn fields_len(mut self, fields_len: usize) -> Self {
self.fields_len = fields_len;
self
}
fn delimiter(mut self, delimiter: Delimiter) -> Self {
self.delimiter = delimiter;
self
}
#[allow(clippy::wrong_self_convention)]
fn is_html(mut self, is_html: bool) -> Self {
self.is_html = is_html;
self
fn ctx(&mut self) -> ColumnContext<MockNotetypeForString> {
ColumnContext::new(
self.columns.clone(),
std::mem::take(&mut self.column_names),
self.is_html,
&mut self.mock_col,
)
}
}
#[test]
fn should_allow_missing_columns() {
let options = CsvOptions::new().add_column(Column::Field(2)).fields_len(4);
assert_imported_fields!(
options,
"front,back\nfoo\n",
&[&["front", "back", "", ""], &["foo", "", "", ""]]
);
let mut options = CsvOptions::new();
assert_imported_fields!(options, "foo\n", &[&["foo", ""]]);
}
#[test]
fn should_respect_custom_delimiter() {
let options = CsvOptions::new().delimiter(Delimiter::Pipe);
let mut options = CsvOptions::new();
options.delimiter = Delimiter::Pipe;
assert_imported_fields!(options, "fr,ont|ba,ck\n", &[&["fr,ont", "ba,ck"]]);
}
#[test]
fn should_ignore_first_line_starting_with_tags() {
let options = CsvOptions::new();
let mut options = CsvOptions::new();
assert_imported_fields!(options, "tags:foo\nfront,back\n", &[&["front", "back"]]);
}
#[test]
fn should_respect_column_remapping() {
let options =
CsvOptions::new().columns(vec![Column::Field(1), Column::Ignore, Column::Field(0)]);
let mut options = CsvOptions::new();
options.columns.fields = vec![2, 0];
assert_imported_fields!(options, "front,foo,back\n", &[&["back", "front"]]);
}
#[test]
fn should_ignore_lines_starting_with_number_sign() {
let options = CsvOptions::new();
let mut options = CsvOptions::new();
assert_imported_fields!(options, "#foo\nfront,back\n#bar\n", &[&["front", "back"]]);
}
#[test]
fn should_escape_html_entities_if_csv_is_html() {
assert_imported_fields!(CsvOptions::new(), "<hr>\n", &[&["&lt;hr&gt;", ""]]);
let with_html = CsvOptions::new().is_html(true);
assert_imported_fields!(with_html, "<hr>\n", &[&["<hr>", ""]]);
let mut options = CsvOptions::new();
assert_imported_fields!(options, "<hr>\n", &[&["&lt;hr&gt;", ""]]);
options.is_html = true;
assert_imported_fields!(options, "<hr>\n", &[&["<hr>", ""]]);
}
#[test]
fn should_parse_tag_column() {
let mut options = CsvOptions::new();
options.columns.tags = 2;
let notes = import!(options, "front,back,foo bar\n");
assert_eq!(notes[0].tags, &["foo", "bar"]);
}
#[test]
fn should_parse_deck_column() {
let mut options = CsvOptions::new();
options.columns.deck = 2;
let notes = import!(options, "front,back,foo bar\n");
assert_eq!(notes[0].deck, "foo bar");
}
#[test]
fn should_parse_notes_according_to_their_respective_notetypes() {
let mut options = CsvOptions::new();
options.columns.notetype = 3;
options.column_names = ["Front", "Back", "Text", "notetype"]
.iter()
.map(ToString::to_string)
.collect();
assert_imported_fields!(
options,
"front,back,Basic (and reversed card)\n,,foo,Cloze\n",
&[&["front", "back"], &["foo", ""]]
);
}
}

View file

@ -9,7 +9,10 @@ use std::{
use strum::IntoEnumIterator;
pub use crate::backend_proto::csv_metadata::Delimiter;
use crate::{backend_proto::CsvMetadata, error::ImportError, prelude::*};
use crate::{
backend_proto::CsvMetadata, error::ImportError, import_export::text::import::NotetypeForString,
prelude::*,
};
impl Collection {
pub fn get_csv_metadata(

View file

@ -3,10 +3,3 @@
mod import;
mod metadata;
#[derive(Debug, Clone, Copy)]
pub enum Column {
Field(usize),
Ignore,
Tags,
}

View file

@ -143,6 +143,20 @@ impl<'a> Context<'a> {
}
}
pub(super) trait NotetypeForString {
fn notetype_for_string(&mut self, name_or_id: &str) -> Result<Option<Arc<Notetype>>>;
}
impl NotetypeForString for Collection {
fn notetype_for_string(&mut self, name_or_id: &str) -> Result<Option<Arc<Notetype>>> {
if let Some(nt) = self.get_notetype_for_id_string(name_or_id)? {
Ok(Some(nt))
} else {
self.get_notetype_by_name(name_or_id)
}
}
}
impl Collection {
pub(super) fn deck_id_for_string(&mut self, deck: &str) -> Result<Option<DeckId>> {
if let Ok(did) = deck.parse::<DeckId>() {
@ -153,14 +167,6 @@ impl Collection {
self.get_deck_id(deck)
}
pub(super) fn notetype_for_string(&mut self, notetype: &str) -> Result<Option<Arc<Notetype>>> {
if let Some(nt) = self.get_notetype_for_id_string(notetype)? {
Ok(Some(nt))
} else {
self.get_notetype_by_name(notetype)
}
}
fn get_notetype_for_id_string(&mut self, notetype: &str) -> Result<Option<Arc<Notetype>>> {
notetype
.parse::<NotetypeId>()