diff --git a/ftl/core/importing.ftl b/ftl/core/importing.ftl index 6323ca602..30cbe5c8d 100644 --- a/ftl/core/importing.ftl +++ b/ftl/core/importing.ftl @@ -8,6 +8,7 @@ importing-appeared-twice-in-file = Appeared twice in file: { $val } importing-by-default-anki-will-detect-the = By default, Anki will detect the character between fields, such as a tab, comma, and so on. If Anki is detecting the character incorrectly, you can enter it here. Use \t to represent tab. importing-change = Change importing-colon = Colon +importing-column = Column { $val } importing-comma = Comma importing-empty-first-field = Empty first field: { $val } importing-field-mapping = Field mapping diff --git a/proto/anki/import_export.proto b/proto/anki/import_export.proto index 54738c113..793338163 100644 --- a/proto/anki/import_export.proto +++ b/proto/anki/import_export.proto @@ -16,6 +16,7 @@ service ImportExportService { returns (generic.Empty); rpc ImportAnkiPackage(ImportAnkiPackageRequest) returns (ImportResponse); rpc ExportAnkiPackage(ExportAnkiPackageRequest) returns (generic.UInt32); + rpc GetCsvMetadata(CsvMetadataRequest) returns (CsvMetadata); rpc ImportCsv(ImportCsvRequest) returns (ImportResponse); rpc ImportJson(generic.String) returns (ImportResponse); } @@ -109,6 +110,20 @@ message ImportCsvRequest { int64 deck_id = 2; int64 notetype_id = 3; repeated CsvColumn columns = 4; - string delimiter = 5; + uint32 delimiter = 5; bool allow_html = 6; } + +message CsvMetadataRequest { + string path = 1; + optional uint32 delimiter = 2; +} + +message CsvMetadata { + uint32 delimiter = 1; + string tags = 2; + repeated string columns = 3; + int64 deck_id = 4; + int64 notetype_id = 5; + optional bool html = 6; +} diff --git a/pylib/anki/collection.py b/pylib/anki/collection.py index f39911f11..aefd50b35 100644 --- a/pylib/anki/collection.py +++ b/pylib/anki/collection.py @@ -410,7 +410,7 @@ class Collection(DeprecatedNamesMixin): deck_id: DeckId, notetype_id: NotetypeId, columns: list[CsvColumn], - delimiter: str, + delimiter: int, allow_html: bool, ) -> ImportLogWithChanges: return self._backend.import_csv( diff --git a/qt/aqt/import_export/import_dialog.py b/qt/aqt/import_export/import_dialog.py index 186b0d541..9e2e6056a 100644 --- a/qt/aqt/import_export/import_dialog.py +++ b/qt/aqt/import_export/import_dialog.py @@ -154,7 +154,7 @@ class ImportDialog(QDialog): d = repr(d) txt = tr.importing_fields_separated_by(val=d) self.frm.autoDetect.setText(txt) - self.delim = d + self.delim = ord(d) def accept(self) -> None: # self.mw.pm.profile["importMode"] = self.importer.importMode diff --git a/rslib/src/backend/import_export.rs b/rslib/src/backend/import_export.rs index 105deb0ca..86c13e8ef 100644 --- a/rslib/src/backend/import_export.rs +++ b/rslib/src/backend/import_export.rs @@ -81,6 +81,11 @@ impl ImportExportService for Backend { .map(Into::into) } + fn get_csv_metadata(&self, input: pb::CsvMetadataRequest) -> Result { + let delimiter = input.delimiter.map(try_into_byte).transpose()?; + self.with_col(|col| col.get_csv_metadata(&input.path, delimiter)) + } + fn import_csv(&self, input: pb::ImportCsvRequest) -> Result { self.with_col(|col| { col.import_csv( @@ -88,7 +93,7 @@ impl ImportExportService for Backend { input.deck_id.into(), input.notetype_id.into(), input.columns.into_iter().map(Into::into).collect(), - byte_from_string(&input.delimiter)?, + try_into_byte(input.delimiter)?, //input.allow_html, ) }) @@ -146,8 +151,7 @@ impl From for Column { } } -fn byte_from_string(s: &str) -> Result { - s.bytes() - .next() - .ok_or_else(|| AnkiError::invalid_input("empty string")) +fn try_into_byte(u: impl TryInto) -> Result { + u.try_into() + .map_err(|_| AnkiError::invalid_input("expected single byte")) } diff --git a/rslib/src/import_export/text/csv.rs b/rslib/src/import_export/text/csv/import.rs similarity index 96% rename from rslib/src/import_export/text/csv.rs rename to rslib/src/import_export/text/csv/import.rs index ee7e2b06f..25bfe2fba 100644 --- a/rslib/src/import_export/text/csv.rs +++ b/rslib/src/import_export/text/csv/import.rs @@ -8,19 +8,12 @@ use std::{ use crate::{ import_export::{ - text::{ForeignData, ForeignNote}, + text::{csv::Column, ForeignData, ForeignNote}, NoteLog, }, prelude::*, }; -#[derive(Debug, Clone, Copy)] -pub enum Column { - Field(usize), - Ignore, - Tags, -} - impl Collection { pub fn import_csv( &mut self, @@ -66,6 +59,8 @@ fn deserialize_csv( /// Returns a reader with the first line stripped if it starts with "tags:", /// which is allowed for historic reasons. fn reader_without_tags_line(reader: impl Read + Seek) -> Result { + // FIXME: shouldn't pass a buffered reader to csv + // https://docs.rs/csv/latest/csv/struct.ReaderBuilder.html#method.from_reader let mut buf_reader = BufReader::new(reader); let mut first_line = String::new(); buf_reader.read_line(&mut first_line)?; diff --git a/rslib/src/import_export/text/csv/metadata.rs b/rslib/src/import_export/text/csv/metadata.rs new file mode 100644 index 000000000..1bc951a4f --- /dev/null +++ b/rslib/src/import_export/text/csv/metadata.rs @@ -0,0 +1,302 @@ +// Copyright: Ankitects Pty Ltd and contributors +// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html + +use std::{ + fs::File, + io::{BufRead, BufReader}, +}; + +use crate::{backend_proto::CsvMetadata, error::ImportError, prelude::*}; + +impl Collection { + pub fn get_csv_metadata(&mut self, path: &str, delimiter: Option) -> Result { + let reader = BufReader::new(File::open(path)?); + self.get_reader_metadata(reader, delimiter) + } + + fn get_reader_metadata( + &mut self, + reader: impl BufRead, + delimiter: Option, + ) -> Result { + let mut metadata = CsvMetadata::default(); + let line = self.parse_meta_lines(reader, &mut metadata)?; + set_delimiter(delimiter, &mut metadata, &line); + set_columns(&mut metadata, &line, &self.tr)?; + Ok(metadata) + } + + /// Parses the meta head of the file, and returns the first content line. + fn parse_meta_lines( + &mut self, + mut reader: impl BufRead, + metadata: &mut CsvMetadata, + ) -> Result { + let mut line = String::new(); + reader.read_line(&mut line)?; + if self.parse_first_line(&line, metadata) { + line.clear(); + reader.read_line(&mut line)?; + while self.parse_line(&line, metadata) { + line.clear(); + reader.read_line(&mut line)?; + } + } + Ok(line) + } + + /// True if the line is a meta line, i.e. a comment, or starting with 'tags:'. + fn parse_first_line(&mut self, line: &str, metadata: &mut CsvMetadata) -> bool { + if let Some(tags) = line.strip_prefix("tags:") { + metadata.tags = tags.trim().to_owned(); + true + } else { + self.parse_line(line, metadata) + } + } + + /// True if the line is a comment. + fn parse_line(&mut self, line: &str, metadata: &mut CsvMetadata) -> bool { + if let Some(l) = line.strip_prefix('#') { + if let Some((key, value)) = l.split_once(':') { + self.parse_meta_value(key, strip_line_ending(value), metadata); + } + true + } else { + false + } + } + + fn parse_meta_value(&mut self, key: &str, value: &str, metadata: &mut CsvMetadata) { + match key.trim().to_ascii_lowercase().as_str() { + "delimiter" => { + if let Some(delimiter) = delimter_from_value(value) { + metadata.delimiter = delimiter as u32; + } + } + "tags" => metadata.tags = value.trim().to_owned(), + "columns" => { + if let Ok(columns) = self.parse_columns(value, metadata) { + metadata.columns = columns; + } + } + "deck" => { + if let Ok(Some(did)) = self.deck_id_for_string(value) { + metadata.deck_id = did.0; + } + } + "notetype" => { + if let Ok(Some(nt)) = self.notetype_for_string(value) { + metadata.notetype_id = nt.id.0; + } + } + "html" => metadata.html = value.to_lowercase().parse::().ok(), + + _ => (), + } + } + + fn parse_columns(&mut self, line: &str, metadata: &mut CsvMetadata) -> Result> { + let delimiter = if metadata.delimiter != 0 { + metadata.delimiter as u8 + } else { + delimiter_from_line(line) + }; + map_single_record(line, delimiter, |record| { + record + .iter() + .enumerate() + .map(|(idx, s)| self.column_label(idx, s)) + .collect() + }) + } + + fn column_label(&self, idx: usize, column: &str) -> String { + match column.trim() { + "" => self.tr.importing_column(idx + 1).to_string(), + "tags" => self.tr.editing_tags().to_string(), + s => s.to_string(), + } + } +} + +fn set_columns(metadata: &mut CsvMetadata, line: &str, tr: &I18n) -> Result<()> { + if metadata.columns.is_empty() { + let columns = map_single_record(line, metadata.delimiter as u8, |r| r.len())?; + metadata.columns = (0..columns) + .map(|i| tr.importing_column(i + 1).to_string()) + .collect(); + } + Ok(()) +} + +fn set_delimiter(delimiter: Option, metadata: &mut CsvMetadata, line: &str) { + if let Some(delim) = delimiter { + metadata.delimiter = delim as u32; + } else if metadata.delimiter == 0 { + // XXX: should '#delimiter:[NUL]' be supported? + metadata.delimiter = delimiter_from_line(line) as u32; + } +} + +fn delimter_from_value(value: &str) -> Option { + // FIXME: bytes like '\n', '#' and '"' will likely cause issues + Some(if value.as_bytes().len() == 1 { + value.as_bytes()[0] + } else { + match value.trim().to_ascii_lowercase().as_str() { + "tab" | "\\t" => b'\t', + "semicolon" => b';', + "comma" => b',', + "space" => b' ', + _ => return None, + } + }) +} + +fn delimiter_from_line(line: &str) -> u8 { + // TODO: use smarter heuristic + for byte in [b'\t', b';', b','] { + if line.contains(byte as char) { + return byte; + } + } + b' ' +} + +fn map_single_record( + line: &str, + delimiter: u8, + op: impl FnOnce(&csv::StringRecord) -> T, +) -> Result { + csv::ReaderBuilder::new() + .delimiter(delimiter) + .from_reader(line.as_bytes()) + .headers() + .map_err(|_| AnkiError::ImportError(ImportError::Corrupt)) + .map(op) +} + +fn strip_line_ending(line: &str) -> &str { + line.strip_suffix("\r\n") + .unwrap_or_else(|| line.strip_suffix('\n').unwrap_or(line)) +} + +#[cfg(test)] +mod test { + use super::*; + use crate::collection::open_test_collection; + + macro_rules! metadata { + ($col:expr,$csv:expr) => { + metadata!($col, $csv, None) + }; + ($col:expr,$csv:expr, $delim:expr) => { + $col.get_reader_metadata(BufReader::new($csv.as_bytes()), $delim) + .unwrap() + }; + } + + #[test] + fn should_detect_deck_by_name_or_id() { + let mut col = open_test_collection(); + assert_eq!(metadata!(col, "#deck:Default\n").deck_id, 1); + assert_eq!(metadata!(col, "#deck:1\n").deck_id, 1); + } + + #[test] + fn should_detect_notetype_by_name_or_id() { + let mut col = open_test_collection(); + let basic_id = col.get_notetype_by_name("Basic").unwrap().unwrap().id.0; + assert_eq!(metadata!(col, "#notetype:Basic\n").notetype_id, basic_id); + assert_eq!( + metadata!(col, &format!("#notetype:{basic_id}\n")).notetype_id, + basic_id + ); + } + + #[test] + fn should_detect_valid_delimiters() { + let mut col = open_test_collection(); + assert_eq!(metadata!(col, "#delimiter: \n").delimiter, ' ' as u32); + assert_eq!(metadata!(col, "#delimiter:space\n").delimiter, ' ' as u32); + assert_eq!(metadata!(col, "#delimiter:\t\n").delimiter, '\t' as u32); + assert_eq!(metadata!(col, "#delimiter:Tab\n").delimiter, '\t' as u32); + assert_eq!(metadata!(col, "#delimiter:;\n").delimiter, ';' as u32); + assert_eq!( + metadata!(col, "#delimiter:SEMICOLON\n").delimiter, + ';' as u32 + ); + assert_eq!(metadata!(col, "#delimiter:,\n").delimiter, ',' as u32); + assert_eq!(metadata!(col, "#delimiter:comma\n").delimiter, ',' as u32); + assert_eq!(metadata!(col, "#delimiter:|\n").delimiter, '|' as u32); + // fallback + assert_eq!(metadata!(col, "#delimiter:foo\n").delimiter, ' ' as u32); + assert_eq!(metadata!(col, "#delimiter:♥\n").delimiter, ' ' as u32); + // pick up from first line + assert_eq!(metadata!(col, "foo\tbar\n").delimiter, '\t' as u32); + // override with provided + assert_eq!( + metadata!(col, "#delimiter: \nfoo\tbar\n", Some(b'|')).delimiter, + '|' as u32 + ); + } + + #[test] + fn should_detect_valid_html_toggle() { + let mut col = open_test_collection(); + assert_eq!(metadata!(col, "#html:true\n").html, Some(true)); + assert_eq!(metadata!(col, "#html:FALSE\n").html, Some(false)); + assert_eq!(metadata!(col, "#html:maybe\n").html, None); + } + + #[test] + fn should_detect_old_and_new_style_tags() { + let mut col = open_test_collection(); + assert_eq!(&metadata!(col, "tags:foo bar\n").tags, "foo bar"); + assert_eq!(&metadata!(col, "#tags:foo bar\n").tags, "foo bar"); + // only in head + assert_eq!(&metadata!(col, "#\n#tags:foo bar\n").tags, "foo bar"); + assert_eq!(&metadata!(col, "\n#tags:foo bar\n").tags, ""); + // only on very first line + assert_eq!(&metadata!(col, "#\ntags:foo bar\n").tags, ""); + } + + #[test] + fn should_detect_column_number_and_names() { + let mut col = open_test_collection(); + // detect from line + assert_eq!( + metadata!(col, "foo;bar\n").columns, + ["Column 1", "Column 2"] + ); + // detect encoded + assert_eq!( + metadata!(col, "#delimiter:,\nfoo;bar\n").columns, + ["Column 1"] + ); + assert_eq!( + metadata!(col, "#delimiter:|\nfoo|bar\n").columns, + ["Column 1", "Column 2"] + ); + // override + assert_eq!( + metadata!(col, "#delimiter:;\nfoo;bar\n", Some(b'|')).columns, + ["Column 1"] + ); + + // custom names + assert_eq!(metadata!(col, "#columns:one,two\n").columns, ["one", "two"]); + assert_eq!( + metadata!(col, "#delimiter:|\n#columns:one|two\n").columns, + ["one", "two"] + ); + // fill in missing + assert_eq!( + metadata!(col, "#columns:one, ,two,\n").columns, + ["one", "Column 2", "two", "Column 4"] + ); + // fill in special names + assert_eq!(metadata!(col, "#columns:tags\n").columns, ["Tags"]); + } +} diff --git a/rslib/src/import_export/text/csv/mod.rs b/rslib/src/import_export/text/csv/mod.rs new file mode 100644 index 000000000..d3a6fb0dd --- /dev/null +++ b/rslib/src/import_export/text/csv/mod.rs @@ -0,0 +1,12 @@ +// Copyright: Ankitects Pty Ltd and contributors +// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html + +mod import; +mod metadata; + +#[derive(Debug, Clone, Copy)] +pub enum Column { + Field(usize), + Ignore, + Tags, +} diff --git a/rslib/src/import_export/text/import.rs b/rslib/src/import_export/text/import.rs index 65558451b..c517b2c8c 100644 --- a/rslib/src/import_export/text/import.rs +++ b/rslib/src/import_export/text/import.rs @@ -38,7 +38,7 @@ impl<'a> Context<'a> { let mut notetypes = HashMap::new(); notetypes.insert( String::new(), - col.get_notetype_for_string(&data.default_notetype)?, + col.notetype_for_string(&data.default_notetype)?, ); let mut deck_ids = HashMap::new(); deck_ids.insert(String::new(), col.deck_id_for_string(&data.default_deck)?); @@ -64,7 +64,7 @@ impl<'a> Context<'a> { Ok(if let Some(nt) = self.notetypes.get(¬e.notetype) { nt.clone() } else { - let nt = self.col.get_notetype_for_string(¬e.notetype)?; + let nt = self.col.notetype_for_string(¬e.notetype)?; self.notetypes.insert(note.notetype.clone(), nt.clone()); nt }) @@ -134,7 +134,7 @@ impl<'a> Context<'a> { } impl Collection { - fn deck_id_for_string(&mut self, deck: &str) -> Result> { + pub(super) fn deck_id_for_string(&mut self, deck: &str) -> Result> { if let Ok(did) = deck.parse::() { if self.get_deck(did)?.is_some() { return Ok(Some(did)); @@ -143,7 +143,7 @@ impl Collection { self.get_deck_id(deck) } - fn get_notetype_for_string(&mut self, notetype: &str) -> Result>> { + pub(super) fn notetype_for_string(&mut self, notetype: &str) -> Result>> { if let Some(nt) = self.get_notetype_for_id_string(notetype)? { Ok(Some(nt)) } else {