From 74ce5511555888ca7b5e936aa340e5c7125af388 Mon Sep 17 00:00:00 2001 From: RumovZ Date: Wed, 11 May 2022 09:49:19 +0200 Subject: [PATCH] Only allow fixed set of CSV delimiters --- proto/anki/import_export.proto | 17 ++- pylib/anki/collection.py | 5 +- qt/aqt/import_export/import_dialog.py | 57 +++++---- rslib/build/protobuf.rs | 1 + rslib/src/backend/import_export.rs | 10 +- rslib/src/import_export/text/csv/import.rs | 19 +-- rslib/src/import_export/text/csv/metadata.rs | 125 ++++++++++++------- rslib/src/import_export/text/mod.rs | 2 +- 8 files changed, 139 insertions(+), 97 deletions(-) diff --git a/proto/anki/import_export.proto b/proto/anki/import_export.proto index 186a7ee61..553dfb2bc 100644 --- a/proto/anki/import_export.proto +++ b/proto/anki/import_export.proto @@ -110,17 +110,28 @@ message ImportCsvRequest { int64 deck_id = 2; int64 notetype_id = 3; repeated CsvColumn columns = 4; - uint32 delimiter = 5; + CsvMetadata.Delimiter delimiter = 5; bool is_html = 6; } message CsvMetadataRequest { string path = 1; - optional uint32 delimiter = 2; + optional CsvMetadata.Delimiter delimiter = 2; } message CsvMetadata { - uint32 delimiter = 1; + // Order roughly in ascending expected frequency in note text, because the + // delimiter detection algorithm is stupidly picking the first one it + // encounters. + enum Delimiter { + TAB = 0; + PIPE = 1; + SEMICOLON = 2; + COLON = 3; + COMMA = 4; + SPACE = 5; + } + Delimiter delimiter = 1; string tags = 2; repeated string columns = 3; int64 deck_id = 4; diff --git a/pylib/anki/collection.py b/pylib/anki/collection.py index ea2cc049c..86795387b 100644 --- a/pylib/anki/collection.py +++ b/pylib/anki/collection.py @@ -36,6 +36,7 @@ StripHtmlMode = card_rendering_pb2.StripHtmlRequest ImportLogWithChanges = import_export_pb2.ImportResponse CsvColumn = import_export_pb2.ImportCsvRequest.CsvColumn CsvMetadata = import_export_pb2.CsvMetadata +Delimiter = import_export_pb2.CsvMetadata.Delimiter import copy import os @@ -405,7 +406,7 @@ class Collection(DeprecatedNamesMixin): request.whole_collection.SetInParent() return self._backend.export_anki_package(request) - def get_csv_metadata(self, path: str, delimiter: int | None) -> CsvMetadata: + def get_csv_metadata(self, path: str, delimiter: Delimiter.V | None) -> CsvMetadata: request = import_export_pb2.CsvMetadataRequest(path=path, delimiter=delimiter) return self._backend.get_csv_metadata(request) @@ -415,7 +416,7 @@ class Collection(DeprecatedNamesMixin): deck_id: DeckId, notetype_id: NotetypeId, columns: list[CsvColumn], - delimiter: int, + delimiter: Delimiter.V, is_html: bool, ) -> ImportLogWithChanges: return self._backend.import_csv( diff --git a/qt/aqt/import_export/import_dialog.py b/qt/aqt/import_export/import_dialog.py index fa59fa00b..ff75a5535 100644 --- a/qt/aqt/import_export/import_dialog.py +++ b/qt/aqt/import_export/import_dialog.py @@ -9,7 +9,7 @@ from typing import Optional, Sequence import aqt.forms import aqt.main -from anki.collection import CsvColumn, CsvMetadata +from anki.collection import CsvColumn, CsvMetadata, Delimiter from anki.decks import DeckId from anki.models import NotetypeDict, NotetypeId from aqt.import_export.importing import import_progress_update, show_import_log @@ -17,6 +17,15 @@ from aqt.operations import CollectionOp, QueryOp from aqt.qt import * from aqt.utils import HelpPage, disable_help_button, getText, openHelp, showWarning, tr +DELIMITERS = ( + ("\\t", tr.importing_tab(), Delimiter.TAB), + (",", tr.importing_comma(), Delimiter.COMMA), + (" ", tr.studying_space(), Delimiter.SPACE), + (";", tr.importing_semicolon(), Delimiter.SEMICOLON), + (":", tr.importing_colon(), Delimiter.COLON), + ("|", tr.importing_colon(), Delimiter.PIPE), +) + class ChangeMap(QDialog): def __init__(self, mw: aqt.main.AnkiQt, model: dict, current: str) -> None: @@ -99,7 +108,7 @@ class ImportDialog(QDialog): self._setup_choosers() self.column_map = ColumnMap(self.columns, self.model) self._render_mapping() - self._set_delimiter_button_text() + self._set_delimiter() self.frm.allowHTML.setChecked(self.is_html) self.frm.importMode.setCurrentIndex(self.mw.pm.profile.get("importMode", 1)) self.frm.tagModified.setText(self.tags) @@ -142,6 +151,13 @@ class ImportDialog(QDialog): ) self.deck = aqt.deckchooser.DeckChooser(self.mw, self.frm.deckArea, label=False) + def _set_delimiter(self) -> None: + for delimiter in DELIMITERS: + if delimiter[2] == self.delimiter: + txt = tr.importing_fields_separated_by(val=delimiter[1]) + self.frm.autoDetect.setText(txt) + return + def onDelimiter(self) -> None: # Open a modal dialog to enter an delimiter # Todo/Idea Constrain the maximum width, so it doesnt take up that much screen space @@ -154,20 +170,20 @@ class ImportDialog(QDialog): if not ok: return # Check if the entered value is valid and if not fallback to default - # at the moment every single character entry as well as '\t' is valid - delim = delim if len(delim) > 0 else "\t" - delim = delim.replace("\\t", "\t") # un-escape it - delimiter = ord(delim) - if delimiter > 255: + + txt = "" + for delimiter in DELIMITERS: + if delimiter[0] == delim: + txt = tr.importing_fields_separated_by(val=delimiter[1]) + self.delimiter = delimiter[2] + break + if not txt: showWarning( tr.importing_multicharacter_separators_are_not_supported_please() ) return - # self.hideMapping() - # self.showMapping(hook=_update) - self.delimiter = delimiter - self._set_delimiter_button_text() + self.frm.autoDetect.setText(txt) def _update_columns(options: CsvMetadata) -> None: self.columns = options.columns @@ -176,27 +192,10 @@ class ImportDialog(QDialog): QueryOp( parent=self, - op=lambda col: col.get_csv_metadata(self.path, delimiter), + op=lambda col: col.get_csv_metadata(self.path, self.delimiter), success=_update_columns, ).run_in_background() - def _set_delimiter_button_text(self) -> None: - d = chr(self.delimiter) - if d == "\t": - d = tr.importing_tab() - elif d == ",": - d = tr.importing_comma() - elif d == " ": - d = tr.studying_space() - elif d == ";": - d = tr.importing_semicolon() - elif d == ":": - d = tr.importing_colon() - else: - d = repr(d) - txt = tr.importing_fields_separated_by(val=d) - self.frm.autoDetect.setText(txt) - def accept(self) -> None: # self.mw.pm.profile["importMode"] = self.importer.importMode self.mw.pm.profile["allowHTML"] = self.frm.allowHTML.isChecked() diff --git a/rslib/build/protobuf.rs b/rslib/build/protobuf.rs index 1472b0a09..ab50afc60 100644 --- a/rslib/build/protobuf.rs +++ b/rslib/build/protobuf.rs @@ -106,6 +106,7 @@ pub fn write_backend_proto_rs() { "#[derive(strum::EnumIter)]", ) .type_attribute("HelpPageLinkRequest.HelpPage", "#[derive(strum::EnumIter)]") + .type_attribute("CsvMetadata.Delimiter", "#[derive(strum::EnumIter)]") .type_attribute( "Preferences.BackupLimits", "#[derive(Copy, serde_derive::Deserialize, serde_derive::Serialize)]", diff --git a/rslib/src/backend/import_export.rs b/rslib/src/backend/import_export.rs index fbef8b473..4bfc29f1c 100644 --- a/rslib/src/backend/import_export.rs +++ b/rslib/src/backend/import_export.rs @@ -82,18 +82,19 @@ impl ImportExportService for Backend { } fn get_csv_metadata(&self, input: pb::CsvMetadataRequest) -> Result { - let delimiter = input.delimiter.map(try_into_byte).transpose()?; + let delimiter = input.delimiter.is_some().then(|| input.delimiter()); self.with_col(|col| col.get_csv_metadata(&input.path, delimiter)) } fn import_csv(&self, input: pb::ImportCsvRequest) -> Result { self.with_col(|col| { + let delimiter = input.delimiter(); col.import_csv( &input.path, input.deck_id.into(), input.notetype_id.into(), input.columns.into_iter().map(Into::into).collect(), - try_into_byte(input.delimiter)?, + delimiter, input.is_html, ) }) @@ -150,8 +151,3 @@ impl From for Column { } } } - -fn try_into_byte(u: impl TryInto) -> Result { - u.try_into() - .map_err(|_| AnkiError::invalid_input("expected single byte")) -} diff --git a/rslib/src/import_export/text/csv/import.rs b/rslib/src/import_export/text/csv/import.rs index 40fcc559d..3c1a3767b 100644 --- a/rslib/src/import_export/text/csv/import.rs +++ b/rslib/src/import_export/text/csv/import.rs @@ -8,7 +8,10 @@ use std::{ use crate::{ import_export::{ - text::{csv::Column, ForeignData, ForeignNote}, + text::{ + csv::{metadata::Delimiter, Column}, + ForeignData, ForeignNote, + }, NoteLog, }, prelude::*, @@ -21,7 +24,7 @@ impl Collection { deck_id: DeckId, notetype_id: NotetypeId, columns: Vec, - delimiter: u8, + delimiter: Delimiter, is_html: bool, ) -> Result> { let notetype = self.get_notetype(notetype_id)?.ok_or(AnkiError::NotFound)?; @@ -44,7 +47,7 @@ fn deserialize_csv( mut reader: impl Read + Seek, columns: &[Column], fields_len: usize, - delimiter: u8, + delimiter: Delimiter, is_html: bool, ) -> Result> { remove_tags_line_from_reader(&mut reader)?; @@ -52,7 +55,7 @@ fn deserialize_csv( .has_headers(false) .flexible(true) .comment(Some(b'#')) - .delimiter(delimiter) + .delimiter(delimiter.byte()) .trim(csv::Trim::All) .from_reader(reader); deserialize_csv_reader(&mut csv_reader, columns, fields_len, is_html) @@ -150,7 +153,7 @@ mod test { struct CsvOptions { columns: Vec, fields_len: usize, - delimiter: u8, + delimiter: Delimiter, is_html: bool, } @@ -159,7 +162,7 @@ mod test { Self { columns: vec![Column::Field(0), Column::Field(1)], fields_len: 2, - delimiter: b',', + delimiter: Delimiter::Comma, is_html: false, } } @@ -179,7 +182,7 @@ mod test { self } - fn delimiter(mut self, delimiter: u8) -> Self { + fn delimiter(mut self, delimiter: Delimiter) -> Self { self.delimiter = delimiter; self } @@ -203,7 +206,7 @@ mod test { #[test] fn should_respect_custom_delimiter() { - let options = CsvOptions::new().delimiter(b'|'); + let options = CsvOptions::new().delimiter(Delimiter::Pipe); assert_imported_fields!(options, "fr,ont|ba,ck\n", &[&["fr,ont", "ba,ck"]]); } diff --git a/rslib/src/import_export/text/csv/metadata.rs b/rslib/src/import_export/text/csv/metadata.rs index 30f04386b..b516d12fa 100644 --- a/rslib/src/import_export/text/csv/metadata.rs +++ b/rslib/src/import_export/text/csv/metadata.rs @@ -6,10 +6,17 @@ use std::{ io::{BufRead, BufReader}, }; +use strum::IntoEnumIterator; + +pub use crate::backend_proto::csv_metadata::Delimiter; use crate::{backend_proto::CsvMetadata, error::ImportError, prelude::*}; impl Collection { - pub fn get_csv_metadata(&mut self, path: &str, delimiter: Option) -> Result { + pub fn get_csv_metadata( + &mut self, + path: &str, + delimiter: Option, + ) -> Result { let reader = BufReader::new(File::open(path)?); self.get_reader_metadata(reader, delimiter) } @@ -17,9 +24,13 @@ impl Collection { fn get_reader_metadata( &mut self, reader: impl BufRead, - delimiter: Option, + delimiter: Option, ) -> Result { - let mut metadata = CsvMetadata::default(); + let mut metadata = CsvMetadata { + // mark as unset + delimiter: -1, + ..Default::default() + }; let line = self.parse_meta_lines(reader, &mut metadata)?; set_delimiter(delimiter, &mut metadata, &line); set_columns(&mut metadata, &line, &self.tr)?; @@ -71,7 +82,7 @@ impl Collection { match key.trim().to_ascii_lowercase().as_str() { "delimiter" => { if let Some(delimiter) = delimter_from_value(value) { - metadata.delimiter = delimiter as u32; + metadata.delimiter = delimiter as i32; } } "tags" => metadata.tags = value.trim().to_owned(), @@ -97,8 +108,8 @@ impl Collection { } fn parse_columns(&mut self, line: &str, metadata: &mut CsvMetadata) -> Result> { - let delimiter = if metadata.delimiter != 0 { - metadata.delimiter as u8 + let delimiter = if metadata.delimiter != -1 { + metadata.delimiter() } else { delimiter_from_line(line) }; @@ -122,7 +133,7 @@ impl Collection { fn set_columns(metadata: &mut CsvMetadata, line: &str, tr: &I18n) -> Result<()> { if metadata.columns.is_empty() { - let columns = map_single_record(line, metadata.delimiter as u8, |r| r.len())?; + let columns = map_single_record(line, metadata.delimiter(), |r| r.len())?; metadata.columns = (0..columns) .map(|i| tr.importing_column(i + 1).to_string()) .collect(); @@ -130,47 +141,41 @@ fn set_columns(metadata: &mut CsvMetadata, line: &str, tr: &I18n) -> Result<()> Ok(()) } -fn set_delimiter(delimiter: Option, metadata: &mut CsvMetadata, line: &str) { +fn set_delimiter(delimiter: Option, metadata: &mut CsvMetadata, line: &str) { if let Some(delim) = delimiter { - metadata.delimiter = delim as u32; - } else if metadata.delimiter == 0 { - // XXX: should '#delimiter:[NUL]' be supported? - metadata.delimiter = delimiter_from_line(line) as u32; + metadata.set_delimiter(delim); + } else if metadata.delimiter == -1 { + metadata.set_delimiter(delimiter_from_line(line)); } } -fn delimter_from_value(value: &str) -> Option { - // FIXME: bytes like '\n', '#' and '"' will likely cause issues - Some(if value.as_bytes().len() == 1 { - value.as_bytes()[0] - } else { - match value.trim().to_ascii_lowercase().as_str() { - "tab" | "\\t" => b'\t', - "semicolon" => b';', - "comma" => b',', - "space" => b' ', - _ => return None, +fn delimter_from_value(value: &str) -> Option { + let normed = value.to_ascii_lowercase(); + for delimiter in Delimiter::iter() { + if normed.trim() == delimiter.name() || normed.as_bytes() == [delimiter.byte()] { + return Some(delimiter); } - }) + } + None } -fn delimiter_from_line(line: &str) -> u8 { +fn delimiter_from_line(line: &str) -> Delimiter { // TODO: use smarter heuristic - for byte in [b'\t', b';', b','] { - if line.contains(byte as char) { - return byte; + for delimiter in Delimiter::iter() { + if line.contains(delimiter.byte() as char) { + return delimiter; } } - b' ' + Delimiter::Space } fn map_single_record( line: &str, - delimiter: u8, + delimiter: Delimiter, op: impl FnOnce(&csv::StringRecord) -> T, ) -> Result { csv::ReaderBuilder::new() - .delimiter(delimiter) + .delimiter(delimiter.byte()) .from_reader(line.as_bytes()) .headers() .map_err(|_| AnkiError::ImportError(ImportError::Corrupt)) @@ -182,6 +187,30 @@ fn strip_line_ending(line: &str) -> &str { .unwrap_or_else(|| line.strip_suffix('\n').unwrap_or(line)) } +impl Delimiter { + pub fn byte(self) -> u8 { + match self { + Delimiter::Comma => b',', + Delimiter::Semicolon => b';', + Delimiter::Tab => b'\t', + Delimiter::Space => b' ', + Delimiter::Pipe => b'|', + Delimiter::Colon => b':', + } + } + + fn name(self) -> &'static str { + match self { + Delimiter::Comma => "comma", + Delimiter::Semicolon => "semicolon", + Delimiter::Tab => "tab", + Delimiter::Space => "space", + Delimiter::Pipe => "pipe", + Delimiter::Colon => "colon", + } + } +} + #[cfg(test)] mod test { use super::*; @@ -218,27 +247,29 @@ mod test { #[test] fn should_detect_valid_delimiters() { let mut col = open_test_collection(); - assert_eq!(metadata!(col, "#delimiter: \n").delimiter, ' ' as u32); - assert_eq!(metadata!(col, "#delimiter:space\n").delimiter, ' ' as u32); - assert_eq!(metadata!(col, "#delimiter:\t\n").delimiter, '\t' as u32); - assert_eq!(metadata!(col, "#delimiter:Tab\n").delimiter, '\t' as u32); - assert_eq!(metadata!(col, "#delimiter:;\n").delimiter, ';' as u32); assert_eq!( - metadata!(col, "#delimiter:SEMICOLON\n").delimiter, - ';' as u32 + metadata!(col, "#delimiter:comma\n").delimiter(), + Delimiter::Comma + ); + assert_eq!( + metadata!(col, "#delimiter:\t\n").delimiter(), + Delimiter::Tab ); - assert_eq!(metadata!(col, "#delimiter:,\n").delimiter, ',' as u32); - assert_eq!(metadata!(col, "#delimiter:comma\n").delimiter, ',' as u32); - assert_eq!(metadata!(col, "#delimiter:|\n").delimiter, '|' as u32); // fallback - assert_eq!(metadata!(col, "#delimiter:foo\n").delimiter, ' ' as u32); - assert_eq!(metadata!(col, "#delimiter:♥\n").delimiter, ' ' as u32); + assert_eq!( + metadata!(col, "#delimiter:foo\n").delimiter(), + Delimiter::Space + ); + assert_eq!( + metadata!(col, "#delimiter:♥\n").delimiter(), + Delimiter::Space + ); // pick up from first line - assert_eq!(metadata!(col, "foo\tbar\n").delimiter, '\t' as u32); + assert_eq!(metadata!(col, "foo\tbar\n").delimiter(), Delimiter::Tab); // override with provided assert_eq!( - metadata!(col, "#delimiter: \nfoo\tbar\n", Some(b'|')).delimiter, - '|' as u32 + metadata!(col, "#delimiter: \nfoo\tbar\n", Some(Delimiter::Pipe)).delimiter(), + Delimiter::Pipe ); } @@ -281,7 +312,7 @@ mod test { ); // override assert_eq!( - metadata!(col, "#delimiter:;\nfoo;bar\n", Some(b'|')).columns, + metadata!(col, "#delimiter:;\nfoo;bar\n", Some(Delimiter::Pipe)).columns, ["Column 1"] ); diff --git a/rslib/src/import_export/text/mod.rs b/rslib/src/import_export/text/mod.rs index 9f2f2ce6e..b0442504a 100644 --- a/rslib/src/import_export/text/mod.rs +++ b/rslib/src/import_export/text/mod.rs @@ -2,7 +2,7 @@ // License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html pub mod csv; -pub mod import; +mod import; mod json; use serde_derive::{Deserialize, Serialize};