Only allow fixed set of CSV delimiters

This commit is contained in:
RumovZ 2022-05-11 09:49:19 +02:00
parent c565f77c23
commit 74ce551155
8 changed files with 139 additions and 97 deletions

View file

@ -110,17 +110,28 @@ message ImportCsvRequest {
int64 deck_id = 2; int64 deck_id = 2;
int64 notetype_id = 3; int64 notetype_id = 3;
repeated CsvColumn columns = 4; repeated CsvColumn columns = 4;
uint32 delimiter = 5; CsvMetadata.Delimiter delimiter = 5;
bool is_html = 6; bool is_html = 6;
} }
message CsvMetadataRequest { message CsvMetadataRequest {
string path = 1; string path = 1;
optional uint32 delimiter = 2; optional CsvMetadata.Delimiter delimiter = 2;
} }
message CsvMetadata { message CsvMetadata {
uint32 delimiter = 1; // Order roughly in ascending expected frequency in note text, because the
// delimiter detection algorithm is stupidly picking the first one it
// encounters.
enum Delimiter {
TAB = 0;
PIPE = 1;
SEMICOLON = 2;
COLON = 3;
COMMA = 4;
SPACE = 5;
}
Delimiter delimiter = 1;
string tags = 2; string tags = 2;
repeated string columns = 3; repeated string columns = 3;
int64 deck_id = 4; int64 deck_id = 4;

View file

@ -36,6 +36,7 @@ StripHtmlMode = card_rendering_pb2.StripHtmlRequest
ImportLogWithChanges = import_export_pb2.ImportResponse ImportLogWithChanges = import_export_pb2.ImportResponse
CsvColumn = import_export_pb2.ImportCsvRequest.CsvColumn CsvColumn = import_export_pb2.ImportCsvRequest.CsvColumn
CsvMetadata = import_export_pb2.CsvMetadata CsvMetadata = import_export_pb2.CsvMetadata
Delimiter = import_export_pb2.CsvMetadata.Delimiter
import copy import copy
import os import os
@ -405,7 +406,7 @@ class Collection(DeprecatedNamesMixin):
request.whole_collection.SetInParent() request.whole_collection.SetInParent()
return self._backend.export_anki_package(request) return self._backend.export_anki_package(request)
def get_csv_metadata(self, path: str, delimiter: int | None) -> CsvMetadata: def get_csv_metadata(self, path: str, delimiter: Delimiter.V | None) -> CsvMetadata:
request = import_export_pb2.CsvMetadataRequest(path=path, delimiter=delimiter) request = import_export_pb2.CsvMetadataRequest(path=path, delimiter=delimiter)
return self._backend.get_csv_metadata(request) return self._backend.get_csv_metadata(request)
@ -415,7 +416,7 @@ class Collection(DeprecatedNamesMixin):
deck_id: DeckId, deck_id: DeckId,
notetype_id: NotetypeId, notetype_id: NotetypeId,
columns: list[CsvColumn], columns: list[CsvColumn],
delimiter: int, delimiter: Delimiter.V,
is_html: bool, is_html: bool,
) -> ImportLogWithChanges: ) -> ImportLogWithChanges:
return self._backend.import_csv( return self._backend.import_csv(

View file

@ -9,7 +9,7 @@ from typing import Optional, Sequence
import aqt.forms import aqt.forms
import aqt.main import aqt.main
from anki.collection import CsvColumn, CsvMetadata from anki.collection import CsvColumn, CsvMetadata, Delimiter
from anki.decks import DeckId from anki.decks import DeckId
from anki.models import NotetypeDict, NotetypeId from anki.models import NotetypeDict, NotetypeId
from aqt.import_export.importing import import_progress_update, show_import_log from aqt.import_export.importing import import_progress_update, show_import_log
@ -17,6 +17,15 @@ from aqt.operations import CollectionOp, QueryOp
from aqt.qt import * from aqt.qt import *
from aqt.utils import HelpPage, disable_help_button, getText, openHelp, showWarning, tr from aqt.utils import HelpPage, disable_help_button, getText, openHelp, showWarning, tr
DELIMITERS = (
("\\t", tr.importing_tab(), Delimiter.TAB),
(",", tr.importing_comma(), Delimiter.COMMA),
(" ", tr.studying_space(), Delimiter.SPACE),
(";", tr.importing_semicolon(), Delimiter.SEMICOLON),
(":", tr.importing_colon(), Delimiter.COLON),
("|", tr.importing_colon(), Delimiter.PIPE),
)
class ChangeMap(QDialog): class ChangeMap(QDialog):
def __init__(self, mw: aqt.main.AnkiQt, model: dict, current: str) -> None: def __init__(self, mw: aqt.main.AnkiQt, model: dict, current: str) -> None:
@ -99,7 +108,7 @@ class ImportDialog(QDialog):
self._setup_choosers() self._setup_choosers()
self.column_map = ColumnMap(self.columns, self.model) self.column_map = ColumnMap(self.columns, self.model)
self._render_mapping() self._render_mapping()
self._set_delimiter_button_text() self._set_delimiter()
self.frm.allowHTML.setChecked(self.is_html) self.frm.allowHTML.setChecked(self.is_html)
self.frm.importMode.setCurrentIndex(self.mw.pm.profile.get("importMode", 1)) self.frm.importMode.setCurrentIndex(self.mw.pm.profile.get("importMode", 1))
self.frm.tagModified.setText(self.tags) self.frm.tagModified.setText(self.tags)
@ -142,6 +151,13 @@ class ImportDialog(QDialog):
) )
self.deck = aqt.deckchooser.DeckChooser(self.mw, self.frm.deckArea, label=False) self.deck = aqt.deckchooser.DeckChooser(self.mw, self.frm.deckArea, label=False)
def _set_delimiter(self) -> None:
for delimiter in DELIMITERS:
if delimiter[2] == self.delimiter:
txt = tr.importing_fields_separated_by(val=delimiter[1])
self.frm.autoDetect.setText(txt)
return
def onDelimiter(self) -> None: def onDelimiter(self) -> None:
# Open a modal dialog to enter an delimiter # Open a modal dialog to enter an delimiter
# Todo/Idea Constrain the maximum width, so it doesnt take up that much screen space # Todo/Idea Constrain the maximum width, so it doesnt take up that much screen space
@ -154,20 +170,20 @@ class ImportDialog(QDialog):
if not ok: if not ok:
return return
# Check if the entered value is valid and if not fallback to default # Check if the entered value is valid and if not fallback to default
# at the moment every single character entry as well as '\t' is valid
delim = delim if len(delim) > 0 else "\t" txt = ""
delim = delim.replace("\\t", "\t") # un-escape it for delimiter in DELIMITERS:
delimiter = ord(delim) if delimiter[0] == delim:
if delimiter > 255: txt = tr.importing_fields_separated_by(val=delimiter[1])
self.delimiter = delimiter[2]
break
if not txt:
showWarning( showWarning(
tr.importing_multicharacter_separators_are_not_supported_please() tr.importing_multicharacter_separators_are_not_supported_please()
) )
return return
# self.hideMapping() self.frm.autoDetect.setText(txt)
# self.showMapping(hook=_update)
self.delimiter = delimiter
self._set_delimiter_button_text()
def _update_columns(options: CsvMetadata) -> None: def _update_columns(options: CsvMetadata) -> None:
self.columns = options.columns self.columns = options.columns
@ -176,27 +192,10 @@ class ImportDialog(QDialog):
QueryOp( QueryOp(
parent=self, parent=self,
op=lambda col: col.get_csv_metadata(self.path, delimiter), op=lambda col: col.get_csv_metadata(self.path, self.delimiter),
success=_update_columns, success=_update_columns,
).run_in_background() ).run_in_background()
def _set_delimiter_button_text(self) -> None:
d = chr(self.delimiter)
if d == "\t":
d = tr.importing_tab()
elif d == ",":
d = tr.importing_comma()
elif d == " ":
d = tr.studying_space()
elif d == ";":
d = tr.importing_semicolon()
elif d == ":":
d = tr.importing_colon()
else:
d = repr(d)
txt = tr.importing_fields_separated_by(val=d)
self.frm.autoDetect.setText(txt)
def accept(self) -> None: def accept(self) -> None:
# self.mw.pm.profile["importMode"] = self.importer.importMode # self.mw.pm.profile["importMode"] = self.importer.importMode
self.mw.pm.profile["allowHTML"] = self.frm.allowHTML.isChecked() self.mw.pm.profile["allowHTML"] = self.frm.allowHTML.isChecked()

View file

@ -106,6 +106,7 @@ pub fn write_backend_proto_rs() {
"#[derive(strum::EnumIter)]", "#[derive(strum::EnumIter)]",
) )
.type_attribute("HelpPageLinkRequest.HelpPage", "#[derive(strum::EnumIter)]") .type_attribute("HelpPageLinkRequest.HelpPage", "#[derive(strum::EnumIter)]")
.type_attribute("CsvMetadata.Delimiter", "#[derive(strum::EnumIter)]")
.type_attribute( .type_attribute(
"Preferences.BackupLimits", "Preferences.BackupLimits",
"#[derive(Copy, serde_derive::Deserialize, serde_derive::Serialize)]", "#[derive(Copy, serde_derive::Deserialize, serde_derive::Serialize)]",

View file

@ -82,18 +82,19 @@ impl ImportExportService for Backend {
} }
fn get_csv_metadata(&self, input: pb::CsvMetadataRequest) -> Result<pb::CsvMetadata> { fn get_csv_metadata(&self, input: pb::CsvMetadataRequest) -> Result<pb::CsvMetadata> {
let delimiter = input.delimiter.map(try_into_byte).transpose()?; let delimiter = input.delimiter.is_some().then(|| input.delimiter());
self.with_col(|col| col.get_csv_metadata(&input.path, delimiter)) self.with_col(|col| col.get_csv_metadata(&input.path, delimiter))
} }
fn import_csv(&self, input: pb::ImportCsvRequest) -> Result<pb::ImportResponse> { fn import_csv(&self, input: pb::ImportCsvRequest) -> Result<pb::ImportResponse> {
self.with_col(|col| { self.with_col(|col| {
let delimiter = input.delimiter();
col.import_csv( col.import_csv(
&input.path, &input.path,
input.deck_id.into(), input.deck_id.into(),
input.notetype_id.into(), input.notetype_id.into(),
input.columns.into_iter().map(Into::into).collect(), input.columns.into_iter().map(Into::into).collect(),
try_into_byte(input.delimiter)?, delimiter,
input.is_html, input.is_html,
) )
}) })
@ -150,8 +151,3 @@ impl From<CsvColumn> for Column {
} }
} }
} }
fn try_into_byte(u: impl TryInto<u8>) -> Result<u8> {
u.try_into()
.map_err(|_| AnkiError::invalid_input("expected single byte"))
}

View file

@ -8,7 +8,10 @@ use std::{
use crate::{ use crate::{
import_export::{ import_export::{
text::{csv::Column, ForeignData, ForeignNote}, text::{
csv::{metadata::Delimiter, Column},
ForeignData, ForeignNote,
},
NoteLog, NoteLog,
}, },
prelude::*, prelude::*,
@ -21,7 +24,7 @@ impl Collection {
deck_id: DeckId, deck_id: DeckId,
notetype_id: NotetypeId, notetype_id: NotetypeId,
columns: Vec<Column>, columns: Vec<Column>,
delimiter: u8, delimiter: Delimiter,
is_html: bool, is_html: bool,
) -> Result<OpOutput<NoteLog>> { ) -> Result<OpOutput<NoteLog>> {
let notetype = self.get_notetype(notetype_id)?.ok_or(AnkiError::NotFound)?; let notetype = self.get_notetype(notetype_id)?.ok_or(AnkiError::NotFound)?;
@ -44,7 +47,7 @@ fn deserialize_csv(
mut reader: impl Read + Seek, mut reader: impl Read + Seek,
columns: &[Column], columns: &[Column],
fields_len: usize, fields_len: usize,
delimiter: u8, delimiter: Delimiter,
is_html: bool, is_html: bool,
) -> Result<Vec<ForeignNote>> { ) -> Result<Vec<ForeignNote>> {
remove_tags_line_from_reader(&mut reader)?; remove_tags_line_from_reader(&mut reader)?;
@ -52,7 +55,7 @@ fn deserialize_csv(
.has_headers(false) .has_headers(false)
.flexible(true) .flexible(true)
.comment(Some(b'#')) .comment(Some(b'#'))
.delimiter(delimiter) .delimiter(delimiter.byte())
.trim(csv::Trim::All) .trim(csv::Trim::All)
.from_reader(reader); .from_reader(reader);
deserialize_csv_reader(&mut csv_reader, columns, fields_len, is_html) deserialize_csv_reader(&mut csv_reader, columns, fields_len, is_html)
@ -150,7 +153,7 @@ mod test {
struct CsvOptions { struct CsvOptions {
columns: Vec<Column>, columns: Vec<Column>,
fields_len: usize, fields_len: usize,
delimiter: u8, delimiter: Delimiter,
is_html: bool, is_html: bool,
} }
@ -159,7 +162,7 @@ mod test {
Self { Self {
columns: vec![Column::Field(0), Column::Field(1)], columns: vec![Column::Field(0), Column::Field(1)],
fields_len: 2, fields_len: 2,
delimiter: b',', delimiter: Delimiter::Comma,
is_html: false, is_html: false,
} }
} }
@ -179,7 +182,7 @@ mod test {
self self
} }
fn delimiter(mut self, delimiter: u8) -> Self { fn delimiter(mut self, delimiter: Delimiter) -> Self {
self.delimiter = delimiter; self.delimiter = delimiter;
self self
} }
@ -203,7 +206,7 @@ mod test {
#[test] #[test]
fn should_respect_custom_delimiter() { fn should_respect_custom_delimiter() {
let options = CsvOptions::new().delimiter(b'|'); let options = CsvOptions::new().delimiter(Delimiter::Pipe);
assert_imported_fields!(options, "fr,ont|ba,ck\n", &[&["fr,ont", "ba,ck"]]); assert_imported_fields!(options, "fr,ont|ba,ck\n", &[&["fr,ont", "ba,ck"]]);
} }

View file

@ -6,10 +6,17 @@ use std::{
io::{BufRead, BufReader}, io::{BufRead, BufReader},
}; };
use strum::IntoEnumIterator;
pub use crate::backend_proto::csv_metadata::Delimiter;
use crate::{backend_proto::CsvMetadata, error::ImportError, prelude::*}; use crate::{backend_proto::CsvMetadata, error::ImportError, prelude::*};
impl Collection { impl Collection {
pub fn get_csv_metadata(&mut self, path: &str, delimiter: Option<u8>) -> Result<CsvMetadata> { pub fn get_csv_metadata(
&mut self,
path: &str,
delimiter: Option<Delimiter>,
) -> Result<CsvMetadata> {
let reader = BufReader::new(File::open(path)?); let reader = BufReader::new(File::open(path)?);
self.get_reader_metadata(reader, delimiter) self.get_reader_metadata(reader, delimiter)
} }
@ -17,9 +24,13 @@ impl Collection {
fn get_reader_metadata( fn get_reader_metadata(
&mut self, &mut self,
reader: impl BufRead, reader: impl BufRead,
delimiter: Option<u8>, delimiter: Option<Delimiter>,
) -> Result<CsvMetadata> { ) -> Result<CsvMetadata> {
let mut metadata = CsvMetadata::default(); let mut metadata = CsvMetadata {
// mark as unset
delimiter: -1,
..Default::default()
};
let line = self.parse_meta_lines(reader, &mut metadata)?; let line = self.parse_meta_lines(reader, &mut metadata)?;
set_delimiter(delimiter, &mut metadata, &line); set_delimiter(delimiter, &mut metadata, &line);
set_columns(&mut metadata, &line, &self.tr)?; set_columns(&mut metadata, &line, &self.tr)?;
@ -71,7 +82,7 @@ impl Collection {
match key.trim().to_ascii_lowercase().as_str() { match key.trim().to_ascii_lowercase().as_str() {
"delimiter" => { "delimiter" => {
if let Some(delimiter) = delimter_from_value(value) { if let Some(delimiter) = delimter_from_value(value) {
metadata.delimiter = delimiter as u32; metadata.delimiter = delimiter as i32;
} }
} }
"tags" => metadata.tags = value.trim().to_owned(), "tags" => metadata.tags = value.trim().to_owned(),
@ -97,8 +108,8 @@ impl Collection {
} }
fn parse_columns(&mut self, line: &str, metadata: &mut CsvMetadata) -> Result<Vec<String>> { fn parse_columns(&mut self, line: &str, metadata: &mut CsvMetadata) -> Result<Vec<String>> {
let delimiter = if metadata.delimiter != 0 { let delimiter = if metadata.delimiter != -1 {
metadata.delimiter as u8 metadata.delimiter()
} else { } else {
delimiter_from_line(line) delimiter_from_line(line)
}; };
@ -122,7 +133,7 @@ impl Collection {
fn set_columns(metadata: &mut CsvMetadata, line: &str, tr: &I18n) -> Result<()> { fn set_columns(metadata: &mut CsvMetadata, line: &str, tr: &I18n) -> Result<()> {
if metadata.columns.is_empty() { if metadata.columns.is_empty() {
let columns = map_single_record(line, metadata.delimiter as u8, |r| r.len())?; let columns = map_single_record(line, metadata.delimiter(), |r| r.len())?;
metadata.columns = (0..columns) metadata.columns = (0..columns)
.map(|i| tr.importing_column(i + 1).to_string()) .map(|i| tr.importing_column(i + 1).to_string())
.collect(); .collect();
@ -130,47 +141,41 @@ fn set_columns(metadata: &mut CsvMetadata, line: &str, tr: &I18n) -> Result<()>
Ok(()) Ok(())
} }
fn set_delimiter(delimiter: Option<u8>, metadata: &mut CsvMetadata, line: &str) { fn set_delimiter(delimiter: Option<Delimiter>, metadata: &mut CsvMetadata, line: &str) {
if let Some(delim) = delimiter { if let Some(delim) = delimiter {
metadata.delimiter = delim as u32; metadata.set_delimiter(delim);
} else if metadata.delimiter == 0 { } else if metadata.delimiter == -1 {
// XXX: should '#delimiter:[NUL]' be supported? metadata.set_delimiter(delimiter_from_line(line));
metadata.delimiter = delimiter_from_line(line) as u32;
} }
} }
fn delimter_from_value(value: &str) -> Option<u8> { fn delimter_from_value(value: &str) -> Option<Delimiter> {
// FIXME: bytes like '\n', '#' and '"' will likely cause issues let normed = value.to_ascii_lowercase();
Some(if value.as_bytes().len() == 1 { for delimiter in Delimiter::iter() {
value.as_bytes()[0] if normed.trim() == delimiter.name() || normed.as_bytes() == [delimiter.byte()] {
} else { return Some(delimiter);
match value.trim().to_ascii_lowercase().as_str() {
"tab" | "\\t" => b'\t',
"semicolon" => b';',
"comma" => b',',
"space" => b' ',
_ => return None,
} }
}) }
None
} }
fn delimiter_from_line(line: &str) -> u8 { fn delimiter_from_line(line: &str) -> Delimiter {
// TODO: use smarter heuristic // TODO: use smarter heuristic
for byte in [b'\t', b';', b','] { for delimiter in Delimiter::iter() {
if line.contains(byte as char) { if line.contains(delimiter.byte() as char) {
return byte; return delimiter;
} }
} }
b' ' Delimiter::Space
} }
fn map_single_record<T>( fn map_single_record<T>(
line: &str, line: &str,
delimiter: u8, delimiter: Delimiter,
op: impl FnOnce(&csv::StringRecord) -> T, op: impl FnOnce(&csv::StringRecord) -> T,
) -> Result<T> { ) -> Result<T> {
csv::ReaderBuilder::new() csv::ReaderBuilder::new()
.delimiter(delimiter) .delimiter(delimiter.byte())
.from_reader(line.as_bytes()) .from_reader(line.as_bytes())
.headers() .headers()
.map_err(|_| AnkiError::ImportError(ImportError::Corrupt)) .map_err(|_| AnkiError::ImportError(ImportError::Corrupt))
@ -182,6 +187,30 @@ fn strip_line_ending(line: &str) -> &str {
.unwrap_or_else(|| line.strip_suffix('\n').unwrap_or(line)) .unwrap_or_else(|| line.strip_suffix('\n').unwrap_or(line))
} }
impl Delimiter {
pub fn byte(self) -> u8 {
match self {
Delimiter::Comma => b',',
Delimiter::Semicolon => b';',
Delimiter::Tab => b'\t',
Delimiter::Space => b' ',
Delimiter::Pipe => b'|',
Delimiter::Colon => b':',
}
}
fn name(self) -> &'static str {
match self {
Delimiter::Comma => "comma",
Delimiter::Semicolon => "semicolon",
Delimiter::Tab => "tab",
Delimiter::Space => "space",
Delimiter::Pipe => "pipe",
Delimiter::Colon => "colon",
}
}
}
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::*; use super::*;
@ -218,27 +247,29 @@ mod test {
#[test] #[test]
fn should_detect_valid_delimiters() { fn should_detect_valid_delimiters() {
let mut col = open_test_collection(); let mut col = open_test_collection();
assert_eq!(metadata!(col, "#delimiter: \n").delimiter, ' ' as u32);
assert_eq!(metadata!(col, "#delimiter:space\n").delimiter, ' ' as u32);
assert_eq!(metadata!(col, "#delimiter:\t\n").delimiter, '\t' as u32);
assert_eq!(metadata!(col, "#delimiter:Tab\n").delimiter, '\t' as u32);
assert_eq!(metadata!(col, "#delimiter:;\n").delimiter, ';' as u32);
assert_eq!( assert_eq!(
metadata!(col, "#delimiter:SEMICOLON\n").delimiter, metadata!(col, "#delimiter:comma\n").delimiter(),
';' as u32 Delimiter::Comma
);
assert_eq!(
metadata!(col, "#delimiter:\t\n").delimiter(),
Delimiter::Tab
); );
assert_eq!(metadata!(col, "#delimiter:,\n").delimiter, ',' as u32);
assert_eq!(metadata!(col, "#delimiter:comma\n").delimiter, ',' as u32);
assert_eq!(metadata!(col, "#delimiter:|\n").delimiter, '|' as u32);
// fallback // fallback
assert_eq!(metadata!(col, "#delimiter:foo\n").delimiter, ' ' as u32); assert_eq!(
assert_eq!(metadata!(col, "#delimiter:♥\n").delimiter, ' ' as u32); metadata!(col, "#delimiter:foo\n").delimiter(),
Delimiter::Space
);
assert_eq!(
metadata!(col, "#delimiter:♥\n").delimiter(),
Delimiter::Space
);
// pick up from first line // pick up from first line
assert_eq!(metadata!(col, "foo\tbar\n").delimiter, '\t' as u32); assert_eq!(metadata!(col, "foo\tbar\n").delimiter(), Delimiter::Tab);
// override with provided // override with provided
assert_eq!( assert_eq!(
metadata!(col, "#delimiter: \nfoo\tbar\n", Some(b'|')).delimiter, metadata!(col, "#delimiter: \nfoo\tbar\n", Some(Delimiter::Pipe)).delimiter(),
'|' as u32 Delimiter::Pipe
); );
} }
@ -281,7 +312,7 @@ mod test {
); );
// override // override
assert_eq!( assert_eq!(
metadata!(col, "#delimiter:;\nfoo;bar\n", Some(b'|')).columns, metadata!(col, "#delimiter:;\nfoo;bar\n", Some(Delimiter::Pipe)).columns,
["Column 1"] ["Column 1"]
); );

View file

@ -2,7 +2,7 @@
// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html // License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
pub mod csv; pub mod csv;
pub mod import; mod import;
mod json; mod json;
use serde_derive::{Deserialize, Serialize}; use serde_derive::{Deserialize, Serialize};