Only allow fixed set of CSV delimiters

This commit is contained in:
RumovZ 2022-05-11 09:49:19 +02:00
parent c565f77c23
commit 74ce551155
8 changed files with 139 additions and 97 deletions

View file

@ -110,17 +110,28 @@ message ImportCsvRequest {
int64 deck_id = 2;
int64 notetype_id = 3;
repeated CsvColumn columns = 4;
uint32 delimiter = 5;
CsvMetadata.Delimiter delimiter = 5;
bool is_html = 6;
}
message CsvMetadataRequest {
string path = 1;
optional uint32 delimiter = 2;
optional CsvMetadata.Delimiter delimiter = 2;
}
message CsvMetadata {
uint32 delimiter = 1;
// Order roughly in ascending expected frequency in note text, because the
// delimiter detection algorithm is stupidly picking the first one it
// encounters.
enum Delimiter {
TAB = 0;
PIPE = 1;
SEMICOLON = 2;
COLON = 3;
COMMA = 4;
SPACE = 5;
}
Delimiter delimiter = 1;
string tags = 2;
repeated string columns = 3;
int64 deck_id = 4;

View file

@ -36,6 +36,7 @@ StripHtmlMode = card_rendering_pb2.StripHtmlRequest
ImportLogWithChanges = import_export_pb2.ImportResponse
CsvColumn = import_export_pb2.ImportCsvRequest.CsvColumn
CsvMetadata = import_export_pb2.CsvMetadata
Delimiter = import_export_pb2.CsvMetadata.Delimiter
import copy
import os
@ -405,7 +406,7 @@ class Collection(DeprecatedNamesMixin):
request.whole_collection.SetInParent()
return self._backend.export_anki_package(request)
def get_csv_metadata(self, path: str, delimiter: int | None) -> CsvMetadata:
def get_csv_metadata(self, path: str, delimiter: Delimiter.V | None) -> CsvMetadata:
request = import_export_pb2.CsvMetadataRequest(path=path, delimiter=delimiter)
return self._backend.get_csv_metadata(request)
@ -415,7 +416,7 @@ class Collection(DeprecatedNamesMixin):
deck_id: DeckId,
notetype_id: NotetypeId,
columns: list[CsvColumn],
delimiter: int,
delimiter: Delimiter.V,
is_html: bool,
) -> ImportLogWithChanges:
return self._backend.import_csv(

View file

@ -9,7 +9,7 @@ from typing import Optional, Sequence
import aqt.forms
import aqt.main
from anki.collection import CsvColumn, CsvMetadata
from anki.collection import CsvColumn, CsvMetadata, Delimiter
from anki.decks import DeckId
from anki.models import NotetypeDict, NotetypeId
from aqt.import_export.importing import import_progress_update, show_import_log
@ -17,6 +17,15 @@ from aqt.operations import CollectionOp, QueryOp
from aqt.qt import *
from aqt.utils import HelpPage, disable_help_button, getText, openHelp, showWarning, tr
DELIMITERS = (
("\\t", tr.importing_tab(), Delimiter.TAB),
(",", tr.importing_comma(), Delimiter.COMMA),
(" ", tr.studying_space(), Delimiter.SPACE),
(";", tr.importing_semicolon(), Delimiter.SEMICOLON),
(":", tr.importing_colon(), Delimiter.COLON),
("|", tr.importing_colon(), Delimiter.PIPE),
)
class ChangeMap(QDialog):
def __init__(self, mw: aqt.main.AnkiQt, model: dict, current: str) -> None:
@ -99,7 +108,7 @@ class ImportDialog(QDialog):
self._setup_choosers()
self.column_map = ColumnMap(self.columns, self.model)
self._render_mapping()
self._set_delimiter_button_text()
self._set_delimiter()
self.frm.allowHTML.setChecked(self.is_html)
self.frm.importMode.setCurrentIndex(self.mw.pm.profile.get("importMode", 1))
self.frm.tagModified.setText(self.tags)
@ -142,6 +151,13 @@ class ImportDialog(QDialog):
)
self.deck = aqt.deckchooser.DeckChooser(self.mw, self.frm.deckArea, label=False)
def _set_delimiter(self) -> None:
for delimiter in DELIMITERS:
if delimiter[2] == self.delimiter:
txt = tr.importing_fields_separated_by(val=delimiter[1])
self.frm.autoDetect.setText(txt)
return
def onDelimiter(self) -> None:
# Open a modal dialog to enter an delimiter
# Todo/Idea Constrain the maximum width, so it doesnt take up that much screen space
@ -154,20 +170,20 @@ class ImportDialog(QDialog):
if not ok:
return
# Check if the entered value is valid and if not fallback to default
# at the moment every single character entry as well as '\t' is valid
delim = delim if len(delim) > 0 else "\t"
delim = delim.replace("\\t", "\t") # un-escape it
delimiter = ord(delim)
if delimiter > 255:
txt = ""
for delimiter in DELIMITERS:
if delimiter[0] == delim:
txt = tr.importing_fields_separated_by(val=delimiter[1])
self.delimiter = delimiter[2]
break
if not txt:
showWarning(
tr.importing_multicharacter_separators_are_not_supported_please()
)
return
# self.hideMapping()
# self.showMapping(hook=_update)
self.delimiter = delimiter
self._set_delimiter_button_text()
self.frm.autoDetect.setText(txt)
def _update_columns(options: CsvMetadata) -> None:
self.columns = options.columns
@ -176,27 +192,10 @@ class ImportDialog(QDialog):
QueryOp(
parent=self,
op=lambda col: col.get_csv_metadata(self.path, delimiter),
op=lambda col: col.get_csv_metadata(self.path, self.delimiter),
success=_update_columns,
).run_in_background()
def _set_delimiter_button_text(self) -> None:
d = chr(self.delimiter)
if d == "\t":
d = tr.importing_tab()
elif d == ",":
d = tr.importing_comma()
elif d == " ":
d = tr.studying_space()
elif d == ";":
d = tr.importing_semicolon()
elif d == ":":
d = tr.importing_colon()
else:
d = repr(d)
txt = tr.importing_fields_separated_by(val=d)
self.frm.autoDetect.setText(txt)
def accept(self) -> None:
# self.mw.pm.profile["importMode"] = self.importer.importMode
self.mw.pm.profile["allowHTML"] = self.frm.allowHTML.isChecked()

View file

@ -106,6 +106,7 @@ pub fn write_backend_proto_rs() {
"#[derive(strum::EnumIter)]",
)
.type_attribute("HelpPageLinkRequest.HelpPage", "#[derive(strum::EnumIter)]")
.type_attribute("CsvMetadata.Delimiter", "#[derive(strum::EnumIter)]")
.type_attribute(
"Preferences.BackupLimits",
"#[derive(Copy, serde_derive::Deserialize, serde_derive::Serialize)]",

View file

@ -82,18 +82,19 @@ impl ImportExportService for Backend {
}
fn get_csv_metadata(&self, input: pb::CsvMetadataRequest) -> Result<pb::CsvMetadata> {
let delimiter = input.delimiter.map(try_into_byte).transpose()?;
let delimiter = input.delimiter.is_some().then(|| input.delimiter());
self.with_col(|col| col.get_csv_metadata(&input.path, delimiter))
}
fn import_csv(&self, input: pb::ImportCsvRequest) -> Result<pb::ImportResponse> {
self.with_col(|col| {
let delimiter = input.delimiter();
col.import_csv(
&input.path,
input.deck_id.into(),
input.notetype_id.into(),
input.columns.into_iter().map(Into::into).collect(),
try_into_byte(input.delimiter)?,
delimiter,
input.is_html,
)
})
@ -150,8 +151,3 @@ impl From<CsvColumn> for Column {
}
}
}
fn try_into_byte(u: impl TryInto<u8>) -> Result<u8> {
u.try_into()
.map_err(|_| AnkiError::invalid_input("expected single byte"))
}

View file

@ -8,7 +8,10 @@ use std::{
use crate::{
import_export::{
text::{csv::Column, ForeignData, ForeignNote},
text::{
csv::{metadata::Delimiter, Column},
ForeignData, ForeignNote,
},
NoteLog,
},
prelude::*,
@ -21,7 +24,7 @@ impl Collection {
deck_id: DeckId,
notetype_id: NotetypeId,
columns: Vec<Column>,
delimiter: u8,
delimiter: Delimiter,
is_html: bool,
) -> Result<OpOutput<NoteLog>> {
let notetype = self.get_notetype(notetype_id)?.ok_or(AnkiError::NotFound)?;
@ -44,7 +47,7 @@ fn deserialize_csv(
mut reader: impl Read + Seek,
columns: &[Column],
fields_len: usize,
delimiter: u8,
delimiter: Delimiter,
is_html: bool,
) -> Result<Vec<ForeignNote>> {
remove_tags_line_from_reader(&mut reader)?;
@ -52,7 +55,7 @@ fn deserialize_csv(
.has_headers(false)
.flexible(true)
.comment(Some(b'#'))
.delimiter(delimiter)
.delimiter(delimiter.byte())
.trim(csv::Trim::All)
.from_reader(reader);
deserialize_csv_reader(&mut csv_reader, columns, fields_len, is_html)
@ -150,7 +153,7 @@ mod test {
struct CsvOptions {
columns: Vec<Column>,
fields_len: usize,
delimiter: u8,
delimiter: Delimiter,
is_html: bool,
}
@ -159,7 +162,7 @@ mod test {
Self {
columns: vec![Column::Field(0), Column::Field(1)],
fields_len: 2,
delimiter: b',',
delimiter: Delimiter::Comma,
is_html: false,
}
}
@ -179,7 +182,7 @@ mod test {
self
}
fn delimiter(mut self, delimiter: u8) -> Self {
fn delimiter(mut self, delimiter: Delimiter) -> Self {
self.delimiter = delimiter;
self
}
@ -203,7 +206,7 @@ mod test {
#[test]
fn should_respect_custom_delimiter() {
let options = CsvOptions::new().delimiter(b'|');
let options = CsvOptions::new().delimiter(Delimiter::Pipe);
assert_imported_fields!(options, "fr,ont|ba,ck\n", &[&["fr,ont", "ba,ck"]]);
}

View file

@ -6,10 +6,17 @@ use std::{
io::{BufRead, BufReader},
};
use strum::IntoEnumIterator;
pub use crate::backend_proto::csv_metadata::Delimiter;
use crate::{backend_proto::CsvMetadata, error::ImportError, prelude::*};
impl Collection {
pub fn get_csv_metadata(&mut self, path: &str, delimiter: Option<u8>) -> Result<CsvMetadata> {
pub fn get_csv_metadata(
&mut self,
path: &str,
delimiter: Option<Delimiter>,
) -> Result<CsvMetadata> {
let reader = BufReader::new(File::open(path)?);
self.get_reader_metadata(reader, delimiter)
}
@ -17,9 +24,13 @@ impl Collection {
fn get_reader_metadata(
&mut self,
reader: impl BufRead,
delimiter: Option<u8>,
delimiter: Option<Delimiter>,
) -> Result<CsvMetadata> {
let mut metadata = CsvMetadata::default();
let mut metadata = CsvMetadata {
// mark as unset
delimiter: -1,
..Default::default()
};
let line = self.parse_meta_lines(reader, &mut metadata)?;
set_delimiter(delimiter, &mut metadata, &line);
set_columns(&mut metadata, &line, &self.tr)?;
@ -71,7 +82,7 @@ impl Collection {
match key.trim().to_ascii_lowercase().as_str() {
"delimiter" => {
if let Some(delimiter) = delimter_from_value(value) {
metadata.delimiter = delimiter as u32;
metadata.delimiter = delimiter as i32;
}
}
"tags" => metadata.tags = value.trim().to_owned(),
@ -97,8 +108,8 @@ impl Collection {
}
fn parse_columns(&mut self, line: &str, metadata: &mut CsvMetadata) -> Result<Vec<String>> {
let delimiter = if metadata.delimiter != 0 {
metadata.delimiter as u8
let delimiter = if metadata.delimiter != -1 {
metadata.delimiter()
} else {
delimiter_from_line(line)
};
@ -122,7 +133,7 @@ impl Collection {
fn set_columns(metadata: &mut CsvMetadata, line: &str, tr: &I18n) -> Result<()> {
if metadata.columns.is_empty() {
let columns = map_single_record(line, metadata.delimiter as u8, |r| r.len())?;
let columns = map_single_record(line, metadata.delimiter(), |r| r.len())?;
metadata.columns = (0..columns)
.map(|i| tr.importing_column(i + 1).to_string())
.collect();
@ -130,47 +141,41 @@ fn set_columns(metadata: &mut CsvMetadata, line: &str, tr: &I18n) -> Result<()>
Ok(())
}
fn set_delimiter(delimiter: Option<u8>, metadata: &mut CsvMetadata, line: &str) {
fn set_delimiter(delimiter: Option<Delimiter>, metadata: &mut CsvMetadata, line: &str) {
if let Some(delim) = delimiter {
metadata.delimiter = delim as u32;
} else if metadata.delimiter == 0 {
// XXX: should '#delimiter:[NUL]' be supported?
metadata.delimiter = delimiter_from_line(line) as u32;
metadata.set_delimiter(delim);
} else if metadata.delimiter == -1 {
metadata.set_delimiter(delimiter_from_line(line));
}
}
fn delimter_from_value(value: &str) -> Option<u8> {
// FIXME: bytes like '\n', '#' and '"' will likely cause issues
Some(if value.as_bytes().len() == 1 {
value.as_bytes()[0]
} else {
match value.trim().to_ascii_lowercase().as_str() {
"tab" | "\\t" => b'\t',
"semicolon" => b';',
"comma" => b',',
"space" => b' ',
_ => return None,
fn delimter_from_value(value: &str) -> Option<Delimiter> {
let normed = value.to_ascii_lowercase();
for delimiter in Delimiter::iter() {
if normed.trim() == delimiter.name() || normed.as_bytes() == [delimiter.byte()] {
return Some(delimiter);
}
})
}
None
}
fn delimiter_from_line(line: &str) -> u8 {
fn delimiter_from_line(line: &str) -> Delimiter {
// TODO: use smarter heuristic
for byte in [b'\t', b';', b','] {
if line.contains(byte as char) {
return byte;
for delimiter in Delimiter::iter() {
if line.contains(delimiter.byte() as char) {
return delimiter;
}
}
b' '
Delimiter::Space
}
fn map_single_record<T>(
line: &str,
delimiter: u8,
delimiter: Delimiter,
op: impl FnOnce(&csv::StringRecord) -> T,
) -> Result<T> {
csv::ReaderBuilder::new()
.delimiter(delimiter)
.delimiter(delimiter.byte())
.from_reader(line.as_bytes())
.headers()
.map_err(|_| AnkiError::ImportError(ImportError::Corrupt))
@ -182,6 +187,30 @@ fn strip_line_ending(line: &str) -> &str {
.unwrap_or_else(|| line.strip_suffix('\n').unwrap_or(line))
}
impl Delimiter {
pub fn byte(self) -> u8 {
match self {
Delimiter::Comma => b',',
Delimiter::Semicolon => b';',
Delimiter::Tab => b'\t',
Delimiter::Space => b' ',
Delimiter::Pipe => b'|',
Delimiter::Colon => b':',
}
}
fn name(self) -> &'static str {
match self {
Delimiter::Comma => "comma",
Delimiter::Semicolon => "semicolon",
Delimiter::Tab => "tab",
Delimiter::Space => "space",
Delimiter::Pipe => "pipe",
Delimiter::Colon => "colon",
}
}
}
#[cfg(test)]
mod test {
use super::*;
@ -218,27 +247,29 @@ mod test {
#[test]
fn should_detect_valid_delimiters() {
let mut col = open_test_collection();
assert_eq!(metadata!(col, "#delimiter: \n").delimiter, ' ' as u32);
assert_eq!(metadata!(col, "#delimiter:space\n").delimiter, ' ' as u32);
assert_eq!(metadata!(col, "#delimiter:\t\n").delimiter, '\t' as u32);
assert_eq!(metadata!(col, "#delimiter:Tab\n").delimiter, '\t' as u32);
assert_eq!(metadata!(col, "#delimiter:;\n").delimiter, ';' as u32);
assert_eq!(
metadata!(col, "#delimiter:SEMICOLON\n").delimiter,
';' as u32
metadata!(col, "#delimiter:comma\n").delimiter(),
Delimiter::Comma
);
assert_eq!(
metadata!(col, "#delimiter:\t\n").delimiter(),
Delimiter::Tab
);
assert_eq!(metadata!(col, "#delimiter:,\n").delimiter, ',' as u32);
assert_eq!(metadata!(col, "#delimiter:comma\n").delimiter, ',' as u32);
assert_eq!(metadata!(col, "#delimiter:|\n").delimiter, '|' as u32);
// fallback
assert_eq!(metadata!(col, "#delimiter:foo\n").delimiter, ' ' as u32);
assert_eq!(metadata!(col, "#delimiter:♥\n").delimiter, ' ' as u32);
assert_eq!(
metadata!(col, "#delimiter:foo\n").delimiter(),
Delimiter::Space
);
assert_eq!(
metadata!(col, "#delimiter:♥\n").delimiter(),
Delimiter::Space
);
// pick up from first line
assert_eq!(metadata!(col, "foo\tbar\n").delimiter, '\t' as u32);
assert_eq!(metadata!(col, "foo\tbar\n").delimiter(), Delimiter::Tab);
// override with provided
assert_eq!(
metadata!(col, "#delimiter: \nfoo\tbar\n", Some(b'|')).delimiter,
'|' as u32
metadata!(col, "#delimiter: \nfoo\tbar\n", Some(Delimiter::Pipe)).delimiter(),
Delimiter::Pipe
);
}
@ -281,7 +312,7 @@ mod test {
);
// override
assert_eq!(
metadata!(col, "#delimiter:;\nfoo;bar\n", Some(b'|')).columns,
metadata!(col, "#delimiter:;\nfoo;bar\n", Some(Delimiter::Pipe)).columns,
["Column 1"]
);

View file

@ -2,7 +2,7 @@
// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
pub mod csv;
pub mod import;
mod import;
mod json;
use serde_derive::{Deserialize, Serialize};