Add csv metadata extraction on backend

This commit is contained in:
RumovZ 2022-05-08 19:03:11 +02:00
parent db5f167de5
commit 2c3a6a43de
9 changed files with 349 additions and 20 deletions

View file

@ -8,6 +8,7 @@ importing-appeared-twice-in-file = Appeared twice in file: { $val }
importing-by-default-anki-will-detect-the = By default, Anki will detect the character between fields, such as a tab, comma, and so on. If Anki is detecting the character incorrectly, you can enter it here. Use \t to represent tab.
importing-change = Change
importing-colon = Colon
importing-column = Column { $val }
importing-comma = Comma
importing-empty-first-field = Empty first field: { $val }
importing-field-mapping = Field mapping

View file

@ -16,6 +16,7 @@ service ImportExportService {
returns (generic.Empty);
rpc ImportAnkiPackage(ImportAnkiPackageRequest) returns (ImportResponse);
rpc ExportAnkiPackage(ExportAnkiPackageRequest) returns (generic.UInt32);
rpc GetCsvMetadata(CsvMetadataRequest) returns (CsvMetadata);
rpc ImportCsv(ImportCsvRequest) returns (ImportResponse);
rpc ImportJson(generic.String) returns (ImportResponse);
}
@ -109,6 +110,20 @@ message ImportCsvRequest {
int64 deck_id = 2;
int64 notetype_id = 3;
repeated CsvColumn columns = 4;
string delimiter = 5;
uint32 delimiter = 5;
bool allow_html = 6;
}
message CsvMetadataRequest {
string path = 1;
optional uint32 delimiter = 2;
}
message CsvMetadata {
uint32 delimiter = 1;
string tags = 2;
repeated string columns = 3;
int64 deck_id = 4;
int64 notetype_id = 5;
optional bool html = 6;
}

View file

@ -410,7 +410,7 @@ class Collection(DeprecatedNamesMixin):
deck_id: DeckId,
notetype_id: NotetypeId,
columns: list[CsvColumn],
delimiter: str,
delimiter: int,
allow_html: bool,
) -> ImportLogWithChanges:
return self._backend.import_csv(

View file

@ -154,7 +154,7 @@ class ImportDialog(QDialog):
d = repr(d)
txt = tr.importing_fields_separated_by(val=d)
self.frm.autoDetect.setText(txt)
self.delim = d
self.delim = ord(d)
def accept(self) -> None:
# self.mw.pm.profile["importMode"] = self.importer.importMode

View file

@ -81,6 +81,11 @@ impl ImportExportService for Backend {
.map(Into::into)
}
fn get_csv_metadata(&self, input: pb::CsvMetadataRequest) -> Result<pb::CsvMetadata> {
let delimiter = input.delimiter.map(try_into_byte).transpose()?;
self.with_col(|col| col.get_csv_metadata(&input.path, delimiter))
}
fn import_csv(&self, input: pb::ImportCsvRequest) -> Result<pb::ImportResponse> {
self.with_col(|col| {
col.import_csv(
@ -88,7 +93,7 @@ impl ImportExportService for Backend {
input.deck_id.into(),
input.notetype_id.into(),
input.columns.into_iter().map(Into::into).collect(),
byte_from_string(&input.delimiter)?,
try_into_byte(input.delimiter)?,
//input.allow_html,
)
})
@ -146,8 +151,7 @@ impl From<CsvColumn> for Column {
}
}
fn byte_from_string(s: &str) -> Result<u8> {
s.bytes()
.next()
.ok_or_else(|| AnkiError::invalid_input("empty string"))
fn try_into_byte(u: impl TryInto<u8>) -> Result<u8> {
u.try_into()
.map_err(|_| AnkiError::invalid_input("expected single byte"))
}

View file

@ -8,19 +8,12 @@ use std::{
use crate::{
import_export::{
text::{ForeignData, ForeignNote},
text::{csv::Column, ForeignData, ForeignNote},
NoteLog,
},
prelude::*,
};
#[derive(Debug, Clone, Copy)]
pub enum Column {
Field(usize),
Ignore,
Tags,
}
impl Collection {
pub fn import_csv(
&mut self,
@ -66,6 +59,8 @@ fn deserialize_csv(
/// Returns a reader with the first line stripped if it starts with "tags:",
/// which is allowed for historic reasons.
fn reader_without_tags_line(reader: impl Read + Seek) -> Result<impl Read> {
// FIXME: shouldn't pass a buffered reader to csv
// https://docs.rs/csv/latest/csv/struct.ReaderBuilder.html#method.from_reader
let mut buf_reader = BufReader::new(reader);
let mut first_line = String::new();
buf_reader.read_line(&mut first_line)?;

View file

@ -0,0 +1,302 @@
// Copyright: Ankitects Pty Ltd and contributors
// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
use std::{
fs::File,
io::{BufRead, BufReader},
};
use crate::{backend_proto::CsvMetadata, error::ImportError, prelude::*};
impl Collection {
pub fn get_csv_metadata(&mut self, path: &str, delimiter: Option<u8>) -> Result<CsvMetadata> {
let reader = BufReader::new(File::open(path)?);
self.get_reader_metadata(reader, delimiter)
}
fn get_reader_metadata(
&mut self,
reader: impl BufRead,
delimiter: Option<u8>,
) -> Result<CsvMetadata> {
let mut metadata = CsvMetadata::default();
let line = self.parse_meta_lines(reader, &mut metadata)?;
set_delimiter(delimiter, &mut metadata, &line);
set_columns(&mut metadata, &line, &self.tr)?;
Ok(metadata)
}
/// Parses the meta head of the file, and returns the first content line.
fn parse_meta_lines(
&mut self,
mut reader: impl BufRead,
metadata: &mut CsvMetadata,
) -> Result<String> {
let mut line = String::new();
reader.read_line(&mut line)?;
if self.parse_first_line(&line, metadata) {
line.clear();
reader.read_line(&mut line)?;
while self.parse_line(&line, metadata) {
line.clear();
reader.read_line(&mut line)?;
}
}
Ok(line)
}
/// True if the line is a meta line, i.e. a comment, or starting with 'tags:'.
fn parse_first_line(&mut self, line: &str, metadata: &mut CsvMetadata) -> bool {
if let Some(tags) = line.strip_prefix("tags:") {
metadata.tags = tags.trim().to_owned();
true
} else {
self.parse_line(line, metadata)
}
}
/// True if the line is a comment.
fn parse_line(&mut self, line: &str, metadata: &mut CsvMetadata) -> bool {
if let Some(l) = line.strip_prefix('#') {
if let Some((key, value)) = l.split_once(':') {
self.parse_meta_value(key, strip_line_ending(value), metadata);
}
true
} else {
false
}
}
fn parse_meta_value(&mut self, key: &str, value: &str, metadata: &mut CsvMetadata) {
match key.trim().to_ascii_lowercase().as_str() {
"delimiter" => {
if let Some(delimiter) = delimter_from_value(value) {
metadata.delimiter = delimiter as u32;
}
}
"tags" => metadata.tags = value.trim().to_owned(),
"columns" => {
if let Ok(columns) = self.parse_columns(value, metadata) {
metadata.columns = columns;
}
}
"deck" => {
if let Ok(Some(did)) = self.deck_id_for_string(value) {
metadata.deck_id = did.0;
}
}
"notetype" => {
if let Ok(Some(nt)) = self.notetype_for_string(value) {
metadata.notetype_id = nt.id.0;
}
}
"html" => metadata.html = value.to_lowercase().parse::<bool>().ok(),
_ => (),
}
}
fn parse_columns(&mut self, line: &str, metadata: &mut CsvMetadata) -> Result<Vec<String>> {
let delimiter = if metadata.delimiter != 0 {
metadata.delimiter as u8
} else {
delimiter_from_line(line)
};
map_single_record(line, delimiter, |record| {
record
.iter()
.enumerate()
.map(|(idx, s)| self.column_label(idx, s))
.collect()
})
}
fn column_label(&self, idx: usize, column: &str) -> String {
match column.trim() {
"" => self.tr.importing_column(idx + 1).to_string(),
"tags" => self.tr.editing_tags().to_string(),
s => s.to_string(),
}
}
}
fn set_columns(metadata: &mut CsvMetadata, line: &str, tr: &I18n) -> Result<()> {
if metadata.columns.is_empty() {
let columns = map_single_record(line, metadata.delimiter as u8, |r| r.len())?;
metadata.columns = (0..columns)
.map(|i| tr.importing_column(i + 1).to_string())
.collect();
}
Ok(())
}
fn set_delimiter(delimiter: Option<u8>, metadata: &mut CsvMetadata, line: &str) {
if let Some(delim) = delimiter {
metadata.delimiter = delim as u32;
} else if metadata.delimiter == 0 {
// XXX: should '#delimiter:[NUL]' be supported?
metadata.delimiter = delimiter_from_line(line) as u32;
}
}
fn delimter_from_value(value: &str) -> Option<u8> {
// FIXME: bytes like '\n', '#' and '"' will likely cause issues
Some(if value.as_bytes().len() == 1 {
value.as_bytes()[0]
} else {
match value.trim().to_ascii_lowercase().as_str() {
"tab" | "\\t" => b'\t',
"semicolon" => b';',
"comma" => b',',
"space" => b' ',
_ => return None,
}
})
}
fn delimiter_from_line(line: &str) -> u8 {
// TODO: use smarter heuristic
for byte in [b'\t', b';', b','] {
if line.contains(byte as char) {
return byte;
}
}
b' '
}
fn map_single_record<T>(
line: &str,
delimiter: u8,
op: impl FnOnce(&csv::StringRecord) -> T,
) -> Result<T> {
csv::ReaderBuilder::new()
.delimiter(delimiter)
.from_reader(line.as_bytes())
.headers()
.map_err(|_| AnkiError::ImportError(ImportError::Corrupt))
.map(op)
}
fn strip_line_ending(line: &str) -> &str {
line.strip_suffix("\r\n")
.unwrap_or_else(|| line.strip_suffix('\n').unwrap_or(line))
}
#[cfg(test)]
mod test {
use super::*;
use crate::collection::open_test_collection;
macro_rules! metadata {
($col:expr,$csv:expr) => {
metadata!($col, $csv, None)
};
($col:expr,$csv:expr, $delim:expr) => {
$col.get_reader_metadata(BufReader::new($csv.as_bytes()), $delim)
.unwrap()
};
}
#[test]
fn should_detect_deck_by_name_or_id() {
let mut col = open_test_collection();
assert_eq!(metadata!(col, "#deck:Default\n").deck_id, 1);
assert_eq!(metadata!(col, "#deck:1\n").deck_id, 1);
}
#[test]
fn should_detect_notetype_by_name_or_id() {
let mut col = open_test_collection();
let basic_id = col.get_notetype_by_name("Basic").unwrap().unwrap().id.0;
assert_eq!(metadata!(col, "#notetype:Basic\n").notetype_id, basic_id);
assert_eq!(
metadata!(col, &format!("#notetype:{basic_id}\n")).notetype_id,
basic_id
);
}
#[test]
fn should_detect_valid_delimiters() {
let mut col = open_test_collection();
assert_eq!(metadata!(col, "#delimiter: \n").delimiter, ' ' as u32);
assert_eq!(metadata!(col, "#delimiter:space\n").delimiter, ' ' as u32);
assert_eq!(metadata!(col, "#delimiter:\t\n").delimiter, '\t' as u32);
assert_eq!(metadata!(col, "#delimiter:Tab\n").delimiter, '\t' as u32);
assert_eq!(metadata!(col, "#delimiter:;\n").delimiter, ';' as u32);
assert_eq!(
metadata!(col, "#delimiter:SEMICOLON\n").delimiter,
';' as u32
);
assert_eq!(metadata!(col, "#delimiter:,\n").delimiter, ',' as u32);
assert_eq!(metadata!(col, "#delimiter:comma\n").delimiter, ',' as u32);
assert_eq!(metadata!(col, "#delimiter:|\n").delimiter, '|' as u32);
// fallback
assert_eq!(metadata!(col, "#delimiter:foo\n").delimiter, ' ' as u32);
assert_eq!(metadata!(col, "#delimiter:♥\n").delimiter, ' ' as u32);
// pick up from first line
assert_eq!(metadata!(col, "foo\tbar\n").delimiter, '\t' as u32);
// override with provided
assert_eq!(
metadata!(col, "#delimiter: \nfoo\tbar\n", Some(b'|')).delimiter,
'|' as u32
);
}
#[test]
fn should_detect_valid_html_toggle() {
let mut col = open_test_collection();
assert_eq!(metadata!(col, "#html:true\n").html, Some(true));
assert_eq!(metadata!(col, "#html:FALSE\n").html, Some(false));
assert_eq!(metadata!(col, "#html:maybe\n").html, None);
}
#[test]
fn should_detect_old_and_new_style_tags() {
let mut col = open_test_collection();
assert_eq!(&metadata!(col, "tags:foo bar\n").tags, "foo bar");
assert_eq!(&metadata!(col, "#tags:foo bar\n").tags, "foo bar");
// only in head
assert_eq!(&metadata!(col, "#\n#tags:foo bar\n").tags, "foo bar");
assert_eq!(&metadata!(col, "\n#tags:foo bar\n").tags, "");
// only on very first line
assert_eq!(&metadata!(col, "#\ntags:foo bar\n").tags, "");
}
#[test]
fn should_detect_column_number_and_names() {
let mut col = open_test_collection();
// detect from line
assert_eq!(
metadata!(col, "foo;bar\n").columns,
["Column 1", "Column 2"]
);
// detect encoded
assert_eq!(
metadata!(col, "#delimiter:,\nfoo;bar\n").columns,
["Column 1"]
);
assert_eq!(
metadata!(col, "#delimiter:|\nfoo|bar\n").columns,
["Column 1", "Column 2"]
);
// override
assert_eq!(
metadata!(col, "#delimiter:;\nfoo;bar\n", Some(b'|')).columns,
["Column 1"]
);
// custom names
assert_eq!(metadata!(col, "#columns:one,two\n").columns, ["one", "two"]);
assert_eq!(
metadata!(col, "#delimiter:|\n#columns:one|two\n").columns,
["one", "two"]
);
// fill in missing
assert_eq!(
metadata!(col, "#columns:one, ,two,\n").columns,
["one", "Column 2", "two", "Column 4"]
);
// fill in special names
assert_eq!(metadata!(col, "#columns:tags\n").columns, ["Tags"]);
}
}

View file

@ -0,0 +1,12 @@
// Copyright: Ankitects Pty Ltd and contributors
// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
mod import;
mod metadata;
#[derive(Debug, Clone, Copy)]
pub enum Column {
Field(usize),
Ignore,
Tags,
}

View file

@ -38,7 +38,7 @@ impl<'a> Context<'a> {
let mut notetypes = HashMap::new();
notetypes.insert(
String::new(),
col.get_notetype_for_string(&data.default_notetype)?,
col.notetype_for_string(&data.default_notetype)?,
);
let mut deck_ids = HashMap::new();
deck_ids.insert(String::new(), col.deck_id_for_string(&data.default_deck)?);
@ -64,7 +64,7 @@ impl<'a> Context<'a> {
Ok(if let Some(nt) = self.notetypes.get(&note.notetype) {
nt.clone()
} else {
let nt = self.col.get_notetype_for_string(&note.notetype)?;
let nt = self.col.notetype_for_string(&note.notetype)?;
self.notetypes.insert(note.notetype.clone(), nt.clone());
nt
})
@ -134,7 +134,7 @@ impl<'a> Context<'a> {
}
impl Collection {
fn deck_id_for_string(&mut self, deck: &str) -> Result<Option<DeckId>> {
pub(super) fn deck_id_for_string(&mut self, deck: &str) -> Result<Option<DeckId>> {
if let Ok(did) = deck.parse::<DeckId>() {
if self.get_deck(did)?.is_some() {
return Ok(Some(did));
@ -143,7 +143,7 @@ impl Collection {
self.get_deck_id(deck)
}
fn get_notetype_for_string(&mut self, notetype: &str) -> Result<Option<Arc<Notetype>>> {
pub(super) fn notetype_for_string(&mut self, notetype: &str) -> Result<Option<Arc<Notetype>>> {
if let Some(nt) = self.get_notetype_for_id_string(notetype)? {
Ok(Some(nt))
} else {