mirror of
https://github.com/ankitects/anki.git
synced 2025-09-25 09:16:38 -04:00
Fix column detection
Was broken because escaped line breaks were not considered. Also removes delimiter detection on `#columns:` line. User must use tabs or set delimiter beforehand.
This commit is contained in:
parent
ab91517bb4
commit
d04926e30f
1 changed files with 68 additions and 40 deletions
|
@ -4,7 +4,7 @@
|
||||||
use std::{
|
use std::{
|
||||||
collections::{HashMap, HashSet},
|
collections::{HashMap, HashSet},
|
||||||
fs::File,
|
fs::File,
|
||||||
io::{BufRead, BufReader},
|
io::{BufRead, BufReader, Read, Seek, SeekFrom},
|
||||||
};
|
};
|
||||||
|
|
||||||
use strum::IntoEnumIterator;
|
use strum::IntoEnumIterator;
|
||||||
|
@ -31,38 +31,51 @@ impl Collection {
|
||||||
|
|
||||||
fn get_reader_metadata(
|
fn get_reader_metadata(
|
||||||
&mut self,
|
&mut self,
|
||||||
reader: impl BufRead,
|
mut reader: impl BufRead + Seek,
|
||||||
delimiter: Option<Delimiter>,
|
delimiter: Option<Delimiter>,
|
||||||
notetype_id: Option<NotetypeId>,
|
notetype_id: Option<NotetypeId>,
|
||||||
) -> Result<CsvMetadata> {
|
) -> Result<CsvMetadata> {
|
||||||
let mut metadata = CsvMetadata::default();
|
let mut metadata = CsvMetadata::default();
|
||||||
let line = self.parse_meta_lines(reader, &mut metadata)?;
|
let meta_len = self.parse_meta_lines(&mut reader, &mut metadata)? as u64;
|
||||||
maybe_set_fallback_delimiter(delimiter, &mut metadata, &line);
|
|
||||||
maybe_set_fallback_columns(&mut metadata, &line)?;
|
reader.seek(SeekFrom::Start(meta_len))?;
|
||||||
maybe_set_fallback_is_html(&mut metadata, &line)?;
|
maybe_set_fallback_delimiter(delimiter, &mut metadata, &mut reader)?;
|
||||||
|
|
||||||
|
reader.seek(SeekFrom::Start(meta_len))?;
|
||||||
|
let mut csv_reader = csv::ReaderBuilder::new()
|
||||||
|
.delimiter(metadata.delimiter().byte())
|
||||||
|
.from_reader(reader);
|
||||||
|
let record = csv_reader.headers()?;
|
||||||
|
|
||||||
|
maybe_set_fallback_columns(&mut metadata, record)?;
|
||||||
|
maybe_set_fallback_is_html(&mut metadata, record)?;
|
||||||
self.maybe_set_fallback_notetype(&mut metadata, notetype_id)?;
|
self.maybe_set_fallback_notetype(&mut metadata, notetype_id)?;
|
||||||
self.maybe_init_notetype_map(&mut metadata)?;
|
self.maybe_init_notetype_map(&mut metadata)?;
|
||||||
self.maybe_set_fallback_deck(&mut metadata)?;
|
self.maybe_set_fallback_deck(&mut metadata)?;
|
||||||
|
|
||||||
Ok(metadata)
|
Ok(metadata)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parses the meta head of the file, and returns the first content line.
|
/// Parses the meta head of the file and returns the total of meta bytes.
|
||||||
fn parse_meta_lines(
|
fn parse_meta_lines(
|
||||||
&mut self,
|
&mut self,
|
||||||
mut reader: impl BufRead,
|
mut reader: impl BufRead,
|
||||||
metadata: &mut CsvMetadata,
|
metadata: &mut CsvMetadata,
|
||||||
) -> Result<String> {
|
) -> Result<usize> {
|
||||||
|
let mut meta_len = 0;
|
||||||
let mut line = String::new();
|
let mut line = String::new();
|
||||||
reader.read_line(&mut line)?;
|
let mut line_len = reader.read_line(&mut line)?;
|
||||||
if self.parse_first_line(&line, metadata) {
|
if self.parse_first_line(&line, metadata) {
|
||||||
|
meta_len += line_len;
|
||||||
line.clear();
|
line.clear();
|
||||||
reader.read_line(&mut line)?;
|
line_len = reader.read_line(&mut line)?;
|
||||||
while self.parse_line(&line, metadata) {
|
while self.parse_line(&line, metadata) {
|
||||||
|
meta_len += line_len;
|
||||||
line.clear();
|
line.clear();
|
||||||
reader.read_line(&mut line)?;
|
line_len = reader.read_line(&mut line)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(line)
|
Ok(meta_len)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// True if the line is a meta line, i.e. a comment, or starting with 'tags:'.
|
/// True if the line is a meta line, i.e. a comment, or starting with 'tags:'.
|
||||||
|
@ -103,7 +116,7 @@ impl Collection {
|
||||||
}
|
}
|
||||||
"tags" => metadata.global_tags = collect_tags(value),
|
"tags" => metadata.global_tags = collect_tags(value),
|
||||||
"columns" => {
|
"columns" => {
|
||||||
if let Ok(columns) = self.parse_columns(value, metadata) {
|
if let Ok(columns) = parse_columns(value, metadata.delimiter()) {
|
||||||
metadata.column_labels = columns;
|
metadata.column_labels = columns;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -131,17 +144,6 @@ impl Collection {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_columns(&mut self, line: &str, metadata: &mut CsvMetadata) -> Result<Vec<String>> {
|
|
||||||
let delimiter = if metadata.force_delimiter {
|
|
||||||
metadata.delimiter()
|
|
||||||
} else {
|
|
||||||
delimiter_from_line(line)
|
|
||||||
};
|
|
||||||
map_single_record(line, delimiter, |record| {
|
|
||||||
record.iter().map(ToString::to_string).collect()
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn maybe_set_fallback_notetype(
|
fn maybe_set_fallback_notetype(
|
||||||
&mut self,
|
&mut self,
|
||||||
metadata: &mut CsvMetadata,
|
metadata: &mut CsvMetadata,
|
||||||
|
@ -205,6 +207,12 @@ impl Collection {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn parse_columns(line: &str, delimiter: Delimiter) -> Result<Vec<String>> {
|
||||||
|
map_single_record(line, delimiter, |record| {
|
||||||
|
record.iter().map(ToString::to_string).collect()
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
pub(super) fn collect_tags(txt: &str) -> Vec<String> {
|
pub(super) fn collect_tags(txt: &str) -> Vec<String> {
|
||||||
txt.split_whitespace()
|
txt.split_whitespace()
|
||||||
.filter(|s| !s.is_empty())
|
.filter(|s| !s.is_empty())
|
||||||
|
@ -263,20 +271,24 @@ fn ensure_first_field_is_mapped(
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn maybe_set_fallback_columns(metadata: &mut CsvMetadata, line: &str) -> Result<()> {
|
fn maybe_set_fallback_columns(
|
||||||
|
metadata: &mut CsvMetadata,
|
||||||
|
record: &csv::StringRecord,
|
||||||
|
) -> Result<()> {
|
||||||
if metadata.column_labels.is_empty() {
|
if metadata.column_labels.is_empty() {
|
||||||
let columns = map_single_record(line, metadata.delimiter(), |r| r.len())?;
|
metadata.column_labels = vec![String::new(); record.len()];
|
||||||
metadata.column_labels = vec![String::new(); columns];
|
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn maybe_set_fallback_is_html(metadata: &mut CsvMetadata, line: &str) -> Result<()> {
|
fn maybe_set_fallback_is_html(
|
||||||
|
metadata: &mut CsvMetadata,
|
||||||
|
record: &csv::StringRecord,
|
||||||
|
) -> Result<()> {
|
||||||
// TODO: should probably check more than one line; can reuse preview lines
|
// TODO: should probably check more than one line; can reuse preview lines
|
||||||
// when it's implemented
|
// when it's implemented
|
||||||
if !metadata.force_is_html {
|
if !metadata.force_is_html {
|
||||||
metadata.is_html =
|
metadata.is_html = record.iter().any(is_html);
|
||||||
map_single_record(line, metadata.delimiter(), |r| r.iter().any(is_html))?;
|
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -284,13 +296,14 @@ fn maybe_set_fallback_is_html(metadata: &mut CsvMetadata, line: &str) -> Result<
|
||||||
fn maybe_set_fallback_delimiter(
|
fn maybe_set_fallback_delimiter(
|
||||||
delimiter: Option<Delimiter>,
|
delimiter: Option<Delimiter>,
|
||||||
metadata: &mut CsvMetadata,
|
metadata: &mut CsvMetadata,
|
||||||
line: &str,
|
reader: impl Read,
|
||||||
) {
|
) -> Result<()> {
|
||||||
if let Some(delim) = delimiter {
|
if let Some(delim) = delimiter {
|
||||||
metadata.set_delimiter(delim);
|
metadata.set_delimiter(delim);
|
||||||
} else if !metadata.force_delimiter {
|
} else if !metadata.force_delimiter {
|
||||||
metadata.set_delimiter(delimiter_from_line(line));
|
metadata.set_delimiter(delimiter_from_reader(reader)?);
|
||||||
}
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn delimiter_from_value(value: &str) -> Option<Delimiter> {
|
fn delimiter_from_value(value: &str) -> Option<Delimiter> {
|
||||||
|
@ -303,14 +316,16 @@ fn delimiter_from_value(value: &str) -> Option<Delimiter> {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
fn delimiter_from_line(line: &str) -> Delimiter {
|
fn delimiter_from_reader(mut reader: impl Read) -> Result<Delimiter> {
|
||||||
|
let mut buf = [0; 8 * 1024];
|
||||||
|
let _ = reader.read(&mut buf)?;
|
||||||
// TODO: use smarter heuristic
|
// TODO: use smarter heuristic
|
||||||
for delimiter in Delimiter::iter() {
|
for delimiter in Delimiter::iter() {
|
||||||
if line.contains(delimiter.byte() as char) {
|
if buf.contains(&delimiter.byte()) {
|
||||||
return delimiter;
|
return Ok(delimiter);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Delimiter::Space
|
Ok(Delimiter::Space)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn map_single_record<T>(
|
fn map_single_record<T>(
|
||||||
|
@ -400,6 +415,8 @@ impl NameOrId {
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
|
use std::io::Cursor;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::collection::open_test_collection;
|
use crate::collection::open_test_collection;
|
||||||
|
|
||||||
|
@ -408,7 +425,7 @@ mod test {
|
||||||
metadata!($col, $csv, None)
|
metadata!($col, $csv, None)
|
||||||
};
|
};
|
||||||
($col:expr,$csv:expr, $delim:expr) => {
|
($col:expr,$csv:expr, $delim:expr) => {
|
||||||
$col.get_reader_metadata(BufReader::new($csv.as_bytes()), $delim, None)
|
$col.get_reader_metadata(BufReader::new(Cursor::new($csv.as_bytes())), $delim, None)
|
||||||
.unwrap()
|
.unwrap()
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -561,7 +578,7 @@ mod test {
|
||||||
|
|
||||||
// custom names
|
// custom names
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
metadata!(col, "#columns:one,two\n").column_labels,
|
metadata!(col, "#columns:one\ttwo\n").column_labels,
|
||||||
["one", "two"]
|
["one", "two"]
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
@ -570,6 +587,17 @@ mod test {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn should_detect_column_number_despite_escaped_line_breaks() {
|
||||||
|
let mut col = open_test_collection();
|
||||||
|
assert_eq!(
|
||||||
|
metadata!(col, "\"foo|\nbar\"\tfoo\tbar\n")
|
||||||
|
.column_labels
|
||||||
|
.len(),
|
||||||
|
3
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
impl CsvMetadata {
|
impl CsvMetadata {
|
||||||
fn unwrap_notetype_map(&self) -> &[u32] {
|
fn unwrap_notetype_map(&self) -> &[u32] {
|
||||||
match &self.notetype {
|
match &self.notetype {
|
||||||
|
@ -589,7 +617,7 @@ mod test {
|
||||||
#[test]
|
#[test]
|
||||||
fn should_map_default_notetype_fields_by_given_column_names() {
|
fn should_map_default_notetype_fields_by_given_column_names() {
|
||||||
let mut col = open_test_collection();
|
let mut col = open_test_collection();
|
||||||
let meta = metadata!(col, "#columns:Back,Front\nfoo,bar,baz\n");
|
let meta = metadata!(col, "#columns:Back\tFront\nfoo,bar,baz\n");
|
||||||
assert_eq!(meta.unwrap_notetype_map(), &[2, 1]);
|
assert_eq!(meta.unwrap_notetype_map(), &[2, 1]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue