Fix column detection

Was broken because escaped line breaks were not considered.
Also removes delimiter detection on `#columns:` line. User must use tabs
or set delimiter beforehand.
This commit is contained in:
RumovZ 2022-06-03 09:55:08 +02:00
parent ab91517bb4
commit d04926e30f

View file

@ -4,7 +4,7 @@
use std::{ use std::{
collections::{HashMap, HashSet}, collections::{HashMap, HashSet},
fs::File, fs::File,
io::{BufRead, BufReader}, io::{BufRead, BufReader, Read, Seek, SeekFrom},
}; };
use strum::IntoEnumIterator; use strum::IntoEnumIterator;
@ -31,38 +31,51 @@ impl Collection {
fn get_reader_metadata( fn get_reader_metadata(
&mut self, &mut self,
reader: impl BufRead, mut reader: impl BufRead + Seek,
delimiter: Option<Delimiter>, delimiter: Option<Delimiter>,
notetype_id: Option<NotetypeId>, notetype_id: Option<NotetypeId>,
) -> Result<CsvMetadata> { ) -> Result<CsvMetadata> {
let mut metadata = CsvMetadata::default(); let mut metadata = CsvMetadata::default();
let line = self.parse_meta_lines(reader, &mut metadata)?; let meta_len = self.parse_meta_lines(&mut reader, &mut metadata)? as u64;
maybe_set_fallback_delimiter(delimiter, &mut metadata, &line);
maybe_set_fallback_columns(&mut metadata, &line)?; reader.seek(SeekFrom::Start(meta_len))?;
maybe_set_fallback_is_html(&mut metadata, &line)?; maybe_set_fallback_delimiter(delimiter, &mut metadata, &mut reader)?;
reader.seek(SeekFrom::Start(meta_len))?;
let mut csv_reader = csv::ReaderBuilder::new()
.delimiter(metadata.delimiter().byte())
.from_reader(reader);
let record = csv_reader.headers()?;
maybe_set_fallback_columns(&mut metadata, record)?;
maybe_set_fallback_is_html(&mut metadata, record)?;
self.maybe_set_fallback_notetype(&mut metadata, notetype_id)?; self.maybe_set_fallback_notetype(&mut metadata, notetype_id)?;
self.maybe_init_notetype_map(&mut metadata)?; self.maybe_init_notetype_map(&mut metadata)?;
self.maybe_set_fallback_deck(&mut metadata)?; self.maybe_set_fallback_deck(&mut metadata)?;
Ok(metadata) Ok(metadata)
} }
/// Parses the meta head of the file, and returns the first content line. /// Parses the meta head of the file and returns the total of meta bytes.
fn parse_meta_lines( fn parse_meta_lines(
&mut self, &mut self,
mut reader: impl BufRead, mut reader: impl BufRead,
metadata: &mut CsvMetadata, metadata: &mut CsvMetadata,
) -> Result<String> { ) -> Result<usize> {
let mut meta_len = 0;
let mut line = String::new(); let mut line = String::new();
reader.read_line(&mut line)?; let mut line_len = reader.read_line(&mut line)?;
if self.parse_first_line(&line, metadata) { if self.parse_first_line(&line, metadata) {
meta_len += line_len;
line.clear(); line.clear();
reader.read_line(&mut line)?; line_len = reader.read_line(&mut line)?;
while self.parse_line(&line, metadata) { while self.parse_line(&line, metadata) {
meta_len += line_len;
line.clear(); line.clear();
reader.read_line(&mut line)?; line_len = reader.read_line(&mut line)?;
} }
} }
Ok(line) Ok(meta_len)
} }
/// True if the line is a meta line, i.e. a comment, or starting with 'tags:'. /// True if the line is a meta line, i.e. a comment, or starting with 'tags:'.
@ -103,7 +116,7 @@ impl Collection {
} }
"tags" => metadata.global_tags = collect_tags(value), "tags" => metadata.global_tags = collect_tags(value),
"columns" => { "columns" => {
if let Ok(columns) = self.parse_columns(value, metadata) { if let Ok(columns) = parse_columns(value, metadata.delimiter()) {
metadata.column_labels = columns; metadata.column_labels = columns;
} }
} }
@ -131,17 +144,6 @@ impl Collection {
} }
} }
fn parse_columns(&mut self, line: &str, metadata: &mut CsvMetadata) -> Result<Vec<String>> {
let delimiter = if metadata.force_delimiter {
metadata.delimiter()
} else {
delimiter_from_line(line)
};
map_single_record(line, delimiter, |record| {
record.iter().map(ToString::to_string).collect()
})
}
fn maybe_set_fallback_notetype( fn maybe_set_fallback_notetype(
&mut self, &mut self,
metadata: &mut CsvMetadata, metadata: &mut CsvMetadata,
@ -205,6 +207,12 @@ impl Collection {
} }
} }
fn parse_columns(line: &str, delimiter: Delimiter) -> Result<Vec<String>> {
map_single_record(line, delimiter, |record| {
record.iter().map(ToString::to_string).collect()
})
}
pub(super) fn collect_tags(txt: &str) -> Vec<String> { pub(super) fn collect_tags(txt: &str) -> Vec<String> {
txt.split_whitespace() txt.split_whitespace()
.filter(|s| !s.is_empty()) .filter(|s| !s.is_empty())
@ -263,20 +271,24 @@ fn ensure_first_field_is_mapped(
Ok(()) Ok(())
} }
fn maybe_set_fallback_columns(metadata: &mut CsvMetadata, line: &str) -> Result<()> { fn maybe_set_fallback_columns(
metadata: &mut CsvMetadata,
record: &csv::StringRecord,
) -> Result<()> {
if metadata.column_labels.is_empty() { if metadata.column_labels.is_empty() {
let columns = map_single_record(line, metadata.delimiter(), |r| r.len())?; metadata.column_labels = vec![String::new(); record.len()];
metadata.column_labels = vec![String::new(); columns];
} }
Ok(()) Ok(())
} }
fn maybe_set_fallback_is_html(metadata: &mut CsvMetadata, line: &str) -> Result<()> { fn maybe_set_fallback_is_html(
metadata: &mut CsvMetadata,
record: &csv::StringRecord,
) -> Result<()> {
// TODO: should probably check more than one line; can reuse preview lines // TODO: should probably check more than one line; can reuse preview lines
// when it's implemented // when it's implemented
if !metadata.force_is_html { if !metadata.force_is_html {
metadata.is_html = metadata.is_html = record.iter().any(is_html);
map_single_record(line, metadata.delimiter(), |r| r.iter().any(is_html))?;
} }
Ok(()) Ok(())
} }
@ -284,13 +296,14 @@ fn maybe_set_fallback_is_html(metadata: &mut CsvMetadata, line: &str) -> Result<
fn maybe_set_fallback_delimiter( fn maybe_set_fallback_delimiter(
delimiter: Option<Delimiter>, delimiter: Option<Delimiter>,
metadata: &mut CsvMetadata, metadata: &mut CsvMetadata,
line: &str, reader: impl Read,
) { ) -> Result<()> {
if let Some(delim) = delimiter { if let Some(delim) = delimiter {
metadata.set_delimiter(delim); metadata.set_delimiter(delim);
} else if !metadata.force_delimiter { } else if !metadata.force_delimiter {
metadata.set_delimiter(delimiter_from_line(line)); metadata.set_delimiter(delimiter_from_reader(reader)?);
} }
Ok(())
} }
fn delimiter_from_value(value: &str) -> Option<Delimiter> { fn delimiter_from_value(value: &str) -> Option<Delimiter> {
@ -303,14 +316,16 @@ fn delimiter_from_value(value: &str) -> Option<Delimiter> {
None None
} }
fn delimiter_from_line(line: &str) -> Delimiter { fn delimiter_from_reader(mut reader: impl Read) -> Result<Delimiter> {
let mut buf = [0; 8 * 1024];
let _ = reader.read(&mut buf)?;
// TODO: use smarter heuristic // TODO: use smarter heuristic
for delimiter in Delimiter::iter() { for delimiter in Delimiter::iter() {
if line.contains(delimiter.byte() as char) { if buf.contains(&delimiter.byte()) {
return delimiter; return Ok(delimiter);
} }
} }
Delimiter::Space Ok(Delimiter::Space)
} }
fn map_single_record<T>( fn map_single_record<T>(
@ -400,6 +415,8 @@ impl NameOrId {
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use std::io::Cursor;
use super::*; use super::*;
use crate::collection::open_test_collection; use crate::collection::open_test_collection;
@ -408,7 +425,7 @@ mod test {
metadata!($col, $csv, None) metadata!($col, $csv, None)
}; };
($col:expr,$csv:expr, $delim:expr) => { ($col:expr,$csv:expr, $delim:expr) => {
$col.get_reader_metadata(BufReader::new($csv.as_bytes()), $delim, None) $col.get_reader_metadata(BufReader::new(Cursor::new($csv.as_bytes())), $delim, None)
.unwrap() .unwrap()
}; };
} }
@ -561,7 +578,7 @@ mod test {
// custom names // custom names
assert_eq!( assert_eq!(
metadata!(col, "#columns:one,two\n").column_labels, metadata!(col, "#columns:one\ttwo\n").column_labels,
["one", "two"] ["one", "two"]
); );
assert_eq!( assert_eq!(
@ -570,6 +587,17 @@ mod test {
); );
} }
#[test]
fn should_detect_column_number_despite_escaped_line_breaks() {
let mut col = open_test_collection();
assert_eq!(
metadata!(col, "\"foo|\nbar\"\tfoo\tbar\n")
.column_labels
.len(),
3
);
}
impl CsvMetadata { impl CsvMetadata {
fn unwrap_notetype_map(&self) -> &[u32] { fn unwrap_notetype_map(&self) -> &[u32] {
match &self.notetype { match &self.notetype {
@ -589,7 +617,7 @@ mod test {
#[test] #[test]
fn should_map_default_notetype_fields_by_given_column_names() { fn should_map_default_notetype_fields_by_given_column_names() {
let mut col = open_test_collection(); let mut col = open_test_collection();
let meta = metadata!(col, "#columns:Back,Front\nfoo,bar,baz\n"); let meta = metadata!(col, "#columns:Back\tFront\nfoo,bar,baz\n");
assert_eq!(meta.unwrap_notetype_map(), &[2, 1]); assert_eq!(meta.unwrap_notetype_map(), &[2, 1]);
} }
} }