From 51a379de23d8134b90e81935724bf6cdf2da00da Mon Sep 17 00:00:00 2001 From: Damien Elmes Date: Sat, 21 Mar 2020 15:15:59 +1000 Subject: [PATCH] add search that ignores combining chars On a test of a ~40k card collection, the 'ignore accents' add-on takes about 1150ms, and this code takes about 70ms. --- rslib/src/search/parser.rs | 2 ++ rslib/src/search/sqlwriter.rs | 16 ++++++++++++++++ rslib/src/storage/sqlite.rs | 13 +++++++++++++ rslib/src/text.rs | 30 +++++++++++++++++++++++++++++- 4 files changed, 60 insertions(+), 1 deletion(-) diff --git a/rslib/src/search/parser.rs b/rslib/src/search/parser.rs index 46575350c..92a7a3744 100644 --- a/rslib/src/search/parser.rs +++ b/rslib/src/search/parser.rs @@ -79,6 +79,7 @@ pub(super) enum SearchNode<'a> { }, WholeCollection, Regex(Cow<'a, str>), + NoCombining(Cow<'a, str>), } #[derive(Debug, PartialEq)] @@ -272,6 +273,7 @@ fn search_node_for_text_with_argument<'a>( "dupes" => parse_dupes(val.as_ref())?, "prop" => parse_prop(val.as_ref())?, "re" => SearchNode::Regex(val), + "nc" => SearchNode::NoCombining(val), // anything else is a field search _ => parse_single_field(key.as_ref(), val.as_ref()), }) diff --git a/rslib/src/search/sqlwriter.rs b/rslib/src/search/sqlwriter.rs index dee776f45..9b4107245 100644 --- a/rslib/src/search/sqlwriter.rs +++ b/rslib/src/search/sqlwriter.rs @@ -8,6 +8,7 @@ use crate::decks::get_deck; use crate::err::{AnkiError, Result}; use crate::notes::field_checksum; use crate::text::matches_wildcard; +use crate::text::without_combining; use crate::{ collection::RequestContext, text::strip_html_preserving_image_filenames, types::ObjID, }; @@ -81,6 +82,7 @@ impl SqlWriter<'_, '_> { SearchNode::Property { operator, kind } => self.write_prop(operator, kind)?, SearchNode::WholeCollection => write!(self.sql, "true").unwrap(), SearchNode::Regex(re) => self.write_regex(re.as_ref()), + SearchNode::NoCombining(text) => self.write_no_combining(text.as_ref()), }; Ok(()) } @@ -97,6 +99,20 @@ impl SqlWriter<'_, '_> { .unwrap(); } + fn write_no_combining(&mut self, text: &str) { + let text = format!("%{}%", without_combining(text)); + self.args.push(text); + write!( + self.sql, + concat!( + "(coalesce(without_combining(cast(n.sfld as text)), n.sfld) like ?{n} escape '\\' ", + "or coalesce(without_combining(n.flds), n.flds) like ?{n} escape '\\')" + ), + n = self.args.len(), + ) + .unwrap(); + } + fn write_tag(&mut self, text: &str) { match text { "none" => { diff --git a/rslib/src/storage/sqlite.rs b/rslib/src/storage/sqlite.rs index 0d8a3936d..23d6eec1b 100644 --- a/rslib/src/storage/sqlite.rs +++ b/rslib/src/storage/sqlite.rs @@ -10,12 +10,14 @@ use crate::{ decks::Deck, notetypes::NoteType, sched::cutoff::{sched_timing_today, SchedTimingToday}, + text::without_combining, types::{ObjID, Usn}, }; use regex::Regex; use rusqlite::{params, Connection, NO_PARAMS}; use std::cmp::Ordering; use std::{ + borrow::Cow, collections::HashMap, path::{Path, PathBuf}, }; @@ -58,6 +60,7 @@ fn open_or_create_collection_db(path: &Path) -> Result { add_field_index_function(&db)?; add_regexp_function(&db)?; + add_without_combining_function(&db)?; db.create_collation("unicase", unicase_compare)?; @@ -75,6 +78,16 @@ fn add_field_index_function(db: &Connection) -> rusqlite::Result<()> { }) } +fn add_without_combining_function(db: &Connection) -> rusqlite::Result<()> { + db.create_scalar_function("without_combining", 1, true, |ctx| { + let text = ctx.get_raw(0).as_str()?; + Ok(match without_combining(text) { + Cow::Borrowed(_) => None, + Cow::Owned(o) => Some(o), + }) + }) +} + /// Adds sql function regexp(regex, string) -> is_match /// Taken from the rusqlite docs fn add_regexp_function(db: &Connection) -> rusqlite::Result<()> { diff --git a/rslib/src/text.rs b/rslib/src/text.rs index 0a799c5df..a88bd4a4f 100644 --- a/rslib/src/text.rs +++ b/rslib/src/text.rs @@ -6,7 +6,9 @@ use regex::{Captures, Regex}; use std::borrow::Cow; use std::ptr; use unicase::eq as uni_eq; -use unicode_normalization::{is_nfc, UnicodeNormalization}; +use unicode_normalization::{ + char::is_combining_mark, is_nfc, is_nfkd_quick, IsNormalized, UnicodeNormalization, +}; #[derive(Debug, PartialEq)] pub enum AVTag { @@ -231,12 +233,32 @@ pub(crate) fn matches_wildcard(text: &str, search: &str) -> bool { } } +/// Convert provided string to NFKD form and strip combining characters. +pub(crate) fn without_combining(s: &str) -> Cow { + // if the string is already normalized + if matches!(is_nfkd_quick(s.chars()), IsNormalized::Yes) { + // and no combining characters found, return unchanged + if !s.chars().any(is_combining_mark) { + return s.into(); + } + } + + // we need to create a new string without the combining marks + s.chars() + .nfkd() + .filter(|c| !is_combining_mark(*c)) + .collect::() + .into() +} + #[cfg(test)] mod test { use super::matches_wildcard; + use crate::text::without_combining; use crate::text::{ extract_av_tags, strip_av_tags, strip_html, strip_html_preserving_image_filenames, AVTag, }; + use std::borrow::Cow; #[test] fn stripping() { @@ -287,4 +309,10 @@ mod test { assert_eq!(matches_wildcard("foo", "F*oo"), true); assert_eq!(matches_wildcard("foo", "b*"), false); } + + #[test] + fn combining() { + assert!(matches!(without_combining("test"), Cow::Borrowed(_))); + assert!(matches!(without_combining("Über"), Cow::Owned(_))); + } }