add search that ignores combining chars

On a test of a ~40k card collection, the 'ignore accents' add-on
takes about 1150ms, and this code takes about 70ms.
This commit is contained in:
Damien Elmes 2020-03-21 15:15:59 +10:00
parent f0ed34d79b
commit 51a379de23
4 changed files with 60 additions and 1 deletions

View file

@ -79,6 +79,7 @@ pub(super) enum SearchNode<'a> {
},
WholeCollection,
Regex(Cow<'a, str>),
NoCombining(Cow<'a, str>),
}
#[derive(Debug, PartialEq)]
@ -272,6 +273,7 @@ fn search_node_for_text_with_argument<'a>(
"dupes" => parse_dupes(val.as_ref())?,
"prop" => parse_prop(val.as_ref())?,
"re" => SearchNode::Regex(val),
"nc" => SearchNode::NoCombining(val),
// anything else is a field search
_ => parse_single_field(key.as_ref(), val.as_ref()),
})

View file

@ -8,6 +8,7 @@ use crate::decks::get_deck;
use crate::err::{AnkiError, Result};
use crate::notes::field_checksum;
use crate::text::matches_wildcard;
use crate::text::without_combining;
use crate::{
collection::RequestContext, text::strip_html_preserving_image_filenames, types::ObjID,
};
@ -81,6 +82,7 @@ impl SqlWriter<'_, '_> {
SearchNode::Property { operator, kind } => self.write_prop(operator, kind)?,
SearchNode::WholeCollection => write!(self.sql, "true").unwrap(),
SearchNode::Regex(re) => self.write_regex(re.as_ref()),
SearchNode::NoCombining(text) => self.write_no_combining(text.as_ref()),
};
Ok(())
}
@ -97,6 +99,20 @@ impl SqlWriter<'_, '_> {
.unwrap();
}
fn write_no_combining(&mut self, text: &str) {
let text = format!("%{}%", without_combining(text));
self.args.push(text);
write!(
self.sql,
concat!(
"(coalesce(without_combining(cast(n.sfld as text)), n.sfld) like ?{n} escape '\\' ",
"or coalesce(without_combining(n.flds), n.flds) like ?{n} escape '\\')"
),
n = self.args.len(),
)
.unwrap();
}
fn write_tag(&mut self, text: &str) {
match text {
"none" => {

View file

@ -10,12 +10,14 @@ use crate::{
decks::Deck,
notetypes::NoteType,
sched::cutoff::{sched_timing_today, SchedTimingToday},
text::without_combining,
types::{ObjID, Usn},
};
use regex::Regex;
use rusqlite::{params, Connection, NO_PARAMS};
use std::cmp::Ordering;
use std::{
borrow::Cow,
collections::HashMap,
path::{Path, PathBuf},
};
@ -58,6 +60,7 @@ fn open_or_create_collection_db(path: &Path) -> Result<Connection> {
add_field_index_function(&db)?;
add_regexp_function(&db)?;
add_without_combining_function(&db)?;
db.create_collation("unicase", unicase_compare)?;
@ -75,6 +78,16 @@ fn add_field_index_function(db: &Connection) -> rusqlite::Result<()> {
})
}
fn add_without_combining_function(db: &Connection) -> rusqlite::Result<()> {
db.create_scalar_function("without_combining", 1, true, |ctx| {
let text = ctx.get_raw(0).as_str()?;
Ok(match without_combining(text) {
Cow::Borrowed(_) => None,
Cow::Owned(o) => Some(o),
})
})
}
/// Adds sql function regexp(regex, string) -> is_match
/// Taken from the rusqlite docs
fn add_regexp_function(db: &Connection) -> rusqlite::Result<()> {

View file

@ -6,7 +6,9 @@ use regex::{Captures, Regex};
use std::borrow::Cow;
use std::ptr;
use unicase::eq as uni_eq;
use unicode_normalization::{is_nfc, UnicodeNormalization};
use unicode_normalization::{
char::is_combining_mark, is_nfc, is_nfkd_quick, IsNormalized, UnicodeNormalization,
};
#[derive(Debug, PartialEq)]
pub enum AVTag {
@ -231,12 +233,32 @@ pub(crate) fn matches_wildcard(text: &str, search: &str) -> bool {
}
}
/// Convert provided string to NFKD form and strip combining characters.
pub(crate) fn without_combining(s: &str) -> Cow<str> {
// if the string is already normalized
if matches!(is_nfkd_quick(s.chars()), IsNormalized::Yes) {
// and no combining characters found, return unchanged
if !s.chars().any(is_combining_mark) {
return s.into();
}
}
// we need to create a new string without the combining marks
s.chars()
.nfkd()
.filter(|c| !is_combining_mark(*c))
.collect::<String>()
.into()
}
#[cfg(test)]
mod test {
use super::matches_wildcard;
use crate::text::without_combining;
use crate::text::{
extract_av_tags, strip_av_tags, strip_html, strip_html_preserving_image_filenames, AVTag,
};
use std::borrow::Cow;
#[test]
fn stripping() {
@ -287,4 +309,10 @@ mod test {
assert_eq!(matches_wildcard("foo", "F*oo"), true);
assert_eq!(matches_wildcard("foo", "b*"), false);
}
#[test]
fn combining() {
assert!(matches!(without_combining("test"), Cow::Borrowed(_)));
assert!(matches!(without_combining("Über"), Cow::Owned(_)));
}
}