From 51a379de23d8134b90e81935724bf6cdf2da00da Mon Sep 17 00:00:00 2001
From: Damien Elmes <gpg@ankiweb.net>
Date: Sat, 21 Mar 2020 15:15:59 +1000
Subject: [PATCH] add search that ignores combining chars

On a test of a ~40k card collection, the 'ignore accents' add-on
takes about 1150ms, and this code takes about 70ms.
---
 rslib/src/search/parser.rs    |  2 ++
 rslib/src/search/sqlwriter.rs | 16 ++++++++++++++++
 rslib/src/storage/sqlite.rs   | 13 +++++++++++++
 rslib/src/text.rs             | 30 +++++++++++++++++++++++++++++-
 4 files changed, 60 insertions(+), 1 deletion(-)
diff --git a/rslib/src/search/parser.rs b/rslib/src/search/parser.rs
index 46575350c..92a7a3744 100644
--- a/rslib/src/search/parser.rs
+++ b/rslib/src/search/parser.rs
@@ -79,6 +79,7 @@ pub(super) enum SearchNode<'a> {
     },
     WholeCollection,
     Regex(Cow<'a, str>),
+    NoCombining(Cow<'a, str>),
 }
 
 #[derive(Debug, PartialEq)]
@@ -272,6 +273,7 @@ fn search_node_for_text_with_argument<'a>(
         "dupes" => parse_dupes(val.as_ref())?,
         "prop" => parse_prop(val.as_ref())?,
         "re" => SearchNode::Regex(val),
+        "nc" => SearchNode::NoCombining(val),
         // anything else is a field search
         _ => parse_single_field(key.as_ref(), val.as_ref()),
     })
diff --git a/rslib/src/search/sqlwriter.rs b/rslib/src/search/sqlwriter.rs
index dee776f45..9b4107245 100644
--- a/rslib/src/search/sqlwriter.rs
+++ b/rslib/src/search/sqlwriter.rs
@@ -8,6 +8,7 @@ use crate::decks::get_deck;
 use crate::err::{AnkiError, Result};
 use crate::notes::field_checksum;
 use crate::text::matches_wildcard;
+use crate::text::without_combining;
 use crate::{
     collection::RequestContext, text::strip_html_preserving_image_filenames, types::ObjID,
 };
@@ -81,6 +82,7 @@ impl SqlWriter<'_, '_> {
             SearchNode::Property { operator, kind } => self.write_prop(operator, kind)?,
             SearchNode::WholeCollection => write!(self.sql, "true").unwrap(),
             SearchNode::Regex(re) => self.write_regex(re.as_ref()),
+            SearchNode::NoCombining(text) => self.write_no_combining(text.as_ref()),
         };
         Ok(())
     }
@@ -97,6 +99,20 @@ impl SqlWriter<'_, '_> {
         .unwrap();
     }
 
+    fn write_no_combining(&mut self, text: &str) {
+        let text = format!("%{}%", without_combining(text));
+        self.args.push(text);
+        write!(
+            self.sql,
+            concat!(
+                "(coalesce(without_combining(cast(n.sfld as text)), n.sfld) like ?{n} escape '\\' ",
+                "or coalesce(without_combining(n.flds), n.flds) like ?{n} escape '\\')"
+            ),
+            n = self.args.len(),
+        )
+        .unwrap();
+    }
+
     fn write_tag(&mut self, text: &str) {
         match text {
             "none" => {
diff --git a/rslib/src/storage/sqlite.rs b/rslib/src/storage/sqlite.rs
index 0d8a3936d..23d6eec1b 100644
--- a/rslib/src/storage/sqlite.rs
+++ b/rslib/src/storage/sqlite.rs
@@ -10,12 +10,14 @@ use crate::{
     decks::Deck,
     notetypes::NoteType,
     sched::cutoff::{sched_timing_today, SchedTimingToday},
+    text::without_combining,
     types::{ObjID, Usn},
 };
 use regex::Regex;
 use rusqlite::{params, Connection, NO_PARAMS};
 use std::cmp::Ordering;
 use std::{
+    borrow::Cow,
     collections::HashMap,
     path::{Path, PathBuf},
 };
@@ -58,6 +60,7 @@ fn open_or_create_collection_db(path: &Path) -> Result<Connection> {
 
     add_field_index_function(&db)?;
     add_regexp_function(&db)?;
+    add_without_combining_function(&db)?;
 
     db.create_collation("unicase", unicase_compare)?;
 
@@ -75,6 +78,16 @@ fn add_field_index_function(db: &Connection) -> rusqlite::Result<()> {
     })
 }
 
+fn add_without_combining_function(db: &Connection) -> rusqlite::Result<()> {
+    db.create_scalar_function("without_combining", 1, true, |ctx| {
+        let text = ctx.get_raw(0).as_str()?;
+        Ok(match without_combining(text) {
+            Cow::Borrowed(_) => None,
+            Cow::Owned(o) => Some(o),
+        })
+    })
+}
+
 /// Adds sql function regexp(regex, string) -> is_match
 /// Taken from the rusqlite docs
 fn add_regexp_function(db: &Connection) -> rusqlite::Result<()> {
diff --git a/rslib/src/text.rs b/rslib/src/text.rs
index 0a799c5df..a88bd4a4f 100644
--- a/rslib/src/text.rs
+++ b/rslib/src/text.rs
@@ -6,7 +6,9 @@ use regex::{Captures, Regex};
 use std::borrow::Cow;
 use std::ptr;
 use unicase::eq as uni_eq;
-use unicode_normalization::{is_nfc, UnicodeNormalization};
+use unicode_normalization::{
+    char::is_combining_mark, is_nfc, is_nfkd_quick, IsNormalized, UnicodeNormalization,
+};
 
 #[derive(Debug, PartialEq)]
 pub enum AVTag {
@@ -231,12 +233,32 @@ pub(crate) fn matches_wildcard(text: &str, search: &str) -> bool {
     }
 }
 
+/// Convert provided string to NFKD form and strip combining characters.
+pub(crate) fn without_combining(s: &str) -> Cow<str> {
+    // if the string is already normalized
+    if matches!(is_nfkd_quick(s.chars()), IsNormalized::Yes) {
+        // and no combining characters found, return unchanged
+        if !s.chars().any(is_combining_mark) {
+            return s.into();
+        }
+    }
+
+    // we need to create a new string without the combining marks
+    s.chars()
+        .nfkd()
+        .filter(|c| !is_combining_mark(*c))
+        .collect::<String>()
+        .into()
+}
+
 #[cfg(test)]
 mod test {
     use super::matches_wildcard;
+    use crate::text::without_combining;
     use crate::text::{
         extract_av_tags, strip_av_tags, strip_html, strip_html_preserving_image_filenames, AVTag,
     };
+    use std::borrow::Cow;
 
     #[test]
     fn stripping() {
@@ -287,4 +309,10 @@ mod test {
         assert_eq!(matches_wildcard("foo", "F*oo"), true);
         assert_eq!(matches_wildcard("foo", "b*"), false);
     }
+
+    #[test]
+    fn combining() {
+        assert!(matches!(without_combining("test"), Cow::Borrowed(_)));
+        assert!(matches!(without_combining("Über"), Cow::Owned(_)));
+    }
 }