add search that ignores combining chars

On a test of a ~40k card collection, the 'ignore accents' add-on takes about 1150ms, and this code takes about 70ms.
2025-12-25 21:03:17 -05:00 · 2020-03-21 15:15:59 +10:00 · 2020-03-21 15:15:59 +10:00 · 51a379de23
commit 51a379de23
parent f0ed34d79b
4 changed files with 60 additions and 1 deletions
--- a/rslib/src/search/parser.rs
+++ b/rslib/src/search/parser.rs
@ -79,6 +79,7 @@ pub(super) enum SearchNode<'a> {
    },
    WholeCollection,
    Regex(Cow<'a, str>),
    NoCombining(Cow<'a, str>),
 }
 #[derive(Debug, PartialEq)]
@ -272,6 +273,7 @@ fn search_node_for_text_with_argument<'a>(
        "dupes" => parse_dupes(val.as_ref())?,
        "prop" => parse_prop(val.as_ref())?,
        "re" => SearchNode::Regex(val),
        "nc" => SearchNode::NoCombining(val),
        // anything else is a field search
        _ => parse_single_field(key.as_ref(), val.as_ref()),
    })
--- a/rslib/src/search/sqlwriter.rs
+++ b/rslib/src/search/sqlwriter.rs
@ -8,6 +8,7 @@ use crate::decks::get_deck;
 use crate::err::{AnkiError, Result};
 use crate::notes::field_checksum;
 use crate::text::matches_wildcard;
 use crate::text::without_combining;
 use crate::{
    collection::RequestContext, text::strip_html_preserving_image_filenames, types::ObjID,
 };
@ -81,6 +82,7 @@ impl SqlWriter<'_, '_> {
            SearchNode::Property { operator, kind } => self.write_prop(operator, kind)?,
            SearchNode::WholeCollection => write!(self.sql, "true").unwrap(),
            SearchNode::Regex(re) => self.write_regex(re.as_ref()),
            SearchNode::NoCombining(text) => self.write_no_combining(text.as_ref()),
        };
        Ok(())
    }
@ -97,6 +99,20 @@ impl SqlWriter<'_, '_> {
        .unwrap();
    }
    fn write_no_combining(&mut self, text: &str) {
        let text = format!("%{}%", without_combining(text));
        self.args.push(text);
        write!(
            self.sql,
            concat!(
                "(coalesce(without_combining(cast(n.sfld as text)), n.sfld) like ?{n} escape '\\' ",
                "or coalesce(without_combining(n.flds), n.flds) like ?{n} escape '\\')"
            ),
            n = self.args.len(),
        )
        .unwrap();
    }
    fn write_tag(&mut self, text: &str) {
        match text {
            "none" => {
--- a/rslib/src/storage/sqlite.rs
+++ b/rslib/src/storage/sqlite.rs
@ -10,12 +10,14 @@ use crate::{
    decks::Deck,
    notetypes::NoteType,
    sched::cutoff::{sched_timing_today, SchedTimingToday},
    text::without_combining,
    types::{ObjID, Usn},
 };
 use regex::Regex;
 use rusqlite::{params, Connection, NO_PARAMS};
 use std::cmp::Ordering;
 use std::{
    borrow::Cow,
    collections::HashMap,
    path::{Path, PathBuf},
 };
@ -58,6 +60,7 @@ fn open_or_create_collection_db(path: &Path) -> Result<Connection> {
    add_field_index_function(&db)?;
    add_regexp_function(&db)?;
    add_without_combining_function(&db)?;
    db.create_collation("unicase", unicase_compare)?;
@ -75,6 +78,16 @@ fn add_field_index_function(db: &Connection) -> rusqlite::Result<()> {
    })
 }
 fn add_without_combining_function(db: &Connection) -> rusqlite::Result<()> {
    db.create_scalar_function("without_combining", 1, true, |ctx| {
        let text = ctx.get_raw(0).as_str()?;
        Ok(match without_combining(text) {
            Cow::Borrowed(_) => None,
            Cow::Owned(o) => Some(o),
        })
    })
 }
 /// Adds sql function regexp(regex, string) -> is_match
 /// Taken from the rusqlite docs
 fn add_regexp_function(db: &Connection) -> rusqlite::Result<()> {
--- a/rslib/src/text.rs
+++ b/rslib/src/text.rs
@ -6,7 +6,9 @@ use regex::{Captures, Regex};
 use std::borrow::Cow;
 use std::ptr;
 use unicase::eq as uni_eq;
-use unicode_normalization::{is_nfc, UnicodeNormalization};
+use unicode_normalization::{
    char::is_combining_mark, is_nfc, is_nfkd_quick, IsNormalized, UnicodeNormalization,
 };
 #[derive(Debug, PartialEq)]
 pub enum AVTag {
@ -231,12 +233,32 @@ pub(crate) fn matches_wildcard(text: &str, search: &str) -> bool {
    }
 }
 /// Convert provided string to NFKD form and strip combining characters.
 pub(crate) fn without_combining(s: &str) -> Cow<str> {
    // if the string is already normalized
    if matches!(is_nfkd_quick(s.chars()), IsNormalized::Yes) {
        // and no combining characters found, return unchanged
        if !s.chars().any(is_combining_mark) {
            return s.into();
        }
    }
    // we need to create a new string without the combining marks
    s.chars()
        .nfkd()
        .filter(|c| !is_combining_mark(*c))
        .collect::<String>()
        .into()
 }
 #[cfg(test)]
 mod test {
    use super::matches_wildcard;
    use crate::text::without_combining;
    use crate::text::{
        extract_av_tags, strip_av_tags, strip_html, strip_html_preserving_image_filenames, AVTag,
    };
    use std::borrow::Cow;
    #[test]
    fn stripping() {
@ -287,4 +309,10 @@ mod test {
        assert_eq!(matches_wildcard("foo", "F*oo"), true);
        assert_eq!(matches_wildcard("foo", "b*"), false);
    }
    #[test]
    fn combining() {
        assert!(matches!(without_combining("test"), Cow::Borrowed(_)));
        assert!(matches!(without_combining("Über"), Cow::Owned(_)));
    }
 }