mirror of
https://github.com/ankitects/anki.git
synced 2025-09-23 08:22:24 -04:00
add search that ignores combining chars
On a test of a ~40k card collection, the 'ignore accents' add-on takes about 1150ms, and this code takes about 70ms.
This commit is contained in:
parent
f0ed34d79b
commit
51a379de23
4 changed files with 60 additions and 1 deletions
|
@ -79,6 +79,7 @@ pub(super) enum SearchNode<'a> {
|
||||||
},
|
},
|
||||||
WholeCollection,
|
WholeCollection,
|
||||||
Regex(Cow<'a, str>),
|
Regex(Cow<'a, str>),
|
||||||
|
NoCombining(Cow<'a, str>),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq)]
|
||||||
|
@ -272,6 +273,7 @@ fn search_node_for_text_with_argument<'a>(
|
||||||
"dupes" => parse_dupes(val.as_ref())?,
|
"dupes" => parse_dupes(val.as_ref())?,
|
||||||
"prop" => parse_prop(val.as_ref())?,
|
"prop" => parse_prop(val.as_ref())?,
|
||||||
"re" => SearchNode::Regex(val),
|
"re" => SearchNode::Regex(val),
|
||||||
|
"nc" => SearchNode::NoCombining(val),
|
||||||
// anything else is a field search
|
// anything else is a field search
|
||||||
_ => parse_single_field(key.as_ref(), val.as_ref()),
|
_ => parse_single_field(key.as_ref(), val.as_ref()),
|
||||||
})
|
})
|
||||||
|
|
|
@ -8,6 +8,7 @@ use crate::decks::get_deck;
|
||||||
use crate::err::{AnkiError, Result};
|
use crate::err::{AnkiError, Result};
|
||||||
use crate::notes::field_checksum;
|
use crate::notes::field_checksum;
|
||||||
use crate::text::matches_wildcard;
|
use crate::text::matches_wildcard;
|
||||||
|
use crate::text::without_combining;
|
||||||
use crate::{
|
use crate::{
|
||||||
collection::RequestContext, text::strip_html_preserving_image_filenames, types::ObjID,
|
collection::RequestContext, text::strip_html_preserving_image_filenames, types::ObjID,
|
||||||
};
|
};
|
||||||
|
@ -81,6 +82,7 @@ impl SqlWriter<'_, '_> {
|
||||||
SearchNode::Property { operator, kind } => self.write_prop(operator, kind)?,
|
SearchNode::Property { operator, kind } => self.write_prop(operator, kind)?,
|
||||||
SearchNode::WholeCollection => write!(self.sql, "true").unwrap(),
|
SearchNode::WholeCollection => write!(self.sql, "true").unwrap(),
|
||||||
SearchNode::Regex(re) => self.write_regex(re.as_ref()),
|
SearchNode::Regex(re) => self.write_regex(re.as_ref()),
|
||||||
|
SearchNode::NoCombining(text) => self.write_no_combining(text.as_ref()),
|
||||||
};
|
};
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -97,6 +99,20 @@ impl SqlWriter<'_, '_> {
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn write_no_combining(&mut self, text: &str) {
|
||||||
|
let text = format!("%{}%", without_combining(text));
|
||||||
|
self.args.push(text);
|
||||||
|
write!(
|
||||||
|
self.sql,
|
||||||
|
concat!(
|
||||||
|
"(coalesce(without_combining(cast(n.sfld as text)), n.sfld) like ?{n} escape '\\' ",
|
||||||
|
"or coalesce(without_combining(n.flds), n.flds) like ?{n} escape '\\')"
|
||||||
|
),
|
||||||
|
n = self.args.len(),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
fn write_tag(&mut self, text: &str) {
|
fn write_tag(&mut self, text: &str) {
|
||||||
match text {
|
match text {
|
||||||
"none" => {
|
"none" => {
|
||||||
|
|
|
@ -10,12 +10,14 @@ use crate::{
|
||||||
decks::Deck,
|
decks::Deck,
|
||||||
notetypes::NoteType,
|
notetypes::NoteType,
|
||||||
sched::cutoff::{sched_timing_today, SchedTimingToday},
|
sched::cutoff::{sched_timing_today, SchedTimingToday},
|
||||||
|
text::without_combining,
|
||||||
types::{ObjID, Usn},
|
types::{ObjID, Usn},
|
||||||
};
|
};
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use rusqlite::{params, Connection, NO_PARAMS};
|
use rusqlite::{params, Connection, NO_PARAMS};
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::{
|
use std::{
|
||||||
|
borrow::Cow,
|
||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
path::{Path, PathBuf},
|
path::{Path, PathBuf},
|
||||||
};
|
};
|
||||||
|
@ -58,6 +60,7 @@ fn open_or_create_collection_db(path: &Path) -> Result<Connection> {
|
||||||
|
|
||||||
add_field_index_function(&db)?;
|
add_field_index_function(&db)?;
|
||||||
add_regexp_function(&db)?;
|
add_regexp_function(&db)?;
|
||||||
|
add_without_combining_function(&db)?;
|
||||||
|
|
||||||
db.create_collation("unicase", unicase_compare)?;
|
db.create_collation("unicase", unicase_compare)?;
|
||||||
|
|
||||||
|
@ -75,6 +78,16 @@ fn add_field_index_function(db: &Connection) -> rusqlite::Result<()> {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn add_without_combining_function(db: &Connection) -> rusqlite::Result<()> {
|
||||||
|
db.create_scalar_function("without_combining", 1, true, |ctx| {
|
||||||
|
let text = ctx.get_raw(0).as_str()?;
|
||||||
|
Ok(match without_combining(text) {
|
||||||
|
Cow::Borrowed(_) => None,
|
||||||
|
Cow::Owned(o) => Some(o),
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
/// Adds sql function regexp(regex, string) -> is_match
|
/// Adds sql function regexp(regex, string) -> is_match
|
||||||
/// Taken from the rusqlite docs
|
/// Taken from the rusqlite docs
|
||||||
fn add_regexp_function(db: &Connection) -> rusqlite::Result<()> {
|
fn add_regexp_function(db: &Connection) -> rusqlite::Result<()> {
|
||||||
|
|
|
@ -6,7 +6,9 @@ use regex::{Captures, Regex};
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::ptr;
|
use std::ptr;
|
||||||
use unicase::eq as uni_eq;
|
use unicase::eq as uni_eq;
|
||||||
use unicode_normalization::{is_nfc, UnicodeNormalization};
|
use unicode_normalization::{
|
||||||
|
char::is_combining_mark, is_nfc, is_nfkd_quick, IsNormalized, UnicodeNormalization,
|
||||||
|
};
|
||||||
|
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub enum AVTag {
|
pub enum AVTag {
|
||||||
|
@ -231,12 +233,32 @@ pub(crate) fn matches_wildcard(text: &str, search: &str) -> bool {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Convert provided string to NFKD form and strip combining characters.
|
||||||
|
pub(crate) fn without_combining(s: &str) -> Cow<str> {
|
||||||
|
// if the string is already normalized
|
||||||
|
if matches!(is_nfkd_quick(s.chars()), IsNormalized::Yes) {
|
||||||
|
// and no combining characters found, return unchanged
|
||||||
|
if !s.chars().any(is_combining_mark) {
|
||||||
|
return s.into();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// we need to create a new string without the combining marks
|
||||||
|
s.chars()
|
||||||
|
.nfkd()
|
||||||
|
.filter(|c| !is_combining_mark(*c))
|
||||||
|
.collect::<String>()
|
||||||
|
.into()
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use super::matches_wildcard;
|
use super::matches_wildcard;
|
||||||
|
use crate::text::without_combining;
|
||||||
use crate::text::{
|
use crate::text::{
|
||||||
extract_av_tags, strip_av_tags, strip_html, strip_html_preserving_image_filenames, AVTag,
|
extract_av_tags, strip_av_tags, strip_html, strip_html_preserving_image_filenames, AVTag,
|
||||||
};
|
};
|
||||||
|
use std::borrow::Cow;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn stripping() {
|
fn stripping() {
|
||||||
|
@ -287,4 +309,10 @@ mod test {
|
||||||
assert_eq!(matches_wildcard("foo", "F*oo"), true);
|
assert_eq!(matches_wildcard("foo", "F*oo"), true);
|
||||||
assert_eq!(matches_wildcard("foo", "b*"), false);
|
assert_eq!(matches_wildcard("foo", "b*"), false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn combining() {
|
||||||
|
assert!(matches!(without_combining("test"), Cow::Borrowed(_)));
|
||||||
|
assert!(matches!(without_combining("Über"), Cow::Owned(_)));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue