Add search keyword to strip clozes beforehand (#4145)

* add strip_clozes fn

* add test

* replace without_combining with process_text

* update write_unqualified

* update write_regex

* add `sc:...` search option

* add test

* strip clozes before stripping combining characters

find_notes_sc           time:   [1.0398 s 1.0405 s 1.0412 s]
                        change: [-6.1276% -6.0323% -5.9401%] (p = 0.00 < 0.05)
                        Performance has improved.

* add bitflags crate

* add and use ProcessTextFlags

* update sqlwriter.rs to use bitflags
This commit is contained in:
llama 2025-07-01 17:35:21 +08:00 committed by GitHub
parent 944e453419
commit b6c70f7b75
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 100 additions and 21 deletions

1
Cargo.lock generated
View file

@ -94,6 +94,7 @@ dependencies = [
"axum",
"axum-client-ip",
"axum-extra",
"bitflags 2.9.1",
"blake3",
"bytes",
"chrono",

View file

@ -60,6 +60,7 @@ async-trait = "0.1.88"
axum = { version = "0.8.4", features = ["multipart", "macros"] }
axum-client-ip = "1.1.3"
axum-extra = { version = "0.10.1", features = ["typed-header"] }
bitflags = "2.9.1"
blake3 = "1.8.2"
bytes = "1.10.1"
camino = "1.1.10"

View file

@ -48,6 +48,7 @@ async-trait.workspace = true
axum.workspace = true
axum-client-ip.workspace = true
axum-extra.workspace = true
bitflags.workspace = true
blake3.workspace = true
bytes.workspace = true
chrono.workspace = true

View file

@ -25,6 +25,9 @@ use crate::latex::contains_latex;
use crate::template::RenderContext;
use crate::text::strip_html_preserving_entities;
static CLOZE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?s)\{\{c\d+::(.*?)(::.*?)?\}\}").unwrap());
static MATHJAX: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"(?xsi)
@ -453,6 +456,10 @@ pub fn cloze_number_in_fields(fields: impl IntoIterator<Item: AsRef<str>>) -> Ha
set
}
pub(crate) fn strip_clozes(text: &str) -> Cow<'_, str> {
CLOZE.replace_all(text, "$1")
}
fn strip_html_inside_mathjax(text: &str) -> Cow<str> {
MATHJAX.replace_all(text, |caps: &Captures| -> String {
format!(
@ -610,6 +617,16 @@ mod test {
);
}
#[test]
fn strip_clozes_regex() {
assert_eq!(
strip_clozes(
r#"The {{c1::moon::🌛}} {{c2::orbits::this hint has "::" in it}} the {{c3::🌏}}."#
),
"The moon orbits the 🌏."
);
}
#[test]
fn mathjax_html() {
// escaped angle brackets should be preserved

View file

@ -94,6 +94,7 @@ pub enum SearchNode {
WholeCollection,
Regex(String),
NoCombining(String),
StripClozes(String),
WordBoundary(String),
CustomData(String),
Preset(String),
@ -358,6 +359,7 @@ fn search_node_for_text_with_argument<'a>(
"cid" => SearchNode::CardIds(check_id_list(val, key)?.into()),
"re" => SearchNode::Regex(unescape_quotes(val)),
"nc" => SearchNode::NoCombining(unescape(val)?),
"sc" => SearchNode::StripClozes(unescape(val)?),
"w" => SearchNode::WordBoundary(unescape(val)?),
"dupe" => parse_dupe(val)?,
"has-cd" => SearchNode::CustomData(unescape(val)?),

View file

@ -22,6 +22,7 @@ use crate::notes::field_checksum;
use crate::notetype::NotetypeId;
use crate::prelude::*;
use crate::storage::ids_to_string;
use crate::storage::ProcessTextFlags;
use crate::text::glob_matcher;
use crate::text::is_glob;
use crate::text::normalize_to_nfc;
@ -134,6 +135,7 @@ impl SqlWriter<'_> {
self.write_unqualified(
text,
self.col.get_config_bool(BoolKey::IgnoreAccentsInSearch),
false,
)?
}
SearchNode::SingleField { field, text, is_re } => {
@ -143,7 +145,14 @@ impl SqlWriter<'_> {
self.write_dupe(*notetype_id, &self.norm_note(text))?
}
SearchNode::Regex(re) => self.write_regex(&self.norm_note(re), false)?,
SearchNode::NoCombining(text) => self.write_unqualified(&self.norm_note(text), true)?,
SearchNode::NoCombining(text) => {
self.write_unqualified(&self.norm_note(text), true, false)?
}
SearchNode::StripClozes(text) => self.write_unqualified(
&self.norm_note(text),
self.col.get_config_bool(BoolKey::IgnoreAccentsInSearch),
true,
)?,
SearchNode::WordBoundary(text) => self.write_word_boundary(&self.norm_note(text))?,
// other
@ -190,7 +199,12 @@ impl SqlWriter<'_> {
Ok(())
}
fn write_unqualified(&mut self, text: &str, no_combining: bool) -> Result<()> {
fn write_unqualified(
&mut self,
text: &str,
no_combining: bool,
strip_clozes: bool,
) -> Result<()> {
let text = to_sql(text);
let text = if no_combining {
without_combining(&text)
@ -202,17 +216,37 @@ impl SqlWriter<'_> {
self.args.push(text);
let arg_idx = self.args.len();
let sfld_expr = if no_combining {
"coalesce(without_combining(cast(n.sfld as text)), n.sfld)"
let mut process_text_flags = ProcessTextFlags::empty();
if no_combining {
process_text_flags.insert(ProcessTextFlags::NoCombining);
}
if strip_clozes {
process_text_flags.insert(ProcessTextFlags::StripClozes);
}
let (sfld_expr, flds_expr) = if !process_text_flags.is_empty() {
let bits = process_text_flags.bits();
(
Cow::from(format!(
"coalesce(process_text(cast(n.sfld as text), {bits}), n.sfld)"
)),
Cow::from(format!("coalesce(process_text(n.flds, {bits}), n.flds)")),
)
} else {
"n.sfld"
};
let flds_expr = if no_combining {
"coalesce(without_combining(n.flds), n.flds)"
} else {
"n.flds"
(Cow::from("n.sfld"), Cow::from("n.flds"))
};
if strip_clozes {
let cloze_notetypes_only_clause = self
.col
.get_all_notetypes()?
.iter()
.filter(|nt| nt.is_cloze())
.map(|nt| format!("n.mid = {}", nt.id))
.join(" or ");
write!(self.sql, "({cloze_notetypes_only_clause}) and ").unwrap();
}
if let Some(field_indicies_by_notetype) = self.included_fields_by_notetype()? {
let field_idx_str = format!("' || ?{arg_idx} || '");
let other_idx_str = "%".to_string();
@ -803,9 +837,12 @@ impl SqlWriter<'_> {
fn write_regex(&mut self, word: &str, no_combining: bool) -> Result<()> {
let flds_expr = if no_combining {
"coalesce(without_combining(n.flds), n.flds)"
Cow::from(format!(
"coalesce(process_text(n.flds, {}), n.flds)",
ProcessTextFlags::NoCombining.bits()
))
} else {
"n.flds"
Cow::from("n.flds")
};
let word = if no_combining {
without_combining(word)
@ -995,6 +1032,7 @@ impl SearchNode {
SearchNode::Duplicates { .. } => RequiredTable::Notes,
SearchNode::Regex(_) => RequiredTable::Notes,
SearchNode::NoCombining(_) => RequiredTable::Notes,
SearchNode::StripClozes(_) => RequiredTable::Notes,
SearchNode::WordBoundary(_) => RequiredTable::Notes,
SearchNode::NotetypeId(_) => RequiredTable::Notes,
SearchNode::Notetype(_) => RequiredTable::Notes,
@ -1299,6 +1337,9 @@ c.odue != 0 then c.odue else c.due end) != {days}) or (c.queue in (1,4) and
"((c.did in (1) or c.odid in (1)))"
);
assert_eq!(&s(ctx, "preset:typo").0, "(false)");
// strip clozes
assert_eq!(&s(ctx, "sc:abcdef").0, "((n.mid = 1581236385343) and (coalesce(process_text(cast(n.sfld as text), 2), n.sfld) like ?1 escape '\\' or coalesce(process_text(n.flds, 2), n.flds) like ?1 escape '\\'))");
}
#[test]

View file

@ -91,6 +91,7 @@ fn write_search_node(node: &SearchNode) -> String {
WholeCollection => "deck:*".to_string(),
Regex(s) => maybe_quote(&format!("re:{s}")),
NoCombining(s) => maybe_quote(&format!("nc:{s}")),
StripClozes(s) => maybe_quote(&format!("sc:{s}")),
WordBoundary(s) => maybe_quote(&format!("w:{s}")),
CustomData(k) => maybe_quote(&format!("has-cd:{k}")),
Preset(s) => maybe_quote(&format!("preset:{s}")),

View file

@ -19,6 +19,7 @@ mod upgrades;
use std::fmt::Write;
pub(crate) use sqlite::ProcessTextFlags;
pub(crate) use sqlite::SqliteStorage;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]

View file

@ -9,6 +9,7 @@ use std::hash::Hasher;
use std::path::Path;
use std::sync::Arc;
use bitflags::bitflags;
use fnv::FnvHasher;
use fsrs::FSRS;
use fsrs::FSRS5_DEFAULT_DECAY;
@ -24,6 +25,7 @@ use super::upgrades::SCHEMA_MAX_VERSION;
use super::upgrades::SCHEMA_MIN_VERSION;
use super::upgrades::SCHEMA_STARTING_VERSION;
use super::SchemaVersion;
use crate::cloze::strip_clozes;
use crate::config::schema11::schema11_config_as_string;
use crate::error::DbErrorKind;
use crate::prelude::*;
@ -31,6 +33,7 @@ use crate::scheduler::timing::local_minutes_west_for_stamp;
use crate::scheduler::timing::v1_creation_date;
use crate::storage::card::data::CardData;
use crate::text::without_combining;
use crate::text::CowMapping;
fn unicase_compare(s1: &str, s2: &str) -> Ordering {
UniCase::new(s1).cmp(&UniCase::new(s2))
@ -74,7 +77,7 @@ fn open_or_create_collection_db(path: &Path) -> Result<Connection> {
add_regexp_function(&db)?;
add_regexp_fields_function(&db)?;
add_regexp_tags_function(&db)?;
add_without_combining_function(&db)?;
add_process_text_function(&db)?;
add_fnvhash_function(&db)?;
add_extract_original_position_function(&db)?;
add_extract_custom_data_function(&db)?;
@ -111,17 +114,28 @@ fn add_field_index_function(db: &Connection) -> rusqlite::Result<()> {
)
}
fn add_without_combining_function(db: &Connection) -> rusqlite::Result<()> {
bitflags! {
pub(crate) struct ProcessTextFlags: u8 {
const NoCombining = 1;
const StripClozes = 1 << 1;
}
}
fn add_process_text_function(db: &Connection) -> rusqlite::Result<()> {
db.create_scalar_function(
"without_combining",
1,
"process_text",
2,
FunctionFlags::SQLITE_DETERMINISTIC,
|ctx| {
let text = ctx.get_raw(0).as_str()?;
Ok(match without_combining(text) {
Cow::Borrowed(_) => None,
Cow::Owned(o) => Some(o),
})
let mut text = Cow::from(ctx.get_raw(0).as_str()?);
let opt = ProcessTextFlags::from_bits_truncate(ctx.get_raw(1).as_i64()? as u8);
if opt.contains(ProcessTextFlags::StripClozes) {
text = text.map_cow(strip_clozes);
}
if opt.contains(ProcessTextFlags::NoCombining) {
text = text.map_cow(without_combining);
}
Ok(text.get_owned())
},
)
}