Add search keyword to strip clozes beforehand (#4145)

* add strip_clozes fn

* add test

* replace without_combining with process_text

* update write_unqualified

* update write_regex

* add `sc:...` search option

* add test

* strip clozes before stripping combining characters

find_notes_sc           time:   [1.0398 s 1.0405 s 1.0412 s]
                        change: [-6.1276% -6.0323% -5.9401%] (p = 0.00 < 0.05)
                        Performance has improved.

* add bitflags crate

* add and use ProcessTextFlags

* update sqlwriter.rs to use bitflags
This commit is contained in:
llama 2025-07-01 17:35:21 +08:00 committed by GitHub
parent 944e453419
commit b6c70f7b75
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 100 additions and 21 deletions

1
Cargo.lock generated
View file

@ -94,6 +94,7 @@ dependencies = [
"axum", "axum",
"axum-client-ip", "axum-client-ip",
"axum-extra", "axum-extra",
"bitflags 2.9.1",
"blake3", "blake3",
"bytes", "bytes",
"chrono", "chrono",

View file

@ -60,6 +60,7 @@ async-trait = "0.1.88"
axum = { version = "0.8.4", features = ["multipart", "macros"] } axum = { version = "0.8.4", features = ["multipart", "macros"] }
axum-client-ip = "1.1.3" axum-client-ip = "1.1.3"
axum-extra = { version = "0.10.1", features = ["typed-header"] } axum-extra = { version = "0.10.1", features = ["typed-header"] }
bitflags = "2.9.1"
blake3 = "1.8.2" blake3 = "1.8.2"
bytes = "1.10.1" bytes = "1.10.1"
camino = "1.1.10" camino = "1.1.10"

View file

@ -48,6 +48,7 @@ async-trait.workspace = true
axum.workspace = true axum.workspace = true
axum-client-ip.workspace = true axum-client-ip.workspace = true
axum-extra.workspace = true axum-extra.workspace = true
bitflags.workspace = true
blake3.workspace = true blake3.workspace = true
bytes.workspace = true bytes.workspace = true
chrono.workspace = true chrono.workspace = true

View file

@ -25,6 +25,9 @@ use crate::latex::contains_latex;
use crate::template::RenderContext; use crate::template::RenderContext;
use crate::text::strip_html_preserving_entities; use crate::text::strip_html_preserving_entities;
static CLOZE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?s)\{\{c\d+::(.*?)(::.*?)?\}\}").unwrap());
static MATHJAX: LazyLock<Regex> = LazyLock::new(|| { static MATHJAX: LazyLock<Regex> = LazyLock::new(|| {
Regex::new( Regex::new(
r"(?xsi) r"(?xsi)
@ -453,6 +456,10 @@ pub fn cloze_number_in_fields(fields: impl IntoIterator<Item: AsRef<str>>) -> Ha
set set
} }
pub(crate) fn strip_clozes(text: &str) -> Cow<'_, str> {
CLOZE.replace_all(text, "$1")
}
fn strip_html_inside_mathjax(text: &str) -> Cow<str> { fn strip_html_inside_mathjax(text: &str) -> Cow<str> {
MATHJAX.replace_all(text, |caps: &Captures| -> String { MATHJAX.replace_all(text, |caps: &Captures| -> String {
format!( format!(
@ -610,6 +617,16 @@ mod test {
); );
} }
#[test]
fn strip_clozes_regex() {
assert_eq!(
strip_clozes(
r#"The {{c1::moon::🌛}} {{c2::orbits::this hint has "::" in it}} the {{c3::🌏}}."#
),
"The moon orbits the 🌏."
);
}
#[test] #[test]
fn mathjax_html() { fn mathjax_html() {
// escaped angle brackets should be preserved // escaped angle brackets should be preserved

View file

@ -94,6 +94,7 @@ pub enum SearchNode {
WholeCollection, WholeCollection,
Regex(String), Regex(String),
NoCombining(String), NoCombining(String),
StripClozes(String),
WordBoundary(String), WordBoundary(String),
CustomData(String), CustomData(String),
Preset(String), Preset(String),
@ -358,6 +359,7 @@ fn search_node_for_text_with_argument<'a>(
"cid" => SearchNode::CardIds(check_id_list(val, key)?.into()), "cid" => SearchNode::CardIds(check_id_list(val, key)?.into()),
"re" => SearchNode::Regex(unescape_quotes(val)), "re" => SearchNode::Regex(unescape_quotes(val)),
"nc" => SearchNode::NoCombining(unescape(val)?), "nc" => SearchNode::NoCombining(unescape(val)?),
"sc" => SearchNode::StripClozes(unescape(val)?),
"w" => SearchNode::WordBoundary(unescape(val)?), "w" => SearchNode::WordBoundary(unescape(val)?),
"dupe" => parse_dupe(val)?, "dupe" => parse_dupe(val)?,
"has-cd" => SearchNode::CustomData(unescape(val)?), "has-cd" => SearchNode::CustomData(unescape(val)?),

View file

@ -22,6 +22,7 @@ use crate::notes::field_checksum;
use crate::notetype::NotetypeId; use crate::notetype::NotetypeId;
use crate::prelude::*; use crate::prelude::*;
use crate::storage::ids_to_string; use crate::storage::ids_to_string;
use crate::storage::ProcessTextFlags;
use crate::text::glob_matcher; use crate::text::glob_matcher;
use crate::text::is_glob; use crate::text::is_glob;
use crate::text::normalize_to_nfc; use crate::text::normalize_to_nfc;
@ -134,6 +135,7 @@ impl SqlWriter<'_> {
self.write_unqualified( self.write_unqualified(
text, text,
self.col.get_config_bool(BoolKey::IgnoreAccentsInSearch), self.col.get_config_bool(BoolKey::IgnoreAccentsInSearch),
false,
)? )?
} }
SearchNode::SingleField { field, text, is_re } => { SearchNode::SingleField { field, text, is_re } => {
@ -143,7 +145,14 @@ impl SqlWriter<'_> {
self.write_dupe(*notetype_id, &self.norm_note(text))? self.write_dupe(*notetype_id, &self.norm_note(text))?
} }
SearchNode::Regex(re) => self.write_regex(&self.norm_note(re), false)?, SearchNode::Regex(re) => self.write_regex(&self.norm_note(re), false)?,
SearchNode::NoCombining(text) => self.write_unqualified(&self.norm_note(text), true)?, SearchNode::NoCombining(text) => {
self.write_unqualified(&self.norm_note(text), true, false)?
}
SearchNode::StripClozes(text) => self.write_unqualified(
&self.norm_note(text),
self.col.get_config_bool(BoolKey::IgnoreAccentsInSearch),
true,
)?,
SearchNode::WordBoundary(text) => self.write_word_boundary(&self.norm_note(text))?, SearchNode::WordBoundary(text) => self.write_word_boundary(&self.norm_note(text))?,
// other // other
@ -190,7 +199,12 @@ impl SqlWriter<'_> {
Ok(()) Ok(())
} }
fn write_unqualified(&mut self, text: &str, no_combining: bool) -> Result<()> { fn write_unqualified(
&mut self,
text: &str,
no_combining: bool,
strip_clozes: bool,
) -> Result<()> {
let text = to_sql(text); let text = to_sql(text);
let text = if no_combining { let text = if no_combining {
without_combining(&text) without_combining(&text)
@ -202,17 +216,37 @@ impl SqlWriter<'_> {
self.args.push(text); self.args.push(text);
let arg_idx = self.args.len(); let arg_idx = self.args.len();
let sfld_expr = if no_combining { let mut process_text_flags = ProcessTextFlags::empty();
"coalesce(without_combining(cast(n.sfld as text)), n.sfld)" if no_combining {
process_text_flags.insert(ProcessTextFlags::NoCombining);
}
if strip_clozes {
process_text_flags.insert(ProcessTextFlags::StripClozes);
}
let (sfld_expr, flds_expr) = if !process_text_flags.is_empty() {
let bits = process_text_flags.bits();
(
Cow::from(format!(
"coalesce(process_text(cast(n.sfld as text), {bits}), n.sfld)"
)),
Cow::from(format!("coalesce(process_text(n.flds, {bits}), n.flds)")),
)
} else { } else {
"n.sfld" (Cow::from("n.sfld"), Cow::from("n.flds"))
};
let flds_expr = if no_combining {
"coalesce(without_combining(n.flds), n.flds)"
} else {
"n.flds"
}; };
if strip_clozes {
let cloze_notetypes_only_clause = self
.col
.get_all_notetypes()?
.iter()
.filter(|nt| nt.is_cloze())
.map(|nt| format!("n.mid = {}", nt.id))
.join(" or ");
write!(self.sql, "({cloze_notetypes_only_clause}) and ").unwrap();
}
if let Some(field_indicies_by_notetype) = self.included_fields_by_notetype()? { if let Some(field_indicies_by_notetype) = self.included_fields_by_notetype()? {
let field_idx_str = format!("' || ?{arg_idx} || '"); let field_idx_str = format!("' || ?{arg_idx} || '");
let other_idx_str = "%".to_string(); let other_idx_str = "%".to_string();
@ -803,9 +837,12 @@ impl SqlWriter<'_> {
fn write_regex(&mut self, word: &str, no_combining: bool) -> Result<()> { fn write_regex(&mut self, word: &str, no_combining: bool) -> Result<()> {
let flds_expr = if no_combining { let flds_expr = if no_combining {
"coalesce(without_combining(n.flds), n.flds)" Cow::from(format!(
"coalesce(process_text(n.flds, {}), n.flds)",
ProcessTextFlags::NoCombining.bits()
))
} else { } else {
"n.flds" Cow::from("n.flds")
}; };
let word = if no_combining { let word = if no_combining {
without_combining(word) without_combining(word)
@ -995,6 +1032,7 @@ impl SearchNode {
SearchNode::Duplicates { .. } => RequiredTable::Notes, SearchNode::Duplicates { .. } => RequiredTable::Notes,
SearchNode::Regex(_) => RequiredTable::Notes, SearchNode::Regex(_) => RequiredTable::Notes,
SearchNode::NoCombining(_) => RequiredTable::Notes, SearchNode::NoCombining(_) => RequiredTable::Notes,
SearchNode::StripClozes(_) => RequiredTable::Notes,
SearchNode::WordBoundary(_) => RequiredTable::Notes, SearchNode::WordBoundary(_) => RequiredTable::Notes,
SearchNode::NotetypeId(_) => RequiredTable::Notes, SearchNode::NotetypeId(_) => RequiredTable::Notes,
SearchNode::Notetype(_) => RequiredTable::Notes, SearchNode::Notetype(_) => RequiredTable::Notes,
@ -1299,6 +1337,9 @@ c.odue != 0 then c.odue else c.due end) != {days}) or (c.queue in (1,4) and
"((c.did in (1) or c.odid in (1)))" "((c.did in (1) or c.odid in (1)))"
); );
assert_eq!(&s(ctx, "preset:typo").0, "(false)"); assert_eq!(&s(ctx, "preset:typo").0, "(false)");
// strip clozes
assert_eq!(&s(ctx, "sc:abcdef").0, "((n.mid = 1581236385343) and (coalesce(process_text(cast(n.sfld as text), 2), n.sfld) like ?1 escape '\\' or coalesce(process_text(n.flds, 2), n.flds) like ?1 escape '\\'))");
} }
#[test] #[test]

View file

@ -91,6 +91,7 @@ fn write_search_node(node: &SearchNode) -> String {
WholeCollection => "deck:*".to_string(), WholeCollection => "deck:*".to_string(),
Regex(s) => maybe_quote(&format!("re:{s}")), Regex(s) => maybe_quote(&format!("re:{s}")),
NoCombining(s) => maybe_quote(&format!("nc:{s}")), NoCombining(s) => maybe_quote(&format!("nc:{s}")),
StripClozes(s) => maybe_quote(&format!("sc:{s}")),
WordBoundary(s) => maybe_quote(&format!("w:{s}")), WordBoundary(s) => maybe_quote(&format!("w:{s}")),
CustomData(k) => maybe_quote(&format!("has-cd:{k}")), CustomData(k) => maybe_quote(&format!("has-cd:{k}")),
Preset(s) => maybe_quote(&format!("preset:{s}")), Preset(s) => maybe_quote(&format!("preset:{s}")),

View file

@ -19,6 +19,7 @@ mod upgrades;
use std::fmt::Write; use std::fmt::Write;
pub(crate) use sqlite::ProcessTextFlags;
pub(crate) use sqlite::SqliteStorage; pub(crate) use sqlite::SqliteStorage;
#[derive(Debug, Clone, Copy, PartialEq, Eq)] #[derive(Debug, Clone, Copy, PartialEq, Eq)]

View file

@ -9,6 +9,7 @@ use std::hash::Hasher;
use std::path::Path; use std::path::Path;
use std::sync::Arc; use std::sync::Arc;
use bitflags::bitflags;
use fnv::FnvHasher; use fnv::FnvHasher;
use fsrs::FSRS; use fsrs::FSRS;
use fsrs::FSRS5_DEFAULT_DECAY; use fsrs::FSRS5_DEFAULT_DECAY;
@ -24,6 +25,7 @@ use super::upgrades::SCHEMA_MAX_VERSION;
use super::upgrades::SCHEMA_MIN_VERSION; use super::upgrades::SCHEMA_MIN_VERSION;
use super::upgrades::SCHEMA_STARTING_VERSION; use super::upgrades::SCHEMA_STARTING_VERSION;
use super::SchemaVersion; use super::SchemaVersion;
use crate::cloze::strip_clozes;
use crate::config::schema11::schema11_config_as_string; use crate::config::schema11::schema11_config_as_string;
use crate::error::DbErrorKind; use crate::error::DbErrorKind;
use crate::prelude::*; use crate::prelude::*;
@ -31,6 +33,7 @@ use crate::scheduler::timing::local_minutes_west_for_stamp;
use crate::scheduler::timing::v1_creation_date; use crate::scheduler::timing::v1_creation_date;
use crate::storage::card::data::CardData; use crate::storage::card::data::CardData;
use crate::text::without_combining; use crate::text::without_combining;
use crate::text::CowMapping;
fn unicase_compare(s1: &str, s2: &str) -> Ordering { fn unicase_compare(s1: &str, s2: &str) -> Ordering {
UniCase::new(s1).cmp(&UniCase::new(s2)) UniCase::new(s1).cmp(&UniCase::new(s2))
@ -74,7 +77,7 @@ fn open_or_create_collection_db(path: &Path) -> Result<Connection> {
add_regexp_function(&db)?; add_regexp_function(&db)?;
add_regexp_fields_function(&db)?; add_regexp_fields_function(&db)?;
add_regexp_tags_function(&db)?; add_regexp_tags_function(&db)?;
add_without_combining_function(&db)?; add_process_text_function(&db)?;
add_fnvhash_function(&db)?; add_fnvhash_function(&db)?;
add_extract_original_position_function(&db)?; add_extract_original_position_function(&db)?;
add_extract_custom_data_function(&db)?; add_extract_custom_data_function(&db)?;
@ -111,17 +114,28 @@ fn add_field_index_function(db: &Connection) -> rusqlite::Result<()> {
) )
} }
fn add_without_combining_function(db: &Connection) -> rusqlite::Result<()> { bitflags! {
pub(crate) struct ProcessTextFlags: u8 {
const NoCombining = 1;
const StripClozes = 1 << 1;
}
}
fn add_process_text_function(db: &Connection) -> rusqlite::Result<()> {
db.create_scalar_function( db.create_scalar_function(
"without_combining", "process_text",
1, 2,
FunctionFlags::SQLITE_DETERMINISTIC, FunctionFlags::SQLITE_DETERMINISTIC,
|ctx| { |ctx| {
let text = ctx.get_raw(0).as_str()?; let mut text = Cow::from(ctx.get_raw(0).as_str()?);
Ok(match without_combining(text) { let opt = ProcessTextFlags::from_bits_truncate(ctx.get_raw(1).as_i64()? as u8);
Cow::Borrowed(_) => None, if opt.contains(ProcessTextFlags::StripClozes) {
Cow::Owned(o) => Some(o), text = text.map_cow(strip_clozes);
}) }
if opt.contains(ProcessTextFlags::NoCombining) {
text = text.map_cow(without_combining);
}
Ok(text.get_owned())
}, },
) )
} }