mirror of
https://github.com/ankitects/anki.git
synced 2025-09-18 14:02:21 -04:00
Add search keyword to strip clozes beforehand (#4145)
* add strip_clozes fn * add test * replace without_combining with process_text * update write_unqualified * update write_regex * add `sc:...` search option * add test * strip clozes before stripping combining characters find_notes_sc time: [1.0398 s 1.0405 s 1.0412 s] change: [-6.1276% -6.0323% -5.9401%] (p = 0.00 < 0.05) Performance has improved. * add bitflags crate * add and use ProcessTextFlags * update sqlwriter.rs to use bitflags
This commit is contained in:
parent
944e453419
commit
b6c70f7b75
9 changed files with 100 additions and 21 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -94,6 +94,7 @@ dependencies = [
|
|||
"axum",
|
||||
"axum-client-ip",
|
||||
"axum-extra",
|
||||
"bitflags 2.9.1",
|
||||
"blake3",
|
||||
"bytes",
|
||||
"chrono",
|
||||
|
|
|
@ -60,6 +60,7 @@ async-trait = "0.1.88"
|
|||
axum = { version = "0.8.4", features = ["multipart", "macros"] }
|
||||
axum-client-ip = "1.1.3"
|
||||
axum-extra = { version = "0.10.1", features = ["typed-header"] }
|
||||
bitflags = "2.9.1"
|
||||
blake3 = "1.8.2"
|
||||
bytes = "1.10.1"
|
||||
camino = "1.1.10"
|
||||
|
|
|
@ -48,6 +48,7 @@ async-trait.workspace = true
|
|||
axum.workspace = true
|
||||
axum-client-ip.workspace = true
|
||||
axum-extra.workspace = true
|
||||
bitflags.workspace = true
|
||||
blake3.workspace = true
|
||||
bytes.workspace = true
|
||||
chrono.workspace = true
|
||||
|
|
|
@ -25,6 +25,9 @@ use crate::latex::contains_latex;
|
|||
use crate::template::RenderContext;
|
||||
use crate::text::strip_html_preserving_entities;
|
||||
|
||||
static CLOZE: LazyLock<Regex> =
|
||||
LazyLock::new(|| Regex::new(r"(?s)\{\{c\d+::(.*?)(::.*?)?\}\}").unwrap());
|
||||
|
||||
static MATHJAX: LazyLock<Regex> = LazyLock::new(|| {
|
||||
Regex::new(
|
||||
r"(?xsi)
|
||||
|
@ -453,6 +456,10 @@ pub fn cloze_number_in_fields(fields: impl IntoIterator<Item: AsRef<str>>) -> Ha
|
|||
set
|
||||
}
|
||||
|
||||
pub(crate) fn strip_clozes(text: &str) -> Cow<'_, str> {
|
||||
CLOZE.replace_all(text, "$1")
|
||||
}
|
||||
|
||||
fn strip_html_inside_mathjax(text: &str) -> Cow<str> {
|
||||
MATHJAX.replace_all(text, |caps: &Captures| -> String {
|
||||
format!(
|
||||
|
@ -610,6 +617,16 @@ mod test {
|
|||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strip_clozes_regex() {
|
||||
assert_eq!(
|
||||
strip_clozes(
|
||||
r#"The {{c1::moon::🌛}} {{c2::orbits::this hint has "::" in it}} the {{c3::🌏}}."#
|
||||
),
|
||||
"The moon orbits the 🌏."
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mathjax_html() {
|
||||
// escaped angle brackets should be preserved
|
||||
|
|
|
@ -94,6 +94,7 @@ pub enum SearchNode {
|
|||
WholeCollection,
|
||||
Regex(String),
|
||||
NoCombining(String),
|
||||
StripClozes(String),
|
||||
WordBoundary(String),
|
||||
CustomData(String),
|
||||
Preset(String),
|
||||
|
@ -358,6 +359,7 @@ fn search_node_for_text_with_argument<'a>(
|
|||
"cid" => SearchNode::CardIds(check_id_list(val, key)?.into()),
|
||||
"re" => SearchNode::Regex(unescape_quotes(val)),
|
||||
"nc" => SearchNode::NoCombining(unescape(val)?),
|
||||
"sc" => SearchNode::StripClozes(unescape(val)?),
|
||||
"w" => SearchNode::WordBoundary(unescape(val)?),
|
||||
"dupe" => parse_dupe(val)?,
|
||||
"has-cd" => SearchNode::CustomData(unescape(val)?),
|
||||
|
|
|
@ -22,6 +22,7 @@ use crate::notes::field_checksum;
|
|||
use crate::notetype::NotetypeId;
|
||||
use crate::prelude::*;
|
||||
use crate::storage::ids_to_string;
|
||||
use crate::storage::ProcessTextFlags;
|
||||
use crate::text::glob_matcher;
|
||||
use crate::text::is_glob;
|
||||
use crate::text::normalize_to_nfc;
|
||||
|
@ -134,6 +135,7 @@ impl SqlWriter<'_> {
|
|||
self.write_unqualified(
|
||||
text,
|
||||
self.col.get_config_bool(BoolKey::IgnoreAccentsInSearch),
|
||||
false,
|
||||
)?
|
||||
}
|
||||
SearchNode::SingleField { field, text, is_re } => {
|
||||
|
@ -143,7 +145,14 @@ impl SqlWriter<'_> {
|
|||
self.write_dupe(*notetype_id, &self.norm_note(text))?
|
||||
}
|
||||
SearchNode::Regex(re) => self.write_regex(&self.norm_note(re), false)?,
|
||||
SearchNode::NoCombining(text) => self.write_unqualified(&self.norm_note(text), true)?,
|
||||
SearchNode::NoCombining(text) => {
|
||||
self.write_unqualified(&self.norm_note(text), true, false)?
|
||||
}
|
||||
SearchNode::StripClozes(text) => self.write_unqualified(
|
||||
&self.norm_note(text),
|
||||
self.col.get_config_bool(BoolKey::IgnoreAccentsInSearch),
|
||||
true,
|
||||
)?,
|
||||
SearchNode::WordBoundary(text) => self.write_word_boundary(&self.norm_note(text))?,
|
||||
|
||||
// other
|
||||
|
@ -190,7 +199,12 @@ impl SqlWriter<'_> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn write_unqualified(&mut self, text: &str, no_combining: bool) -> Result<()> {
|
||||
fn write_unqualified(
|
||||
&mut self,
|
||||
text: &str,
|
||||
no_combining: bool,
|
||||
strip_clozes: bool,
|
||||
) -> Result<()> {
|
||||
let text = to_sql(text);
|
||||
let text = if no_combining {
|
||||
without_combining(&text)
|
||||
|
@ -202,17 +216,37 @@ impl SqlWriter<'_> {
|
|||
self.args.push(text);
|
||||
let arg_idx = self.args.len();
|
||||
|
||||
let sfld_expr = if no_combining {
|
||||
"coalesce(without_combining(cast(n.sfld as text)), n.sfld)"
|
||||
let mut process_text_flags = ProcessTextFlags::empty();
|
||||
if no_combining {
|
||||
process_text_flags.insert(ProcessTextFlags::NoCombining);
|
||||
}
|
||||
if strip_clozes {
|
||||
process_text_flags.insert(ProcessTextFlags::StripClozes);
|
||||
}
|
||||
|
||||
let (sfld_expr, flds_expr) = if !process_text_flags.is_empty() {
|
||||
let bits = process_text_flags.bits();
|
||||
(
|
||||
Cow::from(format!(
|
||||
"coalesce(process_text(cast(n.sfld as text), {bits}), n.sfld)"
|
||||
)),
|
||||
Cow::from(format!("coalesce(process_text(n.flds, {bits}), n.flds)")),
|
||||
)
|
||||
} else {
|
||||
"n.sfld"
|
||||
};
|
||||
let flds_expr = if no_combining {
|
||||
"coalesce(without_combining(n.flds), n.flds)"
|
||||
} else {
|
||||
"n.flds"
|
||||
(Cow::from("n.sfld"), Cow::from("n.flds"))
|
||||
};
|
||||
|
||||
if strip_clozes {
|
||||
let cloze_notetypes_only_clause = self
|
||||
.col
|
||||
.get_all_notetypes()?
|
||||
.iter()
|
||||
.filter(|nt| nt.is_cloze())
|
||||
.map(|nt| format!("n.mid = {}", nt.id))
|
||||
.join(" or ");
|
||||
write!(self.sql, "({cloze_notetypes_only_clause}) and ").unwrap();
|
||||
}
|
||||
|
||||
if let Some(field_indicies_by_notetype) = self.included_fields_by_notetype()? {
|
||||
let field_idx_str = format!("' || ?{arg_idx} || '");
|
||||
let other_idx_str = "%".to_string();
|
||||
|
@ -803,9 +837,12 @@ impl SqlWriter<'_> {
|
|||
|
||||
fn write_regex(&mut self, word: &str, no_combining: bool) -> Result<()> {
|
||||
let flds_expr = if no_combining {
|
||||
"coalesce(without_combining(n.flds), n.flds)"
|
||||
Cow::from(format!(
|
||||
"coalesce(process_text(n.flds, {}), n.flds)",
|
||||
ProcessTextFlags::NoCombining.bits()
|
||||
))
|
||||
} else {
|
||||
"n.flds"
|
||||
Cow::from("n.flds")
|
||||
};
|
||||
let word = if no_combining {
|
||||
without_combining(word)
|
||||
|
@ -995,6 +1032,7 @@ impl SearchNode {
|
|||
SearchNode::Duplicates { .. } => RequiredTable::Notes,
|
||||
SearchNode::Regex(_) => RequiredTable::Notes,
|
||||
SearchNode::NoCombining(_) => RequiredTable::Notes,
|
||||
SearchNode::StripClozes(_) => RequiredTable::Notes,
|
||||
SearchNode::WordBoundary(_) => RequiredTable::Notes,
|
||||
SearchNode::NotetypeId(_) => RequiredTable::Notes,
|
||||
SearchNode::Notetype(_) => RequiredTable::Notes,
|
||||
|
@ -1299,6 +1337,9 @@ c.odue != 0 then c.odue else c.due end) != {days}) or (c.queue in (1,4) and
|
|||
"((c.did in (1) or c.odid in (1)))"
|
||||
);
|
||||
assert_eq!(&s(ctx, "preset:typo").0, "(false)");
|
||||
|
||||
// strip clozes
|
||||
assert_eq!(&s(ctx, "sc:abcdef").0, "((n.mid = 1581236385343) and (coalesce(process_text(cast(n.sfld as text), 2), n.sfld) like ?1 escape '\\' or coalesce(process_text(n.flds, 2), n.flds) like ?1 escape '\\'))");
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
@ -91,6 +91,7 @@ fn write_search_node(node: &SearchNode) -> String {
|
|||
WholeCollection => "deck:*".to_string(),
|
||||
Regex(s) => maybe_quote(&format!("re:{s}")),
|
||||
NoCombining(s) => maybe_quote(&format!("nc:{s}")),
|
||||
StripClozes(s) => maybe_quote(&format!("sc:{s}")),
|
||||
WordBoundary(s) => maybe_quote(&format!("w:{s}")),
|
||||
CustomData(k) => maybe_quote(&format!("has-cd:{k}")),
|
||||
Preset(s) => maybe_quote(&format!("preset:{s}")),
|
||||
|
|
|
@ -19,6 +19,7 @@ mod upgrades;
|
|||
|
||||
use std::fmt::Write;
|
||||
|
||||
pub(crate) use sqlite::ProcessTextFlags;
|
||||
pub(crate) use sqlite::SqliteStorage;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
|
|
|
@ -9,6 +9,7 @@ use std::hash::Hasher;
|
|||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
use bitflags::bitflags;
|
||||
use fnv::FnvHasher;
|
||||
use fsrs::FSRS;
|
||||
use fsrs::FSRS5_DEFAULT_DECAY;
|
||||
|
@ -24,6 +25,7 @@ use super::upgrades::SCHEMA_MAX_VERSION;
|
|||
use super::upgrades::SCHEMA_MIN_VERSION;
|
||||
use super::upgrades::SCHEMA_STARTING_VERSION;
|
||||
use super::SchemaVersion;
|
||||
use crate::cloze::strip_clozes;
|
||||
use crate::config::schema11::schema11_config_as_string;
|
||||
use crate::error::DbErrorKind;
|
||||
use crate::prelude::*;
|
||||
|
@ -31,6 +33,7 @@ use crate::scheduler::timing::local_minutes_west_for_stamp;
|
|||
use crate::scheduler::timing::v1_creation_date;
|
||||
use crate::storage::card::data::CardData;
|
||||
use crate::text::without_combining;
|
||||
use crate::text::CowMapping;
|
||||
|
||||
fn unicase_compare(s1: &str, s2: &str) -> Ordering {
|
||||
UniCase::new(s1).cmp(&UniCase::new(s2))
|
||||
|
@ -74,7 +77,7 @@ fn open_or_create_collection_db(path: &Path) -> Result<Connection> {
|
|||
add_regexp_function(&db)?;
|
||||
add_regexp_fields_function(&db)?;
|
||||
add_regexp_tags_function(&db)?;
|
||||
add_without_combining_function(&db)?;
|
||||
add_process_text_function(&db)?;
|
||||
add_fnvhash_function(&db)?;
|
||||
add_extract_original_position_function(&db)?;
|
||||
add_extract_custom_data_function(&db)?;
|
||||
|
@ -111,17 +114,28 @@ fn add_field_index_function(db: &Connection) -> rusqlite::Result<()> {
|
|||
)
|
||||
}
|
||||
|
||||
fn add_without_combining_function(db: &Connection) -> rusqlite::Result<()> {
|
||||
bitflags! {
|
||||
pub(crate) struct ProcessTextFlags: u8 {
|
||||
const NoCombining = 1;
|
||||
const StripClozes = 1 << 1;
|
||||
}
|
||||
}
|
||||
|
||||
fn add_process_text_function(db: &Connection) -> rusqlite::Result<()> {
|
||||
db.create_scalar_function(
|
||||
"without_combining",
|
||||
1,
|
||||
"process_text",
|
||||
2,
|
||||
FunctionFlags::SQLITE_DETERMINISTIC,
|
||||
|ctx| {
|
||||
let text = ctx.get_raw(0).as_str()?;
|
||||
Ok(match without_combining(text) {
|
||||
Cow::Borrowed(_) => None,
|
||||
Cow::Owned(o) => Some(o),
|
||||
})
|
||||
let mut text = Cow::from(ctx.get_raw(0).as_str()?);
|
||||
let opt = ProcessTextFlags::from_bits_truncate(ctx.get_raw(1).as_i64()? as u8);
|
||||
if opt.contains(ProcessTextFlags::StripClozes) {
|
||||
text = text.map_cow(strip_clozes);
|
||||
}
|
||||
if opt.contains(ProcessTextFlags::NoCombining) {
|
||||
text = text.map_cow(without_combining);
|
||||
}
|
||||
Ok(text.get_owned())
|
||||
},
|
||||
)
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue