support disabling unicode normalization in notes

This commit is contained in:
Damien Elmes 2020-05-06 20:06:42 +10:00
parent a7a485d550
commit 7bab99d873
16 changed files with 200 additions and 77 deletions

View file

@ -454,7 +454,9 @@ select id from notes where id in %s and id not in (select nid from cards)"""
# Card generation & field checksums/sort fields
##########################################################################
def after_note_updates(self, nids: List[int], mark_modified: bool, generate_cards: bool = True) -> None:
def after_note_updates(
self, nids: List[int], mark_modified: bool, generate_cards: bool = True
) -> None:
self.backend.after_note_updates(
nids=nids, generate_cards=generate_cards, mark_notes_modified=mark_modified
)
@ -818,7 +820,9 @@ select id from cards where odid > 0 and did in %s"""
self.tags.registerNotes()
# field cache
for m in self.models.all():
self.after_note_updates(self.models.nids(m), mark_modified=False, generate_cards=False)
self.after_note_updates(
self.models.nids(m), mark_modified=False, generate_cards=False
)
# new cards can't have a due position > 32 bits, so wrap items over
# 2 million back to 1 million
self.db.execute(

View file

@ -2,7 +2,6 @@
# License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
import html
import unicodedata
from typing import Any, Dict, List, Optional, Tuple, Union
from anki.collection import _Collection
@ -147,8 +146,6 @@ class NoteImporter(Importer):
n.fields[c] = n.fields[c].strip()
if not self.allowHTML:
n.fields[c] = n.fields[c].replace("\n", "<br>")
n.fields[c] = unicodedata.normalize("NFC", n.fields[c])
n.tags = [unicodedata.normalize("NFC", t) for t in n.tags]
fld0 = n.fields[fld0idx]
csum = fieldChecksum(fld0)
# first field must exist

View file

@ -789,7 +789,7 @@ class RustBackend:
mark_notes_modified=mark_notes_modified,
)
),
release_gil=True
release_gil=True,
)

View file

@ -5,9 +5,7 @@
from __future__ import annotations
import html
import sre_constants
import time
import unicodedata
from dataclasses import dataclass
from enum import Enum
from operator import itemgetter
@ -775,7 +773,6 @@ class Browser(QMainWindow):
# grab search text and normalize
txt = self.form.searchEdit.lineEdit().text()
txt = unicodedata.normalize("NFC", txt)
# update history
sh = self.mw.pm.profile["searchHistory"]

View file

@ -377,7 +377,6 @@ class Editor:
if nid != self.note.id:
print("ignored late blur")
return
txt = unicodedata.normalize("NFC", txt)
txt = self.mungeHTML(txt)
# misbehaving apps may include a null byte in the text
txt = txt.replace("\x00", "")

View file

@ -13,7 +13,6 @@ use crate::{
deckconf::{DeckConf, DeckConfID},
decks::{Deck, DeckID, DeckSchema11},
err::{AnkiError, NetworkErrorKind, Result, SyncErrorKind},
findreplace::FindReplaceContext,
i18n::{tr_args, I18n, TR},
latex::{extract_latex, extract_latex_expanding_clozes, ExtractedLatex},
log,
@ -1093,9 +1092,7 @@ impl Backend {
Some(input.field_name)
};
let repl = input.replacement;
self.with_col(|col| {
col.find_and_replace(FindReplaceContext::new(nids, &search, &repl, field_name)?)
})
self.with_col(|col| col.find_and_replace(nids, &search, &repl, field_name))
}
fn after_note_updates(&self, input: pb::AfterNoteUpdatesIn) -> Result<pb::Empty> {

View file

@ -43,6 +43,7 @@ pub(crate) enum ConfigKey {
NextNewCardPosition,
SchedulerVersion,
LearnAheadSecs,
NormalizeNoteText,
}
#[derive(PartialEq, Serialize_repr, Deserialize_repr, Clone, Copy)]
#[repr(u8)]
@ -64,6 +65,7 @@ impl From<ConfigKey> for &'static str {
ConfigKey::NextNewCardPosition => "nextPos",
ConfigKey::SchedulerVersion => "schedVer",
ConfigKey::LearnAheadSecs => "collapseTime",
ConfigKey::NormalizeNoteText => "normalize_note_text",
}
}
}
@ -163,6 +165,12 @@ impl Collection {
self.get_config_optional(ConfigKey::LearnAheadSecs)
.unwrap_or(1200)
}
/// This is a stop-gap solution until we can decouple searching from canonical storage.
pub(crate) fn normalize_note_text(&self) -> bool {
self.get_config_optional(ConfigKey::NormalizeNoteText)
.unwrap_or(true)
}
}
#[derive(Deserialize, PartialEq, Debug, Clone, Copy)]

View file

@ -6,6 +6,7 @@ use crate::{
err::{AnkiError, Result},
notes::NoteID,
notetype::CardGenContext,
text::normalize_to_nfc,
types::Usn,
};
use itertools::Itertools;
@ -40,11 +41,31 @@ impl FindReplaceContext {
}
impl Collection {
pub fn find_and_replace(&mut self, ctx: FindReplaceContext) -> Result<u32> {
self.transact(None, |col| col.find_and_replace_inner(ctx, col.usn()?))
pub fn find_and_replace(
&mut self,
nids: Vec<NoteID>,
search_re: &str,
repl: &str,
field_name: Option<String>,
) -> Result<u32> {
self.transact(None, |col| {
let norm = col.normalize_note_text();
let search = if norm {
normalize_to_nfc(search_re)
} else {
search_re.into()
};
let ctx = FindReplaceContext::new(nids, &search, repl, field_name)?;
col.find_and_replace_inner(ctx, col.usn()?, norm)
})
}
fn find_and_replace_inner(&mut self, ctx: FindReplaceContext, usn: Usn) -> Result<u32> {
fn find_and_replace_inner(
&mut self,
ctx: FindReplaceContext,
usn: Usn,
normalize_text: bool,
) -> Result<u32> {
let mut total_changed = 0;
let nids_by_notetype = self.storage.note_ids_by_notetype(&ctx.nids)?;
for (ntid, group) in &nids_by_notetype.into_iter().group_by(|tup| tup.0) {
@ -77,7 +98,12 @@ impl Collection {
}
}
if changed {
self.update_note_inner_generating_cards(&genctx, &mut note, true)?;
self.update_note_inner_generating_cards(
&genctx,
&mut note,
true,
normalize_text,
)?;
total_changed += 1;
}
}
@ -108,12 +134,7 @@ mod test {
col.add_note(&mut note2, DeckID(1))?;
let nids = col.search_notes_only("")?;
let cnt = col.find_and_replace(FindReplaceContext::new(
nids.clone(),
"(?i)AAA",
"BBB",
None,
)?)?;
let cnt = col.find_and_replace(nids.clone(), "(?i)AAA", "BBB", None)?;
assert_eq!(cnt, 2);
let note = col.storage.get_note(note.id)?.unwrap();
@ -127,12 +148,7 @@ mod test {
col.storage.field_names_for_notes(&nids)?,
vec!["Back".to_string(), "Front".into(), "Text".into()]
);
let cnt = col.find_and_replace(FindReplaceContext::new(
nids.clone(),
"BBB",
"ccc",
Some("Front".into()),
)?)?;
let cnt = col.find_and_replace(nids.clone(), "BBB", "ccc", Some("Front".into()))?;
// still 2, as the caller is expected to provide only note ids that have
// that field, and if we can't find the field we fall back on all fields
assert_eq!(cnt, 2);

View file

@ -404,7 +404,7 @@ where
&self.mgr.media_folder,
)? {
// note was modified, needs saving
note.prepare_for_update(nt)?;
note.prepare_for_update(nt, false)?;
note.set_modified(usn);
self.ctx.storage.update_note(&note)?;
collection_modified = true;

View file

@ -8,7 +8,7 @@ use crate::{
define_newtype,
err::{AnkiError, Result},
notetype::{CardGenContext, NoteField, NoteType, NoteTypeID},
text::strip_html_preserving_image_filenames,
text::{ensure_string_in_nfc, strip_html_preserving_image_filenames},
timestamp::TimestampSecs,
types::Usn,
};
@ -65,7 +65,7 @@ impl Note {
}
/// Prepare note for saving to the database. Does not mark it as modified.
pub fn prepare_for_update(&mut self, nt: &NoteType) -> Result<()> {
pub fn prepare_for_update(&mut self, nt: &NoteType, normalize_text: bool) -> Result<()> {
assert!(nt.id == self.ntid);
if nt.fields.len() != self.fields.len() {
return Err(AnkiError::invalid_input(format!(
@ -75,6 +75,12 @@ impl Note {
)));
}
if normalize_text {
for field in &mut self.fields {
ensure_string_in_nfc(field);
}
}
let field1_nohtml = strip_html_preserving_image_filenames(&self.fields()[0]);
let checksum = field_checksum(field1_nohtml.as_ref());
let sort_field = if nt.config.sort_field_idx == 0 {
@ -184,7 +190,8 @@ impl Collection {
.get_notetype(note.ntid)?
.ok_or_else(|| AnkiError::invalid_input("missing note type"))?;
let ctx = CardGenContext::new(&nt, col.usn()?);
col.add_note_inner(&ctx, note, did)
let norm = col.normalize_note_text();
col.add_note_inner(&ctx, note, did, norm)
})
}
@ -193,9 +200,10 @@ impl Collection {
ctx: &CardGenContext,
note: &mut Note,
did: DeckID,
normalize_text: bool,
) -> Result<()> {
self.canonify_note_tags(note, ctx.usn)?;
note.prepare_for_update(&ctx.notetype)?;
note.prepare_for_update(&ctx.notetype, normalize_text)?;
note.set_modified(ctx.usn);
self.storage.add_note(note)?;
self.generate_cards_for_new_note(ctx, note, did)
@ -207,7 +215,8 @@ impl Collection {
.get_notetype(note.ntid)?
.ok_or_else(|| AnkiError::invalid_input("missing note type"))?;
let ctx = CardGenContext::new(&nt, col.usn()?);
col.update_note_inner_generating_cards(&ctx, note, true)
let norm = col.normalize_note_text();
col.update_note_inner_generating_cards(&ctx, note, true, norm)
})
}
@ -216,8 +225,15 @@ impl Collection {
ctx: &CardGenContext,
note: &mut Note,
mark_note_modified: bool,
normalize_text: bool,
) -> Result<()> {
self.update_note_inner_without_cards(note, ctx.notetype, ctx.usn, mark_note_modified)?;
self.update_note_inner_without_cards(
note,
ctx.notetype,
ctx.usn,
mark_note_modified,
normalize_text,
)?;
self.generate_cards_for_existing_note(ctx, note)
}
@ -227,9 +243,10 @@ impl Collection {
nt: &NoteType,
usn: Usn,
mark_note_modified: bool,
normalize_text: bool,
) -> Result<()> {
self.canonify_note_tags(note, usn)?;
note.prepare_for_update(nt)?;
note.prepare_for_update(nt, normalize_text)?;
if mark_note_modified {
note.set_modified(usn);
}
@ -256,6 +273,7 @@ impl Collection {
mark_notes_modified: bool,
) -> Result<()> {
let nids_by_notetype = self.storage.note_ids_by_notetype(nids)?;
let norm = self.normalize_note_text();
for (ntid, group) in &nids_by_notetype.into_iter().group_by(|tup| tup.0) {
let nt = self
.get_notetype(ntid)?
@ -268,6 +286,7 @@ impl Collection {
&genctx,
&mut note,
mark_notes_modified,
norm,
)?;
} else {
self.update_note_inner_without_cards(
@ -275,6 +294,7 @@ impl Collection {
&genctx.notetype,
usn,
mark_notes_modified,
norm,
)?;
}
}
@ -286,7 +306,7 @@ impl Collection {
#[cfg(test)]
mod test {
use super::{anki_base91, field_checksum};
use crate::{collection::open_test_collection, decks::DeckID, err::Result};
use crate::{collection::open_test_collection, config::ConfigKey, decks::DeckID, err::Result};
#[test]
fn test_base91() {
@ -350,4 +370,50 @@ mod test {
Ok(())
}
#[test]
fn normalization() -> Result<()> {
let mut col = open_test_collection();
let nt = col.get_notetype_by_name("Basic")?.unwrap();
let mut note = nt.new_note();
note.fields[0] = "\u{fa47}".into();
col.add_note(&mut note, DeckID(1))?;
assert_eq!(note.fields[0], "\u{6f22}");
// non-normalized searches should be converted
assert_eq!(
col.search_cards("\u{fa47}", crate::search::SortMode::NoOrder)?
.len(),
1
);
assert_eq!(
col.search_cards("front:\u{fa47}", crate::search::SortMode::NoOrder)?
.len(),
1
);
col.remove_note_only(note.id, col.usn()?)?;
// if normalization turned off, note text is entered as-is
let mut note = nt.new_note();
note.fields[0] = "\u{fa47}".into();
col.set_config(ConfigKey::NormalizeNoteText, &false)
.unwrap();
col.add_note(&mut note, DeckID(1))?;
assert_eq!(note.fields[0], "\u{fa47}");
// normalized searches won't match
assert_eq!(
col.search_cards("\u{6f22}", crate::search::SortMode::NoOrder)?
.len(),
0
);
// but original characters will
assert_eq!(
col.search_cards("\u{fa47}", crate::search::SortMode::NoOrder)?
.len(),
1
);
Ok(())
}
}

View file

@ -337,6 +337,7 @@ impl Collection {
/// or fields have been added/removed/reordered.
pub fn update_notetype(&mut self, nt: &mut NoteType, preserve_usn: bool) -> Result<()> {
let existing = self.get_notetype(nt.id)?;
let norm = self.normalize_note_text();
nt.prepare_for_update(existing.as_ref().map(AsRef::as_ref))?;
self.transact(None, |col| {
if let Some(existing_notetype) = existing {
@ -347,6 +348,7 @@ impl Collection {
nt,
existing_notetype.fields.len(),
existing_notetype.config.sort_field_idx,
norm,
)?;
col.update_cards_for_changed_templates(nt, existing_notetype.templates.len())?;
}

View file

@ -55,6 +55,7 @@ impl Collection {
nt: &NoteType,
previous_field_count: usize,
previous_sort_idx: u32,
normalize_text: bool,
) -> Result<()> {
let ords: Vec<_> = nt.fields.iter().map(|f| f.ord).collect();
if !ords_changed(&ords, previous_field_count) {
@ -63,7 +64,7 @@ impl Collection {
let nids = self.search_notes_only(&format!("mid:{}", nt.id))?;
for nid in nids {
let mut note = self.storage.get_note(nid)?.unwrap();
note.prepare_for_update(nt)?;
note.prepare_for_update(nt, normalize_text)?;
self.storage.update_note(&note)?;
}
} else {
@ -92,7 +93,7 @@ impl Collection {
})
.map(Into::into)
.collect();
note.prepare_for_update(nt)?;
note.prepare_for_update(nt, normalize_text)?;
note.set_modified(usn);
self.storage.update_note(&note)?;
}

View file

@ -19,7 +19,7 @@ pub enum SortMode {
impl Collection {
pub fn search_cards(&mut self, search: &str, order: SortMode) -> Result<Vec<CardID>> {
let top_node = Node::Group(parse(search)?);
let (sql, args) = node_to_sql(self, &top_node)?;
let (sql, args) = node_to_sql(self, &top_node, self.normalize_note_text())?;
let mut sql = format!(
"select c.id from cards c, notes n where c.nid=n.id and {}",

View file

@ -30,7 +30,7 @@ impl Collection {
F: FnOnce(String) -> String,
{
let top_node = Node::Group(parse(search)?);
let (sql, args) = node_to_sql(self, &top_node)?;
let (sql, args) = node_to_sql(self, &top_node, self.normalize_note_text())?;
let sql = build_sql(sql);

View file

@ -1,15 +1,19 @@
// Copyright: Ankitects Pty Ltd and contributors
// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
use crate::err::{AnkiError, Result};
use crate::notetype::NoteTypeID;
use nom::branch::alt;
use nom::bytes::complete::{escaped, is_not, tag, take_while1};
use nom::character::complete::{anychar, char, one_of};
use nom::character::is_digit;
use nom::combinator::{all_consuming, map, map_res};
use nom::sequence::{delimited, preceded, tuple};
use nom::{multi::many0, IResult};
use crate::{
err::{AnkiError, Result},
notetype::NoteTypeID,
};
use nom::{
branch::alt,
bytes::complete::{escaped, is_not, tag, take_while1},
character::complete::{anychar, char, one_of},
character::is_digit,
combinator::{all_consuming, map, map_res},
sequence::{delimited, preceded, tuple},
{multi::many0, IResult},
};
use std::{borrow::Cow, num};
// fixme: need to preserve \ when used twice in string
@ -109,7 +113,6 @@ pub(super) enum TemplateKind {
}
/// Parse the input string into a list of nodes.
#[allow(dead_code)]
pub(super) fn parse(input: &str) -> Result<Vec<Node>> {
let input = input.trim();
if input.is_empty() {
@ -118,6 +121,7 @@ pub(super) fn parse(input: &str) -> Result<Vec<Node>> {
let (_, nodes) = all_consuming(group_inner)(input)
.map_err(|_e| AnkiError::invalid_input("unable to parse search"))?;
Ok(nodes)
}

View file

@ -2,37 +2,47 @@
// License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
use super::parser::{Node, PropertyKind, SearchNode, StateKind, TemplateKind};
use crate::card::CardQueue;
use crate::err::Result;
use crate::notes::field_checksum;
use crate::notetype::NoteTypeID;
use crate::text::matches_wildcard;
use crate::text::without_combining;
use crate::{
collection::Collection, decks::human_deck_name_to_native,
text::strip_html_preserving_image_filenames,
card::CardQueue,
collection::Collection,
decks::human_deck_name_to_native,
err::Result,
notes::field_checksum,
notetype::NoteTypeID,
text::matches_wildcard,
text::{normalize_to_nfc, strip_html_preserving_image_filenames, without_combining},
};
use lazy_static::lazy_static;
use regex::{Captures, Regex};
use std::fmt::Write;
use std::{borrow::Cow, fmt::Write};
struct SqlWriter<'a> {
col: &'a mut Collection,
sql: String,
args: Vec<String>,
normalize_note_text: bool,
}
pub(super) fn node_to_sql(req: &mut Collection, node: &Node) -> Result<(String, Vec<String>)> {
let mut sctx = SqlWriter::new(req);
pub(super) fn node_to_sql(
req: &mut Collection,
node: &Node,
normalize_note_text: bool,
) -> Result<(String, Vec<String>)> {
let mut sctx = SqlWriter::new(req, normalize_note_text);
sctx.write_node_to_sql(&node)?;
Ok((sctx.sql, sctx.args))
}
impl SqlWriter<'_> {
fn new(col: &mut Collection) -> SqlWriter<'_> {
fn new(col: &mut Collection, normalize_note_text: bool) -> SqlWriter<'_> {
let sql = String::new();
let args = vec![];
SqlWriter { col, sql, args }
SqlWriter {
col,
sql,
args,
normalize_note_text,
}
}
fn write_node_to_sql(&mut self, node: &Node) -> Result<()> {
@ -55,22 +65,47 @@ impl SqlWriter<'_> {
Ok(())
}
/// Convert search text to NFC if note normalization is enabled.
fn norm_note<'a>(&self, text: &'a str) -> Cow<'a, str> {
if self.normalize_note_text {
normalize_to_nfc(text)
} else {
text.into()
}
}
fn write_search_node_to_sql(&mut self, node: &SearchNode) -> Result<()> {
use normalize_to_nfc as norm;
match node {
SearchNode::UnqualifiedText(text) => self.write_unqualified(text),
// note fields related
SearchNode::UnqualifiedText(text) => self.write_unqualified(&self.norm_note(text)),
SearchNode::SingleField { field, text, is_re } => {
self.write_single_field(field.as_ref(), text.as_ref(), *is_re)?
self.write_single_field(field.as_ref(), &self.norm_note(text), *is_re)?
}
SearchNode::Duplicates { note_type_id, text } => {
self.write_dupes(*note_type_id, &self.norm_note(text))
}
SearchNode::Regex(re) => self.write_regex(&self.norm_note(re)),
SearchNode::NoCombining(text) => self.write_no_combining(&self.norm_note(text)),
SearchNode::WordBoundary(text) => self.write_word_boundary(&self.norm_note(text)),
// other
SearchNode::AddedInDays(days) => self.write_added(*days)?,
SearchNode::CardTemplate(template) => self.write_template(template)?,
SearchNode::Deck(deck) => self.write_deck(deck.as_ref())?,
SearchNode::CardTemplate(template) => match template {
TemplateKind::Ordinal(_) => {
self.write_template(template)?;
}
TemplateKind::Name(name) => {
self.write_template(&TemplateKind::Name(norm(name).into()))?;
}
},
SearchNode::Deck(deck) => self.write_deck(&norm(deck))?,
SearchNode::NoteTypeID(ntid) => {
write!(self.sql, "n.mid = {}", ntid).unwrap();
}
SearchNode::NoteType(notetype) => self.write_note_type(notetype.as_ref())?,
SearchNode::NoteType(notetype) => self.write_note_type(&norm(notetype))?,
SearchNode::Rated { days, ease } => self.write_rated(*days, *ease)?,
SearchNode::Tag(tag) => self.write_tag(tag)?,
SearchNode::Duplicates { note_type_id, text } => self.write_dupes(*note_type_id, text),
SearchNode::Tag(tag) => self.write_tag(&norm(tag))?,
SearchNode::State(state) => self.write_state(state)?,
SearchNode::Flag(flag) => {
write!(self.sql, "(c.flags & 7) == {}", flag).unwrap();
@ -83,9 +118,6 @@ impl SqlWriter<'_> {
}
SearchNode::Property { operator, kind } => self.write_prop(operator, kind)?,
SearchNode::WholeCollection => write!(self.sql, "true").unwrap(),
SearchNode::Regex(re) => self.write_regex(re.as_ref()),
SearchNode::NoCombining(text) => self.write_no_combining(text.as_ref()),
SearchNode::WordBoundary(text) => self.write_word_boundary(text.as_ref()),
};
Ok(())
}
@ -424,7 +456,7 @@ mod test {
// shortcut
fn s(req: &mut Collection, search: &str) -> (String, Vec<String>) {
let node = Node::Group(parse(search).unwrap());
node_to_sql(req, &node).unwrap()
node_to_sql(req, &node, true).unwrap()
}
#[test]