mirror of
https://github.com/ankitects/anki.git
synced 2025-09-18 22:12:21 -04:00
Replace dissimilar crate with difflib (#2322)
This also inserts the expected text if it's missing at the very beginning of the provided text.
This commit is contained in:
parent
604e0f46e1
commit
1be30573e1
3 changed files with 118 additions and 76 deletions
14
Cargo.lock
generated
14
Cargo.lock
generated
|
@ -92,7 +92,7 @@ dependencies = [
|
||||||
"convert_case 0.6.0",
|
"convert_case 0.6.0",
|
||||||
"criterion",
|
"criterion",
|
||||||
"csv",
|
"csv",
|
||||||
"dissimilar",
|
"difflib",
|
||||||
"env_logger",
|
"env_logger",
|
||||||
"flate2",
|
"flate2",
|
||||||
"fluent",
|
"fluent",
|
||||||
|
@ -822,6 +822,12 @@ dependencies = [
|
||||||
"cipher 0.4.3",
|
"cipher 0.4.3",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "difflib"
|
||||||
|
version = "0.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "digest"
|
name = "digest"
|
||||||
version = "0.10.6"
|
version = "0.10.6"
|
||||||
|
@ -844,12 +850,6 @@ dependencies = [
|
||||||
"syn",
|
"syn",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "dissimilar"
|
|
||||||
version = "1.0.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "8c97b9233581d84b8e1e689cdd3a47b6f69770084fc246e86a7f78b0d9c1d4a5"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "doc-comment"
|
name = "doc-comment"
|
||||||
version = "0.3.3"
|
version = "0.3.3"
|
||||||
|
|
|
@ -57,7 +57,7 @@ bytes = "1.3.0"
|
||||||
chrono = { version = "0.4.19", default-features = false, features = ["std", "clock"] }
|
chrono = { version = "0.4.19", default-features = false, features = ["std", "clock"] }
|
||||||
coarsetime = "0.1.22"
|
coarsetime = "0.1.22"
|
||||||
convert_case = "0.6.0"
|
convert_case = "0.6.0"
|
||||||
dissimilar = "1.0.4"
|
difflib = "0.4.0"
|
||||||
flate2 = "1.0.25"
|
flate2 = "1.0.25"
|
||||||
fluent = "0.16.0"
|
fluent = "0.16.0"
|
||||||
fluent-bundle = "0.15.2"
|
fluent-bundle = "0.15.2"
|
||||||
|
|
|
@ -3,7 +3,8 @@
|
||||||
|
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
|
|
||||||
use dissimilar::Chunk;
|
use difflib::sequencematcher::{Opcode, SequenceMatcher};
|
||||||
|
use itertools::Itertools;
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use unic_ucd_category::GeneralCategory;
|
use unic_ucd_category::GeneralCategory;
|
||||||
|
@ -29,57 +30,55 @@ lazy_static! {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct DiffContext {
|
struct DiffContext {
|
||||||
expected: String,
|
expected: Vec<char>,
|
||||||
provided: String,
|
provided: Vec<char>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DiffContext {
|
impl DiffContext {
|
||||||
fn new(expected: &str, provided: &str) -> Self {
|
fn new(expected: &str, provided: &str) -> Self {
|
||||||
DiffContext {
|
DiffContext {
|
||||||
expected: prepare_expected(expected),
|
provided: prepare_provided(provided).chars().collect_vec(),
|
||||||
provided: prepare_provided(provided),
|
expected: prepare_expected(expected).chars().collect_vec(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn to_tokens(&self) -> DiffOutput<'_> {
|
fn slice_expected(&self, opcode: &Opcode) -> String {
|
||||||
let chunks = dissimilar::diff(&self.provided, &self.expected);
|
self.expected[opcode.second_start..opcode.second_end]
|
||||||
|
.iter()
|
||||||
|
.cloned()
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn slice_provided(&self, opcode: &Opcode) -> String {
|
||||||
|
self.provided[opcode.first_start..opcode.first_end]
|
||||||
|
.iter()
|
||||||
|
.cloned()
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn to_tokens(&self) -> DiffOutput {
|
||||||
|
let mut matcher = SequenceMatcher::new(&self.provided, &self.expected);
|
||||||
|
let opcodes = matcher.get_opcodes();
|
||||||
let mut provided = vec![];
|
let mut provided = vec![];
|
||||||
let mut expected = vec![];
|
let mut expected = vec![];
|
||||||
for chunk in chunks {
|
for opcode in opcodes {
|
||||||
match chunk {
|
match opcode.tag.as_str() {
|
||||||
Chunk::Equal(text) => {
|
"equal" => {
|
||||||
provided.push(DiffToken {
|
provided.push(DiffToken::good(self.slice_provided(&opcode)));
|
||||||
kind: DiffTokenKind::Good,
|
expected.push(DiffToken::good(self.slice_expected(&opcode)));
|
||||||
text: text.into(),
|
|
||||||
});
|
|
||||||
expected.push(DiffToken {
|
|
||||||
kind: DiffTokenKind::Good,
|
|
||||||
text: text.into(),
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
Chunk::Delete(text) => {
|
"delete" => {
|
||||||
provided.push(DiffToken {
|
provided.push(DiffToken::bad(self.slice_provided(&opcode)));
|
||||||
kind: DiffTokenKind::Bad,
|
|
||||||
text: text.into(),
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
Chunk::Insert(text) => {
|
"insert" => {
|
||||||
// If the preceding text was correct, indicate text was missing
|
provided.push(DiffToken::missing(self.slice_expected(&opcode)));
|
||||||
if provided
|
expected.push(DiffToken::missing(self.slice_expected(&opcode)));
|
||||||
.last()
|
|
||||||
.map(|v| v.kind == DiffTokenKind::Good)
|
|
||||||
.unwrap_or_default()
|
|
||||||
{
|
|
||||||
provided.push(DiffToken {
|
|
||||||
kind: DiffTokenKind::Missing,
|
|
||||||
text: text.into(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
expected.push(DiffToken {
|
|
||||||
kind: DiffTokenKind::Missing,
|
|
||||||
text: text.into(),
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
"replace" => {
|
||||||
|
provided.push(DiffToken::bad(self.slice_provided(&opcode)));
|
||||||
|
expected.push(DiffToken::missing(self.slice_expected(&opcode)));
|
||||||
|
}
|
||||||
|
_ => unreachable!(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
DiffOutput { provided, expected }
|
DiffOutput { provided, expected }
|
||||||
|
@ -123,15 +122,38 @@ enum DiffTokenKind {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Eq)]
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
struct DiffToken<'a> {
|
struct DiffToken {
|
||||||
kind: DiffTokenKind,
|
kind: DiffTokenKind,
|
||||||
text: Cow<'a, str>,
|
text: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DiffToken {
|
||||||
|
fn bad(text: String) -> Self {
|
||||||
|
Self {
|
||||||
|
kind: DiffTokenKind::Bad,
|
||||||
|
text,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn good(text: String) -> Self {
|
||||||
|
Self {
|
||||||
|
kind: DiffTokenKind::Good,
|
||||||
|
text,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn missing(text: String) -> Self {
|
||||||
|
Self {
|
||||||
|
kind: DiffTokenKind::Missing,
|
||||||
|
text,
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Eq)]
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
struct DiffOutput<'a> {
|
struct DiffOutput {
|
||||||
provided: Vec<DiffToken<'a>>,
|
provided: Vec<DiffToken>,
|
||||||
expected: Vec<DiffToken<'a>>,
|
expected: Vec<DiffToken>,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn compare_answer(expected: &str, provided: &str) -> String {
|
pub fn compare_answer(expected: &str, provided: &str) -> String {
|
||||||
|
@ -168,18 +190,18 @@ fn with_isolated_leading_mark(text: &str) -> Cow<str> {
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use DiffTokenKind::*;
|
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
macro_rules! token {
|
macro_rules! token_factory {
|
||||||
($kind:ident, $text:expr) => {
|
($name:ident) => {
|
||||||
DiffToken {
|
fn $name(text: &str) -> DiffToken {
|
||||||
kind: $kind,
|
DiffToken::$name(String::from(text))
|
||||||
text: $text.into(),
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
token_factory!(bad);
|
||||||
|
token_factory!(good);
|
||||||
|
token_factory!(missing);
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn tokens() {
|
fn tokens() {
|
||||||
|
@ -188,25 +210,25 @@ mod test {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
output.provided,
|
output.provided,
|
||||||
vec![
|
vec![
|
||||||
token!(Bad, "y"),
|
bad("y"),
|
||||||
token!(Good, " ahora q"),
|
good(" ahora q"),
|
||||||
token!(Bad, "e"),
|
bad("e"),
|
||||||
token!(Good, " vamos"),
|
good(" vamos"),
|
||||||
token!(Missing, " "),
|
missing(" "),
|
||||||
token!(Good, "a hacer"),
|
good("a hacer"),
|
||||||
token!(Missing, "?"),
|
missing("?"),
|
||||||
]
|
]
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
output.expected,
|
output.expected,
|
||||||
vec![
|
vec![
|
||||||
token!(Missing, "¿Y"),
|
missing("¿Y"),
|
||||||
token!(Good, " ahora q"),
|
good(" ahora q"),
|
||||||
token!(Missing, "ué"),
|
missing("ué"),
|
||||||
token!(Good, " vamos"),
|
good(" vamos"),
|
||||||
token!(Missing, " "),
|
missing(" "),
|
||||||
token!(Good, "a hacer"),
|
good("a hacer"),
|
||||||
token!(Missing, "?"),
|
missing("?"),
|
||||||
]
|
]
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -215,17 +237,37 @@ mod test {
|
||||||
fn html_and_media() {
|
fn html_and_media() {
|
||||||
let ctx = DiffContext::new("[sound:foo.mp3]<b>1</b> 2", "1 2");
|
let ctx = DiffContext::new("[sound:foo.mp3]<b>1</b> 2", "1 2");
|
||||||
// the spacing is handled by wrapping html output in white-space: pre-wrap
|
// the spacing is handled by wrapping html output in white-space: pre-wrap
|
||||||
assert_eq!(ctx.to_tokens().expected, &[token!(Good, "1 2")]);
|
assert_eq!(ctx.to_tokens().expected, &[good("1 2")]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn missed_chars_only_shown_in_provided_when_after_good() {
|
fn missed_chars_only_shown_in_provided_when_after_good() {
|
||||||
let ctx = DiffContext::new("1", "23");
|
let ctx = DiffContext::new("1", "23");
|
||||||
assert_eq!(ctx.to_tokens().provided, &[token!(Bad, "23")]);
|
assert_eq!(ctx.to_tokens().provided, &[bad("23")]);
|
||||||
let ctx = DiffContext::new("12", "1");
|
let ctx = DiffContext::new("12", "1");
|
||||||
|
assert_eq!(ctx.to_tokens().provided, &[good("1"), missing("2"),]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn handles_certain_unicode_as_expected() {
|
||||||
|
// this was not parsed as expected with dissimilar 1.0.4
|
||||||
|
let ctx = DiffContext::new("쓰다듬다", "스다뜸다");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
ctx.to_tokens().provided,
|
ctx.to_tokens().provided,
|
||||||
&[token!(Good, "1"), token!(Missing, "2"),]
|
&[bad("스"), good("다"), bad("뜸"), good("다"),]
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn does_not_panic_with_certain_unicode() {
|
||||||
|
// this was causing a panic with dissimilar 1.0.4
|
||||||
|
let ctx = DiffContext::new(
|
||||||
|
"Сущность должна быть ответственна только за одно дело",
|
||||||
|
concat!(
|
||||||
|
"Single responsibility Сущность выполняет только одну задачу.",
|
||||||
|
"Повод для изменения сущности только один."
|
||||||
|
),
|
||||||
|
);
|
||||||
|
ctx.to_tokens();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue