diff --git a/Cargo.lock b/Cargo.lock index 1754bd37d..1537749eb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -92,7 +92,7 @@ dependencies = [ "convert_case 0.6.0", "criterion", "csv", - "dissimilar", + "difflib", "env_logger", "flate2", "fluent", @@ -822,6 +822,12 @@ dependencies = [ "cipher 0.4.3", ] +[[package]] +name = "difflib" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" + [[package]] name = "digest" version = "0.10.6" @@ -844,12 +850,6 @@ dependencies = [ "syn", ] -[[package]] -name = "dissimilar" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c97b9233581d84b8e1e689cdd3a47b6f69770084fc246e86a7f78b0d9c1d4a5" - [[package]] name = "doc-comment" version = "0.3.3" diff --git a/rslib/Cargo.toml b/rslib/Cargo.toml index 74b4999d6..0666248d7 100644 --- a/rslib/Cargo.toml +++ b/rslib/Cargo.toml @@ -57,7 +57,7 @@ bytes = "1.3.0" chrono = { version = "0.4.19", default-features = false, features = ["std", "clock"] } coarsetime = "0.1.22" convert_case = "0.6.0" -dissimilar = "1.0.4" +difflib = "0.4.0" flate2 = "1.0.25" fluent = "0.16.0" fluent-bundle = "0.15.2" diff --git a/rslib/src/typeanswer.rs b/rslib/src/typeanswer.rs index d1657ad6f..bba4f5b0b 100644 --- a/rslib/src/typeanswer.rs +++ b/rslib/src/typeanswer.rs @@ -3,7 +3,8 @@ use std::borrow::Cow; -use dissimilar::Chunk; +use difflib::sequencematcher::{Opcode, SequenceMatcher}; +use itertools::Itertools; use lazy_static::lazy_static; use regex::Regex; use unic_ucd_category::GeneralCategory; @@ -29,57 +30,55 @@ lazy_static! { } struct DiffContext { - expected: String, - provided: String, + expected: Vec, + provided: Vec, } impl DiffContext { fn new(expected: &str, provided: &str) -> Self { DiffContext { - expected: prepare_expected(expected), - provided: prepare_provided(provided), + provided: prepare_provided(provided).chars().collect_vec(), + expected: prepare_expected(expected).chars().collect_vec(), } } - fn to_tokens(&self) -> DiffOutput<'_> { - let chunks = dissimilar::diff(&self.provided, &self.expected); + fn slice_expected(&self, opcode: &Opcode) -> String { + self.expected[opcode.second_start..opcode.second_end] + .iter() + .cloned() + .collect() + } + + fn slice_provided(&self, opcode: &Opcode) -> String { + self.provided[opcode.first_start..opcode.first_end] + .iter() + .cloned() + .collect() + } + + fn to_tokens(&self) -> DiffOutput { + let mut matcher = SequenceMatcher::new(&self.provided, &self.expected); + let opcodes = matcher.get_opcodes(); let mut provided = vec![]; let mut expected = vec![]; - for chunk in chunks { - match chunk { - Chunk::Equal(text) => { - provided.push(DiffToken { - kind: DiffTokenKind::Good, - text: text.into(), - }); - expected.push(DiffToken { - kind: DiffTokenKind::Good, - text: text.into(), - }); + for opcode in opcodes { + match opcode.tag.as_str() { + "equal" => { + provided.push(DiffToken::good(self.slice_provided(&opcode))); + expected.push(DiffToken::good(self.slice_expected(&opcode))); } - Chunk::Delete(text) => { - provided.push(DiffToken { - kind: DiffTokenKind::Bad, - text: text.into(), - }); + "delete" => { + provided.push(DiffToken::bad(self.slice_provided(&opcode))); } - Chunk::Insert(text) => { - // If the preceding text was correct, indicate text was missing - if provided - .last() - .map(|v| v.kind == DiffTokenKind::Good) - .unwrap_or_default() - { - provided.push(DiffToken { - kind: DiffTokenKind::Missing, - text: text.into(), - }); - } - expected.push(DiffToken { - kind: DiffTokenKind::Missing, - text: text.into(), - }); + "insert" => { + provided.push(DiffToken::missing(self.slice_expected(&opcode))); + expected.push(DiffToken::missing(self.slice_expected(&opcode))); } + "replace" => { + provided.push(DiffToken::bad(self.slice_provided(&opcode))); + expected.push(DiffToken::missing(self.slice_expected(&opcode))); + } + _ => unreachable!(), } } DiffOutput { provided, expected } @@ -123,15 +122,38 @@ enum DiffTokenKind { } #[derive(Debug, PartialEq, Eq)] -struct DiffToken<'a> { +struct DiffToken { kind: DiffTokenKind, - text: Cow<'a, str>, + text: String, +} + +impl DiffToken { + fn bad(text: String) -> Self { + Self { + kind: DiffTokenKind::Bad, + text, + } + } + + fn good(text: String) -> Self { + Self { + kind: DiffTokenKind::Good, + text, + } + } + + fn missing(text: String) -> Self { + Self { + kind: DiffTokenKind::Missing, + text, + } + } } #[derive(Debug, PartialEq, Eq)] -struct DiffOutput<'a> { - provided: Vec>, - expected: Vec>, +struct DiffOutput { + provided: Vec, + expected: Vec, } pub fn compare_answer(expected: &str, provided: &str) -> String { @@ -168,18 +190,18 @@ fn with_isolated_leading_mark(text: &str) -> Cow { #[cfg(test)] mod test { - use DiffTokenKind::*; - use super::*; - macro_rules! token { - ($kind:ident, $text:expr) => { - DiffToken { - kind: $kind, - text: $text.into(), + macro_rules! token_factory { + ($name:ident) => { + fn $name(text: &str) -> DiffToken { + DiffToken::$name(String::from(text)) } }; } + token_factory!(bad); + token_factory!(good); + token_factory!(missing); #[test] fn tokens() { @@ -188,25 +210,25 @@ mod test { assert_eq!( output.provided, vec![ - token!(Bad, "y"), - token!(Good, " ahora q"), - token!(Bad, "e"), - token!(Good, " vamos"), - token!(Missing, " "), - token!(Good, "a hacer"), - token!(Missing, "?"), + bad("y"), + good(" ahora q"), + bad("e"), + good(" vamos"), + missing(" "), + good("a hacer"), + missing("?"), ] ); assert_eq!( output.expected, vec![ - token!(Missing, "¿Y"), - token!(Good, " ahora q"), - token!(Missing, "ué"), - token!(Good, " vamos"), - token!(Missing, " "), - token!(Good, "a hacer"), - token!(Missing, "?"), + missing("¿Y"), + good(" ahora q"), + missing("ué"), + good(" vamos"), + missing(" "), + good("a hacer"), + missing("?"), ] ); } @@ -215,17 +237,37 @@ mod test { fn html_and_media() { let ctx = DiffContext::new("[sound:foo.mp3]1  2", "1 2"); // the spacing is handled by wrapping html output in white-space: pre-wrap - assert_eq!(ctx.to_tokens().expected, &[token!(Good, "1 2")]); + assert_eq!(ctx.to_tokens().expected, &[good("1 2")]); } #[test] fn missed_chars_only_shown_in_provided_when_after_good() { let ctx = DiffContext::new("1", "23"); - assert_eq!(ctx.to_tokens().provided, &[token!(Bad, "23")]); + assert_eq!(ctx.to_tokens().provided, &[bad("23")]); let ctx = DiffContext::new("12", "1"); + assert_eq!(ctx.to_tokens().provided, &[good("1"), missing("2"),]); + } + + #[test] + fn handles_certain_unicode_as_expected() { + // this was not parsed as expected with dissimilar 1.0.4 + let ctx = DiffContext::new("쓰다듬다", "스다뜸다"); assert_eq!( ctx.to_tokens().provided, - &[token!(Good, "1"), token!(Missing, "2"),] + &[bad("스"), good("다"), bad("뜸"), good("다"),] ); } + + #[test] + fn does_not_panic_with_certain_unicode() { + // this was causing a panic with dissimilar 1.0.4 + let ctx = DiffContext::new( + "Сущность должна быть ответственна только за одно дело", + concat!( + "Single responsibility Сущность выполняет только одну задачу.", + "Повод для изменения сущности только один." + ), + ); + ctx.to_tokens(); + } }