move from Python's URI escaping to IRI escaping in Rust

Should make non-Latin text readable in the HTML editor, without the
breakages reverted in the previous change.
This commit is contained in:
Damien Elmes 2021-07-16 10:35:09 +10:00
parent e97c381a6f
commit bf507cca98
14 changed files with 252 additions and 27 deletions

15
Cargo.lock generated
View file

@ -76,6 +76,7 @@ dependencies = [
"num-integer",
"num_enum",
"once_cell",
"pct-str",
"pin-project",
"proc-macro-nested",
"prost",
@ -1466,6 +1467,14 @@ dependencies = [
"proc-macro-hack",
]
[[package]]
name = "pct-str"
version = "1.1.0"
source = "git+https://github.com/timothee-haudebourg/pct-str.git?rev=4adccd8d4a222ab2672350a102f06ae832a0572d#4adccd8d4a222ab2672350a102f06ae832a0572d"
dependencies = [
"utf8-decode",
]
[[package]]
name = "percent-encoding"
version = "2.1.0"
@ -2719,6 +2728,12 @@ version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "utf8-decode"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca61eb27fa339aa08826a29f03e87b99b4d8f0fc2255306fd266bb1b6a9de498"
[[package]]
name = "utime"
version = "0.3.1"

View file

@ -1471,6 +1471,16 @@ def raze_fetch_remote_crates():
build_file = Label("//cargo/remote:BUILD.paste-impl-0.1.18.bazel"),
)
maybe(
new_git_repository,
name = "raze__pct_str__1_1_0",
remote = "https://github.com/timothee-haudebourg/pct-str.git",
shallow_since = "1605376517 +0100",
commit = "4adccd8d4a222ab2672350a102f06ae832a0572d",
build_file = Label("//cargo/remote:BUILD.pct-str-1.1.0.bazel"),
init_submodules = True,
)
maybe(
http_archive,
name = "raze__percent_encoding__2_1_0",
@ -2741,6 +2751,16 @@ def raze_fetch_remote_crates():
build_file = Label("//cargo/remote:BUILD.utf-8-0.7.6.bazel"),
)
maybe(
http_archive,
name = "raze__utf8_decode__1_0_1",
url = "https://crates.io/api/v1/crates/utf8-decode/1.0.1/download",
type = "tar.gz",
sha256 = "ca61eb27fa339aa08826a29f03e87b99b4d8f0fc2255306fd266bb1b6a9de498",
strip_prefix = "utf8-decode-1.0.1",
build_file = Label("//cargo/remote:BUILD.utf8-decode-1.0.1.bazel"),
)
maybe(
http_archive,
name = "raze__utime__0_3_1",

View file

@ -1340,6 +1340,15 @@
"license_file": null,
"description": "Implementation detail of the `paste` crate"
},
{
"name": "pct-str",
"version": "1.1.0",
"authors": "Timothée Haudebourg <author@haudebourg.net>",
"repository": "https://github.com/timothee-haudebourg/pct-str",
"license": "Apache-2.0 OR MIT",
"license_file": null,
"description": "Percent-encoded strings for URL, URI, IRI, etc."
},
{
"name": "percent-encoding",
"version": "2.1.0",
@ -2492,6 +2501,15 @@
"license_file": null,
"description": "Incremental, zero-copy UTF-8 decoding with error handling"
},
{
"name": "utf8-decode",
"version": "1.0.1",
"authors": "Timothée Haudebourg <author@haudebourg.net>",
"repository": "https://github.com/timothee-haudebourg/utf8-decode",
"license": "Apache-2.0 OR MIT",
"license_file": null,
"description": "UTF-8 incremental decoding iterators."
},
{
"name": "utime",
"version": "0.3.1",

60
cargo/remote/BUILD.pct-str-1.1.0.bazel vendored Normal file
View file

@ -0,0 +1,60 @@
"""
@generated
cargo-raze crate build file.
DO NOT EDIT! Replaced on runs of cargo-raze
"""
# buildifier: disable=load
load("@bazel_skylib//lib:selects.bzl", "selects")
# buildifier: disable=load
load(
"@rules_rust//rust:rust.bzl",
"rust_binary",
"rust_library",
"rust_test",
)
package(default_visibility = [
# Public for visibility by "@raze__crate__version//" targets.
#
# Prefer access through "//cargo", which limits external
# visibility to explicit Cargo.toml dependencies.
"//visibility:public",
])
licenses([
"notice", # MIT from expression "MIT OR Apache-2.0"
])
# Generated Targets
# Unsupported target "encode" with type "example" omitted
# Unsupported target "str" with type "example" omitted
# Unsupported target "string" with type "example" omitted
rust_library(
name = "pct_str",
srcs = glob(["**/*.rs"]),
crate_features = [
],
crate_root = "src/lib.rs",
crate_type = "lib",
data = [],
edition = "2018",
rustc_flags = [
"--cap-lints=allow",
],
tags = [
"cargo-raze",
"manual",
],
version = "1.1.0",
# buildifier: leave-alone
deps = [
"@raze__utf8_decode__1_0_1//:utf8_decode",
],
)

View file

@ -0,0 +1,57 @@
"""
@generated
cargo-raze crate build file.
DO NOT EDIT! Replaced on runs of cargo-raze
"""
# buildifier: disable=load
load("@bazel_skylib//lib:selects.bzl", "selects")
# buildifier: disable=load
load(
"@rules_rust//rust:rust.bzl",
"rust_binary",
"rust_library",
"rust_test",
)
package(default_visibility = [
# Public for visibility by "@raze__crate__version//" targets.
#
# Prefer access through "//cargo", which limits external
# visibility to explicit Cargo.toml dependencies.
"//visibility:public",
])
licenses([
"notice", # MIT from expression "MIT OR Apache-2.0"
])
# Generated Targets
# Unsupported target "safe" with type "example" omitted
# Unsupported target "unsafe" with type "example" omitted
rust_library(
name = "utf8_decode",
srcs = glob(["**/*.rs"]),
crate_features = [
],
crate_root = "src/lib.rs",
crate_type = "lib",
data = [],
edition = "2018",
rustc_flags = [
"--cap-lints=allow",
],
tags = [
"cargo-raze",
"manual",
],
version = "1.0.1",
# buildifier: leave-alone
deps = [
],
)

View file

@ -19,6 +19,8 @@ COMMITS_SHALLOW_SINCE = {
"0cb6f7d14c62819e37cd221736f8b0555e823712": "1619519657 +1000",
# tokio-io-timeout
"1ee0892217e9a76bba4bb369ec5fab8854935a3c": "1619517354 +1000",
# pct-str
"4adccd8d4a222ab2672350a102f06ae832a0572d": "1605376517 +0100",
}
import glob

View file

@ -21,6 +21,8 @@ service CardRenderingService {
returns (RenderCardResponse);
rpc StripAVTags(generic.String) returns (generic.String);
rpc RenderMarkdown(RenderMarkdownRequest) returns (generic.String);
rpc EncodeIriPaths(generic.String) returns (generic.String);
rpc DecodeIriPaths(generic.String) returns (generic.String);
}
message ExtractAVTagsRequest {

View file

@ -8,10 +8,7 @@ import pprint
import re
import sys
import time
import urllib.error
import urllib.parse
import urllib.request
from typing import Any, Callable, List, Match, Optional, Tuple
from typing import Any, Callable, List, Optional, Tuple
from anki import media_pb2
from anki.consts import *
@ -191,22 +188,10 @@ class MediaManager:
def escape_media_filenames(self, string: str, unescape: bool = False) -> str:
"Apply or remove percent encoding to filenames in html tags (audio, image, object)."
fn: Callable
if unescape:
fn = urllib.parse.unquote
return self.col._backend.decode_iri_paths(string)
else:
fn = urllib.parse.quote
def repl(match: Match) -> str:
tag = match.group(0)
fname = match.group("fname")
if re.match("(https?|ftp)://", fname):
return tag
return tag.replace(fname, fn(fname))
for reg in self.html_media_regexps:
string = re.sub(reg, repl, string)
return string
return self.col._backend.encode_iri_paths(string)
# Checking media
##########################################################################

View file

@ -116,6 +116,7 @@ rust_library(
"//rslib/cargo:unicode_normalization",
"//rslib/cargo:utime",
"//rslib/cargo:zip",
"//rslib/cargo:pct_str",
"//rslib/i18n:anki_i18n",
] + select({
# rustls on Linux

View file

@ -92,3 +92,4 @@ pulldown-cmark = "0.8.0"
fnv = "1.0.7"
strum = { version = "0.21.0", features = ["derive"] }
tokio-util = { version = "0.6.7", features = ["io"] }
pct-str = { git="https://github.com/timothee-haudebourg/pct-str.git", rev="4adccd8d4a222ab2672350a102f06ae832a0572d" }

View file

@ -210,6 +210,15 @@ alias(
],
)
alias(
name = "pct_str",
actual = "@raze__pct_str__1_1_0//:pct_str",
tags = [
"cargo-raze",
"manual",
],
)
alias(
name = "pin_project",
actual = "@raze__pin_project__1_0_7//:pin_project",

View file

@ -10,7 +10,10 @@ use crate::{
notetype::{CardTemplateSchema11, RenderCardOutput},
prelude::*,
template::RenderedNode,
text::{extract_av_tags, sanitize_html_no_images, strip_av_tags, AvTag},
text::{
decode_iri_paths, encode_iri_paths, extract_av_tags, sanitize_html_no_images,
strip_av_tags, AvTag,
},
};
impl CardRenderingService for Backend {
@ -150,6 +153,14 @@ impl CardRenderingService for Backend {
}
Ok(text.into())
}
fn encode_iri_paths(&self, input: pb::String) -> Result<pb::String> {
Ok(encode_iri_paths(&input.val).to_string().into())
}
fn decode_iri_paths(&self, input: pb::String) -> Result<pb::String> {
Ok(decode_iri_paths(&input.val).to_string().into())
}
}
fn rendered_nodes_to_proto(nodes: Vec<RenderedNode>) -> Vec<pb::RenderedTemplateNode> {

View file

@ -8,9 +8,6 @@ use std::{
path::Path,
};
use lazy_static::lazy_static;
use regex::Regex;
use crate::{
collection::Collection,
error::{AnkiError, DbErrorKind, Result},
@ -25,13 +22,9 @@ use crate::{
MediaManager,
},
notes::Note,
text::{extract_media_refs, normalize_to_nfc, MediaRef},
text::{extract_media_refs, normalize_to_nfc, MediaRef, REMOTE_FILENAME},
};
lazy_static! {
static ref REMOTE_FILENAME: Regex = Regex::new("(?i)^https?://").unwrap();
}
#[derive(Debug, PartialEq, Clone)]
pub struct MediaCheckOutput {
pub unused: Vec<String>,

View file

@ -4,6 +4,7 @@
use std::{borrow::Cow, ptr};
use lazy_static::lazy_static;
use pct_str::{IriReserved, PctStr, PctString};
use regex::{Captures, Regex};
use unicase::eq as uni_eq;
use unicode_normalization::{
@ -424,6 +425,56 @@ pub(crate) fn matches_glob(text: &str, search: &str) -> bool {
}
}
lazy_static! {
pub(crate) static ref REMOTE_FILENAME: Regex = Regex::new("(?i)^https?://").unwrap();
}
/// IRI-encode unescaped local paths in HTML fragment.
pub(crate) fn encode_iri_paths(unescaped_html: &str) -> Cow<str> {
transform_html_paths(unescaped_html, |fname| {
PctString::encode(fname.chars(), IriReserved::Segment)
.into_string()
.into()
})
}
/// URI-decode unescaped local paths in HTML fragment.
pub(crate) fn decode_iri_paths(unescaped_html: &str) -> Cow<str> {
transform_html_paths(unescaped_html, |fname| {
match PctStr::new(fname) {
Ok(s) => s.decode().into(),
Err(_e) => {
// invalid percent encoding; return unchanged
fname.into()
}
}
})
}
/// Apply a transform to local filename references in tags like IMG.
/// Required to display time, as Anki unfortunately stores the references
/// in unencoded form in the database.
fn transform_html_paths<F>(html: &str, transform: F) -> Cow<str>
where
F: Fn(&str) -> Cow<str>,
{
HTML_MEDIA_TAGS.replace_all(html, |caps: &Captures| {
let fname = caps
.get(1)
.or_else(|| caps.get(2))
.or_else(|| caps.get(3))
.unwrap()
.as_str()
.trim();
let full = caps.get(0).unwrap().as_str();
if REMOTE_FILENAME.is_match(fname) {
full.into()
} else {
full.replace(fname, &transform(fname))
}
})
}
#[cfg(test)]
mod test {
use std::borrow::Cow;