diff --git a/Cargo.lock b/Cargo.lock index e275039a6..ac0cae130 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -76,6 +76,7 @@ dependencies = [ "num-integer", "num_enum", "once_cell", + "pct-str", "pin-project", "proc-macro-nested", "prost", @@ -1466,6 +1467,14 @@ dependencies = [ "proc-macro-hack", ] +[[package]] +name = "pct-str" +version = "1.1.0" +source = "git+https://github.com/timothee-haudebourg/pct-str.git?rev=4adccd8d4a222ab2672350a102f06ae832a0572d#4adccd8d4a222ab2672350a102f06ae832a0572d" +dependencies = [ + "utf8-decode", +] + [[package]] name = "percent-encoding" version = "2.1.0" @@ -2719,6 +2728,12 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" +[[package]] +name = "utf8-decode" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca61eb27fa339aa08826a29f03e87b99b4d8f0fc2255306fd266bb1b6a9de498" + [[package]] name = "utime" version = "0.3.1" diff --git a/cargo/crates.bzl b/cargo/crates.bzl index 3d8e16295..72f7896d9 100644 --- a/cargo/crates.bzl +++ b/cargo/crates.bzl @@ -1471,6 +1471,16 @@ def raze_fetch_remote_crates(): build_file = Label("//cargo/remote:BUILD.paste-impl-0.1.18.bazel"), ) + maybe( + new_git_repository, + name = "raze__pct_str__1_1_0", + remote = "https://github.com/timothee-haudebourg/pct-str.git", + shallow_since = "1605376517 +0100", + commit = "4adccd8d4a222ab2672350a102f06ae832a0572d", + build_file = Label("//cargo/remote:BUILD.pct-str-1.1.0.bazel"), + init_submodules = True, + ) + maybe( http_archive, name = "raze__percent_encoding__2_1_0", @@ -2741,6 +2751,16 @@ def raze_fetch_remote_crates(): build_file = Label("//cargo/remote:BUILD.utf-8-0.7.6.bazel"), ) + maybe( + http_archive, + name = "raze__utf8_decode__1_0_1", + url = "https://crates.io/api/v1/crates/utf8-decode/1.0.1/download", + type = "tar.gz", + sha256 = "ca61eb27fa339aa08826a29f03e87b99b4d8f0fc2255306fd266bb1b6a9de498", + strip_prefix = "utf8-decode-1.0.1", + build_file = Label("//cargo/remote:BUILD.utf8-decode-1.0.1.bazel"), + ) + maybe( http_archive, name = "raze__utime__0_3_1", diff --git a/cargo/licenses.json b/cargo/licenses.json index 0415fc2c3..87bfdb506 100644 --- a/cargo/licenses.json +++ b/cargo/licenses.json @@ -1340,6 +1340,15 @@ "license_file": null, "description": "Implementation detail of the `paste` crate" }, + { + "name": "pct-str", + "version": "1.1.0", + "authors": "Timothée Haudebourg ", + "repository": "https://github.com/timothee-haudebourg/pct-str", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Percent-encoded strings for URL, URI, IRI, etc." + }, { "name": "percent-encoding", "version": "2.1.0", @@ -2492,6 +2501,15 @@ "license_file": null, "description": "Incremental, zero-copy UTF-8 decoding with error handling" }, + { + "name": "utf8-decode", + "version": "1.0.1", + "authors": "Timothée Haudebourg ", + "repository": "https://github.com/timothee-haudebourg/utf8-decode", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "UTF-8 incremental decoding iterators." + }, { "name": "utime", "version": "0.3.1", diff --git a/cargo/remote/BUILD.pct-str-1.1.0.bazel b/cargo/remote/BUILD.pct-str-1.1.0.bazel new file mode 100644 index 000000000..df38d6654 --- /dev/null +++ b/cargo/remote/BUILD.pct-str-1.1.0.bazel @@ -0,0 +1,60 @@ +""" +@generated +cargo-raze crate build file. + +DO NOT EDIT! Replaced on runs of cargo-raze +""" + +# buildifier: disable=load +load("@bazel_skylib//lib:selects.bzl", "selects") + +# buildifier: disable=load +load( + "@rules_rust//rust:rust.bzl", + "rust_binary", + "rust_library", + "rust_test", +) + +package(default_visibility = [ + # Public for visibility by "@raze__crate__version//" targets. + # + # Prefer access through "//cargo", which limits external + # visibility to explicit Cargo.toml dependencies. + "//visibility:public", +]) + +licenses([ + "notice", # MIT from expression "MIT OR Apache-2.0" +]) + +# Generated Targets + +# Unsupported target "encode" with type "example" omitted + +# Unsupported target "str" with type "example" omitted + +# Unsupported target "string" with type "example" omitted + +rust_library( + name = "pct_str", + srcs = glob(["**/*.rs"]), + crate_features = [ + ], + crate_root = "src/lib.rs", + crate_type = "lib", + data = [], + edition = "2018", + rustc_flags = [ + "--cap-lints=allow", + ], + tags = [ + "cargo-raze", + "manual", + ], + version = "1.1.0", + # buildifier: leave-alone + deps = [ + "@raze__utf8_decode__1_0_1//:utf8_decode", + ], +) diff --git a/cargo/remote/BUILD.utf8-decode-1.0.1.bazel b/cargo/remote/BUILD.utf8-decode-1.0.1.bazel new file mode 100644 index 000000000..48530e9cf --- /dev/null +++ b/cargo/remote/BUILD.utf8-decode-1.0.1.bazel @@ -0,0 +1,57 @@ +""" +@generated +cargo-raze crate build file. + +DO NOT EDIT! Replaced on runs of cargo-raze +""" + +# buildifier: disable=load +load("@bazel_skylib//lib:selects.bzl", "selects") + +# buildifier: disable=load +load( + "@rules_rust//rust:rust.bzl", + "rust_binary", + "rust_library", + "rust_test", +) + +package(default_visibility = [ + # Public for visibility by "@raze__crate__version//" targets. + # + # Prefer access through "//cargo", which limits external + # visibility to explicit Cargo.toml dependencies. + "//visibility:public", +]) + +licenses([ + "notice", # MIT from expression "MIT OR Apache-2.0" +]) + +# Generated Targets + +# Unsupported target "safe" with type "example" omitted + +# Unsupported target "unsafe" with type "example" omitted + +rust_library( + name = "utf8_decode", + srcs = glob(["**/*.rs"]), + crate_features = [ + ], + crate_root = "src/lib.rs", + crate_type = "lib", + data = [], + edition = "2018", + rustc_flags = [ + "--cap-lints=allow", + ], + tags = [ + "cargo-raze", + "manual", + ], + version = "1.0.1", + # buildifier: leave-alone + deps = [ + ], +) diff --git a/cargo/update.py b/cargo/update.py index 1fcf7688d..a16731ec0 100755 --- a/cargo/update.py +++ b/cargo/update.py @@ -19,6 +19,8 @@ COMMITS_SHALLOW_SINCE = { "0cb6f7d14c62819e37cd221736f8b0555e823712": "1619519657 +1000", # tokio-io-timeout "1ee0892217e9a76bba4bb369ec5fab8854935a3c": "1619517354 +1000", + # pct-str + "4adccd8d4a222ab2672350a102f06ae832a0572d": "1605376517 +0100", } import glob diff --git a/proto/anki/card_rendering.proto b/proto/anki/card_rendering.proto index 589293086..744332e19 100644 --- a/proto/anki/card_rendering.proto +++ b/proto/anki/card_rendering.proto @@ -21,6 +21,8 @@ service CardRenderingService { returns (RenderCardResponse); rpc StripAVTags(generic.String) returns (generic.String); rpc RenderMarkdown(RenderMarkdownRequest) returns (generic.String); + rpc EncodeIriPaths(generic.String) returns (generic.String); + rpc DecodeIriPaths(generic.String) returns (generic.String); } message ExtractAVTagsRequest { diff --git a/pylib/anki/media.py b/pylib/anki/media.py index 29f67f047..daae446d0 100644 --- a/pylib/anki/media.py +++ b/pylib/anki/media.py @@ -8,10 +8,7 @@ import pprint import re import sys import time -import urllib.error -import urllib.parse -import urllib.request -from typing import Any, Callable, List, Match, Optional, Tuple +from typing import Any, Callable, List, Optional, Tuple from anki import media_pb2 from anki.consts import * @@ -191,22 +188,10 @@ class MediaManager: def escape_media_filenames(self, string: str, unescape: bool = False) -> str: "Apply or remove percent encoding to filenames in html tags (audio, image, object)." - fn: Callable if unescape: - fn = urllib.parse.unquote + return self.col._backend.decode_iri_paths(string) else: - fn = urllib.parse.quote - - def repl(match: Match) -> str: - tag = match.group(0) - fname = match.group("fname") - if re.match("(https?|ftp)://", fname): - return tag - return tag.replace(fname, fn(fname)) - - for reg in self.html_media_regexps: - string = re.sub(reg, repl, string) - return string + return self.col._backend.encode_iri_paths(string) # Checking media ########################################################################## diff --git a/rslib/BUILD.bazel b/rslib/BUILD.bazel index 27db1eaf8..857f64aa3 100644 --- a/rslib/BUILD.bazel +++ b/rslib/BUILD.bazel @@ -116,6 +116,7 @@ rust_library( "//rslib/cargo:unicode_normalization", "//rslib/cargo:utime", "//rslib/cargo:zip", + "//rslib/cargo:pct_str", "//rslib/i18n:anki_i18n", ] + select({ # rustls on Linux diff --git a/rslib/Cargo.toml b/rslib/Cargo.toml index 11ac0f24c..07fa4c894 100644 --- a/rslib/Cargo.toml +++ b/rslib/Cargo.toml @@ -92,3 +92,4 @@ pulldown-cmark = "0.8.0" fnv = "1.0.7" strum = { version = "0.21.0", features = ["derive"] } tokio-util = { version = "0.6.7", features = ["io"] } +pct-str = { git="https://github.com/timothee-haudebourg/pct-str.git", rev="4adccd8d4a222ab2672350a102f06ae832a0572d" } diff --git a/rslib/cargo/BUILD.bazel b/rslib/cargo/BUILD.bazel index 5d3e72813..6377a3531 100644 --- a/rslib/cargo/BUILD.bazel +++ b/rslib/cargo/BUILD.bazel @@ -210,6 +210,15 @@ alias( ], ) +alias( + name = "pct_str", + actual = "@raze__pct_str__1_1_0//:pct_str", + tags = [ + "cargo-raze", + "manual", + ], +) + alias( name = "pin_project", actual = "@raze__pin_project__1_0_7//:pin_project", diff --git a/rslib/src/backend/cardrendering.rs b/rslib/src/backend/cardrendering.rs index 9528b7895..0d501e224 100644 --- a/rslib/src/backend/cardrendering.rs +++ b/rslib/src/backend/cardrendering.rs @@ -10,7 +10,10 @@ use crate::{ notetype::{CardTemplateSchema11, RenderCardOutput}, prelude::*, template::RenderedNode, - text::{extract_av_tags, sanitize_html_no_images, strip_av_tags, AvTag}, + text::{ + decode_iri_paths, encode_iri_paths, extract_av_tags, sanitize_html_no_images, + strip_av_tags, AvTag, + }, }; impl CardRenderingService for Backend { @@ -150,6 +153,14 @@ impl CardRenderingService for Backend { } Ok(text.into()) } + + fn encode_iri_paths(&self, input: pb::String) -> Result { + Ok(encode_iri_paths(&input.val).to_string().into()) + } + + fn decode_iri_paths(&self, input: pb::String) -> Result { + Ok(decode_iri_paths(&input.val).to_string().into()) + } } fn rendered_nodes_to_proto(nodes: Vec) -> Vec { diff --git a/rslib/src/media/check.rs b/rslib/src/media/check.rs index 2d1f4492f..c901583da 100644 --- a/rslib/src/media/check.rs +++ b/rslib/src/media/check.rs @@ -8,9 +8,6 @@ use std::{ path::Path, }; -use lazy_static::lazy_static; -use regex::Regex; - use crate::{ collection::Collection, error::{AnkiError, DbErrorKind, Result}, @@ -25,13 +22,9 @@ use crate::{ MediaManager, }, notes::Note, - text::{extract_media_refs, normalize_to_nfc, MediaRef}, + text::{extract_media_refs, normalize_to_nfc, MediaRef, REMOTE_FILENAME}, }; -lazy_static! { - static ref REMOTE_FILENAME: Regex = Regex::new("(?i)^https?://").unwrap(); -} - #[derive(Debug, PartialEq, Clone)] pub struct MediaCheckOutput { pub unused: Vec, diff --git a/rslib/src/text.rs b/rslib/src/text.rs index 04f5c65b9..f4e375347 100644 --- a/rslib/src/text.rs +++ b/rslib/src/text.rs @@ -4,6 +4,7 @@ use std::{borrow::Cow, ptr}; use lazy_static::lazy_static; +use pct_str::{IriReserved, PctStr, PctString}; use regex::{Captures, Regex}; use unicase::eq as uni_eq; use unicode_normalization::{ @@ -424,6 +425,56 @@ pub(crate) fn matches_glob(text: &str, search: &str) -> bool { } } +lazy_static! { + pub(crate) static ref REMOTE_FILENAME: Regex = Regex::new("(?i)^https?://").unwrap(); +} + +/// IRI-encode unescaped local paths in HTML fragment. +pub(crate) fn encode_iri_paths(unescaped_html: &str) -> Cow { + transform_html_paths(unescaped_html, |fname| { + PctString::encode(fname.chars(), IriReserved::Segment) + .into_string() + .into() + }) +} + +/// URI-decode unescaped local paths in HTML fragment. +pub(crate) fn decode_iri_paths(unescaped_html: &str) -> Cow { + transform_html_paths(unescaped_html, |fname| { + match PctStr::new(fname) { + Ok(s) => s.decode().into(), + Err(_e) => { + // invalid percent encoding; return unchanged + fname.into() + } + } + }) +} + +/// Apply a transform to local filename references in tags like IMG. +/// Required to display time, as Anki unfortunately stores the references +/// in unencoded form in the database. +fn transform_html_paths(html: &str, transform: F) -> Cow +where + F: Fn(&str) -> Cow, +{ + HTML_MEDIA_TAGS.replace_all(html, |caps: &Captures| { + let fname = caps + .get(1) + .or_else(|| caps.get(2)) + .or_else(|| caps.get(3)) + .unwrap() + .as_str() + .trim(); + let full = caps.get(0).unwrap().as_str(); + if REMOTE_FILENAME.is_match(fname) { + full.into() + } else { + full.replace(fname, &transform(fname)) + } + }) +} + #[cfg(test)] mod test { use std::borrow::Cow;