strip invalid Unicode chars in media check

This commit is contained in:
Damien Elmes 2021-07-17 15:36:43 +10:00
parent cba0735c59
commit c944dd048e
13 changed files with 453 additions and 0 deletions

43
Cargo.lock generated
View file

@ -103,6 +103,7 @@ dependencies = [
"tokio", "tokio",
"tokio-util", "tokio-util",
"unic-langid", "unic-langid",
"unic-ucd-category",
"unicase", "unicase",
"unicode-normalization", "unicode-normalization",
"utime", "utime",
@ -2610,6 +2611,27 @@ version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "879f6906492a7cd215bfa4cf595b600146ccfac0c79bcbd1f3000162af5e8b06" checksum = "879f6906492a7cd215bfa4cf595b600146ccfac0c79bcbd1f3000162af5e8b06"
[[package]]
name = "unic-char-property"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8c57a407d9b6fa02b4795eb81c5b6652060a15a7903ea981f3d723e6c0be221"
dependencies = [
"unic-char-range",
]
[[package]]
name = "unic-char-range"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0398022d5f700414f6b899e10b8348231abf9173fa93144cbc1a43b9793c1fbc"
[[package]]
name = "unic-common"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc"
[[package]] [[package]]
name = "unic-langid" name = "unic-langid"
version = "0.9.0" version = "0.9.0"
@ -2653,6 +2675,27 @@ dependencies = [
"unic-langid-impl", "unic-langid-impl",
] ]
[[package]]
name = "unic-ucd-category"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b8d4591f5fcfe1bd4453baaf803c40e1b1e69ff8455c47620440b46efef91c0"
dependencies = [
"matches",
"unic-char-property",
"unic-char-range",
"unic-ucd-version",
]
[[package]]
name = "unic-ucd-version"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96bd2f2237fe450fcd0a1d2f5f4e91711124f7857ba2e964247776ebeeb7b0c4"
dependencies = [
"unic-common",
]
[[package]] [[package]]
name = "unicase" name = "unicase"
version = "2.6.0" version = "2.6.0"

View file

@ -39,3 +39,9 @@ compile_data_attr = "glob([\"src/**/*.der\"])"
[package.metadata.raze.crates.webpki.'*'] [package.metadata.raze.crates.webpki.'*']
compile_data_attr = "glob([\"src/**/*.der\"])" compile_data_attr = "glob([\"src/**/*.der\"])"
[package.metadata.raze.crates.unic-ucd-version.'*']
compile_data_attr = "glob([\"**/*.rsv\"])"
[package.metadata.raze.crates.unic-ucd-category.'*']
compile_data_attr = "glob([\"**/*.rsv\"])"

View file

@ -2611,6 +2611,36 @@ def raze_fetch_remote_crates():
build_file = Label("//cargo/remote:BUILD.typenum-1.13.0.bazel"), build_file = Label("//cargo/remote:BUILD.typenum-1.13.0.bazel"),
) )
maybe(
http_archive,
name = "raze__unic_char_property__0_9_0",
url = "https://crates.io/api/v1/crates/unic-char-property/0.9.0/download",
type = "tar.gz",
sha256 = "a8c57a407d9b6fa02b4795eb81c5b6652060a15a7903ea981f3d723e6c0be221",
strip_prefix = "unic-char-property-0.9.0",
build_file = Label("//cargo/remote:BUILD.unic-char-property-0.9.0.bazel"),
)
maybe(
http_archive,
name = "raze__unic_char_range__0_9_0",
url = "https://crates.io/api/v1/crates/unic-char-range/0.9.0/download",
type = "tar.gz",
sha256 = "0398022d5f700414f6b899e10b8348231abf9173fa93144cbc1a43b9793c1fbc",
strip_prefix = "unic-char-range-0.9.0",
build_file = Label("//cargo/remote:BUILD.unic-char-range-0.9.0.bazel"),
)
maybe(
http_archive,
name = "raze__unic_common__0_9_0",
url = "https://crates.io/api/v1/crates/unic-common/0.9.0/download",
type = "tar.gz",
sha256 = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc",
strip_prefix = "unic-common-0.9.0",
build_file = Label("//cargo/remote:BUILD.unic-common-0.9.0.bazel"),
)
maybe( maybe(
http_archive, http_archive,
name = "raze__unic_langid__0_9_0", name = "raze__unic_langid__0_9_0",
@ -2651,6 +2681,26 @@ def raze_fetch_remote_crates():
build_file = Label("//cargo/remote:BUILD.unic-langid-macros-impl-0.9.0.bazel"), build_file = Label("//cargo/remote:BUILD.unic-langid-macros-impl-0.9.0.bazel"),
) )
maybe(
http_archive,
name = "raze__unic_ucd_category__0_9_0",
url = "https://crates.io/api/v1/crates/unic-ucd-category/0.9.0/download",
type = "tar.gz",
sha256 = "1b8d4591f5fcfe1bd4453baaf803c40e1b1e69ff8455c47620440b46efef91c0",
strip_prefix = "unic-ucd-category-0.9.0",
build_file = Label("//cargo/remote:BUILD.unic-ucd-category-0.9.0.bazel"),
)
maybe(
http_archive,
name = "raze__unic_ucd_version__0_9_0",
url = "https://crates.io/api/v1/crates/unic-ucd-version/0.9.0/download",
type = "tar.gz",
sha256 = "96bd2f2237fe450fcd0a1d2f5f4e91711124f7857ba2e964247776ebeeb7b0c4",
strip_prefix = "unic-ucd-version-0.9.0",
build_file = Label("//cargo/remote:BUILD.unic-ucd-version-0.9.0.bazel"),
)
maybe( maybe(
http_archive, http_archive,
name = "raze__unicase__2_6_0", name = "raze__unicase__2_6_0",

View file

@ -2375,6 +2375,33 @@
"license_file": null, "license_file": null,
"description": "Typenum is a Rust library for type-level numbers evaluated at compile time. It currently supports bits, unsigned integers, and signed integers. It also provides a type-level array of type-level numbers, but its implementation is incomplete." "description": "Typenum is a Rust library for type-level numbers evaluated at compile time. It currently supports bits, unsigned integers, and signed integers. It also provides a type-level array of type-level numbers, but its implementation is incomplete."
}, },
{
"name": "unic-char-property",
"version": "0.9.0",
"authors": "The UNIC Project Developers",
"repository": "https://github.com/open-i18n/rust-unic/",
"license": "Apache-2.0 OR MIT",
"license_file": null,
"description": "UNIC — Unicode Character Tools — Character Property taxonomy, contracts and build macros"
},
{
"name": "unic-char-range",
"version": "0.9.0",
"authors": "The UNIC Project Developers",
"repository": "https://github.com/open-i18n/rust-unic/",
"license": "Apache-2.0 OR MIT",
"license_file": null,
"description": "UNIC — Unicode Character Tools — Character Range and Iteration"
},
{
"name": "unic-common",
"version": "0.9.0",
"authors": "The UNIC Project Developers",
"repository": "https://github.com/open-i18n/rust-unic/",
"license": "Apache-2.0 OR MIT",
"license_file": null,
"description": "UNIC — Common Utilities"
},
{ {
"name": "unic-langid", "name": "unic-langid",
"version": "0.9.0", "version": "0.9.0",
@ -2411,6 +2438,24 @@
"license_file": null, "license_file": null,
"description": "API for managing Unicode Language Identifiers" "description": "API for managing Unicode Language Identifiers"
}, },
{
"name": "unic-ucd-category",
"version": "0.9.0",
"authors": "The UNIC Project Developers",
"repository": "https://github.com/open-i18n/rust-unic/",
"license": "Apache-2.0 OR MIT",
"license_file": null,
"description": "UNIC — Unicode Character Database — General Category"
},
{
"name": "unic-ucd-version",
"version": "0.9.0",
"authors": "The UNIC Project Developers",
"repository": "https://github.com/open-i18n/rust-unic/",
"license": "Apache-2.0 OR MIT",
"license_file": null,
"description": "UNIC — Unicode Character Database — Version"
},
{ {
"name": "unicase", "name": "unicase",
"version": "2.6.0", "version": "2.6.0",

View file

@ -0,0 +1,60 @@
"""
@generated
cargo-raze crate build file.
DO NOT EDIT! Replaced on runs of cargo-raze
"""
# buildifier: disable=load
load("@bazel_skylib//lib:selects.bzl", "selects")
# buildifier: disable=load
load(
"@rules_rust//rust:rust.bzl",
"rust_binary",
"rust_library",
"rust_test",
)
package(default_visibility = [
# Public for visibility by "@raze__crate__version//" targets.
#
# Prefer access through "//cargo", which limits external
# visibility to explicit Cargo.toml dependencies.
"//visibility:public",
])
licenses([
"notice", # MIT from expression "MIT OR Apache-2.0"
])
# Generated Targets
rust_library(
name = "unic_char_property",
srcs = glob(["**/*.rs"]),
crate_features = [
],
crate_root = "src/lib.rs",
crate_type = "lib",
data = [],
edition = "2018",
rustc_flags = [
"--cap-lints=allow",
],
tags = [
"cargo-raze",
"manual",
],
version = "0.9.0",
# buildifier: leave-alone
deps = [
"@raze__unic_char_range__0_9_0//:unic_char_range",
],
)
# Unsupported target "bool_property_macro" with type "test" omitted
# Unsupported target "enum_property_macro" with type "test" omitted
# Unsupported target "tables_tests" with type "test" omitted

View file

@ -0,0 +1,60 @@
"""
@generated
cargo-raze crate build file.
DO NOT EDIT! Replaced on runs of cargo-raze
"""
# buildifier: disable=load
load("@bazel_skylib//lib:selects.bzl", "selects")
# buildifier: disable=load
load(
"@rules_rust//rust:rust.bzl",
"rust_binary",
"rust_library",
"rust_test",
)
package(default_visibility = [
# Public for visibility by "@raze__crate__version//" targets.
#
# Prefer access through "//cargo", which limits external
# visibility to explicit Cargo.toml dependencies.
"//visibility:public",
])
licenses([
"notice", # MIT from expression "MIT OR Apache-2.0"
])
# Generated Targets
# Unsupported target "benchmarks" with type "bench" omitted
# Unsupported target "macro_use_std_tests" with type "example" omitted
rust_library(
name = "unic_char_range",
srcs = glob(["**/*.rs"]),
crate_features = [
"default",
],
crate_root = "src/lib.rs",
crate_type = "lib",
data = [],
edition = "2018",
rustc_flags = [
"--cap-lints=allow",
],
tags = [
"cargo-raze",
"manual",
],
version = "0.9.0",
# buildifier: leave-alone
deps = [
],
)
# Unsupported target "iter_tests" with type "test" omitted

View file

@ -0,0 +1,54 @@
"""
@generated
cargo-raze crate build file.
DO NOT EDIT! Replaced on runs of cargo-raze
"""
# buildifier: disable=load
load("@bazel_skylib//lib:selects.bzl", "selects")
# buildifier: disable=load
load(
"@rules_rust//rust:rust.bzl",
"rust_binary",
"rust_library",
"rust_test",
)
package(default_visibility = [
# Public for visibility by "@raze__crate__version//" targets.
#
# Prefer access through "//cargo", which limits external
# visibility to explicit Cargo.toml dependencies.
"//visibility:public",
])
licenses([
"notice", # MIT from expression "MIT OR Apache-2.0"
])
# Generated Targets
rust_library(
name = "unic_common",
srcs = glob(["**/*.rs"]),
crate_features = [
"default",
],
crate_root = "src/lib.rs",
crate_type = "lib",
data = [],
edition = "2018",
rustc_flags = [
"--cap-lints=allow",
],
tags = [
"cargo-raze",
"manual",
],
version = "0.9.0",
# buildifier: leave-alone
deps = [
],
)

View file

@ -0,0 +1,64 @@
"""
@generated
cargo-raze crate build file.
DO NOT EDIT! Replaced on runs of cargo-raze
"""
# buildifier: disable=load
load("@bazel_skylib//lib:selects.bzl", "selects")
# buildifier: disable=load
load(
"@rules_rust//rust:rust.bzl",
"rust_binary",
"rust_library",
"rust_test",
)
package(default_visibility = [
# Public for visibility by "@raze__crate__version//" targets.
#
# Prefer access through "//cargo", which limits external
# visibility to explicit Cargo.toml dependencies.
"//visibility:public",
])
licenses([
"notice", # MIT from expression "MIT OR Apache-2.0"
])
# Generated Targets
rust_library(
name = "unic_ucd_category",
srcs = glob(["**/*.rs"]),
crate_features = [
],
crate_root = "src/lib.rs",
crate_type = "lib",
data = [],
compile_data = glob(["**/*.rsv"]),
edition = "2018",
rustc_flags = [
"--cap-lints=allow",
],
tags = [
"cargo-raze",
"manual",
],
version = "0.9.0",
# buildifier: leave-alone
deps = [
"@raze__matches__0_1_8//:matches",
"@raze__unic_char_property__0_9_0//:unic_char_property",
"@raze__unic_char_range__0_9_0//:unic_char_range",
"@raze__unic_ucd_version__0_9_0//:unic_ucd_version",
],
)
# Unsupported target "basic_tests" with type "test" omitted
# Unsupported target "major_category_tests" with type "test" omitted
# Unsupported target "unicode_version_tests" with type "test" omitted

View file

@ -0,0 +1,57 @@
"""
@generated
cargo-raze crate build file.
DO NOT EDIT! Replaced on runs of cargo-raze
"""
# buildifier: disable=load
load("@bazel_skylib//lib:selects.bzl", "selects")
# buildifier: disable=load
load(
"@rules_rust//rust:rust.bzl",
"rust_binary",
"rust_library",
"rust_test",
)
package(default_visibility = [
# Public for visibility by "@raze__crate__version//" targets.
#
# Prefer access through "//cargo", which limits external
# visibility to explicit Cargo.toml dependencies.
"//visibility:public",
])
licenses([
"notice", # MIT from expression "MIT OR Apache-2.0"
])
# Generated Targets
rust_library(
name = "unic_ucd_version",
srcs = glob(["**/*.rs"]),
crate_features = [
],
crate_root = "src/lib.rs",
crate_type = "lib",
data = [],
compile_data = glob(["**/*.rsv"]),
edition = "2018",
rustc_flags = [
"--cap-lints=allow",
],
tags = [
"cargo-raze",
"manual",
],
version = "0.9.0",
# buildifier: leave-alone
deps = [
"@raze__unic_common__0_9_0//:unic_common",
],
)
# Unsupported target "basic_tests" with type "test" omitted

View file

@ -114,6 +114,7 @@ rust_library(
"//rslib/cargo:unic_langid", "//rslib/cargo:unic_langid",
"//rslib/cargo:unicase", "//rslib/cargo:unicase",
"//rslib/cargo:unicode_normalization", "//rslib/cargo:unicode_normalization",
"//rslib/cargo:unic_ucd_category",
"//rslib/cargo:utime", "//rslib/cargo:utime",
"//rslib/cargo:zip", "//rslib/cargo:zip",
"//rslib/cargo:pct_str", "//rslib/cargo:pct_str",

View file

@ -93,3 +93,4 @@ fnv = "1.0.7"
strum = { version = "0.21.0", features = ["derive"] } strum = { version = "0.21.0", features = ["derive"] }
tokio-util = { version = "0.6.7", features = ["io"] } tokio-util = { version = "0.6.7", features = ["io"] }
pct-str = { git="https://github.com/timothee-haudebourg/pct-str.git", rev="4adccd8d4a222ab2672350a102f06ae832a0572d" } pct-str = { git="https://github.com/timothee-haudebourg/pct-str.git", rev="4adccd8d4a222ab2672350a102f06ae832a0572d" }
unic-ucd-category = "0.9.0"

View file

@ -453,6 +453,15 @@ alias(
], ],
) )
alias(
name = "unic_ucd_category",
actual = "@raze__unic_ucd_category__0_9_0//:unic_ucd_category",
tags = [
"cargo-raze",
"manual",
],
)
alias( alias(
name = "unicase", name = "unicase",
actual = "@raze__unicase__2_6_0//:unicase", actual = "@raze__unicase__2_6_0//:unicase",

View file

@ -12,6 +12,7 @@ use std::{
use lazy_static::lazy_static; use lazy_static::lazy_static;
use regex::Regex; use regex::Regex;
use sha1::Sha1; use sha1::Sha1;
use unic_ucd_category::GeneralCategory;
use unicode_normalization::{is_nfc, UnicodeNormalization}; use unicode_normalization::{is_nfc, UnicodeNormalization};
use crate::{ use crate::{
@ -70,6 +71,8 @@ fn disallowed_char(char: char) -> bool {
match char { match char {
'[' | ']' | '<' | '>' | ':' | '"' | '/' | '?' | '*' | '^' | '\\' | '|' => true, '[' | ']' | '<' | '>' | ':' | '"' | '/' | '?' | '*' | '^' | '\\' | '|' => true,
c if c.is_ascii_control() => true, c if c.is_ascii_control() => true,
// Macs do not allow invalid Unicode characters like 05F8 to be in a filename.
c if GeneralCategory::of(c) == GeneralCategory::Unassigned => true,
_ => false, _ => false,
} }
} }