Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Store data path hash instead of string in binary #5886

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file modified components/icu/tests/data/tutorial_buffer.wasm
Binary file not shown.
7 changes: 4 additions & 3 deletions provider/baked/src/export.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ use databake::*;
use heck::ToShoutySnakeCase;
use heck::ToSnakeCase;
use icu_provider::export::*;
use icu_provider::marker::data_marker_path;
use icu_provider::prelude::*;
use std::collections::{BTreeMap, BTreeSet, HashMap};
use std::fmt::Write as _;
Expand Down Expand Up @@ -781,12 +782,12 @@ impl DataExporter for BakedExporter {
macro_rules! cb {
($($marker:path = $path:literal,)+ #[experimental] $($emarker:path = $epath:literal,)+) => {
fn bake_marker(marker: DataMarkerInfo) -> databake::TokenStream {
if marker.path.as_str() == icu_provider::hello_world::HelloWorldV1Marker::INFO.path.as_str() {
if marker.path == icu_provider::hello_world::HelloWorldV1Marker::INFO.path {
return databake::quote!(icu_provider::hello_world::HelloWorldV1Marker);
}

$(
if marker.path.as_str() == $path {
if marker.path == data_marker_path!($path) {
return stringify!($marker)
.replace("icu :: ", "icu_")
.parse()
Expand All @@ -795,7 +796,7 @@ macro_rules! cb {
)+

$(
if marker.path.as_str() == $epath {
if marker.path == data_marker_path!($epath) {
return stringify!($emarker)
.replace("icu :: ", "icu_")
.parse()
Expand Down
2 changes: 1 addition & 1 deletion provider/core/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ impl fmt::Display for DataError {
write!(f, ": {}", self.kind)?;
}
if let Some(marker) = self.marker_path {
write!(f, " (marker: {})", marker.as_str())?;
write!(f, " (marker: {marker:?})")?;
}
if let Some(str_context) = self.str_context {
write!(f, ": {str_context}")?;
Expand Down
94 changes: 26 additions & 68 deletions provider/core/src/marker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,24 @@ impl AsULE for DataMarkerPathHash {
// Safe since the ULE type is `self`.
unsafe impl EqULE for DataMarkerPathHash {}

#[repr(C, packed)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
struct TaggedDataMarkerHash {
prefix: [u8; 10], // "icu4x_tdmh"
hash: DataMarkerPathHash,
suffix: u8, // "\n"
}

impl TaggedDataMarkerHash {
const fn new(hash: DataMarkerPathHash) -> Self {
Self {
prefix: *b"icu4x_tdmh",
hash,
suffix: b'\n',
}
}
}

/// The string path of a data marker. For example, "foo@1"
///
/// ```
Expand All @@ -358,39 +376,15 @@ unsafe impl EqULE for DataMarkerPathHash {}
/// # use icu_provider::marker::DataMarkerPath;
/// const K: DataMarkerPath = icu_provider::marker::data_marker_path!("foo/../bar@1");
/// ```
#[derive(Debug, Copy, Clone, Eq)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct DataMarkerPath {
// This string literal is wrapped in leading_tag!() and trailing_tag!() to make it detectable
// in a compiled binary.
tagged: &'static str,
hash: DataMarkerPathHash,
}

impl PartialEq for DataMarkerPath {
#[inline]
fn eq(&self, other: &Self) -> bool {
self.hash == other.hash && self.tagged == other.tagged
}
}

impl Ord for DataMarkerPath {
#[inline]
fn cmp(&self, other: &Self) -> core::cmp::Ordering {
self.tagged.cmp(other.tagged)
}
}

impl PartialOrd for DataMarkerPath {
#[inline]
fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
Some(self.tagged.cmp(other.tagged))
}
hash: TaggedDataMarkerHash,
}

impl core::hash::Hash for DataMarkerPath {
#[inline]
fn hash<H: core::hash::Hasher>(&self, state: &mut H) {
self.hash.hash(state)
self.hash.hash.hash(state)
}
}

Expand Down Expand Up @@ -437,8 +431,9 @@ impl DataMarkerPath {
)
.to_le_bytes(),
);
let hash = TaggedDataMarkerHash::new(hash);

Ok(Self { tagged, hash })
Ok(Self { hash })
}

const fn validate_path_manual_slice(
Expand Down Expand Up @@ -484,18 +479,6 @@ impl DataMarkerPath {
}
}

/// Gets the path as a static string slice.
#[inline]
pub const fn as_str(self) -> &'static str {
unsafe {
// Safe due to invariant that self.path is tagged correctly
core::str::from_utf8_unchecked(core::slice::from_raw_parts(
self.tagged.as_ptr().add(leading_tag!().len()),
self.tagged.len() - trailing_tag!().len() - leading_tag!().len(),
))
}
}

/// Gets a platform-independent hash of a [`DataMarkerPath`].
///
/// The hash is 4 bytes and allows for fast comparison.
Expand All @@ -514,7 +497,7 @@ impl DataMarkerPath {
/// ```
#[inline]
pub const fn hashed(self) -> DataMarkerPathHash {
self.hash
self.hash.hash
}
}

Expand Down Expand Up @@ -642,7 +625,7 @@ pub use __data_marker_path as data_marker_path;

impl fmt::Debug for DataMarkerInfo {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str(self.path.as_str())
self.path.fmt(f)
}
}

Expand Down Expand Up @@ -742,31 +725,6 @@ fn test_path_syntax() {
);
}

#[test]
fn test_path_to_string() {
struct TestCase {
pub path: DataMarkerPath,
pub expected: &'static str,
}

for cas in [
TestCase {
path: data_marker_path!("core/cardinal@1"),
expected: "core/cardinal@1",
},
TestCase {
path: data_marker_path!("core/maxlengthsubcatg@1"),
expected: "core/maxlengthsubcatg@1",
},
TestCase {
path: data_marker_path!("core/cardinal@65535"),
expected: "core/cardinal@65535",
},
] {
assert_eq!(cas.expected, cas.path.as_str());
}
}

#[test]
fn test_hash_word_32() {
assert_eq!(0, fxhash_32(b"", 0, 0));
Expand Down Expand Up @@ -807,6 +765,6 @@ fn test_path_hash() {
hash: DataMarkerPathHash([176, 131, 182, 223]),
},
] {
assert_eq!(cas.hash, cas.path.hashed(), "{}", cas.path.as_str());
assert_eq!(cas.hash, cas.path.hashed(), "{:?}", cas.path);
}
}
1 change: 1 addition & 0 deletions provider/fs/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ all-features = true
[dependencies]
displaydoc = { workspace = true }
icu_provider = { workspace = true, features = ["serde", "std"] }
icu_provider_registry = { workspace = true }
serde = { workspace = true, features = ["derive", "alloc"] }
serde-json-core = { workspace = true, features = ["std"] }

Expand Down
56 changes: 56 additions & 0 deletions provider/fs/src/datapath.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use icu_provider::marker::*;
use std::collections::HashMap;
use std::sync::OnceLock;

macro_rules! cb {
($($marker:path = $path:literal,)+ #[experimental] $($emarker:path = $epath:literal,)+) => {
pub(crate) fn get_data_marker_path(marker: DataMarkerPath) -> Option<&'static str> {
static LOOKUP: OnceLock<HashMap<DataMarkerPathHash, &'static str>> = OnceLock::new();
let lookup = LOOKUP.get_or_init(|| {
[
(data_marker_path!("core/helloworld@1").hashed(), "core/helloworld@1"),
$(
(data_marker_path!($path).hashed(), $path),
)+
$(
(data_marker_path!($epath).hashed(), $epath),
)+
]
.into_iter()
.collect()
});
lookup.get(&marker.hashed()).map(|v| &**v)
}
}
}
icu_provider_registry::registry!(cb);

#[test]
fn test_path_to_string() {
struct TestCase {
pub path: DataMarkerPath,
pub expected: &'static str,
}

for cas in [
TestCase {
path: data_marker_path!("core/cardinal@1"),
expected: "core/cardinal@1",
},
TestCase {
path: data_marker_path!("core/maxlengthsubcatg@1"),
expected: "core/maxlengthsubcatg@1",
},
TestCase {
path: data_marker_path!("core/cardinal@65535"),
expected: "core/cardinal@65535",
},
] {
let path = get_data_marker_path(cas.path);
assert_eq!(Some(cas.expected), path);
}
}
11 changes: 9 additions & 2 deletions provider/fs/src/export/fs_exporter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use super::serializers::AbstractSerializer;
use crate::datapath::get_data_marker_path;
use crate::manifest::Manifest;
use icu_provider::export::*;
use icu_provider::prelude::*;
Expand Down Expand Up @@ -105,7 +106,10 @@ impl DataExporter for FilesystemExporter {
id: DataIdentifierBorrowed,
obj: &DataPayload<ExportMarker>,
) -> Result<(), DataError> {
let mut path_buf = self.root.join(marker.path.as_str());
let Some(path) = get_data_marker_path(marker.path) else {
return Err(DataErrorKind::MarkerNotFound.with_marker(marker));
};
let mut path_buf = self.root.join(path);
if !id.marker_attributes.is_empty() {
path_buf.push(id.marker_attributes.as_str());
}
Expand Down Expand Up @@ -145,7 +149,10 @@ impl DataExporter for FilesystemExporter {
}

fn flush(&self, marker: DataMarkerInfo, _metadata: FlushMetadata) -> Result<(), DataError> {
let mut path_buf = self.root.join(marker.path.as_str());
let Some(path) = get_data_marker_path(marker.path) else {
return Err(DataErrorKind::MarkerNotFound.with_marker(marker));
};
let mut path_buf = self.root.join(path);

if !marker.is_singleton && !path_buf.exists() {
fs::create_dir_all(&path_buf)
Expand Down
6 changes: 5 additions & 1 deletion provider/fs/src/fs_data_provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use crate::datapath::get_data_marker_path;
use crate::manifest::Manifest;
use icu_provider::prelude::*;
use icu_provider::DynamicDryDataProvider;
Expand Down Expand Up @@ -68,7 +69,10 @@ impl FsDataProvider {
if marker.is_singleton && !req.id.locale.is_default() {
return Err(DataErrorKind::InvalidRequest.with_req(marker, req));
}
let mut path = self.root.join(marker.path.as_str());
let Some(path) = get_data_marker_path(marker.path) else {
return Err(DataErrorKind::MarkerNotFound.with_req(marker, req));
};
let mut path = self.root.join(path);
if !path.exists() {
return Err(DataErrorKind::MarkerNotFound.with_req(marker, req));
}
Expand Down
1 change: 1 addition & 0 deletions provider/fs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@
)]
#![warn(missing_docs)]

mod datapath;
mod fs_data_provider;
mod manifest;

Expand Down
5 changes: 3 additions & 2 deletions provider/registry/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -326,12 +326,13 @@ macro_rules! cb {
#[test]
fn test_paths_correct() {
use icu_provider::prelude::*;
use icu_provider::marker::data_marker_path;

$(
assert_eq!(<$marker>::INFO.path.as_str(), $path);
assert_eq!(<$marker>::INFO.path, data_marker_path!($path));
)+
$(
assert_eq!(<$emarker>::INFO.path.as_str(), $epath);
assert_eq!(<$emarker>::INFO.path, data_marker_path!($epath));
)+
}
}
Expand Down
6 changes: 5 additions & 1 deletion tools/make/depcheck/src/allowlist.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,11 @@ pub const EXTRA_BLOB_DEPS: &[&str] = &["cobs", "icu_provider_blob", "postcard"];
/// This shuld rarely change
///
/// Keep in sync with Cargo.toml crates.io dependencies.
pub const EXTRA_FS_DEPS: &[&str] = &["icu_provider_fs", "serde-json-core"];
pub const EXTRA_FS_DEPS: &[&str] = &[
"icu_provider_fs",
"icu_provider_registry",
"serde-json-core",
];

/// Dependencies needed by datagen provider (not counting `log` and `zip` deps)
/// This might change semi frequently but we should try and keep this small.
Expand Down
Loading