Skip to content

Commit

Permalink
Checkpoint: switch to ByteStr throughout builder API
Browse files Browse the repository at this point in the history
  • Loading branch information
sffc committed Sep 29, 2024
1 parent 0c85baf commit efcbf77
Show file tree
Hide file tree
Showing 14 changed files with 240 additions and 209 deletions.
14 changes: 8 additions & 6 deletions utils/zerotrie/benches/overview.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@ use zerotrie::ZeroTrieSimpleAscii;
use zerovec::ZeroHashMap;
#[cfg(feature = "bench")]
use zerovec::ZeroMap;
use zerotrie::ByteStr;

mod testdata {
use zerotrie::ByteStr;
include!("../tests/data/data.rs");
}

Expand Down Expand Up @@ -137,7 +139,7 @@ fn get_subtags_bench_large(c: &mut Criterion) {
fn get_subtags_bench_helper<M: criterion::measurement::Measurement>(
mut g: criterion::BenchmarkGroup<M>,
strings: &[&str],
litemap: LiteMap<&[u8], usize>,
litemap: LiteMap<&ByteStr, usize>,
) {
g.bench_function("SimpleAscii", |b| {
let trie = ZeroTrieSimpleAscii::try_from(&litemap).unwrap();
Expand Down Expand Up @@ -171,7 +173,7 @@ fn get_subtags_bench_helper<M: criterion::measurement::Measurement>(

#[cfg(feature = "bench")]
g.bench_function("ZeroMap/usize", |b| {
let zm: ZeroMap<[u8], usize> = litemap.iter().map(|(a, b)| (*a, b)).collect();
let zm: ZeroMap<[u8], usize> = litemap.iter().map(|(a, b)| (a.as_bytes(), b)).collect();
b.iter(|| {
for (i, key) in black_box(strings).iter().enumerate() {
let actual = black_box(&zm).get_copied(key.as_bytes());
Expand All @@ -182,7 +184,7 @@ fn get_subtags_bench_helper<M: criterion::measurement::Measurement>(

#[cfg(feature = "bench")]
g.bench_function("ZeroMap/u8", |b| {
let zm: ZeroMap<[u8], u8> = litemap.iter().map(|(k, v)| (*k, *v as u8)).collect();
let zm: ZeroMap<[u8], u8> = litemap.iter().map(|(k, v)| (k.as_bytes(), *v as u8)).collect();
b.iter(|| {
for (i, key) in black_box(strings).iter().enumerate() {
let actual = black_box(&zm).get_copied(key.as_bytes());
Expand All @@ -193,7 +195,7 @@ fn get_subtags_bench_helper<M: criterion::measurement::Measurement>(

#[cfg(feature = "bench")]
g.bench_function("HashMap", |b| {
let hm: HashMap<&[u8], usize> = litemap.iter().map(|(a, b)| (*a, *b)).collect();
let hm: HashMap<&[u8], usize> = litemap.iter().map(|(a, b)| (a.as_bytes(), *b)).collect();
b.iter(|| {
for (i, key) in black_box(strings).iter().enumerate() {
let actual = black_box(&hm).get(key.as_bytes());
Expand All @@ -206,7 +208,7 @@ fn get_subtags_bench_helper<M: criterion::measurement::Measurement>(
g.bench_function("ZeroHashMap/usize", |b| {
let zhm: ZeroHashMap<[u8], usize> = litemap
.iter()
.map(|(a, b)| (*a, b))
.map(|(a, b)| (a.as_bytes(), b))
.collect();
b.iter(|| {
for (i, key) in black_box(strings).iter().enumerate() {
Expand All @@ -220,7 +222,7 @@ fn get_subtags_bench_helper<M: criterion::measurement::Measurement>(

#[cfg(feature = "bench")]
g.bench_function("ZeroHashMap/u8", |b| {
let zhm: ZeroHashMap<[u8], u8> = litemap.iter().map(|(k, v)| (*k, *v as u8)).collect();
let zhm: ZeroHashMap<[u8], u8> = litemap.iter().map(|(k, v)| (k.as_bytes(), *v as u8)).collect();
b.iter(|| {
for (i, key) in black_box(strings).iter().enumerate() {
let actual = black_box(&zhm).get(key.as_bytes()).copied();
Expand Down
72 changes: 51 additions & 21 deletions utils/zerotrie/src/builder/bytestr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,63 +3,77 @@
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use crate::comparison;
use core::borrow::Borrow;
use core::cmp::Ordering;
use core::fmt;

#[cfg(feature = "serde")]
use alloc::boxed::Box;

/// A struct transparent over `[u8]` with convenient helper functions.
/// A string key in a ZeroTrie.
///
/// This type has a custom Ord impl, making it suitable for use in a sorted
/// map for ZeroTrie construction.
#[repr(transparent)]
#[derive(PartialEq, Eq, PartialOrd, Ord)]
pub(crate) struct ByteStr([u8]);
#[derive(PartialEq, Eq)]
pub struct ByteStr([u8]);

impl ByteStr {
pub const fn from_byte_slice_with_value<'a, 'l>(
#[inline]
pub(crate) const fn from_byte_slice_with_value<'a, 'l>(
input: &'l [(&'a [u8], usize)],
) -> &'l [(&'a ByteStr, usize)] {
// Safety: [u8] and ByteStr have the same layout and invariants
unsafe { core::mem::transmute(input) }
}

pub const fn from_str_slice_with_value<'a, 'l>(
#[inline]
pub(crate) const fn from_str_slice_with_value<'a, 'l>(
input: &'l [(&'a str, usize)],
) -> &'l [(&'a ByteStr, usize)] {
// Safety: str and ByteStr have the same layout, and ByteStr is less restrictive
unsafe { core::mem::transmute(input) }
}

pub fn from_bytes(input: &[u8]) -> &Self {
/// Casts a `&[u8]` to a `&ByteStr`
#[inline]
pub const fn from_bytes(input: &[u8]) -> &Self {
// Safety: [u8] and ByteStr have the same layout and invariants
unsafe { core::mem::transmute(input) }
}

#[cfg(feature = "serde")]
pub fn from_boxed_bytes(input: Box<[u8]>) -> Box<Self> {
/// Casts a `Box<[u8]>` to a `Box<ByteStr>`
#[cfg(feature = "alloc")]
pub const fn from_boxed_bytes(input: Box<[u8]>) -> Box<Self> {
// Safety: [u8] and ByteStr have the same layout and invariants
unsafe { core::mem::transmute(input) }
}

#[allow(dead_code)] // may want this in the future
pub fn from_str(input: &str) -> &Self {
/// Casts a `&str` to a `&ByteStr`
pub const fn from_str(input: &str) -> &Self {
Self::from_bytes(input.as_bytes())
}

#[allow(dead_code)] // may want this in the future
pub fn empty() -> &'static Self {
/// Creates an empty ByteStr
pub const fn empty() -> &'static Self {
Self::from_bytes(&[])
}

#[allow(dead_code)] // not used in all features
/// Returns this ByteStr as a byte slice
pub const fn as_bytes(&self) -> &[u8] {
&self.0
}

/// Whether the ByteStr is an empty slice
pub const fn is_empty(&self) -> bool {
self.len() == 0
}

/// How many bytes are in the ByteStr
pub const fn len(&self) -> usize {
self.0.len()
}

#[allow(dead_code)] // not used in all features
/// Whether the ByteStr is all ASCII-range
pub fn is_all_ascii(&self) -> bool {
for byte in self.0.iter() {
if !byte.is_ascii() {
Expand Down Expand Up @@ -111,16 +125,18 @@ impl ByteStr {
}
}

impl Borrow<[u8]> for ByteStr {
fn borrow(&self) -> &[u8] {
// Note: Does NOT impl Borrow<[u8]> because the Ord impls differ.
// AsRef is okay to implement.

impl AsRef<[u8]> for ByteStr {
fn as_ref(&self) -> &[u8] {
self.as_bytes()
}
}

#[cfg(feature = "alloc")]
impl Borrow<[u8]> for alloc::boxed::Box<ByteStr> {
fn borrow(&self) -> &[u8] {
self.as_bytes()
impl AsRef<ByteStr> for ByteStr {
fn as_ref(&self) -> &ByteStr {
self
}
}

Expand All @@ -133,3 +149,17 @@ impl fmt::Debug for ByteStr {
}
}
}

impl Ord for ByteStr {
#[inline]
fn cmp(&self, other: &Self) -> Ordering {
crate::comparison::cmp_slices(&self.0, &other.0)
}
}

impl PartialOrd for ByteStr {
#[inline]
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
18 changes: 8 additions & 10 deletions utils/zerotrie/src/builder/litemap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,33 +15,31 @@ use litemap::LiteMap;

impl ZeroTrieSimpleAscii<Vec<u8>> {
#[doc(hidden)]
pub fn try_from_litemap_with_const_builder<'a, S>(
items: &LiteMap<&'a [u8], usize, S>,
pub fn try_from_litemap_with_const_builder<'a, 'b, S>(
items: &'a LiteMap<&'b ByteStr, usize, S>,
) -> Result<Self, ZeroTrieBuildError>
where
S: litemap::store::StoreSlice<&'a [u8], usize, Slice = [(&'a [u8], usize)]>,
S: litemap::store::StoreSlice<&'b ByteStr, usize, Slice = [(&'b ByteStr, usize)]>,
{
let tuples = items.as_slice();
let byte_str_slice = ByteStr::from_byte_slice_with_value(tuples);
let byte_str_slice = items.as_slice();
ZeroTrieBuilderConst::<10000>::from_sorted_const_tuple_slice::<100>(byte_str_slice.into())
.map(|s| Self {
store: s.as_bytes().to_vec(),
})
}
}

impl<'a, K, S> TryFrom<&'a LiteMap<K, usize, S>> for ZeroTrie<Vec<u8>>
impl<'a, 'b, K, S> TryFrom<&'a LiteMap<K, usize, S>> for ZeroTrie<Vec<u8>>
where
// Borrow, not AsRef, because we rely on Ord being the same. Unfortunately
// this means `LiteMap<&str, usize>` does not work.
K: Borrow<[u8]>,
K: Borrow<ByteStr>,
S: litemap::store::StoreSlice<K, usize, Slice = [(K, usize)]>,
{
type Error = ZeroTrieBuildError;
fn try_from(items: &LiteMap<K, usize, S>) -> Result<Self, ZeroTrieBuildError> {
let byte_litemap = items.to_borrowed_keys::<[u8], Vec<_>>();
let byte_slice = byte_litemap.as_slice();
let byte_str_slice = ByteStr::from_byte_slice_with_value(byte_slice);
let byte_litemap = items.to_borrowed_keys::<ByteStr, Vec<_>>();
let byte_str_slice = byte_litemap.as_slice();
Self::try_from_tuple_slice(byte_str_slice)
}
}
Expand Down
2 changes: 1 addition & 1 deletion utils/zerotrie/src/builder/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ mod litemap;
#[cfg(feature = "alloc")]
pub(crate) mod nonconst;

use bytestr::ByteStr;
pub use bytestr::ByteStr;

use super::ZeroTrieSimpleAscii;

Expand Down
34 changes: 15 additions & 19 deletions utils/zerotrie/src/builder/nonconst/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,9 @@ use super::store::NonConstLengthsStack;
use super::store::TrieBuilderStore;
use crate::builder::bytestr::ByteStr;
use crate::byte_phf::PerfectByteHashMapCacheOwned;
use crate::comparison;
use crate::error::ZeroTrieBuildError;
use crate::options::*;
use crate::varint;
use alloc::borrow::Cow;
use alloc::vec::Vec;

/// A low-level builder for ZeroTrie. Supports all options.
Expand Down Expand Up @@ -102,11 +100,10 @@ impl<S: TrieBuilderStore> ZeroTrieBuilder<S> {
let items = Vec::<(K, usize)>::from_iter(iter);
let mut items = items
.iter()
.map(|(k, v)| (k.as_ref(), *v))
.collect::<Vec<(&[u8], usize)>>();
items.sort_by(|a, b| cmp_keys_values(&options, *a, *b));
let ascii_str_slice = items.as_slice();
let byte_str_slice = ByteStr::from_byte_slice_with_value(ascii_str_slice);
.map(|(k, v)| (ByteStr::from_bytes(k.as_ref()), *v))
.collect::<Vec<(&ByteStr, usize)>>();
items.sort_by(|a, b| cmp_keys_values(*a, *b));
let byte_str_slice = items.as_slice();
Self::from_sorted_tuple_slice(byte_str_slice, options)
}

Expand All @@ -121,12 +118,14 @@ impl<S: TrieBuilderStore> ZeroTrieBuilder<S> {
options: ZeroTrieBuilderOptions,
) -> Result<Self, ZeroTrieBuildError> {
for ab in items.windows(2) {
debug_assert!(cmp_keys_values(
&options,
(ab[0].0.as_bytes(), ab[0].1),
(ab[1].0.as_bytes(), ab[1].1)
)
.is_lt(), "{ab:?}");
debug_assert!(
cmp_keys_values(
(&ab[0].0, ab[0].1),
(&ab[1].0, ab[1].1)
)
.is_lt(),
"{ab:?}"
);
}
let mut result = Self {
data: S::atbs_new_empty(),
Expand Down Expand Up @@ -389,11 +388,8 @@ impl<S: TrieBuilderStore> ZeroTrieBuilder<S> {
}

fn cmp_keys_values(
options: &ZeroTrieBuilderOptions,
a: (&[u8], usize),
b: (&[u8], usize),
a: (&ByteStr, usize),
b: (&ByteStr, usize),
) -> Ordering {
let a_iter = a.0.iter().copied().map(comparison::shift);
let b_iter = b.0.iter().copied().map(comparison::shift);
Iterator::cmp(a_iter, b_iter).then_with(|| a.1.cmp(&b.1))
a.0.cmp(b.0).then_with(|| a.1.cmp(&b.1))
}
7 changes: 7 additions & 0 deletions utils/zerotrie/src/comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@ pub(crate) fn cmpi(a: u8, b: u8) -> Ordering {
shift(a.to_ascii_lowercase()).cmp(&shift(b.to_ascii_lowercase()))
}

#[inline]
pub(crate) fn cmp_slices(a: &[u8], b: &[u8]) -> Ordering {
let a_iter = a.iter().copied().map(shift);
let b_iter = b.iter().copied().map(shift);
Iterator::cmp(a_iter, b_iter)
}

#[test]
fn test_basic_cmp() {
let mut all_bytes = (0u8..=255u8).collect::<Vec<_>>();
Expand Down
1 change: 1 addition & 0 deletions utils/zerotrie/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ pub use crate::zerotrie::ZeroTrieExtendedCapacity;
pub use crate::zerotrie::ZeroTriePerfectHash;
pub use crate::zerotrie::ZeroTrieSimpleAscii;
pub use error::ZeroTrieBuildError;
pub use builder::ByteStr;

#[cfg(feature = "alloc")]
pub use crate::zerotrie::ZeroTrieStringIterator;
Expand Down
1 change: 1 addition & 0 deletions utils/zerotrie/src/serde.rs
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@ where

#[cfg(test)]
mod testdata {
use crate::ByteStr;
include!("../tests/data/data.rs");
}

Expand Down
12 changes: 5 additions & 7 deletions utils/zerotrie/src/zerotrie.rs
Original file line number Diff line number Diff line change
Expand Up @@ -483,17 +483,16 @@ macro_rules! impl_zerotrie_subtype {
#[cfg(feature = "litemap")]
impl<'a, K, S> TryFrom<&'a LiteMap<K, usize, S>> for $name<Vec<u8>>
where
K: Borrow<[u8]>,
K: Borrow<ByteStr>,
S: litemap::store::StoreIterable<'a, K, usize>,
{
type Error = crate::error::ZeroTrieBuildError;
fn try_from(map: &'a LiteMap<K, usize, S>) -> Result<Self, Self::Error> {
let tuples: Vec<(&[u8], usize)> = map
let byte_str_slice: Vec<(&ByteStr, usize)> = map
.iter()
.map(|(k, v)| (k.borrow(), *v))
.collect();
let byte_str_slice = ByteStr::from_byte_slice_with_value(&tuples);
Self::try_from_tuple_slice(byte_str_slice)
Self::try_from_tuple_slice(&byte_str_slice)
}
}
#[cfg(feature = "litemap")]
Expand Down Expand Up @@ -799,11 +798,10 @@ where
fn from_iter<T: IntoIterator<Item = (K, usize)>>(iter: T) -> Self {
// We need two Vecs because the first one anchors the `K`s that the second one borrows.
let items = Vec::from_iter(iter);
let mut items: Vec<(&[u8], usize)> = items.iter().map(|(k, v)| (k.as_ref(), *v)).collect();
let mut items: Vec<(&ByteStr, usize)> = items.iter().map(|(k, v)| (ByteStr::from_bytes(k.as_ref()), *v)).collect();
items.sort();
let byte_str_slice = ByteStr::from_byte_slice_with_value(&items);
#[allow(clippy::unwrap_used)] // FromIterator is panicky
Self::try_from_tuple_slice(byte_str_slice).unwrap()
Self::try_from_tuple_slice(&items).unwrap()
}
}

Expand Down
1 change: 1 addition & 0 deletions utils/zerotrie/tests/asciitrie_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use zerotrie::ZeroTrieSimpleAscii;
use zerovec::ZeroMap;

mod testdata {
use zerotrie::ByteStr;
include!("data/data.rs");
}

Expand Down
Loading

0 comments on commit efcbf77

Please sign in to comment.