Skip to content

Commit

Permalink
ZeroTrieSimpleAscii internals and non-ASCII behavior docs (unicode-or…
Browse files Browse the repository at this point in the history
  • Loading branch information
sffc authored Jan 24, 2024
1 parent 27322e0 commit 993fa4f
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 21 deletions.
21 changes: 20 additions & 1 deletion experimental/zerotrie/src/cursor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,28 @@ impl<'a> ZeroTrieSimpleAsciiCursor<'a> {
/// cursor.step(b'y');
/// assert_eq!(cursor.take_value(), None); // "abcdxy"
/// ```
///
/// If the byte is not ASCII, the cursor will become empty:
///
/// ```
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// // A trie with two values: "abc" and "abcdef"
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
///
/// let mut cursor = trie.cursor();
/// assert_eq!(cursor.take_value(), None); // ""
/// cursor.step(b'a');
/// assert_eq!(cursor.take_value(), None); // "a"
/// cursor.step(b'b');
/// assert_eq!(cursor.take_value(), None); // "ab"
/// cursor.step(b'\xFF');
/// assert!(cursor.is_empty());
/// assert_eq!(cursor.take_value(), None);
/// ```
#[inline]
pub fn step(&mut self, byte: u8) {
step_bsearch_only(&mut self.trie.store, byte)
step_ascii_bsearch_only(&mut self.trie.store, byte)
}

/// Takes the value at the current position.
Expand Down
31 changes: 12 additions & 19 deletions experimental/zerotrie/src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
//! Here is an example ZeroTrie without branch nodes:
//!
//! ```
//! use zerotrie::ZeroTrieSimpleAscii;
//! use zerotrie::ZeroTriePerfectHash;
//!
//! let bytes = [
//! b'a', // ASCII literal
Expand All @@ -44,7 +44,7 @@
//! 0b10000100, // value 4
//! ];
//!
//! let trie = ZeroTrieSimpleAscii::from_bytes(&bytes);
//! let trie = ZeroTriePerfectHash::from_bytes(&bytes);
//!
//! // First value: "a" → 10
//! assert_eq!(trie.get(b"a"), Some(10));
Expand Down Expand Up @@ -303,15 +303,20 @@ fn byte_type(b: u8) -> NodeType {
// | subtags_10pct | ~9.5557 µs | ~4.8696 µs | ~9.5779 µs | ~4.5649 µs |
// | subtags_full | ~137.75 µs | ~76.016 µs | ~142.02 µs | ~70.254 µs |

/// Query the trie assuming all branch nodes are binary search.
pub fn get_bsearch_only(mut trie: &[u8], mut ascii: &[u8]) -> Option<usize> {
/// Query the trie assuming all branch nodes are binary search
/// and there are no span nodes.
pub fn get_ascii_bsearch_only(mut trie: &[u8], mut ascii: &[u8]) -> Option<usize> {
loop {
let (b, x, i, search);
(b, trie) = trie.split_first()?;
let byte_type = byte_type(*b);
(x, trie) = match byte_type {
NodeType::Ascii => (0, trie),
NodeType::Span | NodeType::Value => read_varint_meta3(*b, trie),
NodeType::Span => {
debug_assert!(false, "Span node found in ASCII trie!");
return None;
}
NodeType::Value => read_varint_meta3(*b, trie),
NodeType::Branch => read_varint_meta2(*b, trie),
};
if let Some((c, temp)) = ascii.split_first() {
Expand All @@ -329,18 +334,6 @@ pub fn get_bsearch_only(mut trie: &[u8], mut ascii: &[u8]) -> Option<usize> {
// Value node, but not at end of string
continue;
}
if matches!(byte_type, NodeType::Span) {
let (trie_span, ascii_span);
(trie_span, trie) = trie.debug_split_at(x);
(ascii_span, ascii) = ascii.maybe_split_at(x)?;
if trie_span == ascii_span {
// Matched a byte span
continue;
} else {
// Byte span that doesn't match
return None;
}
}
// Branch node
let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) };
// See comment above regarding this assertion
Expand Down Expand Up @@ -509,7 +502,7 @@ pub fn get_phf_extended(mut trie: &[u8], mut ascii: &[u8]) -> Option<usize> {
///
/// The input-output argument `trie` starts at the original trie and ends pointing to
/// the sub-trie reachable by `c`.
pub(crate) fn step_bsearch_only(trie: &mut &[u8], c: u8) {
pub(crate) fn step_ascii_bsearch_only(trie: &mut &[u8], c: u8) {
let (mut b, x, search);
loop {
(b, *trie) = match trie.split_first() {
Expand Down Expand Up @@ -537,7 +530,7 @@ pub(crate) fn step_bsearch_only(trie: &mut &[u8], c: u8) {
NodeType::Span => {
// Question: Should we put the trie back into a valid state?
// Currently this code is unreachable so let's not worry about it.
debug_assert!(false, "span nodes not supported in stepping");
debug_assert!(false, "Span node found in ASCII trie!");
return;
}
NodeType::Value => {
Expand Down
13 changes: 12 additions & 1 deletion experimental/zerotrie/src/zerotrie.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,17 @@ pub(crate) enum ZeroTrieFlavor<Store> {
///
/// # Ok::<_, zerotrie::ZeroTrieError>(())
/// ```
///
/// The trie can only store ASCII bytes; a string with non-ASCII always returns None:
///
/// ```
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// // A trie with two values: "abc" and "abcdef"
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
///
/// assert!(matches!(trie.get(b"ab\xFF"), None));
/// ```
#[repr(transparent)]
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
#[cfg_attr(feature = "databake", derive(databake::Bake), databake(path = zerotrie))]
Expand Down Expand Up @@ -543,7 +554,7 @@ fn string_to_box_u8(input: String) -> Box<[u8]> {
impl_zerotrie_subtype!(
ZeroTrieSimpleAscii,
SimpleAscii,
get_bsearch_only,
get_ascii_bsearch_only,
String,
get_iter_ascii_or_panic,
string_to_box_u8
Expand Down

0 comments on commit 993fa4f

Please sign in to comment.