diff --git a/experimental/zerotrie/src/cursor.rs b/experimental/zerotrie/src/cursor.rs index a7a88645415..40327d7e85d 100644 --- a/experimental/zerotrie/src/cursor.rs +++ b/experimental/zerotrie/src/cursor.rs @@ -118,9 +118,28 @@ impl<'a> ZeroTrieSimpleAsciiCursor<'a> { /// cursor.step(b'y'); /// assert_eq!(cursor.take_value(), None); // "abcdxy" /// ``` + /// + /// If the byte is not ASCII, the cursor will become empty: + /// + /// ``` + /// use zerotrie::ZeroTrieSimpleAscii; + /// + /// // A trie with two values: "abc" and "abcdef" + /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); + /// + /// let mut cursor = trie.cursor(); + /// assert_eq!(cursor.take_value(), None); // "" + /// cursor.step(b'a'); + /// assert_eq!(cursor.take_value(), None); // "a" + /// cursor.step(b'b'); + /// assert_eq!(cursor.take_value(), None); // "ab" + /// cursor.step(b'\xFF'); + /// assert!(cursor.is_empty()); + /// assert_eq!(cursor.take_value(), None); + /// ``` #[inline] pub fn step(&mut self, byte: u8) { - step_bsearch_only(&mut self.trie.store, byte) + step_ascii_bsearch_only(&mut self.trie.store, byte) } /// Takes the value at the current position. diff --git a/experimental/zerotrie/src/reader.rs b/experimental/zerotrie/src/reader.rs index bc5df09808e..7d2c65f811a 100644 --- a/experimental/zerotrie/src/reader.rs +++ b/experimental/zerotrie/src/reader.rs @@ -31,7 +31,7 @@ //! Here is an example ZeroTrie without branch nodes: //! //! ``` -//! use zerotrie::ZeroTrieSimpleAscii; +//! use zerotrie::ZeroTriePerfectHash; //! //! let bytes = [ //! b'a', // ASCII literal @@ -44,7 +44,7 @@ //! 0b10000100, // value 4 //! ]; //! -//! let trie = ZeroTrieSimpleAscii::from_bytes(&bytes); +//! let trie = ZeroTriePerfectHash::from_bytes(&bytes); //! //! // First value: "a" → 10 //! assert_eq!(trie.get(b"a"), Some(10)); @@ -303,15 +303,20 @@ fn byte_type(b: u8) -> NodeType { // | subtags_10pct | ~9.5557 µs | ~4.8696 µs | ~9.5779 µs | ~4.5649 µs | // | subtags_full | ~137.75 µs | ~76.016 µs | ~142.02 µs | ~70.254 µs | -/// Query the trie assuming all branch nodes are binary search. -pub fn get_bsearch_only(mut trie: &[u8], mut ascii: &[u8]) -> Option { +/// Query the trie assuming all branch nodes are binary search +/// and there are no span nodes. +pub fn get_ascii_bsearch_only(mut trie: &[u8], mut ascii: &[u8]) -> Option { loop { let (b, x, i, search); (b, trie) = trie.split_first()?; let byte_type = byte_type(*b); (x, trie) = match byte_type { NodeType::Ascii => (0, trie), - NodeType::Span | NodeType::Value => read_varint_meta3(*b, trie), + NodeType::Span => { + debug_assert!(false, "Span node found in ASCII trie!"); + return None; + } + NodeType::Value => read_varint_meta3(*b, trie), NodeType::Branch => read_varint_meta2(*b, trie), }; if let Some((c, temp)) = ascii.split_first() { @@ -329,18 +334,6 @@ pub fn get_bsearch_only(mut trie: &[u8], mut ascii: &[u8]) -> Option { // Value node, but not at end of string continue; } - if matches!(byte_type, NodeType::Span) { - let (trie_span, ascii_span); - (trie_span, trie) = trie.debug_split_at(x); - (ascii_span, ascii) = ascii.maybe_split_at(x)?; - if trie_span == ascii_span { - // Matched a byte span - continue; - } else { - // Byte span that doesn't match - return None; - } - } // Branch node let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) }; // See comment above regarding this assertion @@ -509,7 +502,7 @@ pub fn get_phf_extended(mut trie: &[u8], mut ascii: &[u8]) -> Option { /// /// The input-output argument `trie` starts at the original trie and ends pointing to /// the sub-trie reachable by `c`. -pub(crate) fn step_bsearch_only(trie: &mut &[u8], c: u8) { +pub(crate) fn step_ascii_bsearch_only(trie: &mut &[u8], c: u8) { let (mut b, x, search); loop { (b, *trie) = match trie.split_first() { @@ -537,7 +530,7 @@ pub(crate) fn step_bsearch_only(trie: &mut &[u8], c: u8) { NodeType::Span => { // Question: Should we put the trie back into a valid state? // Currently this code is unreachable so let's not worry about it. - debug_assert!(false, "span nodes not supported in stepping"); + debug_assert!(false, "Span node found in ASCII trie!"); return; } NodeType::Value => { diff --git a/experimental/zerotrie/src/zerotrie.rs b/experimental/zerotrie/src/zerotrie.rs index deedcb81443..b542f2483db 100644 --- a/experimental/zerotrie/src/zerotrie.rs +++ b/experimental/zerotrie/src/zerotrie.rs @@ -96,6 +96,17 @@ pub(crate) enum ZeroTrieFlavor { /// /// # Ok::<_, zerotrie::ZeroTrieError>(()) /// ``` +/// +/// The trie can only store ASCII bytes; a string with non-ASCII always returns None: +/// +/// ``` +/// use zerotrie::ZeroTrieSimpleAscii; +/// +/// // A trie with two values: "abc" and "abcdef" +/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); +/// +/// assert!(matches!(trie.get(b"ab\xFF"), None)); +/// ``` #[repr(transparent)] #[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] #[cfg_attr(feature = "databake", derive(databake::Bake), databake(path = zerotrie))] @@ -543,7 +554,7 @@ fn string_to_box_u8(input: String) -> Box<[u8]> { impl_zerotrie_subtype!( ZeroTrieSimpleAscii, SimpleAscii, - get_bsearch_only, + get_ascii_bsearch_only, String, get_iter_ascii_or_panic, string_to_box_u8