From 4e8a97c949c139049fcfd176017854a7117e2a9d Mon Sep 17 00:00:00 2001 From: Joseph Burton Date: Fri, 6 Oct 2023 02:49:39 +0100 Subject: [PATCH] Implement JavaString (#540) # Objective - Solve the problem that NBT cannot be loaded when strings contain invalid UTF-16, such as in ban books. # Solution - This is the first part of the solution to this problem, a new string implementation which is tolerant of invalid UTF-16. See the added readme for details. - This allows for round-trip, useful manipulation of strings which may not be fully valid. - This solution is widely applicable outside of Valence when you have to deal with arbitrary Java strings, such as when manipulating class files. --- .github/workflows/ci.yml | 19 + Cargo.toml | 1 + assets/depgraph.svg | 366 ++-- crates/java_string/Cargo.toml | 16 + crates/java_string/README.md | 17 + crates/java_string/src/cesu8.rs | 279 +++ crates/java_string/src/char.rs | 1012 +++++++++++ crates/java_string/src/error.rs | 126 ++ crates/java_string/src/iter.rs | 977 +++++++++++ crates/java_string/src/lib.rs | 27 + crates/java_string/src/owned.rs | 1401 ++++++++++++++++ crates/java_string/src/pattern.rs | 402 +++++ crates/java_string/src/serde.rs | 263 +++ crates/java_string/src/slice.rs | 2239 +++++++++++++++++++++++++ crates/java_string/src/validations.rs | 369 ++++ typos.toml | 2 +- 16 files changed, 7335 insertions(+), 181 deletions(-) create mode 100644 crates/java_string/Cargo.toml create mode 100644 crates/java_string/README.md create mode 100644 crates/java_string/src/cesu8.rs create mode 100644 crates/java_string/src/char.rs create mode 100644 crates/java_string/src/error.rs create mode 100644 crates/java_string/src/iter.rs create mode 100644 crates/java_string/src/lib.rs create mode 100644 crates/java_string/src/owned.rs create mode 100644 crates/java_string/src/pattern.rs create mode 100644 crates/java_string/src/serde.rs create mode 100644 crates/java_string/src/slice.rs create mode 100644 crates/java_string/src/validations.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 78e12f7f5..ea46962fd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -117,6 +117,25 @@ jobs: - name: Run valence_nbt tests without preserve_order feature run: cargo test -p valence_nbt --all-targets + valence-miri: + name: Miri Tests + runs-on: ubuntu-latest + steps: + - name: Checkout Actions Repository + uses: actions/checkout@v3 + + - name: Setup Rust toolchain and cache + uses: actions-rust-lang/setup-rust-toolchain@v1.5.0 + with: + toolchain: "nightly" + components: "miri" + + - name: Run tests + run: cargo miri test --workspace --all-features --doc + + - name: Run doctests + run: cargo miri test --workspace --all-features --doc + extractor-build: name: Build Extractor runs-on: ubuntu-latest diff --git a/Cargo.toml b/Cargo.toml index cccade60f..f221a2dd4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -135,6 +135,7 @@ hmac = "0.12.1" image = "0.24.6" indexmap = "2.0.0" itertools = "0.11.0" +java_string = { path = "crates/java_string", version = "0.1.0" } lru = "0.11.0" noise = "0.8.2" num = "0.4.0" diff --git a/assets/depgraph.svg b/assets/depgraph.svg index b9523f69e..8bb16f718 100644 --- a/assets/depgraph.svg +++ b/assets/depgraph.svg @@ -12,368 +12,374 @@ 0 - -valence_advancement + +java_string 1 + +valence_advancement + + + +2 valence_server - + -0->1 +1->2 - - -2 + + +3 valence_entity - + -1->2 +2->3 - - -11 + + +12 valence_registry - + -1->11 +2->12 - - -10 - -valence_server_common + + +11 + +valence_server_common - + -2->10 - - +3->11 + + - + -11->10 - - +12->11 + + - - -6 - -valence_protocol + + +7 + +valence_protocol - + -10->6 - - - - - -3 - -valence_math +11->7 + + 4 - -valence_nbt + +valence_math 5 - -valence_ident + +valence_nbt - - -7 - -valence_generated + + +6 + +valence_ident - + + +8 + +valence_generated + + -6->7 - - +7->8 + + - - -9 - -valence_text + + +10 + +valence_text - + -6->9 - - +7->10 + + - + -7->3 - - +8->4 + + - + -7->5 - - +8->6 + + - + -9->4 - - +10->5 + + - + -9->5 - - +10->6 + + - - -8 + + +9 valence_build_utils - - -12 + + +13 valence_anvil - + -12->1 +13->2 - - -13 + + +14 valence_boss_bar - + -13->1 +14->2 - - -14 + + +15 valence_inventory - + -14->1 +15->2 - - -15 - -valence_lang - 16 + +valence_lang + + + +17 valence_network - + -16->1 +17->2 - + -16->15 +17->16 - - -17 + + +18 valence_player_list - + -17->1 +18->2 - - -18 + + +19 valence_scoreboard - + -18->1 +19->2 - - -19 - -valence_spatial - 20 + +valence_spatial + + + +21 valence_weather - + -20->1 +21->2 - - -21 + + +22 valence_world_border - + -21->1 +22->2 - - -22 - -dump_schedule - 23 + +dump_schedule + + + +24 valence - + -22->23 +23->24 - + -23->0 +24->1 - + -23->12 +24->13 - + -23->13 +24->14 - + -23->14 +24->15 - + -23->16 +24->17 - + -23->17 +24->18 - + -23->18 +24->19 - + -23->20 +24->21 - + -23->21 +24->22 - - -24 - -packet_inspector - - - -24->6 - - - 25 + +packet_inspector + + + +25->7 + + + + + +26 playground - + -25->23 +26->24 - - -26 - -stresser + + +27 + +stresser - + -26->6 - - +27->7 + + diff --git a/crates/java_string/Cargo.toml b/crates/java_string/Cargo.toml new file mode 100644 index 000000000..414e22a7d --- /dev/null +++ b/crates/java_string/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "java_string" +description = "An implementation of Java strings, tolerant of invalid UTF-16 encoding" +readme = "README.md" +version = "0.1.0" +keywords = ["java", "string", "utf16"] +edition.workspace = true +repository.workspace = true +documentation.workspace = true +license.workspace = true + +[features] +serde = ["dep:serde"] + +[dependencies] +serde = { workspace = true, optional = true } diff --git a/crates/java_string/README.md b/crates/java_string/README.md new file mode 100644 index 000000000..7135d6454 --- /dev/null +++ b/crates/java_string/README.md @@ -0,0 +1,17 @@ +# java_string + +An implementation of Java strings, tolerant of invalid UTF-16 encoding. +This allows for round-trip serialization of all Java strings, including those which contain invalid UTF-16, while still +being able to perform useful operations on those strings. + +These Java strings use the UTF-8 encoding, with the modification that surrogate code points (code points between U+D800 +and U+DFFF inclusive) are allowed. This allows for zero-cost conversion from Rust strings to Java strings. This modified +encoding is known as "semi-UTF-8" throughout the codebase. Similarly, this crate introduces a `JavaCodePoint` type which +is analogous to `char`, except that surrogate code points are allowed. + +This crate is mostly undocumented, because most methods are entirely analogous to those of the same name in Rust's +strings. Please refer to the `std` documentation. + +# Features + +- `serde` Adds support for [`serde`](https://docs.rs/serde/latest/serde/) \ No newline at end of file diff --git a/crates/java_string/src/cesu8.rs b/crates/java_string/src/cesu8.rs new file mode 100644 index 000000000..eb94ee6c1 --- /dev/null +++ b/crates/java_string/src/cesu8.rs @@ -0,0 +1,279 @@ +use std::borrow::Cow; + +use crate::validations::{utf8_char_width, CONT_MASK, TAG_CONT}; +use crate::{JavaStr, JavaString, Utf8Error}; + +impl JavaStr { + /// Converts from Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format to a `Cow`. + /// + /// ``` + /// # use std::borrow::Cow; + /// # use java_string::{JavaCodePoint, JavaStr, JavaString}; + /// + /// let result = JavaStr::from_modified_utf8("Hello World!".as_bytes()).unwrap(); + /// assert!(matches!(result, Cow::Borrowed(_))); + /// assert_eq!(JavaStr::from_str("Hello World!"), result); + /// + /// let result = JavaStr::from_modified_utf8(&[ + /// 0x61, 0x62, 0x63, 0xc0, 0x80, 0xe2, 0x84, 0x9d, 0xed, 0xa0, 0xbd, 0xed, 0xb2, 0xa3, 0xed, + /// 0xa0, 0x80, + /// ]) + /// .unwrap(); + /// assert!(matches!(result, Cow::Owned(_))); + /// let mut expected = JavaString::from("abc\0ℝ💣"); + /// expected.push_java(JavaCodePoint::from_u32(0xd800).unwrap()); + /// assert_eq!(expected, result); + /// + /// let result = JavaStr::from_modified_utf8(&[0xed]); + /// assert!(result.is_err()); + /// ``` + #[inline] + pub fn from_modified_utf8(bytes: &[u8]) -> Result, Utf8Error> { + match JavaStr::from_full_utf8(bytes) { + Ok(str) => Ok(Cow::Borrowed(str)), + Err(_) => JavaString::from_modified_utf8_internal(bytes).map(Cow::Owned), + } + } + + /// Converts to Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format. + /// + /// ``` + /// # use std::borrow::Cow; + /// # use java_string::{JavaCodePoint, JavaStr, JavaString}; + /// + /// let result = JavaStr::from_str("Hello World!").to_modified_utf8(); + /// assert!(matches!(result, Cow::Borrowed(_))); + /// assert_eq!(result, &b"Hello World!"[..]); + /// + /// let mut str = JavaString::from("abc\0ℝ💣"); + /// str.push_java(JavaCodePoint::from_u32(0xd800).unwrap()); + /// let result = str.to_modified_utf8(); + /// let expected = [ + /// 0x61, 0x62, 0x63, 0xc0, 0x80, 0xe2, 0x84, 0x9d, 0xed, 0xa0, 0xbd, 0xed, 0xb2, 0xa3, 0xed, + /// 0xa0, 0x80, + /// ]; + /// assert!(matches!(result, Cow::Owned(_))); + /// assert_eq!(result, &expected[..]); + /// ``` + #[inline] + #[must_use] + pub fn to_modified_utf8(&self) -> Cow<[u8]> { + if is_valid_cesu8(self) { + Cow::Borrowed(self.as_bytes()) + } else { + Cow::Owned(self.to_modified_utf8_internal()) + } + } + + #[inline] + fn to_modified_utf8_internal(&self) -> Vec { + let bytes = self.as_bytes(); + let mut encoded = Vec::with_capacity((bytes.len() + bytes.len()) >> 2); + let mut i = 0; + while i < bytes.len() { + let b = bytes[i]; + if b == 0 { + encoded.extend([0xc0, 0x80]); + i += 1; + } else if b < 128 { + // Pass ASCII through quickly. + encoded.push(b); + i += 1; + } else { + // Figure out how many bytes we need for this character. + let w = utf8_char_width(b); + let char_bytes = unsafe { + // SAFETY: input must be valid semi UTF-8, so there must be at least w more + // bytes from i + bytes.get_unchecked(i..i + w) + }; + if w != 4 { + // Pass through short UTF-8 sequences unmodified. + encoded.extend(char_bytes.iter().copied()) + } else { + // Encode 4-byte sequences as 6 bytes + let s = unsafe { + // SAFETY: input is valid semi UTF-8 + JavaStr::from_semi_utf8_unchecked(char_bytes) + }; + let c = unsafe { + // SAFETY: s contains a single char of width 4 + s.chars().next().unwrap_unchecked().as_u32() - 0x10000 + }; + let s = [((c >> 10) as u16) | 0xd800, ((c & 0x3ff) as u16) | 0xdc00]; + encoded.extend(enc_surrogate(s[0])); + encoded.extend(enc_surrogate(s[1])); + } + i += w; + } + } + encoded + } +} + +impl JavaString { + /// Converts from Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format to a `JavaString`. + /// + /// See [JavaStr::from_modified_utf8]. + #[inline] + pub fn from_modified_utf8(bytes: Vec) -> Result { + match JavaString::from_full_utf8(bytes) { + Ok(str) => Ok(str), + Err(err) => JavaString::from_modified_utf8_internal(&err.bytes), + } + } + + fn from_modified_utf8_internal(slice: &[u8]) -> Result { + let mut offset = 0; + let mut decoded = Vec::with_capacity(slice.len() + 1); + + while let Some(&first) = slice.get(offset) { + let old_offset = offset; + offset += 1; + + macro_rules! err { + ($error_len:expr) => { + return Err(Utf8Error { + valid_up_to: old_offset, + error_len: $error_len, + }) + }; + } + + macro_rules! next { + () => {{ + if let Some(&b) = slice.get(offset) { + offset += 1; + b + } else { + err!(None) + } + }}; + } + + macro_rules! next_cont { + ($error_len:expr) => {{ + let byte = next!(); + if (byte) & !CONT_MASK == TAG_CONT { + byte + } else { + err!($error_len) + } + }}; + } + + if first == 0 { + // modified UTF-8 should never contain \0 directly. + err!(Some(1)); + } else if first < 128 { + // Pass ASCII through directly. + decoded.push(first); + } else if first == 0xc0 { + // modified UTF-8 encoding of null character + match next!() { + 0x80 => decoded.push(0), + _ => err!(Some(1)), + } + } else { + let w = utf8_char_width(first); + let second = next_cont!(Some(1)); + match w { + // Two-byte sequences can be used directly. + 2 => { + decoded.extend([first, second]); + } + 3 => { + let third = next_cont!(Some(2)); + match (first, second) { + // These are valid UTF-8, so pass them through. + (0xe0, 0xa0..=0xbf) + | (0xe1..=0xec, 0x80..=0xbf) + | (0xed, 0x80..=0x9f) + | (0xee..=0xef, 0x80..=0xbf) + // Second half of a surrogate pair without a preceding first half, also pass this through. + | (0xed, 0xb0..=0xbf) + => decoded.extend([first, second, third]), + // First half of a surrogate pair + (0xed, 0xa0..=0xaf) => { + // Peek ahead and try to pair the first half of surrogate pair with + // second. + match &slice[offset..] { + [0xed, fifth @ 0xb0..=0xbf, sixth, ..] + if *sixth & !CONT_MASK == TAG_CONT => + { + let s = dec_surrogates(second, third, *fifth, *sixth); + decoded.extend(s); + offset += 3; + } + _ => { + // No second half, append the first half directly. + decoded.extend([first, second, third]); + } + } + } + _ => err!(Some(1)), + } + } + _ => err!(Some(1)), // modified UTF-8 doesn't allow width 4 + } + } + } + + unsafe { + // SAFETY: we built a semi UTF-8 encoded string + Ok(JavaString::from_semi_utf8_unchecked(decoded)) + } + } + + /// Converts to Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format. + /// + /// See [JavaStr::to_modified_utf8]. + #[inline] + #[must_use] + pub fn into_modified_utf8(self) -> Vec { + if is_valid_cesu8(&self) { + self.into_bytes() + } else { + self.to_modified_utf8_internal() + } + } +} + +#[inline] +fn dec_surrogate(second: u8, third: u8) -> u32 { + 0xd000 | ((second & CONT_MASK) as u32) << 6 | (third & CONT_MASK) as u32 +} + +#[inline] +fn dec_surrogates(second: u8, third: u8, fifth: u8, sixth: u8) -> [u8; 4] { + // Convert to a 32-bit code point. + let s1 = dec_surrogate(second, third); + let s2 = dec_surrogate(fifth, sixth); + let c = 0x10000 + (((s1 - 0xd800) << 10) | (s2 - 0xdc00)); + assert!((0x010000..=0x10ffff).contains(&c)); + + // Convert to UTF-8. + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + [ + 0b1111_0000u8 | ((c & 0b1_1100_0000_0000_0000_0000) >> 18) as u8, + TAG_CONT | ((c & 0b0_0011_1111_0000_0000_0000) >> 12) as u8, + TAG_CONT | ((c & 0b0_0000_0000_1111_1100_0000) >> 6) as u8, + TAG_CONT | (c & 0b0_0000_0000_0000_0011_1111) as u8, + ] +} + +#[inline] +fn is_valid_cesu8(text: &JavaStr) -> bool { + text.bytes() + .all(|b| b != 0 && ((b & !CONT_MASK) == TAG_CONT || utf8_char_width(b) <= 3)) +} + +#[inline] +fn enc_surrogate(surrogate: u16) -> [u8; 3] { + // 1110xxxx 10xxxxxx 10xxxxxx + [ + 0b11100000 | ((surrogate & 0b11110000_00000000) >> 12) as u8, + TAG_CONT | ((surrogate & 0b00001111_11000000) >> 6) as u8, + TAG_CONT | (surrogate & 0b00000000_00111111) as u8, + ] +} diff --git a/crates/java_string/src/char.rs b/crates/java_string/src/char.rs new file mode 100644 index 000000000..5bc26a4f7 --- /dev/null +++ b/crates/java_string/src/char.rs @@ -0,0 +1,1012 @@ +use std::char::ParseCharError; +use std::cmp::Ordering; +use std::fmt; +use std::fmt::{Debug, Display, Formatter, Write}; +use std::hash::{Hash, Hasher}; +use std::iter::{once, FusedIterator, Once}; +use std::ops::Range; +use std::str::FromStr; + +use crate::validations::{TAG_CONT, TAG_FOUR_B, TAG_THREE_B, TAG_TWO_B}; + +// JavaCodePoint is guaranteed to have the same repr as a u32, with valid values +// of between 0 and 0x10FFFF, the same as a unicode code point. Surrogate code +// points are valid values of this type. +#[derive(Copy, Clone, PartialEq, Eq)] +#[repr(C)] +pub struct JavaCodePoint { + #[cfg(target_endian = "little")] + lower: u16, + upper: SeventeenValues, + #[cfg(target_endian = "big")] + lower: u16, +} + +#[repr(u16)] +#[derive(Copy, Clone, PartialEq, Eq)] +#[allow(unused)] +enum SeventeenValues { + V0, + V1, + V2, + V3, + V4, + V5, + V6, + V7, + V8, + V9, + V10, + V11, + V12, + V13, + V14, + V15, + V16, +} + +impl JavaCodePoint { + pub const MAX: JavaCodePoint = JavaCodePoint::from_char(char::MAX); + pub const REPLACEMENT_CHARACTER: JavaCodePoint = + JavaCodePoint::from_char(char::REPLACEMENT_CHARACTER); + + /// See [char::from_u32] + /// + /// ``` + /// # use java_string::JavaCodePoint; + /// let c = JavaCodePoint::from_u32(0x2764); + /// assert_eq!(Some(JavaCodePoint::from_char('❤')), c); + /// + /// assert_eq!(None, JavaCodePoint::from_u32(0x110000)); + /// ``` + #[inline] + #[must_use] + pub const fn from_u32(i: u32) -> Option { + if i <= 0x10ffff { + unsafe { Some(Self::from_u32_unchecked(i)) } + } else { + None + } + } + + /// # Safety + /// The argument must be within the valid Unicode code point range of 0 to + /// 0x10FFFF inclusive. Surrogate code points are allowed. + #[inline] + #[must_use] + pub const unsafe fn from_u32_unchecked(i: u32) -> JavaCodePoint { + // SAFETY: the caller checks that the argument can be represented by this type + std::mem::transmute(i) + } + + /// Converts a `char` to a code point. + #[inline] + #[must_use] + pub const fn from_char(char: char) -> JavaCodePoint { + unsafe { + // SAFETY: all chars are valid code points + JavaCodePoint::from_u32_unchecked(char as u32) + } + } + + /// Converts this code point to a `u32`. + /// + /// ``` + /// # use java_string::JavaCodePoint; + /// assert_eq!(65, JavaCodePoint::from_char('A').as_u32()); + /// assert_eq!(0xd800, JavaCodePoint::from_u32(0xd800).unwrap().as_u32()); + /// ``` + #[inline] + #[must_use] + pub const fn as_u32(self) -> u32 { + unsafe { + // SAFETY: JavaCodePoint has the same repr as a u32 + let result = std::mem::transmute(self); + + if result > 0x10ffff { + // SAFETY: JavaCodePoint can never have a value > 0x10FFFF. + // This statement may allow the optimizer to remove branches in the calling code + // associated with out of bounds chars. + std::hint::unreachable_unchecked(); + } + + result + } + } + + /// Converts this code point to a `char`. + /// + /// ``` + /// # use java_string::JavaCodePoint; + /// assert_eq!(Some('a'), JavaCodePoint::from_char('a').as_char()); + /// assert_eq!(None, JavaCodePoint::from_u32(0xd800).unwrap().as_char()); + /// ``` + #[inline] + #[must_use] + pub const fn as_char(self) -> Option { + char::from_u32(self.as_u32()) + } + + /// # Safety + /// The caller must ensure that this code point is not a surrogate code + /// point. + #[inline] + #[must_use] + pub unsafe fn as_char_unchecked(self) -> char { + char::from_u32_unchecked(self.as_u32()) + } + + /// See [char::encode_utf16] + /// + /// ``` + /// # use java_string::JavaCodePoint; + /// assert_eq!( + /// 2, + /// JavaCodePoint::from_char('𝕊') + /// .encode_utf16(&mut [0; 2]) + /// .len() + /// ); + /// assert_eq!( + /// 1, + /// JavaCodePoint::from_u32(0xd800) + /// .unwrap() + /// .encode_utf16(&mut [0; 2]) + /// .len() + /// ); + /// ``` + /// ```should_panic + /// # use java_string::JavaCodePoint; + /// // Should panic + /// JavaCodePoint::from_char('𝕊').encode_utf16(&mut [0; 1]); + /// ``` + #[inline] + pub fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] { + if let Some(char) = self.as_char() { + char.encode_utf16(dst) + } else { + dst[0] = self.as_u32() as u16; + &mut dst[..1] + } + } + + /// Encodes this `JavaCodePoint` into semi UTF-8, that is, UTF-8 with + /// surrogate code points. See also [char::encode_utf8]. + /// + /// ``` + /// # use java_string::JavaCodePoint; + /// assert_eq!( + /// 2, + /// JavaCodePoint::from_char('ß') + /// .encode_semi_utf8(&mut [0; 4]) + /// .len() + /// ); + /// assert_eq!( + /// 3, + /// JavaCodePoint::from_u32(0xd800) + /// .unwrap() + /// .encode_semi_utf8(&mut [0; 4]) + /// .len() + /// ); + /// ``` + /// ```should_panic + /// # use java_string::JavaCodePoint; + /// // Should panic + /// JavaCodePoint::from_char('ß').encode_semi_utf8(&mut [0; 1]); + /// ``` + #[inline] + pub fn encode_semi_utf8(self, dst: &mut [u8]) -> &mut [u8] { + let len = self.len_utf8(); + let code = self.as_u32(); + match (len, &mut dst[..]) { + (1, [a, ..]) => { + *a = code as u8; + } + (2, [a, b, ..]) => { + *a = (code >> 6 & 0x1f) as u8 | TAG_TWO_B; + *b = (code & 0x3f) as u8 | TAG_CONT; + } + (3, [a, b, c, ..]) => { + *a = (code >> 12 & 0x0f) as u8 | TAG_THREE_B; + *b = (code >> 6 & 0x3f) as u8 | TAG_CONT; + *c = (code & 0x3f) as u8 | TAG_CONT; + } + (4, [a, b, c, d, ..]) => { + *a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; + *b = (code >> 12 & 0x3f) as u8 | TAG_CONT; + *c = (code >> 6 & 0x3f) as u8 | TAG_CONT; + *d = (code & 0x3f) as u8 | TAG_CONT; + } + _ => panic!( + "encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}", + len, + code, + dst.len() + ), + } + &mut dst[..len] + } + + /// See [char::eq_ignore_ascii_case]. + #[inline] + pub fn eq_ignore_ascii_case(&self, other: &JavaCodePoint) -> bool { + match (self.as_char(), other.as_char()) { + (Some(char1), Some(char2)) => char1.eq_ignore_ascii_case(&char2), + (None, None) => self == other, + _ => false, + } + } + + /// See [char::escape_debug]. + /// + /// ``` + /// # use java_string::JavaCodePoint; + /// assert_eq!( + /// "a", + /// JavaCodePoint::from_char('a').escape_debug().to_string() + /// ); + /// assert_eq!( + /// "\\n", + /// JavaCodePoint::from_char('\n').escape_debug().to_string() + /// ); + /// assert_eq!( + /// "\\u{d800}", + /// JavaCodePoint::from_u32(0xd800) + /// .unwrap() + /// .escape_debug() + /// .to_string() + /// ); + /// ``` + #[inline] + #[must_use] + pub fn escape_debug(self) -> CharEscapeIter { + self.escape_debug_ext(EscapeDebugExtArgs::ESCAPE_ALL) + } + + #[inline] + #[must_use] + pub(crate) fn escape_debug_ext(self, args: EscapeDebugExtArgs) -> CharEscapeIter { + const NULL: u32 = '\0' as u32; + const TAB: u32 = '\t' as u32; + const CARRIAGE_RETURN: u32 = '\r' as u32; + const LINE_FEED: u32 = '\n' as u32; + const SINGLE_QUOTE: u32 = '\'' as u32; + const DOUBLE_QUOTE: u32 = '"' as u32; + const BACKSLASH: u32 = '\\' as u32; + + unsafe { + // SAFETY: all characters specified are in ascii range + match self.as_u32() { + NULL => CharEscapeIter::new([b'\\', b'0']), + TAB => CharEscapeIter::new([b'\\', b't']), + CARRIAGE_RETURN => CharEscapeIter::new([b'\\', b'r']), + LINE_FEED => CharEscapeIter::new([b'\\', b'n']), + SINGLE_QUOTE if args.escape_single_quote => CharEscapeIter::new([b'\\', b'\'']), + DOUBLE_QUOTE if args.escape_double_quote => CharEscapeIter::new([b'\\', b'"']), + BACKSLASH => CharEscapeIter::new([b'\\', b'\\']), + _ if self.is_printable() => { + // SAFETY: surrogate code points are not printable + CharEscapeIter::printable(self.as_char_unchecked()) + } + _ => self.escape_unicode(), + } + } + } + + #[inline] + fn is_printable(self) -> bool { + let Some(char) = self.as_char() else { + return false; + }; + if matches!(char, '\\' | '\'' | '"') { + return true; + } + char.escape_debug().next() != Some('\\') + } + + /// See [char::escape_default]. + /// + /// ``` + /// # use java_string::JavaCodePoint; + /// assert_eq!( + /// "a", + /// JavaCodePoint::from_char('a').escape_default().to_string() + /// ); + /// assert_eq!( + /// "\\n", + /// JavaCodePoint::from_char('\n').escape_default().to_string() + /// ); + /// assert_eq!( + /// "\\u{d800}", + /// JavaCodePoint::from_u32(0xd800) + /// .unwrap() + /// .escape_default() + /// .to_string() + /// ); + /// ``` + #[inline] + #[must_use] + pub fn escape_default(self) -> CharEscapeIter { + const TAB: u32 = '\t' as u32; + const CARRIAGE_RETURN: u32 = '\r' as u32; + const LINE_FEED: u32 = '\n' as u32; + const SINGLE_QUOTE: u32 = '\'' as u32; + const DOUBLE_QUOTE: u32 = '"' as u32; + const BACKSLASH: u32 = '\\' as u32; + + unsafe { + // SAFETY: all characters specified are in ascii range + match self.as_u32() { + TAB => CharEscapeIter::new([b'\\', b't']), + CARRIAGE_RETURN => CharEscapeIter::new([b'\\', b'r']), + LINE_FEED => CharEscapeIter::new([b'\\', b'n']), + SINGLE_QUOTE => CharEscapeIter::new([b'\\', b'\'']), + DOUBLE_QUOTE => CharEscapeIter::new([b'\\', b'"']), + BACKSLASH => CharEscapeIter::new([b'\\', b'\\']), + 0x20..=0x7e => CharEscapeIter::new([self.as_u32() as u8]), + _ => self.escape_unicode(), + } + } + } + + /// See [char::escape_unicode]. + /// + /// ``` + /// # use java_string::JavaCodePoint; + /// assert_eq!( + /// "\\u{2764}", + /// JavaCodePoint::from_char('❤').escape_unicode().to_string() + /// ); + /// assert_eq!( + /// "\\u{d800}", + /// JavaCodePoint::from_u32(0xd800) + /// .unwrap() + /// .escape_unicode() + /// .to_string() + /// ); + /// ``` + #[inline] + #[must_use] + pub fn escape_unicode(self) -> CharEscapeIter { + let x = self.as_u32(); + + let mut arr = [0; 10]; + arr[0] = b'\\'; + arr[1] = b'u'; + arr[2] = b'{'; + + let number_len = if x == 0 { + 1 + } else { + ((x.ilog2() >> 2) + 1) as usize + }; + arr[3 + number_len] = b'}'; + for hexit in 0..number_len { + arr[2 + number_len - hexit] = b"0123456789abcdef"[((x >> (hexit << 2)) & 15) as usize]; + } + + CharEscapeIter { + inner: EscapeIterInner::Escaped(EscapeIterEscaped { + bytes: arr, + range: 0..number_len + 4, + }), + } + } + + /// See [char::is_alphabetic]. + #[inline] + #[must_use] + pub fn is_alphabetic(self) -> bool { + self.as_char().is_some_and(|char| char.is_alphabetic()) + } + + /// See [char::is_alphanumeric]. + #[inline] + #[must_use] + pub fn is_alphanumeric(self) -> bool { + self.as_char().is_some_and(|char| char.is_alphanumeric()) + } + + /// See [char::is_ascii]. + #[inline] + #[must_use] + pub fn is_ascii(self) -> bool { + self.as_u32() <= 0x7f + } + + /// See [char::is_ascii_alphabetic]. + #[inline] + #[must_use] + pub const fn is_ascii_alphabetic(self) -> bool { + self.is_ascii_lowercase() || self.is_ascii_uppercase() + } + + /// See [char::is_ascii_alphanumeric]. + #[inline] + #[must_use] + pub const fn is_ascii_alphanumeric(self) -> bool { + self.is_ascii_alphabetic() || self.is_ascii_digit() + } + + /// See [char::is_ascii_control]. + #[inline] + #[must_use] + pub const fn is_ascii_control(self) -> bool { + matches!(self.as_u32(), 0..=0x1f | 0x7f) + } + + /// See [char::is_ascii_digit]. + #[inline] + #[must_use] + pub const fn is_ascii_digit(self) -> bool { + const ZERO: u32 = '0' as u32; + const NINE: u32 = '9' as u32; + matches!(self.as_u32(), ZERO..=NINE) + } + + /// See [char::is_ascii_graphic]. + #[inline] + #[must_use] + pub const fn is_ascii_graphic(self) -> bool { + matches!(self.as_u32(), 0x21..=0x7e) + } + + /// See [char::is_ascii_hexdigit]. + #[inline] + #[must_use] + pub const fn is_ascii_hexdigit(self) -> bool { + const LOWER_A: u32 = 'a' as u32; + const LOWER_F: u32 = 'f' as u32; + const UPPER_A: u32 = 'A' as u32; + const UPPER_F: u32 = 'F' as u32; + self.is_ascii_digit() || matches!(self.as_u32(), (LOWER_A..=LOWER_F) | (UPPER_A..=UPPER_F)) + } + + /// See [char::is_ascii_lowercase]. + #[inline] + #[must_use] + pub const fn is_ascii_lowercase(self) -> bool { + const A: u32 = 'a' as u32; + const Z: u32 = 'z' as u32; + matches!(self.as_u32(), A..=Z) + } + + /// See [char::is_ascii_octdigit]. + #[inline] + #[must_use] + pub const fn is_ascii_octdigit(self) -> bool { + const ZERO: u32 = '0' as u32; + const SEVEN: u32 = '7' as u32; + matches!(self.as_u32(), ZERO..=SEVEN) + } + + /// See [char::is_ascii_punctuation]. + #[inline] + #[must_use] + pub const fn is_ascii_punctuation(self) -> bool { + matches!( + self.as_u32(), + (0x21..=0x2f) | (0x3a..=0x40) | (0x5b..=0x60) | (0x7b..=0x7e) + ) + } + + /// See [char::is_ascii_uppercase]. + #[inline] + #[must_use] + pub const fn is_ascii_uppercase(self) -> bool { + const A: u32 = 'A' as u32; + const Z: u32 = 'Z' as u32; + matches!(self.as_u32(), A..=Z) + } + + /// See [char::is_ascii_whitespace]. + #[inline] + #[must_use] + pub const fn is_ascii_whitespace(self) -> bool { + const SPACE: u32 = ' ' as u32; + const HORIZONTAL_TAB: u32 = '\t' as u32; + const LINE_FEED: u32 = '\n' as u32; + const FORM_FEED: u32 = 0xc; + const CARRIAGE_RETURN: u32 = '\r' as u32; + matches!( + self.as_u32(), + SPACE | HORIZONTAL_TAB | LINE_FEED | FORM_FEED | CARRIAGE_RETURN + ) + } + + /// See [char::is_control]. + #[inline] + #[must_use] + pub fn is_control(self) -> bool { + self.as_char().is_some_and(|char| char.is_control()) + } + + /// See [char::is_digit]. + #[inline] + #[must_use] + pub fn is_digit(self, radix: u32) -> bool { + self.to_digit(radix).is_some() + } + + /// See [char::is_lowercase]. + #[inline] + #[must_use] + pub fn is_lowercase(self) -> bool { + self.as_char().is_some_and(|char| char.is_lowercase()) + } + + /// See [char::is_numeric]. + #[inline] + #[must_use] + pub fn is_numeric(self) -> bool { + self.as_char().is_some_and(|char| char.is_numeric()) + } + + /// See [char::is_uppercase]. + #[inline] + #[must_use] + pub fn is_uppercase(self) -> bool { + self.as_char().is_some_and(|char| char.is_uppercase()) + } + + /// See [char::is_whitespace]. + #[inline] + #[must_use] + pub fn is_whitespace(self) -> bool { + self.as_char().is_some_and(|char| char.is_whitespace()) + } + + /// See [char::len_utf16]. Surrogate code points return 1. + /// + /// ``` + /// # use java_string::JavaCodePoint; + /// + /// let n = JavaCodePoint::from_char('ß').len_utf16(); + /// assert_eq!(n, 1); + /// + /// let len = JavaCodePoint::from_char('💣').len_utf16(); + /// assert_eq!(len, 2); + /// + /// assert_eq!(1, JavaCodePoint::from_u32(0xd800).unwrap().len_utf16()); + /// ``` + #[inline] + #[must_use] + pub const fn len_utf16(self) -> usize { + if let Some(char) = self.as_char() { + char.len_utf16() + } else { + 1 // invalid code points are encoded as 1 utf16 code point anyway + } + } + + /// See [char::len_utf8]. Surrogate code points return 3. + /// + /// ``` + /// # use java_string::JavaCodePoint; + /// + /// let len = JavaCodePoint::from_char('A').len_utf8(); + /// assert_eq!(len, 1); + /// + /// let len = JavaCodePoint::from_char('ß').len_utf8(); + /// assert_eq!(len, 2); + /// + /// let len = JavaCodePoint::from_char('ℝ').len_utf8(); + /// assert_eq!(len, 3); + /// + /// let len = JavaCodePoint::from_char('💣').len_utf8(); + /// assert_eq!(len, 4); + /// + /// let len = JavaCodePoint::from_u32(0xd800).unwrap().len_utf8(); + /// assert_eq!(len, 3); + /// ``` + #[inline] + #[must_use] + pub const fn len_utf8(self) -> usize { + if let Some(char) = self.as_char() { + char.len_utf8() + } else { + 3 // invalid code points are all length 3 in semi-valid utf8 + } + } + + /// See [char::make_ascii_lowercase]. + #[inline] + pub fn make_ascii_lowercase(&mut self) { + *self = self.to_ascii_lowercase(); + } + + /// See [char::make_ascii_uppercase]. + #[inline] + pub fn make_ascii_uppercase(&mut self) { + *self = self.to_ascii_uppercase(); + } + + /// See [char::to_ascii_lowercase]. + /// + /// ``` + /// # use java_string::JavaCodePoint; + /// + /// let ascii = JavaCodePoint::from_char('A'); + /// let non_ascii = JavaCodePoint::from_char('❤'); + /// + /// assert_eq!('a', ascii.to_ascii_lowercase()); + /// assert_eq!('❤', non_ascii.to_ascii_lowercase()); + /// ``` + #[inline] + #[must_use] + pub const fn to_ascii_lowercase(self) -> JavaCodePoint { + if self.is_ascii_uppercase() { + unsafe { + // SAFETY: all lowercase chars are valid chars + Self::from_u32_unchecked(self.as_u32() + 32) + } + } else { + self + } + } + + /// See [char::to_ascii_uppercase]. + /// + /// ``` + /// # use java_string::JavaCodePoint; + /// + /// let ascii = JavaCodePoint::from_char('a'); + /// let non_ascii = JavaCodePoint::from_char('❤'); + /// + /// assert_eq!('A', ascii.to_ascii_uppercase()); + /// assert_eq!('❤', non_ascii.to_ascii_uppercase()); + /// ``` + #[inline] + #[must_use] + pub const fn to_ascii_uppercase(self) -> JavaCodePoint { + if self.is_ascii_lowercase() { + unsafe { + // SAFETY: all uppercase chars are valid chars + Self::from_u32_unchecked(self.as_u32() - 32) + } + } else { + self + } + } + + /// See [char::to_digit]. + #[inline] + #[must_use] + pub const fn to_digit(self, radix: u32) -> Option { + if let Some(char) = self.as_char() { + char.to_digit(radix) + } else { + None + } + } + + /// See [char::to_lowercase]. + #[inline] + #[must_use] + pub fn to_lowercase(self) -> ToLowercase { + match self.as_char() { + Some(char) => ToLowercase::char(char.to_lowercase()), + None => ToLowercase::invalid(self), + } + } + + /// See [char::to_uppercase]. + #[inline] + #[must_use] + pub fn to_uppercase(self) -> ToUppercase { + match self.as_char() { + Some(char) => ToUppercase::char(char.to_uppercase()), + None => ToUppercase::invalid(self), + } + } +} + +impl Debug for JavaCodePoint { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.write_char('\'')?; + for c in self.escape_debug_ext(EscapeDebugExtArgs { + escape_single_quote: true, + escape_double_quote: false, + }) { + f.write_char(c)?; + } + f.write_char('\'') + } +} + +impl Default for JavaCodePoint { + #[inline] + fn default() -> Self { + JavaCodePoint::from_char('\0') + } +} + +impl Display for JavaCodePoint { + #[inline] + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + Display::fmt(&self.as_char().unwrap_or(char::REPLACEMENT_CHARACTER), f) + } +} + +impl From for u32 { + #[inline] + fn from(value: JavaCodePoint) -> Self { + value.as_u32() + } +} + +impl From for JavaCodePoint { + #[inline] + fn from(value: u8) -> Self { + JavaCodePoint::from_char(char::from(value)) + } +} + +impl FromStr for JavaCodePoint { + type Err = ParseCharError; + + #[inline] + fn from_str(s: &str) -> Result { + char::from_str(s).map(JavaCodePoint::from_char) + } +} + +impl Hash for JavaCodePoint { + #[inline] + fn hash(&self, state: &mut H) { + self.as_u32().hash(state) + } +} + +impl Ord for JavaCodePoint { + #[inline] + fn cmp(&self, other: &Self) -> Ordering { + self.as_u32().cmp(&other.as_u32()) + } +} + +impl PartialOrd for JavaCodePoint { + #[inline] + fn partial_cmp(&self, other: &Self) -> Option { + self.as_u32().partial_cmp(&other.as_u32()) + } +} + +impl PartialOrd for JavaCodePoint { + #[inline] + fn partial_cmp(&self, other: &char) -> Option { + self.partial_cmp(&JavaCodePoint::from_char(*other)) + } +} + +impl PartialOrd for char { + #[inline] + fn partial_cmp(&self, other: &JavaCodePoint) -> Option { + JavaCodePoint::from_char(*self).partial_cmp(other) + } +} + +impl PartialEq for JavaCodePoint { + #[inline] + fn eq(&self, other: &char) -> bool { + self == &JavaCodePoint::from_char(*other) + } +} + +impl PartialEq for char { + #[inline] + fn eq(&self, other: &JavaCodePoint) -> bool { + &JavaCodePoint::from_char(*self) == other + } +} + +pub(crate) struct EscapeDebugExtArgs { + pub(crate) escape_single_quote: bool, + pub(crate) escape_double_quote: bool, +} + +impl EscapeDebugExtArgs { + pub(crate) const ESCAPE_ALL: Self = Self { + escape_single_quote: true, + escape_double_quote: true, + }; +} + +#[derive(Clone, Debug)] +pub struct CharEscapeIter { + inner: EscapeIterInner, +} + +#[derive(Clone, Debug)] +enum EscapeIterInner { + Printable(Once), + Escaped(EscapeIterEscaped), +} + +impl Display for EscapeIterInner { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self { + EscapeIterInner::Printable(char) => char.clone().try_for_each(|ch| f.write_char(ch)), + EscapeIterInner::Escaped(escaped) => Display::fmt(escaped, f), + } + } +} + +impl CharEscapeIter { + #[inline] + fn printable(char: char) -> Self { + CharEscapeIter { + inner: EscapeIterInner::Printable(once(char)), + } + } + + /// # Safety + /// Assumes that the input byte array is ASCII + #[inline] + unsafe fn new(bytes: [u8; N]) -> Self { + assert!(N <= 10, "Too many bytes in escape iter"); + let mut ten_bytes = [0; 10]; + ten_bytes[..N].copy_from_slice(&bytes); + CharEscapeIter { + inner: EscapeIterInner::Escaped(EscapeIterEscaped { + bytes: ten_bytes, + range: 0..N, + }), + } + } +} + +impl Iterator for CharEscapeIter { + type Item = char; + + #[inline] + fn next(&mut self) -> Option { + match &mut self.inner { + EscapeIterInner::Printable(printable) => printable.next(), + EscapeIterInner::Escaped(escaped) => escaped.next(), + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + match &self.inner { + EscapeIterInner::Printable(printable) => printable.size_hint(), + EscapeIterInner::Escaped(escaped) => escaped.size_hint(), + } + } +} + +impl ExactSizeIterator for CharEscapeIter { + #[inline] + fn len(&self) -> usize { + match &self.inner { + EscapeIterInner::Printable(printable) => printable.len(), + EscapeIterInner::Escaped(escaped) => escaped.len(), + } + } +} + +impl FusedIterator for CharEscapeIter {} + +impl Display for CharEscapeIter { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + Display::fmt(&self.inner, f) + } +} + +#[derive(Clone, Debug)] +struct EscapeIterEscaped { + // SAFETY: all values must be in the ASCII range + bytes: [u8; 10], + // SAFETY: range must not be out of bounds for length 10 + range: Range, +} + +impl Iterator for EscapeIterEscaped { + type Item = char; + + #[inline] + fn next(&mut self) -> Option { + self.range.next().map(|index| unsafe { + // SAFETY: the range is never out of bounds for length 10 + char::from(*self.bytes.get_unchecked(index)) + }) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.range.size_hint() + } + + #[inline] + fn count(self) -> usize { + self.range.len() + } +} + +impl ExactSizeIterator for EscapeIterEscaped { + #[inline] + fn len(&self) -> usize { + self.range.len() + } +} + +impl FusedIterator for EscapeIterEscaped {} + +impl Display for EscapeIterEscaped { + #[inline] + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + let str = unsafe { + // SAFETY: all bytes are in ASCII range, and range is in bounds for length 10 + std::str::from_utf8_unchecked(self.bytes.get_unchecked(self.range.clone())) + }; + f.write_str(str) + } +} + +pub type ToLowercase = CharIterDelegate; +pub type ToUppercase = CharIterDelegate; + +#[derive(Debug, Clone)] +pub struct CharIterDelegate(CharIterDelegateInner); + +impl CharIterDelegate { + #[inline] + fn char(iter: I) -> CharIterDelegate { + CharIterDelegate(CharIterDelegateInner::Char(iter)) + } + + #[inline] + fn invalid(code_point: JavaCodePoint) -> CharIterDelegate { + CharIterDelegate(CharIterDelegateInner::Invalid(Some(code_point).into_iter())) + } +} + +#[derive(Debug, Clone)] +enum CharIterDelegateInner { + Char(I), + Invalid(std::option::IntoIter), +} + +impl Iterator for CharIterDelegate +where + I: Iterator, +{ + type Item = JavaCodePoint; + + #[inline] + fn next(&mut self) -> Option { + match &mut self.0 { + CharIterDelegateInner::Char(char_iter) => { + char_iter.next().map(JavaCodePoint::from_char) + } + CharIterDelegateInner::Invalid(code_point) => code_point.next(), + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + match &self.0 { + CharIterDelegateInner::Char(char_iter) => char_iter.size_hint(), + CharIterDelegateInner::Invalid(code_point) => code_point.size_hint(), + } + } +} + +impl DoubleEndedIterator for CharIterDelegate +where + I: Iterator + DoubleEndedIterator, +{ + #[inline] + fn next_back(&mut self) -> Option { + match &mut self.0 { + CharIterDelegateInner::Char(char_iter) => { + char_iter.next_back().map(JavaCodePoint::from_char) + } + CharIterDelegateInner::Invalid(code_point) => code_point.next_back(), + } + } +} + +impl ExactSizeIterator for CharIterDelegate where I: Iterator + ExactSizeIterator {} + +impl FusedIterator for CharIterDelegate where I: Iterator + FusedIterator {} diff --git a/crates/java_string/src/error.rs b/crates/java_string/src/error.rs new file mode 100644 index 000000000..09742d014 --- /dev/null +++ b/crates/java_string/src/error.rs @@ -0,0 +1,126 @@ +use std::error::Error; +use std::fmt; +use std::fmt::{Display, Formatter}; + +#[derive(Copy, Eq, PartialEq, Clone, Debug)] +pub struct Utf8Error { + pub(crate) valid_up_to: usize, + pub(crate) error_len: Option, +} + +impl Utf8Error { + #[must_use] + #[inline] + pub const fn valid_up_to(&self) -> usize { + self.valid_up_to + } + + #[must_use] + #[inline] + pub const fn error_len(&self) -> Option { + // Manual implementation of Option::map since it's not const + match self.error_len { + Some(len) => Some(len as usize), + None => None, + } + } + + #[must_use] + #[inline] + pub(crate) const fn from_std(value: std::str::Utf8Error) -> Self { + Self { + valid_up_to: value.valid_up_to(), + // Manual implementation of Option::map since it's not const + error_len: match value.error_len() { + Some(error_len) => Some(error_len as u8), + None => None, + }, + } + } +} + +impl Display for Utf8Error { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + if let Some(error_len) = self.error_len { + write!( + f, + "invalid utf-8 sequence of {} bytes from index {}", + error_len, self.valid_up_to + ) + } else { + write!( + f, + "incomplete utf-8 byte sequence from index {}", + self.valid_up_to + ) + } + } +} + +impl From for Utf8Error { + #[inline] + fn from(value: std::str::Utf8Error) -> Self { + Self::from_std(value) + } +} + +impl Error for Utf8Error {} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct FromUtf8Error { + pub(crate) bytes: Vec, + pub(crate) error: Utf8Error, +} + +impl FromUtf8Error { + pub fn as_bytes(&self) -> &[u8] { + &self.bytes[..] + } + + #[must_use] + pub fn into_bytes(self) -> Vec { + self.bytes + } + + pub fn utf8_error(&self) -> Utf8Error { + self.error + } +} + +impl Display for FromUtf8Error { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + Display::fmt(&self.error, f) + } +} + +impl Error for FromUtf8Error {} + +#[derive(Copy, Eq, PartialEq, Clone, Debug)] +pub enum ParseError { + InvalidUtf8(Utf8Error), + Err(E), +} + +impl Display for ParseError +where + E: Display, +{ + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self { + ParseError::InvalidUtf8(err) => Display::fmt(err, f), + ParseError::Err(err) => Display::fmt(err, f), + } + } +} + +impl Error for ParseError +where + E: Error + 'static, +{ + fn source(&self) -> Option<&(dyn Error + 'static)> { + match self { + ParseError::InvalidUtf8(err) => Some(err), + ParseError::Err(err) => Some(err), + } + } +} diff --git a/crates/java_string/src/iter.rs b/crates/java_string/src/iter.rs new file mode 100644 index 000000000..3762f6d72 --- /dev/null +++ b/crates/java_string/src/iter.rs @@ -0,0 +1,977 @@ +use std::fmt::{Debug, Display, Formatter, Write}; +use std::iter::{Chain, Copied, Filter, FlatMap, Flatten, FusedIterator, Map}; +use std::{option, slice}; + +use crate::validations::{next_code_point, next_code_point_reverse}; +use crate::{CharEscapeIter, JavaCodePoint, JavaStr, JavaStrPattern}; +macro_rules! delegate { + (Iterator for $ty:ident $(<$($lt:lifetime),+>)? => $item:ty $(, DoubleEnded = $double_ended:ty)?) => { + impl$(<$($lt),+>)? Iterator for $ty$(<$($lt),+>)? { + type Item = $item; + + #[inline] + fn next(&mut self) -> Option { + self.inner.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } + + #[inline] + fn count(self) -> usize { + self.inner.count() + } + + #[inline] + fn last(self) -> Option { + self.inner.last() + } + + #[inline] + fn nth(&mut self, n: usize) -> Option { + self.inner.nth(n) + } + + #[inline] + fn all(&mut self, f: F) -> bool + where + F: FnMut(Self::Item) -> bool, + { + self.inner.all(f) + } + + #[inline] + fn any(&mut self, f: F) -> bool + where + F: FnMut(Self::Item) -> bool, + { + self.inner.any(f) + } + + #[inline] + fn find

(&mut self, predicate: P) -> Option + where + P: FnMut(&Self::Item) -> bool, + { + self.inner.find(predicate) + } + + #[inline] + fn position

(&mut self, predicate: P) -> Option + where + P: FnMut(Self::Item) -> bool, + { + self.inner.position(predicate) + } + + $( + #[inline] + fn rposition

(&mut self, predicate: P) -> Option + where + P: FnMut(Self::Item) -> bool, + { + let _test: $double_ended = (); + self.inner.rposition(predicate) + } + )? + } + }; + + (DoubleEndedIterator for $ty:ident $(<$($lt:lifetime),+>)?) => { + impl$(<$($lt),+>)? DoubleEndedIterator for $ty$(<$($lt),+>)? { + #[inline] + fn next_back(&mut self) -> Option { + self.inner.next_back() + } + + #[inline] + fn nth_back(&mut self, n: usize) -> Option { + self.inner.nth_back(n) + } + + #[inline] + fn rfind

(&mut self, predicate: P) -> Option + where + P: FnMut(&Self::Item) -> bool, + { + self.inner.rfind(predicate) + } + } + }; + + (ExactSizeIterator for $ty:ident $(<$($lt:lifetime),+>)?) => { + impl$(<$($lt),+>)? ExactSizeIterator for $ty$(<$($lt),+>)? { + #[inline] + fn len(&self) -> usize { + self.inner.len() + } + } + }; + + (FusedIterator for $ty:ident $(<$($lt:lifetime),+>)?) => { + impl$(<$($lt),+>)? FusedIterator for $ty$(<$($lt),+>)? {} + }; + + (Iterator, DoubleEndedIterator, ExactSizeIterator, FusedIterator for $ty:ident $(<$($lt:lifetime),+>)? => $item:ty) => { + delegate!(Iterator for $ty$(<$($lt),+>)? => $item, DoubleEnded = ()); + delegate!(DoubleEndedIterator for $ty$(<$($lt),+>)?); + delegate!(ExactSizeIterator for $ty$(<$($lt),+>)?); + delegate!(FusedIterator for $ty$(<$($lt),+>)?); + }; +} + +#[must_use] +#[derive(Clone, Debug)] +pub struct Bytes<'a> { + pub(crate) inner: Copied>, +} +delegate!(Iterator, DoubleEndedIterator, ExactSizeIterator, FusedIterator for Bytes<'a> => u8); + +#[derive(Clone, Debug)] +#[must_use] +pub struct EscapeDebug<'a> { + #[allow(clippy::type_complexity)] + pub(crate) inner: Chain< + Flatten>, + FlatMap, CharEscapeIter, fn(JavaCodePoint) -> CharEscapeIter>, + >, +} +delegate!(Iterator for EscapeDebug<'a> => char); +delegate!(FusedIterator for EscapeDebug<'a>); +impl<'a> Display for EscapeDebug<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + self.clone().try_for_each(|c| f.write_char(c)) + } +} + +#[derive(Clone, Debug)] +#[must_use] +pub struct EscapeDefault<'a> { + pub(crate) inner: FlatMap, CharEscapeIter, fn(JavaCodePoint) -> CharEscapeIter>, +} +delegate!(Iterator for EscapeDefault<'a> => char); +delegate!(FusedIterator for EscapeDefault<'a>); +impl<'a> Display for EscapeDefault<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + self.clone().try_for_each(|c| f.write_char(c)) + } +} + +#[derive(Clone, Debug)] +#[must_use] +pub struct EscapeUnicode<'a> { + pub(crate) inner: FlatMap, CharEscapeIter, fn(JavaCodePoint) -> CharEscapeIter>, +} +delegate!(Iterator for EscapeUnicode<'a> => char); +delegate!(FusedIterator for EscapeUnicode<'a>); +impl<'a> Display for EscapeUnicode<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + self.clone().try_for_each(|c| f.write_char(c)) + } +} + +#[derive(Clone, Debug)] +#[must_use] +pub struct Lines<'a> { + pub(crate) inner: Map, fn(&JavaStr) -> &JavaStr>, +} +delegate!(Iterator for Lines<'a> => &'a JavaStr); +delegate!(DoubleEndedIterator for Lines<'a>); +delegate!(FusedIterator for Lines<'a>); + +#[derive(Clone)] +#[must_use] +pub struct Chars<'a> { + pub(crate) inner: slice::Iter<'a, u8>, +} + +impl<'a> Iterator for Chars<'a> { + type Item = JavaCodePoint; + + #[inline] + fn next(&mut self) -> Option { + // SAFETY: `JavaStr` invariant says `self.inner` is a semi-valid UTF-8 string + // and the resulting `ch` is a valid Unicode Scalar Value or surrogate + // code point. + unsafe { next_code_point(&mut self.inner).map(|ch| JavaCodePoint::from_u32_unchecked(ch)) } + } + + // TODO: std has an optimized count impl + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let len = self.inner.len(); + // `(len + 3)` can't overflow, because we know that the `slice::Iter` + // belongs to a slice in memory which has a maximum length of + // `isize::MAX` (that's well below `usize::MAX`). + ((len + 3) / 4, Some(len)) + } + + #[inline] + fn last(mut self) -> Option { + // No need to go through the entire string. + self.next_back() + } +} + +impl Debug for Chars<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "Chars(")?; + f.debug_list().entries(self.clone()).finish()?; + write!(f, ")")?; + Ok(()) + } +} + +impl<'a> DoubleEndedIterator for Chars<'a> { + #[inline] + fn next_back(&mut self) -> Option { + // SAFETY: `JavaStr` invariant says `self.inner` is a semi-valid UTF-8 string + // and the resulting `ch` is a valid Unicode Scalar Value or surrogate + // code point. + unsafe { + next_code_point_reverse(&mut self.inner).map(|ch| JavaCodePoint::from_u32_unchecked(ch)) + } + } +} + +impl FusedIterator for Chars<'_> {} + +impl<'a> Chars<'a> { + #[inline] + #[must_use] + pub fn as_str(&self) -> &'a JavaStr { + // SAFETY: `Chars` is only made from a JavaStr, which guarantees the iter is + // semi-valid UTF-8. + unsafe { JavaStr::from_semi_utf8_unchecked(self.inner.as_slice()) } + } +} + +#[derive(Clone, Debug)] +#[must_use] +pub struct CharIndices<'a> { + pub(crate) front_offset: usize, + pub(crate) inner: Chars<'a>, +} + +impl<'a> Iterator for CharIndices<'a> { + type Item = (usize, JavaCodePoint); + + #[inline] + fn next(&mut self) -> Option<(usize, JavaCodePoint)> { + let pre_len = self.inner.inner.len(); + match self.inner.next() { + None => None, + Some(ch) => { + let index = self.front_offset; + let len = self.inner.inner.len(); + self.front_offset += pre_len - len; + Some((index, ch)) + } + } + } + + #[inline] + fn count(self) -> usize { + self.inner.count() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } + + #[inline] + fn last(mut self) -> Option<(usize, JavaCodePoint)> { + // No need to go through the entire string. + self.next_back() + } +} + +impl<'a> DoubleEndedIterator for CharIndices<'a> { + #[inline] + fn next_back(&mut self) -> Option<(usize, JavaCodePoint)> { + self.inner.next_back().map(|ch| { + let index = self.front_offset + self.inner.inner.len(); + (index, ch) + }) + } +} + +impl FusedIterator for CharIndices<'_> {} + +impl<'a> CharIndices<'a> { + #[inline] + #[must_use] + pub fn as_str(&self) -> &'a JavaStr { + self.inner.as_str() + } +} + +#[must_use] +#[derive(Debug, Clone)] +pub struct Matches<'a, P> { + pub(crate) str: &'a JavaStr, + pub(crate) pat: P, +} + +impl<'a, P> Iterator for Matches<'a, P> +where + P: JavaStrPattern, +{ + type Item = &'a JavaStr; + + #[inline] + fn next(&mut self) -> Option { + if let Some((index, len)) = self.pat.find_in(self.str) { + // SAFETY: pattern returns valid indices + let ret = unsafe { self.str.get_unchecked(index..index + len) }; + self.str = unsafe { self.str.get_unchecked(index + len..) }; + Some(ret) + } else { + self.str = Default::default(); + None + } + } +} + +impl<'a, P> DoubleEndedIterator for Matches<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + fn next_back(&mut self) -> Option { + if let Some((index, len)) = self.pat.rfind_in(self.str) { + // SAFETY: pattern returns valid indices + let ret = unsafe { self.str.get_unchecked(index..index + len) }; + self.str = unsafe { self.str.get_unchecked(..index) }; + Some(ret) + } else { + self.str = Default::default(); + None + } + } +} + +#[must_use] +#[derive(Clone, Debug)] +pub struct RMatches<'a, P> { + pub(crate) inner: Matches<'a, P>, +} + +impl<'a, P> Iterator for RMatches<'a, P> +where + P: JavaStrPattern, +{ + type Item = &'a JavaStr; + + #[inline] + fn next(&mut self) -> Option { + self.inner.next_back() + } +} + +impl<'a, P> DoubleEndedIterator for RMatches<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + fn next_back(&mut self) -> Option { + self.inner.next() + } +} + +#[must_use] +#[derive(Clone, Debug)] +pub struct MatchIndices<'a, P> { + pub(crate) str: &'a JavaStr, + pub(crate) start: usize, + pub(crate) pat: P, +} + +impl<'a, P> Iterator for MatchIndices<'a, P> +where + P: JavaStrPattern, +{ + type Item = (usize, &'a JavaStr); + + #[inline] + fn next(&mut self) -> Option { + if let Some((index, len)) = self.pat.find_in(self.str) { + let full_index = self.start + index; + self.start = full_index + len; + // SAFETY: pattern returns valid indices + let ret = unsafe { self.str.get_unchecked(index..index + len) }; + self.str = unsafe { self.str.get_unchecked(index + len..) }; + Some((full_index, ret)) + } else { + self.start += self.str.len(); + self.str = Default::default(); + None + } + } +} + +impl<'a, P> DoubleEndedIterator for MatchIndices<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + fn next_back(&mut self) -> Option { + if let Some((index, len)) = self.pat.rfind_in(self.str) { + // SAFETY: pattern returns valid indices + let ret = unsafe { self.str.get_unchecked(index..index + len) }; + self.str = unsafe { self.str.get_unchecked(..index) }; + Some((self.start + index, ret)) + } else { + self.str = Default::default(); + None + } + } +} + +#[derive(Clone, Debug)] +pub struct RMatchIndices<'a, P> { + pub(crate) inner: MatchIndices<'a, P>, +} + +impl<'a, P> Iterator for RMatchIndices<'a, P> +where + P: JavaStrPattern, +{ + type Item = (usize, &'a JavaStr); + + #[inline] + fn next(&mut self) -> Option { + self.inner.next_back() + } +} + +impl<'a, P> DoubleEndedIterator for RMatchIndices<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + fn next_back(&mut self) -> Option { + self.inner.next() + } +} + +#[derive(Clone, Debug)] +struct SplitHelper<'a, P> { + start: usize, + end: usize, + haystack: &'a JavaStr, + pat: P, + allow_trailing_empty: bool, + finished: bool, + had_empty_match: bool, +} + +impl<'a, P> SplitHelper<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + fn new(haystack: &'a JavaStr, pat: P, allow_trailing_empty: bool) -> Self { + Self { + start: 0, + end: haystack.len(), + haystack, + pat, + allow_trailing_empty, + finished: false, + had_empty_match: false, + } + } + + #[inline] + fn get_end(&mut self) -> Option<&'a JavaStr> { + if !self.finished { + self.finished = true; + + if self.allow_trailing_empty || self.end - self.start > 0 { + // SAFETY: `self.start` and `self.end` always lie on unicode boundaries. + let string = unsafe { self.haystack.get_unchecked(self.start..self.end) }; + return Some(string); + } + } + + None + } + + #[inline] + fn next_match(&mut self) -> Option<(usize, usize)> { + // SAFETY: `self.start` always lies on a unicode boundary. + let substr = unsafe { self.haystack.get_unchecked(self.start..) }; + + let result = if self.had_empty_match { + // if we had an empty match before, we are going to find the empty match again. + // don't do that, search from the next index along. + + if substr.is_empty() { + None + } else { + // SAFETY: we can pop the string because we already checked if the string is + // empty above + let first_char_len = unsafe { substr.chars().next().unwrap_unchecked().len_utf8() }; + let popped_str = unsafe { substr.get_unchecked(first_char_len..) }; + + self.pat + .find_in(popped_str) + .map(|(index, len)| (index + first_char_len + self.start, len)) + } + } else { + self.pat + .find_in(substr) + .map(|(index, len)| (index + self.start, len)) + }; + + self.had_empty_match = result.is_some_and(|(_, len)| len == 0); + + result + } + + #[inline] + fn next(&mut self) -> Option<&'a JavaStr> { + if self.finished { + return None; + } + + match self.next_match() { + Some((index, len)) => unsafe { + // SAFETY: pattern guarantees valid indices + let elt = self.haystack.get_unchecked(self.start..index); + self.start = index + len; + Some(elt) + }, + None => self.get_end(), + } + } + + #[inline] + fn next_inclusive(&mut self) -> Option<&'a JavaStr> { + if self.finished { + return None; + } + + match self.next_match() { + Some((index, len)) => unsafe { + // SAFETY: pattern guarantees valid indices + let elt = self.haystack.get_unchecked(self.start..index + len); + self.start = index + len; + Some(elt) + }, + None => self.get_end(), + } + } + + #[inline] + fn next_match_back(&mut self) -> Option<(usize, usize)> { + // SAFETY: `self.end` always lies on a unicode boundary. + let substr = unsafe { self.haystack.get_unchecked(..self.end) }; + + let result = if self.had_empty_match { + // if we had an empty match before, we are going to find the empty match again. + // don't do that, search from the next index along. + + if substr.is_empty() { + None + } else { + // SAFETY: we can pop the string because we already checked if the string is + // empty above + let last_char_len = + unsafe { substr.chars().next_back().unwrap_unchecked().len_utf8() }; + let popped_str = unsafe { substr.get_unchecked(..substr.len() - last_char_len) }; + + self.pat.rfind_in(popped_str) + } + } else { + self.pat.rfind_in(substr) + }; + + self.had_empty_match = result.is_some_and(|(_, len)| len == 0); + + result + } + + #[inline] + fn next_back(&mut self) -> Option<&'a JavaStr> { + if self.finished { + return None; + } + + if !self.allow_trailing_empty { + self.allow_trailing_empty = true; + match self.next_back() { + Some(elt) if !elt.is_empty() => return Some(elt), + _ => { + if self.finished { + return None; + } + } + } + } + + match self.next_match_back() { + Some((index, len)) => unsafe { + // SAFETY: pattern guarantees valid indices + let elt = self.haystack.get_unchecked(index + len..self.end); + self.end = index; + Some(elt) + }, + None => unsafe { + // SAFETY: `self.start` and `self.end` always lie on unicode boundaries. + self.finished = true; + Some(self.haystack.get_unchecked(self.start..self.end)) + }, + } + } + + #[inline] + fn next_back_inclusive(&mut self) -> Option<&'a JavaStr> { + if self.finished { + return None; + } + + if !self.allow_trailing_empty { + self.allow_trailing_empty = true; + match self.next_back_inclusive() { + Some(elt) if !elt.is_empty() => return Some(elt), + _ => { + if self.finished { + return None; + } + } + } + } + + match self.next_match_back() { + Some((index, len)) => unsafe { + // SAFETY: pattern guarantees valid indices + let elt = self.haystack.get_unchecked(index + len..self.end); + self.end = index + len; + Some(elt) + }, + None => unsafe { + // SAFETY: `self.start` and `self.end` always lie on unicode boundaries. + self.finished = true; + Some(self.haystack.get_unchecked(self.start..self.end)) + }, + } + } +} + +#[derive(Clone, Debug)] +pub struct Split<'a, P> { + inner: SplitHelper<'a, P>, +} + +impl<'a, P> Split<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + pub(crate) fn new(haystack: &'a JavaStr, pat: P) -> Self { + Split { + inner: SplitHelper::new(haystack, pat, true), + } + } +} + +impl<'a, P> Iterator for Split<'a, P> +where + P: JavaStrPattern, +{ + type Item = &'a JavaStr; + + #[inline] + fn next(&mut self) -> Option { + self.inner.next() + } +} + +impl<'a, P> DoubleEndedIterator for Split<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + fn next_back(&mut self) -> Option { + self.inner.next_back() + } +} + +impl<'a, P> FusedIterator for Split<'a, P> where P: JavaStrPattern {} + +#[derive(Clone, Debug)] +pub struct RSplit<'a, P> { + inner: SplitHelper<'a, P>, +} + +impl<'a, P> RSplit<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + pub(crate) fn new(haystack: &'a JavaStr, pat: P) -> Self { + RSplit { + inner: SplitHelper::new(haystack, pat, true), + } + } +} + +impl<'a, P> Iterator for RSplit<'a, P> +where + P: JavaStrPattern, +{ + type Item = &'a JavaStr; + + #[inline] + fn next(&mut self) -> Option { + self.inner.next_back() + } +} + +impl<'a, P> DoubleEndedIterator for RSplit<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + fn next_back(&mut self) -> Option { + self.inner.next() + } +} + +impl<'a, P> FusedIterator for RSplit<'a, P> where P: JavaStrPattern {} + +#[derive(Clone, Debug)] +pub struct SplitTerminator<'a, P> { + inner: SplitHelper<'a, P>, +} + +impl<'a, P> SplitTerminator<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + pub(crate) fn new(haystack: &'a JavaStr, pat: P) -> Self { + SplitTerminator { + inner: SplitHelper::new(haystack, pat, false), + } + } +} + +impl<'a, P> Iterator for SplitTerminator<'a, P> +where + P: JavaStrPattern, +{ + type Item = &'a JavaStr; + + #[inline] + fn next(&mut self) -> Option { + self.inner.next() + } +} + +impl<'a, P> DoubleEndedIterator for SplitTerminator<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + fn next_back(&mut self) -> Option { + self.inner.next_back() + } +} + +impl<'a, P> FusedIterator for SplitTerminator<'a, P> where P: JavaStrPattern {} + +#[derive(Clone, Debug)] +pub struct RSplitTerminator<'a, P> { + inner: SplitHelper<'a, P>, +} + +impl<'a, P> RSplitTerminator<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + pub(crate) fn new(haystack: &'a JavaStr, pat: P) -> Self { + RSplitTerminator { + inner: SplitHelper::new(haystack, pat, false), + } + } +} + +impl<'a, P> Iterator for RSplitTerminator<'a, P> +where + P: JavaStrPattern, +{ + type Item = &'a JavaStr; + + #[inline] + fn next(&mut self) -> Option { + self.inner.next_back() + } +} + +impl<'a, P> DoubleEndedIterator for RSplitTerminator<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + fn next_back(&mut self) -> Option { + self.inner.next() + } +} + +impl<'a, P> FusedIterator for RSplitTerminator<'a, P> where P: JavaStrPattern {} + +#[derive(Clone, Debug)] +pub struct SplitInclusive<'a, P> { + inner: SplitHelper<'a, P>, +} + +impl<'a, P> SplitInclusive<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + pub(crate) fn new(haystack: &'a JavaStr, pat: P) -> Self { + SplitInclusive { + inner: SplitHelper::new(haystack, pat, false), + } + } +} + +impl<'a, P> Iterator for SplitInclusive<'a, P> +where + P: JavaStrPattern, +{ + type Item = &'a JavaStr; + + #[inline] + fn next(&mut self) -> Option { + self.inner.next_inclusive() + } +} + +impl<'a, P> DoubleEndedIterator for SplitInclusive<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + fn next_back(&mut self) -> Option { + self.inner.next_back_inclusive() + } +} + +impl<'a, P> FusedIterator for SplitInclusive<'a, P> where P: JavaStrPattern {} + +#[derive(Clone, Debug)] +pub struct SplitN<'a, P> { + inner: SplitHelper<'a, P>, + count: usize, +} + +impl<'a, P> SplitN<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + pub(crate) fn new(haystack: &'a JavaStr, pat: P, count: usize) -> Self { + SplitN { + inner: SplitHelper::new(haystack, pat, true), + count, + } + } +} + +impl<'a, P> Iterator for SplitN<'a, P> +where + P: JavaStrPattern, +{ + type Item = &'a JavaStr; + + #[inline] + fn next(&mut self) -> Option { + match self.count { + 0 => None, + 1 => { + self.count = 0; + self.inner.get_end() + } + _ => { + self.count -= 1; + self.inner.next() + } + } + } +} + +impl<'a, P> FusedIterator for SplitN<'a, P> where P: JavaStrPattern {} + +#[derive(Clone, Debug)] +pub struct RSplitN<'a, P> { + inner: SplitHelper<'a, P>, + count: usize, +} + +impl<'a, P> RSplitN<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + pub(crate) fn new(haystack: &'a JavaStr, pat: P, count: usize) -> Self { + RSplitN { + inner: SplitHelper::new(haystack, pat, true), + count, + } + } +} + +impl<'a, P> Iterator for RSplitN<'a, P> +where + P: JavaStrPattern, +{ + type Item = &'a JavaStr; + + #[inline] + fn next(&mut self) -> Option { + match self.count { + 0 => None, + 1 => { + self.count = 0; + self.inner.get_end() + } + _ => { + self.count -= 1; + self.inner.next_back() + } + } + } +} + +impl<'a, P> FusedIterator for RSplitN<'a, P> where P: JavaStrPattern {} + +#[derive(Clone, Debug)] +pub struct SplitAsciiWhitespace<'a> { + #[allow(clippy::type_complexity)] + pub(crate) inner: Map< + Filter bool>, fn(&&[u8]) -> bool>, + fn(&[u8]) -> &JavaStr, + >, +} +delegate!(Iterator for SplitAsciiWhitespace<'a> => &'a JavaStr); +delegate!(DoubleEndedIterator for SplitAsciiWhitespace<'a>); +delegate!(FusedIterator for SplitAsciiWhitespace<'a>); + +#[derive(Clone, Debug)] +pub struct SplitWhitespace<'a> { + #[allow(clippy::type_complexity)] + pub(crate) inner: Filter bool>, fn(&&JavaStr) -> bool>, +} +delegate!(Iterator for SplitWhitespace<'a> => &'a JavaStr); +delegate!(DoubleEndedIterator for SplitWhitespace<'a>); +delegate!(FusedIterator for SplitWhitespace<'a>); diff --git a/crates/java_string/src/lib.rs b/crates/java_string/src/lib.rs new file mode 100644 index 000000000..57f035944 --- /dev/null +++ b/crates/java_string/src/lib.rs @@ -0,0 +1,27 @@ +#![doc = include_str!("../README.md")] + +mod cesu8; +mod char; +mod error; +mod iter; +mod owned; +mod pattern; +#[cfg(feature = "serde")] +mod serde; +mod slice; +pub(crate) mod validations; + +pub use cesu8::*; +pub use char::*; +pub use error::*; +pub use iter::*; +pub use owned::*; +pub use pattern::*; +pub use slice::*; + +#[macro_export] +macro_rules! format_java { + ($($arg:tt)*) => { + $crate::JavaString::from(::std::format!($($arg)*)) + } +} diff --git a/crates/java_string/src/owned.rs b/crates/java_string/src/owned.rs new file mode 100644 index 000000000..e03f82a7d --- /dev/null +++ b/crates/java_string/src/owned.rs @@ -0,0 +1,1401 @@ +use std::borrow::{Borrow, BorrowMut, Cow}; +use std::collections::{Bound, TryReserveError}; +use std::convert::Infallible; +use std::fmt::{Debug, Display, Formatter, Write}; +use std::hash::{Hash, Hasher}; +use std::iter::FusedIterator; +use std::ops::{ + Add, AddAssign, Deref, DerefMut, Index, IndexMut, Range, RangeBounds, RangeFrom, RangeFull, + RangeInclusive, RangeTo, RangeToInclusive, +}; +use std::rc::Rc; +use std::str::FromStr; +use std::sync::Arc; +use std::{ptr, slice}; + +use crate::validations::{ + run_utf8_full_validation_from_semi, run_utf8_semi_validation, to_range_checked, +}; +use crate::{Chars, FromUtf8Error, JavaCodePoint, JavaStr, Utf8Error}; + +#[derive(Default, PartialEq, PartialOrd, Eq, Ord)] +pub struct JavaString { + vec: Vec, +} + +impl JavaString { + #[inline] + #[must_use] + pub const fn new() -> JavaString { + JavaString { vec: Vec::new() } + } + + #[inline] + #[must_use] + pub fn with_capacity(capacity: usize) -> JavaString { + JavaString { + vec: Vec::with_capacity(capacity), + } + } + + /// Converts `vec` to a `JavaString` if it is fully-valid UTF-8, i.e. UTF-8 + /// without surrogate code points. See [String::from_utf8]. + #[inline] + pub fn from_full_utf8(vec: Vec) -> Result { + match std::str::from_utf8(&vec) { + Ok(..) => Ok(JavaString { vec }), + Err(e) => Err(FromUtf8Error { + bytes: vec, + error: e.into(), + }), + } + } + + /// Converts `vec` to a `JavaString` if it is semi-valid UTF-8, i.e. UTF-8 + /// with surrogate code points. + /// + /// ``` + /// # use java_string::{JavaCodePoint, JavaString}; + /// + /// assert_eq!( + /// JavaString::from_semi_utf8(b"Hello World!".to_vec()).unwrap(), + /// "Hello World!" + /// ); + /// assert_eq!( + /// JavaString::from_semi_utf8(vec![0xf0, 0x9f, 0x92, 0x96]).unwrap(), + /// "💖" + /// ); + /// assert_eq!( + /// JavaString::from_semi_utf8(vec![0xed, 0xa0, 0x80]).unwrap(), + /// JavaString::from(JavaCodePoint::from_u32(0xd800).unwrap()) + /// ); + /// assert!(JavaString::from_semi_utf8(vec![0xed]).is_err()); + /// ``` + pub fn from_semi_utf8(vec: Vec) -> Result { + match run_utf8_semi_validation(&vec) { + Ok(..) => Ok(JavaString { vec }), + Err(err) => Err(FromUtf8Error { + bytes: vec, + error: err, + }), + } + } + + /// Converts `v` to a `Cow`, replacing invalid semi-UTF-8 with the + /// replacement character �. + /// + /// ``` + /// # use std::borrow::Cow; + /// # use java_string::{JavaStr, JavaString}; + /// + /// let sparkle_heart = [0xf0, 0x9f, 0x92, 0x96]; + /// let result = JavaString::from_semi_utf8_lossy(&sparkle_heart); + /// assert!(matches!(result, Cow::Borrowed(_))); + /// assert_eq!(result, JavaStr::from_str("💖")); + /// + /// let foobar_with_error = [b'f', b'o', b'o', 0xed, b'b', b'a', b'r']; + /// let result = JavaString::from_semi_utf8_lossy(&foobar_with_error); + /// assert!(matches!(result, Cow::Owned(_))); + /// assert_eq!(result, JavaStr::from_str("foo�bar")); + /// ``` + #[must_use] + pub fn from_semi_utf8_lossy(v: &[u8]) -> Cow<'_, JavaStr> { + const REPLACEMENT: &str = "\u{FFFD}"; + + match run_utf8_semi_validation(v) { + Ok(()) => unsafe { + // SAFETY: validation succeeded + Cow::Borrowed(JavaStr::from_semi_utf8_unchecked(v)) + }, + Err(error) => { + let mut result = unsafe { + // SAFETY: validation succeeded up to this index + JavaString::from_semi_utf8_unchecked( + v.get_unchecked(..error.valid_up_to).to_vec(), + ) + }; + result.push_str(REPLACEMENT); + let mut index = error.valid_up_to + error.error_len.unwrap_or(1) as usize; + loop { + match run_utf8_semi_validation(&v[index..]) { + Ok(()) => { + unsafe { + // SAFETY: validation succeeded + result + .push_java_str(JavaStr::from_semi_utf8_unchecked(&v[index..])); + } + return Cow::Owned(result); + } + Err(error) => { + unsafe { + // SAFETY: validation succeeded up to this index + result.push_java_str(JavaStr::from_semi_utf8_unchecked( + v.get_unchecked(index..index + error.valid_up_to), + )); + } + result.push_str(REPLACEMENT); + index += error.valid_up_to + error.error_len.unwrap_or(1) as usize; + } + } + } + } + } + } + + /// # Safety + /// + /// The parameter must be in semi-valid UTF-8 format, that is, UTF-8 plus + /// surrogate code points. + #[inline] + #[must_use] + pub unsafe fn from_semi_utf8_unchecked(bytes: Vec) -> JavaString { + JavaString { vec: bytes } + } + + /// See [String::into_bytes]. + #[inline] + #[must_use] + pub fn into_bytes(self) -> Vec { + self.vec + } + + /// See [String::as_str]. + #[inline] + #[must_use] + pub fn as_java_str(&self) -> &JavaStr { + unsafe { + // SAFETY: this str has semi-valid UTF-8 + JavaStr::from_semi_utf8_unchecked(&self.vec) + } + } + + /// See [String::as_mut_str]. + #[inline] + #[must_use] + pub fn as_mut_java_str(&mut self) -> &mut JavaStr { + unsafe { + // SAFETY: this str has semi-valid UTF-8 + JavaStr::from_semi_utf8_unchecked_mut(&mut self.vec) + } + } + + /// Tries to convert this `JavaString` to a `String`, returning an error if + /// it is not fully valid UTF-8, i.e. has no surrogate code points. + /// + /// ``` + /// # use java_string::{JavaCodePoint, JavaString}; + /// + /// assert_eq!( + /// JavaString::from("Hello World!").into_string().unwrap(), + /// "Hello World!" + /// ); + /// assert_eq!( + /// JavaString::from("abc\0ℝ💣").into_string().unwrap(), + /// "abc\0ℝ💣" + /// ); + /// + /// let string_with_error = JavaString::from("abc") + /// + JavaString::from(JavaCodePoint::from_u32(0xd800).unwrap()).as_java_str(); + /// assert!(string_with_error.into_string().is_err()); + /// ``` + pub fn into_string(self) -> Result { + run_utf8_full_validation_from_semi(self.as_bytes()).map(|_| unsafe { + // SAFETY: validation succeeded + self.into_string_unchecked() + }) + } + + /// # Safety + /// + /// This string must be fully valid UTF-8, i.e. have no surrogate code + /// points. + #[inline] + #[must_use] + pub unsafe fn into_string_unchecked(self) -> String { + // SAFETY: preconditions checked by caller + String::from_utf8_unchecked(self.vec) + } + + /// See [String::push_str]. + #[inline] + pub fn push_java_str(&mut self, string: &JavaStr) { + self.vec.extend_from_slice(string.as_bytes()) + } + + /// See [String::push_str]. + #[inline] + pub fn push_str(&mut self, string: &str) { + self.vec.extend_from_slice(string.as_bytes()) + } + + /// See [String::capacity]. + #[inline] + #[must_use] + pub fn capacity(&self) -> usize { + self.vec.capacity() + } + + /// See [String::reserve]. + #[inline] + pub fn reserve(&mut self, additional: usize) { + self.vec.reserve(additional) + } + + /// See [String::reserve_exact]. + #[inline] + pub fn reserve_exact(&mut self, additional: usize) { + self.vec.reserve_exact(additional) + } + + /// See [String::try_reserve]. + #[inline] + pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> { + self.vec.try_reserve(additional) + } + + /// See [String::try_reserve_exact]. + #[inline] + pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> { + self.vec.try_reserve_exact(additional) + } + + /// See [String::shrink_to_fit]. + #[inline] + pub fn shrink_to_fit(&mut self) { + self.vec.shrink_to_fit() + } + + /// See [String::shrink_to]. + #[inline] + pub fn shrink_to(&mut self, min_capacity: usize) { + self.vec.shrink_to(min_capacity) + } + + /// See [String::push]. + #[inline] + pub fn push(&mut self, ch: char) { + match ch.len_utf8() { + 1 => self.vec.push(ch as u8), + _ => self + .vec + .extend_from_slice(ch.encode_utf8(&mut [0; 4]).as_bytes()), + } + } + + /// See [String::push]. + #[inline] + pub fn push_java(&mut self, ch: JavaCodePoint) { + match ch.len_utf8() { + 1 => self.vec.push(ch.as_u32() as u8), + _ => self.vec.extend_from_slice(ch.encode_semi_utf8(&mut [0; 4])), + } + } + + /// See [String::as_bytes]. + #[inline] + #[must_use] + pub fn as_bytes(&self) -> &[u8] { + &self.vec + } + + /// See [String::truncate]. + #[inline] + pub fn truncate(&mut self, new_len: usize) { + if new_len <= self.len() { + assert!(self.is_char_boundary(new_len)); + self.vec.truncate(new_len) + } + } + + /// See [String::pop]. + /// + /// ``` + /// # use java_string::JavaString; + /// + /// let mut str = JavaString::from("Hello World!"); + /// assert_eq!(str.pop().unwrap(), '!'); + /// assert_eq!(str, "Hello World"); + /// + /// let mut str = JavaString::from("東京"); + /// assert_eq!(str.pop().unwrap(), '京'); + /// assert_eq!(str, "東"); + /// + /// assert!(JavaString::new().pop().is_none()); + /// ``` + #[inline] + pub fn pop(&mut self) -> Option { + let ch = self.chars().next_back()?; + let newlen = self.len() - ch.len_utf8(); + unsafe { + self.vec.set_len(newlen); + } + Some(ch) + } + + /// See [String::remove]. + /// + /// ``` + /// # use java_string::JavaString; + /// + /// let mut str = JavaString::from("Hello World!"); + /// assert_eq!(str.remove(5), ' '); + /// assert_eq!(str, "HelloWorld!"); + /// + /// let mut str = JavaString::from("Hello 🦀 World!"); + /// assert_eq!(str.remove(6), '🦀'); + /// assert_eq!(str, "Hello World!"); + /// ``` + /// ```should_panic + /// # use java_string::JavaString; + /// // Should panic + /// JavaString::new().remove(0); + /// ``` + /// ```should_panic + /// # use java_string::JavaString; + /// // Should panic + /// JavaString::from("🦀").remove(1); + /// ``` + #[inline] + pub fn remove(&mut self, idx: usize) -> JavaCodePoint { + let ch = match self[idx..].chars().next() { + Some(ch) => ch, + None => panic!("cannot remove a char from the end of a string"), + }; + + let next = idx + ch.len_utf8(); + let len = self.len(); + unsafe { + ptr::copy( + self.vec.as_ptr().add(next), + self.vec.as_mut_ptr().add(idx), + len - next, + ); + self.vec.set_len(len - (next - idx)); + } + ch + } + + /// See [String::retain]. + /// + /// ``` + /// # use java_string::{JavaCodePoint, JavaString}; + /// + /// let mut str = JavaString::from("Hello 🦀 World!"); + /// str.retain(|ch| !ch.is_ascii_uppercase()); + /// assert_eq!(str, "ello 🦀 orld!"); + /// str.retain(JavaCodePoint::is_ascii); + /// assert_eq!(str, "ello orld!"); + /// ``` + #[inline] + pub fn retain(&mut self, mut f: F) + where + F: FnMut(JavaCodePoint) -> bool, + { + struct SetLenOnDrop<'a> { + s: &'a mut JavaString, + idx: usize, + del_bytes: usize, + } + + impl<'a> Drop for SetLenOnDrop<'a> { + #[inline] + fn drop(&mut self) { + let new_len = self.idx - self.del_bytes; + debug_assert!(new_len <= self.s.len()); + unsafe { self.s.vec.set_len(new_len) }; + } + } + + let len = self.len(); + let mut guard = SetLenOnDrop { + s: self, + idx: 0, + del_bytes: 0, + }; + + while guard.idx < len { + // SAFETY: `guard.idx` is positive-or-zero and less that len so the + // `get_unchecked` is in bound. `self` is valid UTF-8 like string + // and the returned slice starts at a unicode code point so the + // `Chars` always return one character. + let ch = unsafe { + guard + .s + .get_unchecked(guard.idx..len) + .chars() + .next() + .unwrap_unchecked() + }; + let ch_len = ch.len_utf8(); + + if !f(ch) { + guard.del_bytes += ch_len; + } else if guard.del_bytes > 0 { + // SAFETY: `guard.idx` is in bound and `guard.del_bytes` represent the number of + // bytes that are erased from the string so the resulting `guard.idx - + // guard.del_bytes` always represent a valid unicode code point. + // + // `guard.del_bytes` >= `ch.len_utf8()`, so taking a slice with `ch.len_utf8()` + // len is safe. + ch.encode_semi_utf8(unsafe { + slice::from_raw_parts_mut( + guard.s.as_mut_ptr().add(guard.idx - guard.del_bytes), + ch.len_utf8(), + ) + }); + } + + // Point idx to the next char + guard.idx += ch_len; + } + + drop(guard); + } + + /// See [String::insert]. + /// + /// ``` + /// # use java_string::JavaString; + /// let mut s = JavaString::from("foo"); + /// s.insert(3, 'a'); + /// s.insert(4, 'r'); + /// s.insert(3, 'b'); + /// assert_eq!(s, "foobar"); + /// ``` + #[inline] + pub fn insert(&mut self, idx: usize, ch: char) { + assert!(self.is_char_boundary(idx)); + let mut bits = [0; 4]; + let bits = ch.encode_utf8(&mut bits).as_bytes(); + + unsafe { + self.insert_bytes(idx, bits); + } + } + + /// See [String::insert]. + #[inline] + pub fn insert_java(&mut self, idx: usize, ch: JavaCodePoint) { + assert!(self.is_char_boundary(idx)); + let mut bits = [0; 4]; + let bits = ch.encode_semi_utf8(&mut bits); + + unsafe { + self.insert_bytes(idx, bits); + } + } + + #[inline] + unsafe fn insert_bytes(&mut self, idx: usize, bytes: &[u8]) { + let len = self.len(); + let amt = bytes.len(); + self.vec.reserve(amt); + + unsafe { + ptr::copy( + self.vec.as_ptr().add(idx), + self.vec.as_mut_ptr().add(idx + amt), + len - idx, + ); + ptr::copy_nonoverlapping(bytes.as_ptr(), self.vec.as_mut_ptr().add(idx), amt); + self.vec.set_len(len + amt); + } + } + + /// See [String::insert_str]. + /// + /// ``` + /// # use java_string::JavaString; + /// let mut s = JavaString::from("bar"); + /// s.insert_str(0, "foo"); + /// assert_eq!(s, "foobar"); + /// ``` + #[inline] + pub fn insert_str(&mut self, idx: usize, string: &str) { + assert!(self.is_char_boundary(idx)); + + unsafe { + self.insert_bytes(idx, string.as_bytes()); + } + } + + /// See [String::insert_str]. + pub fn insert_java_str(&mut self, idx: usize, string: &JavaStr) { + assert!(self.is_char_boundary(idx)); + + unsafe { + self.insert_bytes(idx, string.as_bytes()); + } + } + + /// See [String::as_mut_vec]. + /// + /// # Safety + /// + /// The returned `Vec` must not have invalid UTF-8 written to it, besides + /// surrogate pairs. + #[inline] + pub unsafe fn as_mut_vec(&mut self) -> &mut Vec { + &mut self.vec + } + + /// See [String::len]. + #[inline] + #[must_use] + pub fn len(&self) -> usize { + self.vec.len() + } + + /// See [String::is_empty]. + #[inline] + #[must_use] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// See [String::split_off]. + /// + /// ``` + /// # use java_string::JavaString; + /// let mut hello = JavaString::from("Hello World!"); + /// let world = hello.split_off(6); + /// assert_eq!(hello, "Hello "); + /// assert_eq!(world, "World!"); + /// ``` + /// ```should_panic + /// # use java_string::JavaString; + /// let mut s = JavaString::from("🦀"); + /// // Should panic + /// let _ = s.split_off(1); + /// ``` + #[inline] + #[must_use] + pub fn split_off(&mut self, at: usize) -> JavaString { + assert!(self.is_char_boundary(at)); + let other = self.vec.split_off(at); + unsafe { JavaString::from_semi_utf8_unchecked(other) } + } + + /// See [String::clear]. + #[inline] + pub fn clear(&mut self) { + self.vec.clear(); + } + + /// See [String::drain]. + /// + /// ``` + /// # use java_string::JavaString; + /// + /// let mut s = JavaString::from("α is alpha, β is beta"); + /// let beta_offset = s.find('β').unwrap_or(s.len()); + /// + /// // Remove the range up until the β from the string + /// let t: JavaString = s.drain(..beta_offset).collect(); + /// assert_eq!(t, "α is alpha, "); + /// assert_eq!(s, "β is beta"); + /// + /// // A full range clears the string, like `clear()` does + /// s.drain(..); + /// assert_eq!(s, ""); + /// ``` + #[inline] + pub fn drain(&mut self, range: R) -> Drain<'_> + where + R: RangeBounds, + { + // Memory safety: see String::drain + let Range { start, end } = to_range_checked(range, ..self.len()); + assert!(self.is_char_boundary(start)); + assert!(self.is_char_boundary(end)); + + // Take out two simultaneous borrows. The &mut String won't be accessed + // until iteration is over, in Drop. + let self_ptr = self as *mut _; + // SAFETY: `to_range_checked` and `is_char_boundary` do the appropriate bounds + // checks. + let chars_iter = unsafe { self.get_unchecked(start..end) }.chars(); + + Drain { + start, + end, + iter: chars_iter, + string: self_ptr, + } + } + + /// See [String::replace_range]. + /// + /// ``` + /// # use java_string::JavaString; + /// + /// let mut s = JavaString::from("α is alpha, β is beta"); + /// let beta_offset = s.find('β').unwrap_or(s.len()); + /// + /// // Replace the range up until the β from the string + /// s.replace_range(..beta_offset, "Α is capital alpha; "); + /// assert_eq!(s, "Α is capital alpha; β is beta"); + /// ``` + /// ```should_panic + /// # use java_string::JavaString; + /// let mut s = JavaString::from("α is alpha, β is beta"); + /// // Should panic + /// s.replace_range(..1, "Α is capital alpha; "); + /// ``` + pub fn replace_range(&mut self, range: R, replace_with: &str) + where + R: RangeBounds, + { + self.replace_range_java(range, JavaStr::from_str(replace_with)) + } + + /// See [String::replace_range]. + pub fn replace_range_java(&mut self, range: R, replace_with: &JavaStr) + where + R: RangeBounds, + { + let start = range.start_bound(); + match start { + Bound::Included(&n) => assert!(self.is_char_boundary(n)), + Bound::Excluded(&n) => assert!(self.is_char_boundary(n + 1)), + Bound::Unbounded => {} + }; + let end = range.end_bound(); + match end { + Bound::Included(&n) => assert!(self.is_char_boundary(n + 1)), + Bound::Excluded(&n) => assert!(self.is_char_boundary(n)), + Bound::Unbounded => {} + }; + + unsafe { self.as_mut_vec() }.splice((start, end), replace_with.bytes()); + } + + /// See [String::into_boxed_str]. + #[inline] + #[must_use] + pub fn into_boxed_str(self) -> Box { + let slice = self.vec.into_boxed_slice(); + unsafe { JavaStr::from_boxed_semi_utf8_unchecked(slice) } + } + + /// See [String::leak]. + #[inline] + pub fn leak<'a>(self) -> &'a mut JavaStr { + let slice = self.vec.leak(); + unsafe { JavaStr::from_semi_utf8_unchecked_mut(slice) } + } +} + +impl Add<&str> for JavaString { + type Output = JavaString; + + #[inline] + fn add(mut self, rhs: &str) -> Self::Output { + self.push_str(rhs); + self + } +} + +impl Add<&JavaStr> for JavaString { + type Output = JavaString; + + #[inline] + fn add(mut self, rhs: &JavaStr) -> Self::Output { + self.push_java_str(rhs); + self + } +} + +impl AddAssign<&str> for JavaString { + #[inline] + fn add_assign(&mut self, rhs: &str) { + self.push_str(rhs); + } +} + +impl AddAssign<&JavaStr> for JavaString { + #[inline] + fn add_assign(&mut self, rhs: &JavaStr) { + self.push_java_str(rhs); + } +} + +impl AsMut for JavaString { + #[inline] + fn as_mut(&mut self) -> &mut JavaStr { + self.as_mut_java_str() + } +} + +impl AsRef<[u8]> for JavaString { + #[inline] + fn as_ref(&self) -> &[u8] { + self.as_bytes() + } +} + +impl AsRef for JavaString { + #[inline] + fn as_ref(&self) -> &JavaStr { + self.as_java_str() + } +} + +impl Borrow for JavaString { + #[inline] + fn borrow(&self) -> &JavaStr { + self.as_java_str() + } +} + +impl BorrowMut for JavaString { + #[inline] + fn borrow_mut(&mut self) -> &mut JavaStr { + self.as_mut_java_str() + } +} + +impl Clone for JavaString { + #[inline] + fn clone(&self) -> Self { + JavaString { + vec: self.vec.clone(), + } + } + + #[inline] + fn clone_from(&mut self, source: &Self) { + self.vec.clone_from(&source.vec) + } +} + +impl Debug for JavaString { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + Debug::fmt(&**self, f) + } +} + +impl Deref for JavaString { + type Target = JavaStr; + + #[inline] + fn deref(&self) -> &Self::Target { + self.as_java_str() + } +} + +impl DerefMut for JavaString { + #[inline] + fn deref_mut(&mut self) -> &mut Self::Target { + self.as_mut_java_str() + } +} + +impl Display for JavaString { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + Display::fmt(&**self, f) + } +} + +impl Extend for JavaString { + fn extend>(&mut self, iter: T) { + let iterator = iter.into_iter(); + let (lower_bound, _) = iterator.size_hint(); + self.reserve(lower_bound); + iterator.for_each(move |c| self.push(c)); + } +} + +impl Extend for JavaString { + fn extend>(&mut self, iter: T) { + let iterator = iter.into_iter(); + let (lower_bound, _) = iterator.size_hint(); + self.reserve(lower_bound); + iterator.for_each(move |c| self.push_java(c)); + } +} + +impl Extend for JavaString { + fn extend>(&mut self, iter: T) { + iter.into_iter().for_each(move |s| self.push_str(&s)); + } +} + +impl Extend for JavaString { + fn extend>(&mut self, iter: T) { + iter.into_iter().for_each(move |s| self.push_java_str(&s)); + } +} + +impl<'a> Extend<&'a char> for JavaString { + fn extend>(&mut self, iter: T) { + self.extend(iter.into_iter().cloned()) + } +} + +impl<'a> Extend<&'a JavaCodePoint> for JavaString { + fn extend>(&mut self, iter: T) { + self.extend(iter.into_iter().cloned()) + } +} + +impl<'a> Extend<&'a str> for JavaString { + fn extend>(&mut self, iter: T) { + iter.into_iter().for_each(move |s| self.push_str(s)); + } +} + +impl<'a> Extend<&'a JavaStr> for JavaString { + fn extend>(&mut self, iter: T) { + iter.into_iter().for_each(move |s| self.push_java_str(s)); + } +} + +impl Extend> for JavaString { + fn extend>>(&mut self, iter: T) { + iter.into_iter().for_each(move |s| self.push_str(&s)); + } +} + +impl Extend> for JavaString { + fn extend>>(&mut self, iter: T) { + iter.into_iter().for_each(move |s| self.push_java_str(&s)); + } +} + +impl<'a> Extend> for JavaString { + fn extend>>(&mut self, iter: T) { + iter.into_iter().for_each(move |s| self.push_str(&s)); + } +} + +impl<'a> Extend> for JavaString { + fn extend>>(&mut self, iter: T) { + iter.into_iter().for_each(move |s| self.push_java_str(&s)); + } +} + +impl From for JavaString { + #[inline] + fn from(value: String) -> Self { + unsafe { + // SAFETY: value is valid UTF-8 + JavaString::from_semi_utf8_unchecked(value.into_bytes()) + } + } +} + +impl From<&String> for JavaString { + #[inline] + fn from(value: &String) -> Self { + Self::from(value.clone()) + } +} + +impl From<&JavaString> for JavaString { + #[inline] + fn from(value: &JavaString) -> Self { + value.clone() + } +} + +impl From<&mut str> for JavaString { + #[inline] + fn from(value: &mut str) -> Self { + Self::from(&*value) + } +} + +impl From<&str> for JavaString { + #[inline] + fn from(value: &str) -> Self { + Self::from(value.to_owned()) + } +} + +impl From<&mut JavaStr> for JavaString { + #[inline] + fn from(value: &mut JavaStr) -> Self { + Self::from(&*value) + } +} + +impl From<&JavaStr> for JavaString { + #[inline] + fn from(value: &JavaStr) -> Self { + value.to_owned() + } +} + +impl From> for JavaString { + #[inline] + fn from(value: Box) -> Self { + Self::from(value.into_string()) + } +} + +impl From> for JavaString { + #[inline] + fn from(value: Box) -> Self { + value.into_string() + } +} + +impl<'a> From> for JavaString { + #[inline] + fn from(value: Cow<'a, str>) -> Self { + Self::from(value.into_owned()) + } +} + +impl<'a> From> for JavaString { + #[inline] + fn from(value: Cow<'a, JavaStr>) -> Self { + value.into_owned() + } +} + +impl From for Arc { + #[inline] + fn from(value: JavaString) -> Self { + Arc::from(&value[..]) + } +} + +impl<'a> From for Cow<'a, JavaStr> { + #[inline] + fn from(value: JavaString) -> Self { + Cow::Owned(value) + } +} + +impl From for Rc { + #[inline] + fn from(value: JavaString) -> Self { + Rc::from(&value[..]) + } +} + +impl From for Vec { + #[inline] + fn from(value: JavaString) -> Self { + value.into_bytes() + } +} + +impl From for JavaString { + #[inline] + fn from(value: char) -> Self { + Self::from(value.encode_utf8(&mut [0; 4])) + } +} + +impl From for JavaString { + #[inline] + fn from(value: JavaCodePoint) -> Self { + unsafe { + // SAFETY: we're encoding into semi-valid UTF-8 + JavaString::from_semi_utf8_unchecked(value.encode_semi_utf8(&mut [0; 4]).to_vec()) + } + } +} + +impl FromIterator for JavaString { + #[inline] + fn from_iter>(iter: T) -> Self { + let mut buf = JavaString::new(); + buf.extend(iter); + buf + } +} + +impl<'a> FromIterator<&'a char> for JavaString { + #[inline] + fn from_iter>(iter: T) -> Self { + let mut buf = JavaString::new(); + buf.extend(iter); + buf + } +} + +impl FromIterator for JavaString { + #[inline] + fn from_iter>(iter: T) -> Self { + let mut buf = JavaString::new(); + buf.extend(iter); + buf + } +} + +impl<'a> FromIterator<&'a JavaCodePoint> for JavaString { + #[inline] + fn from_iter>(iter: T) -> Self { + let mut buf = JavaString::new(); + buf.extend(iter); + buf + } +} + +impl<'a> FromIterator<&'a str> for JavaString { + #[inline] + fn from_iter>(iter: T) -> Self { + let mut buf = JavaString::new(); + buf.extend(iter); + buf + } +} + +impl FromIterator for JavaString { + fn from_iter>(iter: T) -> Self { + let mut iterator = iter.into_iter(); + + match iterator.next() { + None => JavaString::new(), + Some(buf) => { + let mut buf = JavaString::from(buf); + buf.extend(iterator); + buf + } + } + } +} + +impl FromIterator for JavaString { + fn from_iter>(iter: T) -> Self { + let mut iterator = iter.into_iter(); + + match iterator.next() { + None => JavaString::new(), + Some(mut buf) => { + buf.extend(iterator); + buf + } + } + } +} + +impl FromIterator> for JavaString { + #[inline] + fn from_iter>>(iter: T) -> Self { + let mut buf = JavaString::new(); + buf.extend(iter); + buf + } +} + +impl FromIterator> for JavaString { + #[inline] + fn from_iter>>(iter: T) -> Self { + let mut buf = JavaString::new(); + buf.extend(iter); + buf + } +} + +impl<'a> FromIterator> for JavaString { + #[inline] + fn from_iter>>(iter: T) -> Self { + let mut buf = JavaString::new(); + buf.extend(iter); + buf + } +} + +impl<'a> FromIterator> for JavaString { + #[inline] + fn from_iter>>(iter: T) -> Self { + let mut buf = JavaString::new(); + buf.extend(iter); + buf + } +} + +impl FromStr for JavaString { + type Err = Infallible; + + #[inline] + fn from_str(s: &str) -> Result { + Ok(Self::from(s)) + } +} + +impl Hash for JavaString { + #[inline] + fn hash(&self, state: &mut H) { + (**self).hash(state) + } +} + +impl Index> for JavaString { + type Output = JavaStr; + + #[inline] + fn index(&self, index: Range) -> &Self::Output { + &self[..][index] + } +} + +impl Index> for JavaString { + type Output = JavaStr; + + #[inline] + fn index(&self, index: RangeFrom) -> &Self::Output { + &self[..][index] + } +} + +impl Index for JavaString { + type Output = JavaStr; + + #[inline] + fn index(&self, _index: RangeFull) -> &Self::Output { + self.as_java_str() + } +} + +impl Index> for JavaString { + type Output = JavaStr; + + #[inline] + fn index(&self, index: RangeInclusive) -> &Self::Output { + &self[..][index] + } +} + +impl Index> for JavaString { + type Output = JavaStr; + + #[inline] + fn index(&self, index: RangeTo) -> &Self::Output { + &self[..][index] + } +} + +impl Index> for JavaString { + type Output = JavaStr; + + #[inline] + fn index(&self, index: RangeToInclusive) -> &Self::Output { + &self[..][index] + } +} + +impl IndexMut> for JavaString { + #[inline] + fn index_mut(&mut self, index: Range) -> &mut Self::Output { + &mut self[..][index] + } +} + +impl IndexMut> for JavaString { + #[inline] + fn index_mut(&mut self, index: RangeFrom) -> &mut Self::Output { + &mut self[..][index] + } +} + +impl IndexMut for JavaString { + #[inline] + fn index_mut(&mut self, _index: RangeFull) -> &mut Self::Output { + self.as_mut_java_str() + } +} + +impl IndexMut> for JavaString { + #[inline] + fn index_mut(&mut self, index: RangeInclusive) -> &mut Self::Output { + &mut self[..][index] + } +} + +impl IndexMut> for JavaString { + #[inline] + fn index_mut(&mut self, index: RangeTo) -> &mut Self::Output { + &mut self[..][index] + } +} + +impl IndexMut> for JavaString { + #[inline] + fn index_mut(&mut self, index: RangeToInclusive) -> &mut Self::Output { + &mut self[..][index] + } +} + +impl PartialEq for JavaString { + #[inline] + fn eq(&self, other: &str) -> bool { + self[..] == other + } +} + +impl PartialEq for str { + #[inline] + fn eq(&self, other: &JavaString) -> bool { + self == other[..] + } +} + +impl<'a> PartialEq<&'a str> for JavaString { + #[inline] + fn eq(&self, other: &&'a str) -> bool { + self == *other + } +} + +impl<'a> PartialEq for &'a str { + #[inline] + fn eq(&self, other: &JavaString) -> bool { + *self == other + } +} + +impl PartialEq for JavaString { + #[inline] + fn eq(&self, other: &String) -> bool { + &self[..] == other + } +} + +impl PartialEq for String { + #[inline] + fn eq(&self, other: &JavaString) -> bool { + self == &other[..] + } +} + +impl PartialEq for JavaString { + #[inline] + fn eq(&self, other: &JavaStr) -> bool { + self[..] == other + } +} + +impl<'a> PartialEq<&'a JavaStr> for JavaString { + #[inline] + fn eq(&self, other: &&'a JavaStr) -> bool { + self == *other + } +} + +impl<'a> PartialEq> for JavaString { + #[inline] + fn eq(&self, other: &Cow<'a, str>) -> bool { + &self[..] == other + } +} + +impl<'a> PartialEq for Cow<'a, str> { + #[inline] + fn eq(&self, other: &JavaString) -> bool { + self == &other[..] + } +} + +impl<'a> PartialEq> for JavaString { + #[inline] + fn eq(&self, other: &Cow<'a, JavaStr>) -> bool { + &self[..] == other + } +} + +impl<'a> PartialEq for Cow<'a, JavaStr> { + #[inline] + fn eq(&self, other: &JavaString) -> bool { + self == &other[..] + } +} + +impl Write for JavaString { + #[inline] + fn write_str(&mut self, s: &str) -> std::fmt::Result { + self.push_str(s); + Ok(()) + } + + #[inline] + fn write_char(&mut self, c: char) -> std::fmt::Result { + self.push(c); + Ok(()) + } +} + +pub struct Drain<'a> { + string: *mut JavaString, + start: usize, + end: usize, + iter: Chars<'a>, +} + +impl Debug for Drain<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("Drain").field(&self.as_str()).finish() + } +} + +unsafe impl Sync for Drain<'_> {} +unsafe impl Send for Drain<'_> {} + +impl Drop for Drain<'_> { + #[inline] + fn drop(&mut self) { + unsafe { + // Use Vec::drain. "Reaffirm" the bounds checks to avoid + // panic code being inserted again. + let self_vec = (*self.string).as_mut_vec(); + if self.start <= self.end && self.end <= self_vec.len() { + self_vec.drain(self.start..self.end); + } + } + } +} + +impl AsRef for Drain<'_> { + #[inline] + fn as_ref(&self) -> &JavaStr { + self.as_str() + } +} + +impl AsRef<[u8]> for Drain<'_> { + #[inline] + fn as_ref(&self) -> &[u8] { + self.as_str().as_bytes() + } +} + +impl Drain<'_> { + #[inline] + #[must_use] + pub fn as_str(&self) -> &JavaStr { + self.iter.as_str() + } +} + +impl Iterator for Drain<'_> { + type Item = JavaCodePoint; + + #[inline] + fn next(&mut self) -> Option { + self.iter.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.iter.size_hint() + } + + #[inline] + fn last(mut self) -> Option { + self.next_back() + } +} + +impl DoubleEndedIterator for Drain<'_> { + #[inline] + fn next_back(&mut self) -> Option { + self.iter.next_back() + } +} + +impl FusedIterator for Drain<'_> {} diff --git a/crates/java_string/src/pattern.rs b/crates/java_string/src/pattern.rs new file mode 100644 index 000000000..06cc78041 --- /dev/null +++ b/crates/java_string/src/pattern.rs @@ -0,0 +1,402 @@ +use crate::{JavaCodePoint, JavaStr}; + +mod private_pattern { + use crate::{JavaCodePoint, JavaStr}; + + pub trait Sealed {} + + impl Sealed for char {} + impl Sealed for JavaCodePoint {} + impl Sealed for &str {} + impl Sealed for &JavaStr {} + impl Sealed for F where F: FnMut(JavaCodePoint) -> bool {} + impl Sealed for &[char] {} + impl Sealed for &[JavaCodePoint] {} + impl Sealed for &char {} + impl Sealed for &JavaCodePoint {} + impl Sealed for &&str {} + impl Sealed for &&JavaStr {} +} + +/// # Safety +/// +/// Methods in this trait must only return indexes that are on char boundaries +pub unsafe trait JavaStrPattern: private_pattern::Sealed { + fn prefix_len_in(&mut self, haystack: &JavaStr) -> Option; + fn suffix_len_in(&mut self, haystack: &JavaStr) -> Option; + fn find_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)>; + fn rfind_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)>; +} + +unsafe impl JavaStrPattern for char { + #[inline] + fn prefix_len_in(&mut self, haystack: &JavaStr) -> Option { + let ch = haystack.chars().next()?; + if ch == *self { + Some(ch.len_utf8()) + } else { + None + } + } + + #[inline] + fn suffix_len_in(&mut self, haystack: &JavaStr) -> Option { + let ch = haystack.chars().next_back()?; + if ch == *self { + Some(ch.len_utf8()) + } else { + None + } + } + + #[inline] + fn find_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + let mut encoded = [0; 4]; + let encoded = self.encode_utf8(&mut encoded).as_bytes(); + find(haystack.as_bytes(), encoded).map(|index| (index, encoded.len())) + } + + #[inline] + fn rfind_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + let mut encoded = [0; 4]; + let encoded = self.encode_utf8(&mut encoded).as_bytes(); + rfind(haystack.as_bytes(), encoded).map(|index| (index, encoded.len())) + } +} + +unsafe impl JavaStrPattern for JavaCodePoint { + #[inline] + fn prefix_len_in(&mut self, haystack: &JavaStr) -> Option { + let ch = haystack.chars().next()?; + if ch == *self { + Some(ch.len_utf8()) + } else { + None + } + } + + #[inline] + fn suffix_len_in(&mut self, haystack: &JavaStr) -> Option { + let ch = haystack.chars().next_back()?; + if ch == *self { + Some(ch.len_utf8()) + } else { + None + } + } + + #[inline] + fn find_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + let mut encoded = [0; 4]; + let encoded = self.encode_semi_utf8(&mut encoded); + find(haystack.as_bytes(), encoded).map(|index| (index, encoded.len())) + } + + #[inline] + fn rfind_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + let mut encoded = [0; 4]; + let encoded = self.encode_semi_utf8(&mut encoded); + rfind(haystack.as_bytes(), encoded).map(|index| (index, encoded.len())) + } +} + +unsafe impl JavaStrPattern for &str { + #[inline] + fn prefix_len_in(&mut self, haystack: &JavaStr) -> Option { + if haystack.as_bytes().starts_with(self.as_bytes()) { + Some(self.len()) + } else { + None + } + } + + #[inline] + fn suffix_len_in(&mut self, haystack: &JavaStr) -> Option { + if haystack.as_bytes().ends_with(self.as_bytes()) { + Some(self.len()) + } else { + None + } + } + + #[inline] + fn find_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + find(haystack.as_bytes(), self.as_bytes()).map(|index| (index, self.len())) + } + + #[inline] + fn rfind_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + rfind(haystack.as_bytes(), self.as_bytes()).map(|index| (index, self.len())) + } +} + +unsafe impl JavaStrPattern for &JavaStr { + #[inline] + fn prefix_len_in(&mut self, haystack: &JavaStr) -> Option { + if haystack.as_bytes().starts_with(self.as_bytes()) { + Some(self.len()) + } else { + None + } + } + + #[inline] + fn suffix_len_in(&mut self, haystack: &JavaStr) -> Option { + if haystack.as_bytes().ends_with(self.as_bytes()) { + Some(self.len()) + } else { + None + } + } + + #[inline] + fn find_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + find(haystack.as_bytes(), self.as_bytes()).map(|index| (index, self.len())) + } + + #[inline] + fn rfind_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + rfind(haystack.as_bytes(), self.as_bytes()).map(|index| (index, self.len())) + } +} + +unsafe impl JavaStrPattern for F +where + F: FnMut(JavaCodePoint) -> bool, +{ + #[inline] + fn prefix_len_in(&mut self, haystack: &JavaStr) -> Option { + let ch = haystack.chars().next()?; + if self(ch) { + Some(ch.len_utf8()) + } else { + None + } + } + + #[inline] + fn suffix_len_in(&mut self, haystack: &JavaStr) -> Option { + let ch = haystack.chars().next_back()?; + if self(ch) { + Some(ch.len_utf8()) + } else { + None + } + } + + #[inline] + fn find_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + haystack + .char_indices() + .find(|(_, ch)| self(*ch)) + .map(|(index, ch)| (index, ch.len_utf8())) + } + + #[inline] + fn rfind_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + haystack + .char_indices() + .rfind(|(_, ch)| self(*ch)) + .map(|(index, ch)| (index, ch.len_utf8())) + } +} + +unsafe impl JavaStrPattern for &[char] { + #[inline] + fn prefix_len_in(&mut self, haystack: &JavaStr) -> Option { + let ch = haystack.chars().next()?; + if self.iter().any(|c| ch == *c) { + Some(ch.len_utf8()) + } else { + None + } + } + + #[inline] + fn suffix_len_in(&mut self, haystack: &JavaStr) -> Option { + let ch = haystack.chars().next_back()?; + if self.iter().any(|c| ch == *c) { + Some(ch.len_utf8()) + } else { + None + } + } + + #[inline] + fn find_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + haystack + .char_indices() + .find(|(_, ch)| self.iter().any(|c| *ch == *c)) + .map(|(index, ch)| (index, ch.len_utf8())) + } + + #[inline] + fn rfind_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + haystack + .char_indices() + .rfind(|(_, ch)| self.iter().any(|c| *ch == *c)) + .map(|(index, ch)| (index, ch.len_utf8())) + } +} + +unsafe impl JavaStrPattern for &[JavaCodePoint] { + #[inline] + fn prefix_len_in(&mut self, haystack: &JavaStr) -> Option { + let ch = haystack.chars().next()?; + if self.contains(&ch) { + Some(ch.len_utf8()) + } else { + None + } + } + + #[inline] + fn suffix_len_in(&mut self, haystack: &JavaStr) -> Option { + let ch = haystack.chars().next_back()?; + if self.contains(&ch) { + Some(ch.len_utf8()) + } else { + None + } + } + + #[inline] + fn find_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + haystack + .char_indices() + .find(|(_, ch)| self.contains(ch)) + .map(|(index, ch)| (index, ch.len_utf8())) + } + + #[inline] + fn rfind_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + haystack + .char_indices() + .rfind(|(_, ch)| self.contains(ch)) + .map(|(index, ch)| (index, ch.len_utf8())) + } +} + +unsafe impl JavaStrPattern for &char { + #[inline] + fn prefix_len_in(&mut self, haystack: &JavaStr) -> Option { + let mut ch = **self; + ch.prefix_len_in(haystack) + } + + #[inline] + fn suffix_len_in(&mut self, haystack: &JavaStr) -> Option { + let mut ch = **self; + ch.suffix_len_in(haystack) + } + + #[inline] + fn find_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + let mut ch = **self; + ch.find_in(haystack) + } + + #[inline] + fn rfind_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + let mut ch = **self; + ch.rfind_in(haystack) + } +} + +unsafe impl JavaStrPattern for &JavaCodePoint { + #[inline] + fn prefix_len_in(&mut self, haystack: &JavaStr) -> Option { + let mut ch = **self; + ch.prefix_len_in(haystack) + } + + #[inline] + fn suffix_len_in(&mut self, haystack: &JavaStr) -> Option { + let mut ch = **self; + ch.suffix_len_in(haystack) + } + + #[inline] + fn find_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + let mut ch = **self; + ch.find_in(haystack) + } + + #[inline] + fn rfind_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + let mut ch = **self; + ch.rfind_in(haystack) + } +} + +unsafe impl JavaStrPattern for &&str { + #[inline] + fn prefix_len_in(&mut self, haystack: &JavaStr) -> Option { + let mut str = **self; + str.prefix_len_in(haystack) + } + + #[inline] + fn suffix_len_in(&mut self, haystack: &JavaStr) -> Option { + let mut str = **self; + str.suffix_len_in(haystack) + } + + #[inline] + fn find_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + let mut str = **self; + str.find_in(haystack) + } + + #[inline] + fn rfind_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + let mut str = **self; + str.rfind_in(haystack) + } +} + +unsafe impl JavaStrPattern for &&JavaStr { + #[inline] + fn prefix_len_in(&mut self, haystack: &JavaStr) -> Option { + let mut str = **self; + str.prefix_len_in(haystack) + } + + #[inline] + fn suffix_len_in(&mut self, haystack: &JavaStr) -> Option { + let mut str = **self; + str.suffix_len_in(haystack) + } + + #[inline] + fn find_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + let mut str = **self; + str.find_in(haystack) + } + + #[inline] + fn rfind_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + let mut str = **self; + str.rfind_in(haystack) + } +} + +#[inline] +fn find(haystack: &[u8], needle: &[u8]) -> Option { + if needle.is_empty() { + return Some(0); + } + haystack + .windows(needle.len()) + .position(|window| window == needle) +} + +#[inline] +fn rfind(haystack: &[u8], needle: &[u8]) -> Option { + if needle.is_empty() { + return Some(haystack.len()); + } + haystack + .windows(needle.len()) + .rposition(|window| window == needle) +} diff --git a/crates/java_string/src/serde.rs b/crates/java_string/src/serde.rs new file mode 100644 index 000000000..e1c152d11 --- /dev/null +++ b/crates/java_string/src/serde.rs @@ -0,0 +1,263 @@ +use std::fmt::Formatter; + +use serde::de::value::SeqAccessDeserializer; +use serde::de::{Error, SeqAccess, Unexpected, Visitor}; +use serde::ser::SerializeSeq; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; + +use crate::{JavaCodePoint, JavaStr, JavaString}; + +impl Serialize for JavaString { + #[inline] + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + match self.as_str() { + Ok(str) => str.serialize(serializer), + Err(_) => { + let mut seq = serializer.serialize_seq(None)?; + for ch in self.chars() { + seq.serialize_element(&ch.as_u32())?; + } + seq.end() + } + } + } +} + +impl<'de> Deserialize<'de> for JavaString { + #[inline] + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + deserializer.deserialize_any(JavaStringVisitor) + } +} + +struct JavaStringVisitor; + +impl<'de> Visitor<'de> for JavaStringVisitor { + type Value = JavaString; + + fn expecting(&self, formatter: &mut Formatter) -> std::fmt::Result { + formatter.write_str("a JavaString") + } + + fn visit_str(self, v: &str) -> Result + where + E: Error, + { + Ok(JavaString::from(v)) + } + + fn visit_string(self, v: String) -> Result + where + E: Error, + { + Ok(JavaString::from(v)) + } + + fn visit_bytes(self, v: &[u8]) -> Result + where + E: Error, + { + match JavaStr::from_semi_utf8(v) { + Ok(str) => Ok(str.to_owned()), + Err(_) => Err(Error::invalid_value(Unexpected::Bytes(v), &self)), + } + } + + fn visit_byte_buf(self, v: Vec) -> Result + where + E: Error, + { + JavaString::from_semi_utf8(v) + .map_err(|err| Error::invalid_value(Unexpected::Bytes(&err.into_bytes()), &self)) + } + + fn visit_seq(self, seq: A) -> Result + where + A: SeqAccess<'de>, + { + let vec = Vec::::deserialize(SeqAccessDeserializer::new(seq))?; + JavaString::from_semi_utf8(vec).map_err(|_| Error::invalid_value(Unexpected::Seq, &self)) + } +} + +impl Serialize for JavaStr { + #[inline] + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + match self.as_str() { + Ok(str) => str.serialize(serializer), + Err(_) => { + let mut seq = serializer.serialize_seq(None)?; + for ch in self.chars() { + seq.serialize_element(&ch.as_u32())?; + } + seq.end() + } + } + } +} + +impl<'de: 'a, 'a> Deserialize<'de> for &'a JavaStr { + #[inline] + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + deserializer.deserialize_any(JavaStrVisitor) + } +} + +struct JavaStrVisitor; + +impl<'de> Visitor<'de> for JavaStrVisitor { + type Value = &'de JavaStr; + + fn expecting(&self, formatter: &mut Formatter) -> std::fmt::Result { + formatter.write_str("a borrowed JavaStr") + } + + fn visit_borrowed_str(self, v: &'de str) -> Result + where + E: Error, + { + Ok(JavaStr::from_str(v)) + } + + fn visit_borrowed_bytes(self, v: &'de [u8]) -> Result + where + E: Error, + { + JavaStr::from_semi_utf8(v).map_err(|_| Error::invalid_value(Unexpected::Bytes(v), &self)) + } +} + +impl Serialize for JavaCodePoint { + #[inline] + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + match self.as_char() { + Some(ch) => ch.serialize(serializer), + None => self.as_u32().serialize(serializer), + } + } +} + +impl<'de> Deserialize<'de> for JavaCodePoint { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + deserializer.deserialize_any(JavaCodePointVisitor) + } +} + +struct JavaCodePointVisitor; + +impl<'de> Visitor<'de> for JavaCodePointVisitor { + type Value = JavaCodePoint; + + fn expecting(&self, formatter: &mut Formatter) -> std::fmt::Result { + formatter.write_str("a character") + } + + #[inline] + fn visit_i8(self, v: i8) -> Result + where + E: Error, + { + self.visit_i32(v as i32) + } + + #[inline] + fn visit_i16(self, v: i16) -> Result + where + E: Error, + { + self.visit_i32(v as i32) + } + + fn visit_i32(self, v: i32) -> Result + where + E: Error, + { + if v < 0 { + Err(Error::invalid_value(Unexpected::Signed(v as i64), &self)) + } else { + self.visit_u32(v as u32) + } + } + + fn visit_i64(self, v: i64) -> Result + where + E: Error, + { + if v < 0 { + Err(Error::invalid_value(Unexpected::Signed(v), &self)) + } else { + self.visit_u64(v as u64) + } + } + + #[inline] + fn visit_u8(self, v: u8) -> Result + where + E: Error, + { + self.visit_u32(v as u32) + } + + #[inline] + fn visit_u16(self, v: u16) -> Result + where + E: Error, + { + self.visit_u32(v as u32) + } + + fn visit_u32(self, v: u32) -> Result + where + E: Error, + { + JavaCodePoint::from_u32(v) + .ok_or_else(|| Error::invalid_value(Unexpected::Unsigned(v as u64), &self)) + } + + fn visit_u64(self, v: u64) -> Result + where + E: Error, + { + if v > u32::MAX as u64 { + Err(Error::invalid_value(Unexpected::Unsigned(v), &self)) + } else { + self.visit_u32(v as u32) + } + } + + fn visit_char(self, v: char) -> Result + where + E: Error, + { + Ok(JavaCodePoint::from_char(v)) + } + + fn visit_str(self, v: &str) -> Result + where + E: Error, + { + let mut iter = v.chars(); + match (iter.next(), iter.next()) { + (Some(c), None) => Ok(JavaCodePoint::from_char(c)), + _ => Err(Error::invalid_value(Unexpected::Str(v), &self)), + } + } +} diff --git a/crates/java_string/src/slice.rs b/crates/java_string/src/slice.rs new file mode 100644 index 000000000..104df4228 --- /dev/null +++ b/crates/java_string/src/slice.rs @@ -0,0 +1,2239 @@ +use std::borrow::Cow; +use std::collections::Bound; +use std::fmt::{Debug, Display, Formatter, Write}; +use std::hash::{Hash, Hasher}; +use std::ops::{ + Add, AddAssign, Index, IndexMut, Range, RangeBounds, RangeFrom, RangeFull, RangeInclusive, + RangeTo, RangeToInclusive, +}; +use std::rc::Rc; +use std::str::FromStr; +use std::sync::Arc; +use std::{ptr, slice}; + +use crate::char::EscapeDebugExtArgs; +use crate::validations::{ + run_utf8_full_validation_from_semi, run_utf8_semi_validation, slice_error_fail, + str_end_index_overflow_fail, +}; +use crate::{ + Bytes, CharEscapeIter, CharIndices, Chars, EscapeDebug, EscapeDefault, EscapeUnicode, + JavaCodePoint, JavaStrPattern, JavaString, Lines, MatchIndices, Matches, ParseError, + RMatchIndices, RMatches, RSplit, RSplitN, RSplitTerminator, Split, SplitAsciiWhitespace, + SplitInclusive, SplitN, SplitTerminator, SplitWhitespace, Utf8Error, +}; + +#[repr(transparent)] +#[derive(PartialEq, Eq, PartialOrd, Ord)] +pub struct JavaStr { + inner: [u8], +} + +impl JavaStr { + /// Converts `v` to a `&JavaStr` if it is fully-valid UTF-8, i.e. UTF-8 + /// without surrogate code points. See [std::str::from_utf8]. + #[inline] + pub const fn from_full_utf8(v: &[u8]) -> Result<&JavaStr, Utf8Error> { + match std::str::from_utf8(v) { + Ok(str) => Ok(JavaStr::from_str(str)), + Err(err) => Err(Utf8Error::from_std(err)), + } + } + + /// Converts `v` to a `&mut JavaStr` if it is fully-valid UTF-8, i.e. UTF-8 + /// without surrogate code points. See [std::str::from_utf8_mut]. + #[inline] + pub fn from_full_utf8_mut(v: &mut [u8]) -> Result<&mut JavaStr, Utf8Error> { + match std::str::from_utf8_mut(v) { + Ok(str) => Ok(JavaStr::from_mut_str(str)), + Err(err) => Err(Utf8Error::from_std(err)), + } + } + + /// Converts `v` to a `&JavaStr` if it is semi-valid UTF-8, i.e. UTF-8 + /// with surrogate code points. + pub fn from_semi_utf8(v: &[u8]) -> Result<&JavaStr, Utf8Error> { + match run_utf8_semi_validation(v) { + Ok(()) => Ok(unsafe { JavaStr::from_semi_utf8_unchecked(v) }), + Err(err) => Err(err), + } + } + + /// Converts `v` to a `&mut JavaStr` if it is semi-valid UTF-8, i.e. UTF-8 + /// with surrogate code points. + pub fn from_semi_utf8_mut(v: &mut [u8]) -> Result<&mut JavaStr, Utf8Error> { + match run_utf8_semi_validation(v) { + Ok(()) => Ok(unsafe { JavaStr::from_semi_utf8_unchecked_mut(v) }), + Err(err) => Err(err), + } + } + + /// # Safety + /// + /// The parameter must be in semi-valid UTF-8 format, that is, UTF-8 plus + /// surrogate code points. + #[inline] + #[must_use] + pub const unsafe fn from_semi_utf8_unchecked(v: &[u8]) -> &JavaStr { + // SAFETY: the caller must guarantee that the bytes `v` are valid UTF-8, minus + // the absence of surrogate chars. Also relies on `&JavaStr` and `&[u8]` + // having the same layout. + std::mem::transmute(v) + } + + /// # Safety + /// + /// The parameter must be in semi-valid UTF-8 format, that is, UTF-8 plus + /// surrogate code points. + #[inline] + #[must_use] + pub unsafe fn from_semi_utf8_unchecked_mut(v: &mut [u8]) -> &mut JavaStr { + // SAFETY: see from_semi_utf8_unchecked + std::mem::transmute(v) + } + + #[inline] + #[must_use] + pub const fn from_str(str: &str) -> &JavaStr { + unsafe { + // SAFETY: the input str is guaranteed to have valid UTF-8. + JavaStr::from_semi_utf8_unchecked(str.as_bytes()) + } + } + + #[inline] + #[must_use] + pub fn from_mut_str(str: &mut str) -> &mut JavaStr { + unsafe { + // SAFETY: the input str is guaranteed to have valid UTF-8. + JavaStr::from_semi_utf8_unchecked_mut(str.as_bytes_mut()) + } + } + + #[inline] + #[must_use] + pub fn from_boxed_str(v: Box) -> Box { + unsafe { JavaStr::from_boxed_semi_utf8_unchecked(v.into_boxed_bytes()) } + } + + /// # Safety + /// + /// The parameter must be in semi-valid UTF-8 format, that is, UTF-8 plus + /// surrogate code points. + #[inline] + #[must_use] + pub unsafe fn from_boxed_semi_utf8_unchecked(v: Box<[u8]>) -> Box { + unsafe { Box::from_raw(Box::into_raw(v) as *mut JavaStr) } + } + + /// See [str::as_bytes]. + #[inline] + #[must_use] + pub const fn as_bytes(&self) -> &[u8] { + &self.inner + } + + /// See [str::as_bytes_mut]. + /// + /// # Safety + /// + /// The returned slice must not have invalid UTF-8 written to it, besides + /// surrogate pairs. + #[inline] + #[must_use] + pub unsafe fn as_bytes_mut(&mut self) -> &mut [u8] { + &mut self.inner + } + + /// See [str::as_mut_ptr]. + #[inline] + #[must_use] + pub fn as_mut_ptr(&mut self) -> *mut u8 { + self.inner.as_mut_ptr() + } + + /// See [str::as_ptr]. + #[inline] + #[must_use] + pub const fn as_ptr(&self) -> *const u8 { + self.inner.as_ptr() + } + + /// Tries to convert this `&JavaStr` to a `&str`, returning an error if + /// it is not fully valid UTF-8, i.e. has no surrogate code points. + pub const fn as_str(&self) -> Result<&str, Utf8Error> { + // Manual implementation of Option::map since it's not const + match run_utf8_full_validation_from_semi(self.as_bytes()) { + Ok(..) => unsafe { + // SAFETY: we were already semi-valid, and full validation just succeeded. + Ok(self.as_str_unchecked()) + }, + Err(err) => Err(err), + } + } + + /// # Safety + /// + /// This string must be fully valid UTF-8, i.e. have no surrogate code + /// points. + #[inline] + #[must_use] + pub const unsafe fn as_str_unchecked(&self) -> &str { + std::str::from_utf8_unchecked(self.as_bytes()) + } + + /// Converts this `&JavaStr` to a `Cow`, replacing surrogate code + /// points with the replacement character �. + /// + /// ``` + /// # use std::borrow::Cow; + /// # use java_string::{JavaCodePoint, JavaStr, JavaString}; + /// let s = JavaStr::from_str("Hello 🦀 World!"); + /// let result = s.as_str_lossy(); + /// assert!(matches!(result, Cow::Borrowed(_))); + /// assert_eq!(result, "Hello 🦀 World!"); + /// + /// let s = JavaString::from("Hello ") + /// + JavaString::from(JavaCodePoint::from_u32(0xd800).unwrap()).as_java_str() + /// + JavaStr::from_str(" World!"); + /// let result = s.as_str_lossy(); + /// assert!(matches!(result, Cow::Owned(_))); + /// assert_eq!(result, "Hello � World!"); + /// ``` + #[must_use] + pub fn as_str_lossy(&self) -> Cow<'_, str> { + match run_utf8_full_validation_from_semi(self.as_bytes()) { + Ok(()) => unsafe { + // SAFETY: validation succeeded + Cow::Borrowed(self.as_str_unchecked()) + }, + Err(error) => unsafe { + // SAFETY: invalid parts of string are converted to replacement char + Cow::Owned( + self.transform_invalid_string(error, str::to_owned, |_| { + JavaStr::from_str("\u{FFFD}") + }) + .into_string_unchecked(), + ) + }, + } + } + + /// See [str::bytes]. + #[inline] + pub fn bytes(&self) -> Bytes<'_> { + Bytes { + inner: self.inner.iter().copied(), + } + } + + /// See [str::char_indices]. + #[inline] + pub fn char_indices(&self) -> CharIndices<'_> { + CharIndices { + front_offset: 0, + inner: self.chars(), + } + } + + /// See [str::chars]. + #[inline] + pub fn chars(&self) -> Chars<'_> { + Chars { + inner: self.inner.iter(), + } + } + + /// See [str::contains]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let bananas = JavaStr::from_str("bananas"); + /// + /// assert!(bananas.contains("nana")); + /// assert!(!bananas.contains("apples")); + /// ``` + #[inline] + #[must_use] + pub fn contains

(&self, mut pat: P) -> bool + where + P: JavaStrPattern, + { + pat.find_in(self).is_some() + } + + /// See [str::ends_with]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let bananas = JavaStr::from_str("bananas"); + /// + /// assert!(bananas.ends_with("anas")); + /// assert!(!bananas.ends_with("nana")); + /// ``` + #[inline] + #[must_use] + pub fn ends_with

(&self, mut pat: P) -> bool + where + P: JavaStrPattern, + { + pat.suffix_len_in(self).is_some() + } + + /// See [str::eq_ignore_ascii_case]. + #[inline] + #[must_use] + pub fn eq_ignore_ascii_case(&self, other: &str) -> bool { + self.as_bytes().eq_ignore_ascii_case(other.as_bytes()) + } + + /// See [str::eq_ignore_ascii_case]. + #[inline] + #[must_use] + pub fn eq_java_ignore_ascii_case(&self, other: &JavaStr) -> bool { + self.as_bytes().eq_ignore_ascii_case(other.as_bytes()) + } + + /// See [str::escape_debug]. + /// + /// ``` + /// # use java_string::JavaStr; + /// assert_eq!( + /// JavaStr::from_str("❤\n!").escape_debug().to_string(), + /// "❤\\n!" + /// ); + /// ``` + #[inline] + pub fn escape_debug(&self) -> EscapeDebug<'_> { + #[inline] + fn escape_first(first: JavaCodePoint) -> CharEscapeIter { + first.escape_debug_ext(EscapeDebugExtArgs::ESCAPE_ALL) + } + #[inline] + fn escape_rest(char: JavaCodePoint) -> CharEscapeIter { + char.escape_debug_ext(EscapeDebugExtArgs { + escape_single_quote: true, + escape_double_quote: true, + }) + } + + let mut chars = self.chars(); + EscapeDebug { + inner: chars + .next() + .map(escape_first as fn(JavaCodePoint) -> CharEscapeIter) + .into_iter() + .flatten() + .chain(chars.flat_map(escape_rest as fn(JavaCodePoint) -> CharEscapeIter)), + } + } + + /// See [str::escape_default]. + /// + /// ``` + /// # use java_string::JavaStr; + /// assert_eq!( + /// JavaStr::from_str("❤\n!").escape_default().to_string(), + /// "\\u{2764}\\n!" + /// ); + /// ``` + #[inline] + pub fn escape_default(&self) -> EscapeDefault<'_> { + EscapeDefault { + inner: self.chars().flat_map(JavaCodePoint::escape_default), + } + } + + /// See [str::escape_unicode]. + /// + /// ``` + /// # use java_string::JavaStr; + /// assert_eq!( + /// JavaStr::from_str("❤\n!").escape_unicode().to_string(), + /// "\\u{2764}\\u{a}\\u{21}" + /// ); + /// ``` + #[inline] + pub fn escape_unicode(&self) -> EscapeUnicode<'_> { + EscapeUnicode { + inner: self.chars().flat_map(JavaCodePoint::escape_unicode), + } + } + + /// See [str::find]. + /// + /// ``` + /// let s = "Löwe 老虎 Léopard Gepardi"; + /// + /// assert_eq!(s.find('L'), Some(0)); + /// assert_eq!(s.find('é'), Some(14)); + /// assert_eq!(s.find("pard"), Some(17)); + /// + /// let x: &[_] = &['1', '2']; + /// assert_eq!(s.find(x), None); + /// ``` + #[inline] + #[must_use] + pub fn find

(&self, mut pat: P) -> Option + where + P: JavaStrPattern, + { + pat.find_in(self).map(|(index, _)| index) + } + + /// See [str::get]. + /// + /// ``` + /// # use java_string::{JavaStr, JavaString}; + /// let v = JavaString::from("🗻∈🌏"); + /// + /// assert_eq!(Some(JavaStr::from_str("🗻")), v.get(0..4)); + /// + /// // indices not on UTF-8 sequence boundaries + /// assert!(v.get(1..).is_none()); + /// assert!(v.get(..8).is_none()); + /// + /// // out of bounds + /// assert!(v.get(..42).is_none()); + /// ``` + #[inline] + #[must_use] + pub fn get(&self, i: I) -> Option<&JavaStr> + where + I: JavaStrSliceIndex, + { + i.get(self) + } + + /// See [str::get_mut]. + #[inline] + #[must_use] + pub fn get_mut(&mut self, i: I) -> Option<&mut JavaStr> + where + I: JavaStrSliceIndex, + { + i.get_mut(self) + } + + /// See [str::get_unchecked]. + /// + /// # Safety + /// + /// - The starting index must not exceed the ending index + /// - Indexes must be within bounds of the original slice + /// - Indexes must lie on UTF-8 sequence boundaries + #[inline] + #[must_use] + pub unsafe fn get_unchecked(&self, i: I) -> &JavaStr + where + I: JavaStrSliceIndex, + { + unsafe { &*i.get_unchecked(self) } + } + + /// See [str::get_unchecked_mut]. + /// + /// # Safety + /// + /// - The starting index must not exceed the ending index + /// - Indexes must be within bounds of the original slice + /// - Indexes must lie on UTF-8 sequence boundaries + #[inline] + #[must_use] + pub unsafe fn get_unchecked_mut(&mut self, i: I) -> &mut JavaStr + where + I: JavaStrSliceIndex, + { + unsafe { &mut *i.get_unchecked_mut(self) } + } + + /// See [str::into_boxed_bytes]. + #[inline] + #[must_use] + pub fn into_boxed_bytes(self: Box) -> Box<[u8]> { + unsafe { Box::from_raw(Box::into_raw(self) as *mut [u8]) } + } + + /// See [str::into_string]. + #[inline] + #[must_use] + pub fn into_string(self: Box) -> JavaString { + let slice = self.into_boxed_bytes(); + unsafe { JavaString::from_semi_utf8_unchecked(slice.into_vec()) } + } + + /// See [str::is_ascii]. + #[inline] + #[must_use] + pub fn is_ascii(&self) -> bool { + self.as_bytes().is_ascii() + } + + /// See [str::is_char_boundary]. + #[inline] + #[must_use] + pub fn is_char_boundary(&self, index: usize) -> bool { + // 0 is always ok. + // Test for 0 explicitly so that it can optimize out the check + // easily and skip reading string data for that case. + // Note that optimizing `self.get(..index)` relies on this. + if index == 0 { + return true; + } + + match self.as_bytes().get(index) { + // For `None` we have two options: + // + // - index == self.len() Empty strings are valid, so return true + // - index > self.len() In this case return false + // + // The check is placed exactly here, because it improves generated + // code on higher opt-levels. See https://github.com/rust-lang/rust/pull/84751 for more details. + None => index == self.len(), + + Some(&b) => { + // This is bit magic equivalent to: b < 128 || b >= 192 + (b as i8) >= -0x40 + } + } + } + + pub(crate) fn floor_char_boundary(&self, index: usize) -> usize { + if index >= self.len() { + self.len() + } else { + let lower_bound = index.saturating_sub(3); + let new_index = self.as_bytes()[lower_bound..=index].iter().rposition(|b| { + // This is bit magic equivalent to: b < 128 || b >= 192 + (*b as i8) >= -0x40 + }); + + // SAFETY: we know that the character boundary will be within four bytes + unsafe { lower_bound + new_index.unwrap_unchecked() } + } + } + + /// See [str::is_empty]. + #[inline] + #[must_use] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// See [str::len]. + #[inline] + #[must_use] + pub fn len(&self) -> usize { + self.inner.len() + } + + /// See [str::lines]. + #[inline] + pub fn lines(&self) -> Lines<'_> { + Lines { + inner: self.split_inclusive('\n').map(|line| { + let Some(line) = line.strip_suffix('\n') else { + return line; + }; + let Some(line) = line.strip_suffix('\r') else { + return line; + }; + line + }), + } + } + + /// See [str::make_ascii_lowercase]. + #[inline] + pub fn make_ascii_lowercase(&mut self) { + // SAFETY: changing ASCII letters only does not invalidate UTF-8. + let me = unsafe { self.as_bytes_mut() }; + me.make_ascii_lowercase() + } + + /// See [str::make_ascii_uppercase]. + #[inline] + pub fn make_ascii_uppercase(&mut self) { + // SAFETY: changing ASCII letters only does not invalidate UTF-8. + let me = unsafe { self.as_bytes_mut() }; + me.make_ascii_uppercase() + } + + /// See [str::match_indices]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let v: Vec<_> = JavaStr::from_str("abcXXXabcYYYabc") + /// .match_indices("abc") + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// (0, JavaStr::from_str("abc")), + /// (6, JavaStr::from_str("abc")), + /// (12, JavaStr::from_str("abc")) + /// ] + /// ); + /// + /// let v: Vec<_> = JavaStr::from_str("1abcabc2").match_indices("abc").collect(); + /// assert_eq!( + /// v, + /// [(1, JavaStr::from_str("abc")), (4, JavaStr::from_str("abc"))] + /// ); + /// + /// let v: Vec<_> = JavaStr::from_str("ababa").match_indices("aba").collect(); + /// assert_eq!(v, [(0, JavaStr::from_str("aba"))]); // only the first `aba` + /// ``` + #[inline] + pub fn match_indices

(&self, pat: P) -> MatchIndices

+ where + P: JavaStrPattern, + { + MatchIndices { + str: self, + start: 0, + pat, + } + } + + /// See [str::matches]. + /// + /// ``` + /// # use java_string::{JavaCodePoint, JavaStr}; + /// let v: Vec<&JavaStr> = JavaStr::from_str("abcXXXabcYYYabc") + /// .matches("abc") + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("abc"), + /// JavaStr::from_str("abc"), + /// JavaStr::from_str("abc") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("1abc2abc3") + /// .matches(JavaCodePoint::is_numeric) + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("1"), + /// JavaStr::from_str("2"), + /// JavaStr::from_str("3") + /// ] + /// ); + /// ``` + #[inline] + pub fn matches

(&self, pat: P) -> Matches

+ where + P: JavaStrPattern, + { + Matches { str: self, pat } + } + + /// See [str::parse]. + #[inline] + pub fn parse(&self) -> Result::Err>> + where + F: FromStr, + { + match self.as_str() { + Ok(str) => str.parse().map_err(ParseError::Err), + Err(err) => Err(ParseError::InvalidUtf8(err)), + } + } + + /// See [str::repeat]. + #[inline] + #[must_use] + pub fn repeat(&self, n: usize) -> JavaString { + unsafe { JavaString::from_semi_utf8_unchecked(self.as_bytes().repeat(n)) } + } + + /// See [str::replace]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let s = JavaStr::from_str("this is old"); + /// + /// assert_eq!("this is new", s.replace("old", "new")); + /// assert_eq!("than an old", s.replace("is", "an")); + /// ``` + #[inline] + #[must_use] + pub fn replace

(&self, from: P, to: &str) -> JavaString + where + P: JavaStrPattern, + { + self.replace_java(from, JavaStr::from_str(to)) + } + + /// See [str::replace]. + #[inline] + #[must_use] + pub fn replace_java

(&self, from: P, to: &JavaStr) -> JavaString + where + P: JavaStrPattern, + { + let mut result = JavaString::new(); + let mut last_end = 0; + for (start, part) in self.match_indices(from) { + result.push_java_str(unsafe { self.get_unchecked(last_end..start) }); + result.push_java_str(to); + last_end = start + part.len(); + } + result.push_java_str(unsafe { self.get_unchecked(last_end..self.len()) }); + result + } + + /// See [str::replacen]. + /// + /// ``` + /// # use java_string::{JavaCodePoint, JavaStr}; + /// let s = JavaStr::from_str("foo foo 123 foo"); + /// assert_eq!("new new 123 foo", s.replacen("foo", "new", 2)); + /// assert_eq!("faa fao 123 foo", s.replacen('o', "a", 3)); + /// assert_eq!( + /// "foo foo new23 foo", + /// s.replacen(JavaCodePoint::is_numeric, "new", 1) + /// ); + /// ``` + #[inline] + #[must_use] + pub fn replacen

(&self, from: P, to: &str, count: usize) -> JavaString + where + P: JavaStrPattern, + { + self.replacen_java(from, JavaStr::from_str(to), count) + } + + /// See [str::replacen]. + #[inline] + #[must_use] + pub fn replacen_java

(&self, from: P, to: &JavaStr, count: usize) -> JavaString + where + P: JavaStrPattern, + { + // Hope to reduce the times of re-allocation + let mut result = JavaString::with_capacity(32); + let mut last_end = 0; + for (start, part) in self.match_indices(from).take(count) { + result.push_java_str(unsafe { self.get_unchecked(last_end..start) }); + result.push_java_str(to); + last_end = start + part.len(); + } + result.push_java_str(unsafe { self.get_unchecked(last_end..self.len()) }); + result + } + + /// See [str::rfind]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let s = JavaStr::from_str("Löwe 老虎 Léopard Gepardi"); + /// + /// assert_eq!(s.rfind('L'), Some(13)); + /// assert_eq!(s.rfind('é'), Some(14)); + /// assert_eq!(s.rfind("pard"), Some(24)); + /// + /// let x: &[_] = &['1', '2']; + /// assert_eq!(s.rfind(x), None); + /// ``` + #[inline] + #[must_use] + pub fn rfind

(&self, mut pat: P) -> Option + where + P: JavaStrPattern, + { + pat.rfind_in(self).map(|(index, _)| index) + } + + /// See [str::rmatch_indices]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let v: Vec<_> = JavaStr::from_str("abcXXXabcYYYabc") + /// .rmatch_indices("abc") + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// (12, JavaStr::from_str("abc")), + /// (6, JavaStr::from_str("abc")), + /// (0, JavaStr::from_str("abc")) + /// ] + /// ); + /// + /// let v: Vec<_> = JavaStr::from_str("1abcabc2") + /// .rmatch_indices("abc") + /// .collect(); + /// assert_eq!( + /// v, + /// [(4, JavaStr::from_str("abc")), (1, JavaStr::from_str("abc"))] + /// ); + /// + /// let v: Vec<_> = JavaStr::from_str("ababa").rmatch_indices("aba").collect(); + /// assert_eq!(v, [(2, JavaStr::from_str("aba"))]); // only the last `aba` + /// ``` + #[inline] + pub fn rmatch_indices

(&self, pat: P) -> RMatchIndices

+ where + P: JavaStrPattern, + { + RMatchIndices { + inner: self.match_indices(pat), + } + } + + /// See [str::rmatches]. + /// + /// ``` + /// # use java_string::{JavaCodePoint, JavaStr}; + /// let v: Vec<&JavaStr> = JavaStr::from_str("abcXXXabcYYYabc") + /// .rmatches("abc") + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("abc"), + /// JavaStr::from_str("abc"), + /// JavaStr::from_str("abc") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("1abc2abc3") + /// .rmatches(JavaCodePoint::is_numeric) + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("3"), + /// JavaStr::from_str("2"), + /// JavaStr::from_str("1") + /// ] + /// ); + /// ``` + #[inline] + pub fn rmatches

(&self, pat: P) -> RMatches

+ where + P: JavaStrPattern, + { + RMatches { + inner: self.matches(pat), + } + } + + /// See [str::rsplit]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let v: Vec<&JavaStr> = JavaStr::from_str("Mary had a little lamb") + /// .rsplit(' ') + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("lamb"), + /// JavaStr::from_str("little"), + /// JavaStr::from_str("a"), + /// JavaStr::from_str("had"), + /// JavaStr::from_str("Mary") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("").rsplit('X').collect(); + /// assert_eq!(v, [JavaStr::from_str("")]); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("lionXXtigerXleopard") + /// .rsplit('X') + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("leopard"), + /// JavaStr::from_str("tiger"), + /// JavaStr::from_str(""), + /// JavaStr::from_str("lion") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("lion::tiger::leopard") + /// .rsplit("::") + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("leopard"), + /// JavaStr::from_str("tiger"), + /// JavaStr::from_str("lion") + /// ] + /// ); + /// ``` + #[inline] + pub fn rsplit

(&self, pat: P) -> RSplit

+ where + P: JavaStrPattern, + { + RSplit::new(self, pat) + } + + /// See [str::rsplit_once]. + /// + /// ``` + /// # use java_string::JavaStr; + /// assert_eq!(JavaStr::from_str("cfg").rsplit_once('='), None); + /// assert_eq!( + /// JavaStr::from_str("cfg=foo").rsplit_once('='), + /// Some((JavaStr::from_str("cfg"), JavaStr::from_str("foo"))) + /// ); + /// assert_eq!( + /// JavaStr::from_str("cfg=foo=bar").rsplit_once('='), + /// Some((JavaStr::from_str("cfg=foo"), JavaStr::from_str("bar"))) + /// ); + /// ``` + #[inline] + #[must_use] + pub fn rsplit_once

(&self, mut delimiter: P) -> Option<(&JavaStr, &JavaStr)> + where + P: JavaStrPattern, + { + let (index, len) = delimiter.rfind_in(self)?; + // SAFETY: pattern is known to return valid indices. + unsafe { + Some(( + self.get_unchecked(..index), + self.get_unchecked(index + len..), + )) + } + } + + /// See [str::rsplit_terminator]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let v: Vec<&JavaStr> = JavaStr::from_str("A.B.").rsplit_terminator('.').collect(); + /// assert_eq!(v, [JavaStr::from_str("B"), JavaStr::from_str("A")]); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("A..B..").rsplit_terminator(".").collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str(""), + /// JavaStr::from_str("B"), + /// JavaStr::from_str(""), + /// JavaStr::from_str("A") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("A.B:C.D") + /// .rsplit_terminator(&['.', ':'][..]) + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("D"), + /// JavaStr::from_str("C"), + /// JavaStr::from_str("B"), + /// JavaStr::from_str("A") + /// ] + /// ); + /// ``` + #[inline] + pub fn rsplit_terminator

(&self, pat: P) -> RSplitTerminator

+ where + P: JavaStrPattern, + { + RSplitTerminator::new(self, pat) + } + + /// See [str::rsplitn]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let v: Vec<&JavaStr> = JavaStr::from_str("Mary had a little lamb") + /// .rsplitn(3, ' ') + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("lamb"), + /// JavaStr::from_str("little"), + /// JavaStr::from_str("Mary had a") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("lionXXtigerXleopard") + /// .rsplitn(3, 'X') + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("leopard"), + /// JavaStr::from_str("tiger"), + /// JavaStr::from_str("lionX") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("lion::tiger::leopard") + /// .rsplitn(2, "::") + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("leopard"), + /// JavaStr::from_str("lion::tiger") + /// ] + /// ); + /// ``` + #[inline] + pub fn rsplitn

(&self, n: usize, pat: P) -> RSplitN

+ where + P: JavaStrPattern, + { + RSplitN::new(self, pat, n) + } + + /// See [str::split]. + /// + /// ``` + /// # use java_string::{JavaCodePoint, JavaStr}; + /// let v: Vec<&JavaStr> = JavaStr::from_str("Mary had a little lamb") + /// .split(' ') + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("Mary"), + /// JavaStr::from_str("had"), + /// JavaStr::from_str("a"), + /// JavaStr::from_str("little"), + /// JavaStr::from_str("lamb") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("").split('X').collect(); + /// assert_eq!(v, [JavaStr::from_str("")]); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("lionXXtigerXleopard") + /// .split('X') + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("lion"), + /// JavaStr::from_str(""), + /// JavaStr::from_str("tiger"), + /// JavaStr::from_str("leopard") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("lion::tiger::leopard") + /// .split("::") + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("lion"), + /// JavaStr::from_str("tiger"), + /// JavaStr::from_str("leopard") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("abc1def2ghi") + /// .split(JavaCodePoint::is_numeric) + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("abc"), + /// JavaStr::from_str("def"), + /// JavaStr::from_str("ghi") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("lionXtigerXleopard") + /// .split(JavaCodePoint::is_uppercase) + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("lion"), + /// JavaStr::from_str("tiger"), + /// JavaStr::from_str("leopard") + /// ] + /// ); + /// ``` + #[inline] + pub fn split

(&self, pat: P) -> Split

+ where + P: JavaStrPattern, + { + Split::new(self, pat) + } + + /// See [str::split_ascii_whitespace]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let mut iter = JavaStr::from_str(" Mary had\ta little \n\t lamb").split_ascii_whitespace(); + /// assert_eq!(Some(JavaStr::from_str("Mary")), iter.next()); + /// assert_eq!(Some(JavaStr::from_str("had")), iter.next()); + /// assert_eq!(Some(JavaStr::from_str("a")), iter.next()); + /// assert_eq!(Some(JavaStr::from_str("little")), iter.next()); + /// assert_eq!(Some(JavaStr::from_str("lamb")), iter.next()); + /// + /// assert_eq!(None, iter.next()); + /// ``` + #[inline] + pub fn split_ascii_whitespace(&self) -> SplitAsciiWhitespace<'_> { + #[inline] + fn is_non_empty(bytes: &&[u8]) -> bool { + !bytes.is_empty() + } + + SplitAsciiWhitespace { + inner: self + .as_bytes() + .split(u8::is_ascii_whitespace as fn(&u8) -> bool) + .filter(is_non_empty as fn(&&[u8]) -> bool) + .map(|bytes| unsafe { JavaStr::from_semi_utf8_unchecked(bytes) }), + } + } + + /// See [str::split_at]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let s = JavaStr::from_str("Per Martin-Löf"); + /// + /// let (first, last) = s.split_at(3); + /// + /// assert_eq!("Per", first); + /// assert_eq!(" Martin-Löf", last); + /// ``` + /// ```should_panic + /// # use java_string::JavaStr; + /// let s = JavaStr::from_str("Per Martin-Löf"); + /// // Should panic + /// let _ = s.split_at(13); + /// ``` + #[inline] + #[must_use] + pub fn split_at(&self, mid: usize) -> (&JavaStr, &JavaStr) { + // is_char_boundary checks that the index is in [0, .len()] + if self.is_char_boundary(mid) { + // SAFETY: just checked that `mid` is on a char boundary. + unsafe { + ( + self.get_unchecked(0..mid), + self.get_unchecked(mid..self.len()), + ) + } + } else { + slice_error_fail(self, 0, mid) + } + } + + /// See [str::split_at_mut]. + /// + /// ``` + /// # use java_string::{JavaStr, JavaString}; + /// let mut s = JavaString::from("Per Martin-Löf"); + /// let s = s.as_mut_java_str(); + /// + /// let (first, last) = s.split_at_mut(3); + /// + /// assert_eq!("Per", first); + /// assert_eq!(" Martin-Löf", last); + /// ``` + /// ```should_panic + /// # use java_string::{JavaStr, JavaString}; + /// let mut s = JavaString::from("Per Martin-Löf"); + /// let s = s.as_mut_java_str(); + /// // Should panic + /// let _ = s.split_at(13); + /// ``` + #[inline] + #[must_use] + pub fn split_at_mut(&mut self, mid: usize) -> (&mut JavaStr, &mut JavaStr) { + // is_char_boundary checks that the index is in [0, .len()] + if self.is_char_boundary(mid) { + let len = self.len(); + let ptr = self.as_mut_ptr(); + // SAFETY: just checked that `mid` is on a char boundary. + unsafe { + ( + JavaStr::from_semi_utf8_unchecked_mut(slice::from_raw_parts_mut(ptr, mid)), + JavaStr::from_semi_utf8_unchecked_mut(slice::from_raw_parts_mut( + ptr.add(mid), + len - mid, + )), + ) + } + } else { + slice_error_fail(self, 0, mid) + } + } + + /// See [str::split_inclusive]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let v: Vec<&JavaStr> = JavaStr::from_str("Mary had a little lamb\nlittle lamb\nlittle lamb.\n") + /// .split_inclusive('\n') + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("Mary had a little lamb\n"), + /// JavaStr::from_str("little lamb\n"), + /// JavaStr::from_str("little lamb.\n") + /// ] + /// ); + /// ``` + #[inline] + pub fn split_inclusive

(&self, pat: P) -> SplitInclusive

+ where + P: JavaStrPattern, + { + SplitInclusive::new(self, pat) + } + + /// See [str::split_once]. + /// + /// ``` + /// # use java_string::JavaStr; + /// assert_eq!(JavaStr::from_str("cfg").split_once('='), None); + /// assert_eq!( + /// JavaStr::from_str("cfg=").split_once('='), + /// Some((JavaStr::from_str("cfg"), JavaStr::from_str(""))) + /// ); + /// assert_eq!( + /// JavaStr::from_str("cfg=foo").split_once('='), + /// Some((JavaStr::from_str("cfg"), JavaStr::from_str("foo"))) + /// ); + /// assert_eq!( + /// JavaStr::from_str("cfg=foo=bar").split_once('='), + /// Some((JavaStr::from_str("cfg"), JavaStr::from_str("foo=bar"))) + /// ); + /// ``` + #[inline] + #[must_use] + pub fn split_once

(&self, mut delimiter: P) -> Option<(&JavaStr, &JavaStr)> + where + P: JavaStrPattern, + { + let (index, len) = delimiter.find_in(self)?; + // SAFETY: pattern is known to return valid indices. + unsafe { + Some(( + self.get_unchecked(..index), + self.get_unchecked(index + len..), + )) + } + } + + /// See [str::split_terminator]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let v: Vec<&JavaStr> = JavaStr::from_str("A.B.").split_terminator('.').collect(); + /// assert_eq!(v, [JavaStr::from_str("A"), JavaStr::from_str("B")]); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("A..B..").split_terminator(".").collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("A"), + /// JavaStr::from_str(""), + /// JavaStr::from_str("B"), + /// JavaStr::from_str("") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("A.B:C.D") + /// .split_terminator(&['.', ':'][..]) + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("A"), + /// JavaStr::from_str("B"), + /// JavaStr::from_str("C"), + /// JavaStr::from_str("D") + /// ] + /// ); + /// ``` + #[inline] + pub fn split_terminator

(&self, pat: P) -> SplitTerminator

+ where + P: JavaStrPattern, + { + SplitTerminator::new(self, pat) + } + + /// See [str::split_whitespace]. + #[inline] + pub fn split_whitespace(&self) -> SplitWhitespace<'_> { + SplitWhitespace { + inner: self + .split(JavaCodePoint::is_whitespace as fn(JavaCodePoint) -> bool) + .filter(|str| !str.is_empty()), + } + } + + /// See [str::splitn]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let v: Vec<&JavaStr> = JavaStr::from_str("Mary had a little lambda") + /// .splitn(3, ' ') + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("Mary"), + /// JavaStr::from_str("had"), + /// JavaStr::from_str("a little lambda") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("lionXXtigerXleopard") + /// .splitn(3, "X") + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("lion"), + /// JavaStr::from_str(""), + /// JavaStr::from_str("tigerXleopard") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("abcXdef").splitn(1, 'X').collect(); + /// assert_eq!(v, [JavaStr::from_str("abcXdef")]); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("").splitn(1, 'X').collect(); + /// assert_eq!(v, [JavaStr::from_str("")]); + /// ``` + #[inline] + pub fn splitn

(&self, n: usize, pat: P) -> SplitN

+ where + P: JavaStrPattern, + { + SplitN::new(self, pat, n) + } + + /// See [str::starts_with]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let bananas = JavaStr::from_str("bananas"); + /// + /// assert!(bananas.starts_with("bana")); + /// assert!(!bananas.starts_with("nana")); + /// ``` + #[inline] + #[must_use] + pub fn starts_with

(&self, mut pat: P) -> bool + where + P: JavaStrPattern, + { + pat.prefix_len_in(self).is_some() + } + + /// See [str::strip_prefix]. + /// + /// ``` + /// # use java_string::JavaStr; + /// assert_eq!( + /// JavaStr::from_str("foo:bar").strip_prefix("foo:"), + /// Some(JavaStr::from_str("bar")) + /// ); + /// assert_eq!(JavaStr::from_str("foo:bar").strip_prefix("bar"), None); + /// assert_eq!( + /// JavaStr::from_str("foofoo").strip_prefix("foo"), + /// Some(JavaStr::from_str("foo")) + /// ); + /// ``` + #[inline] + #[must_use] + pub fn strip_prefix

(&self, mut prefix: P) -> Option<&JavaStr> + where + P: JavaStrPattern, + { + let len = prefix.prefix_len_in(self)?; + // SAFETY: pattern is known to return valid indices. + unsafe { Some(self.get_unchecked(len..)) } + } + + /// See [str::strip_suffix]. + /// + /// ``` + /// # use java_string::JavaStr; + /// assert_eq!( + /// JavaStr::from_str("bar:foo").strip_suffix(":foo"), + /// Some(JavaStr::from_str("bar")) + /// ); + /// assert_eq!(JavaStr::from_str("bar:foo").strip_suffix("bar"), None); + /// assert_eq!( + /// JavaStr::from_str("foofoo").strip_suffix("foo"), + /// Some(JavaStr::from_str("foo")) + /// ); + /// ``` + #[inline] + #[must_use] + pub fn strip_suffix

(&self, mut suffix: P) -> Option<&JavaStr> + where + P: JavaStrPattern, + { + let len = suffix.suffix_len_in(self)?; + // SAFETY: pattern is known to return valid indices. + unsafe { Some(self.get_unchecked(..self.len() - len)) } + } + + /// See [str::to_ascii_lowercase]. + #[inline] + #[must_use] + pub fn to_ascii_lowercase(&self) -> JavaString { + let mut s = self.to_owned(); + s.make_ascii_lowercase(); + s + } + + /// See [str::to_ascii_uppercase]. + #[inline] + #[must_use] + pub fn to_ascii_uppercase(&self) -> JavaString { + let mut s = self.to_owned(); + s.make_ascii_uppercase(); + s + } + + /// See [str::to_lowercase]. + /// + /// ``` + /// # use java_string::{JavaCodePoint, JavaStr, JavaString}; + /// let s = JavaStr::from_str("HELLO"); + /// assert_eq!("hello", s.to_lowercase()); + /// + /// let odysseus = JavaStr::from_str("ὈΔΥΣΣΕΎΣ"); + /// assert_eq!("ὀδυσσεύς", odysseus.to_lowercase()); + /// + /// let s = JavaString::from("Hello ") + /// + JavaString::from(JavaCodePoint::from_u32(0xd800).unwrap()).as_java_str() + /// + JavaStr::from_str(" World!"); + /// let expected = JavaString::from("hello ") + /// + JavaString::from(JavaCodePoint::from_u32(0xd800).unwrap()).as_java_str() + /// + JavaStr::from_str(" world!"); + /// assert_eq!(expected, s.to_lowercase()); + /// ``` + #[inline] + #[must_use] + pub fn to_lowercase(&self) -> JavaString { + self.transform_string(str::to_lowercase, |ch| ch) + } + + /// See [str::to_uppercase]. + /// + /// ``` + /// # use java_string::{JavaCodePoint, JavaStr, JavaString}; + /// let s = JavaStr::from_str("hello"); + /// assert_eq!("HELLO", s.to_uppercase()); + /// + /// let s = JavaStr::from_str("tschüß"); + /// assert_eq!("TSCHÜSS", s.to_uppercase()); + /// + /// let s = JavaString::from("Hello ") + /// + JavaString::from(JavaCodePoint::from_u32(0xd800).unwrap()).as_java_str() + /// + JavaStr::from_str(" World!"); + /// let expected = JavaString::from("HELLO ") + /// + JavaString::from(JavaCodePoint::from_u32(0xd800).unwrap()).as_java_str() + /// + JavaStr::from_str(" WORLD!"); + /// assert_eq!(expected, s.to_uppercase()); + /// ``` + #[inline] + #[must_use] + pub fn to_uppercase(&self) -> JavaString { + self.transform_string(str::to_uppercase, |ch| ch) + } + + /// See [str::trim]. + #[inline] + #[must_use] + pub fn trim(&self) -> &JavaStr { + self.trim_matches(|c: JavaCodePoint| c.is_whitespace()) + } + + /// See [str::trim_end]. + #[inline] + #[must_use] + pub fn trim_end(&self) -> &JavaStr { + self.trim_end_matches(|c: JavaCodePoint| c.is_whitespace()) + } + + /// See [str::trim_end_matches]. + /// + /// ``` + /// # use java_string::{JavaCodePoint, JavaStr}; + /// assert_eq!( + /// JavaStr::from_str("11foo1bar11").trim_end_matches('1'), + /// "11foo1bar" + /// ); + /// assert_eq!( + /// JavaStr::from_str("123foo1bar123").trim_end_matches(JavaCodePoint::is_numeric), + /// "123foo1bar" + /// ); + /// + /// let x: &[_] = &['1', '2']; + /// assert_eq!( + /// JavaStr::from_str("12foo1bar12").trim_end_matches(x), + /// "12foo1bar" + /// ); + /// ``` + #[inline] + #[must_use] + pub fn trim_end_matches

(&self, mut pat: P) -> &JavaStr + where + P: JavaStrPattern, + { + let mut str = self; + while let Some(suffix_len) = pat.suffix_len_in(str) { + if suffix_len == 0 { + break; + } + // SAFETY: pattern is known to return valid indices. + str = unsafe { str.get_unchecked(..str.len() - suffix_len) }; + } + str + } + + /// See [str::trim_matches]. + /// + /// ``` + /// # use java_string::{JavaCodePoint, JavaStr}; + /// assert_eq!( + /// JavaStr::from_str("11foo1bar11").trim_matches('1'), + /// "foo1bar" + /// ); + /// assert_eq!( + /// JavaStr::from_str("123foo1bar123").trim_matches(JavaCodePoint::is_numeric), + /// "foo1bar" + /// ); + /// + /// let x: &[_] = &['1', '2']; + /// assert_eq!(JavaStr::from_str("12foo1bar12").trim_matches(x), "foo1bar"); + /// ``` + #[inline] + #[must_use] + pub fn trim_matches

(&self, mut pat: P) -> &JavaStr + where + P: JavaStrPattern, + { + let mut str = self; + while let Some(prefix_len) = pat.prefix_len_in(str) { + if prefix_len == 0 { + break; + } + // SAFETY: pattern is known to return valid indices. + str = unsafe { str.get_unchecked(prefix_len..) }; + } + while let Some(suffix_len) = pat.suffix_len_in(str) { + if suffix_len == 0 { + break; + } + // SAFETY: pattern is known to return valid indices. + str = unsafe { str.get_unchecked(..str.len() - suffix_len) }; + } + str + } + + /// See [str::trim_start]. + #[inline] + #[must_use] + pub fn trim_start(&self) -> &JavaStr { + self.trim_start_matches(|c: JavaCodePoint| c.is_whitespace()) + } + + /// See [str::trim_start_matches]. + /// + /// ``` + /// # use java_string::{JavaCodePoint, JavaStr}; + /// assert_eq!( + /// JavaStr::from_str("11foo1bar11").trim_start_matches('1'), + /// "foo1bar11" + /// ); + /// assert_eq!( + /// JavaStr::from_str("123foo1bar123").trim_start_matches(JavaCodePoint::is_numeric), + /// "foo1bar123" + /// ); + /// + /// let x: &[_] = &['1', '2']; + /// assert_eq!( + /// JavaStr::from_str("12foo1bar12").trim_start_matches(x), + /// "foo1bar12" + /// ); + /// ``` + #[inline] + #[must_use] + pub fn trim_start_matches

(&self, mut pat: P) -> &JavaStr + where + P: JavaStrPattern, + { + let mut str = self; + while let Some(prefix_len) = pat.prefix_len_in(str) { + if prefix_len == 0 { + break; + } + // SAFETY: pattern is known to return valid indices. + str = unsafe { str.get_unchecked(prefix_len..) }; + } + str + } + + #[inline] + fn transform_string( + &self, + mut string_transformer: SF, + invalid_char_transformer: ICF, + ) -> JavaString + where + SF: FnMut(&str) -> String, + ICF: FnMut(&JavaStr) -> &JavaStr, + { + let bytes = self.as_bytes(); + match run_utf8_full_validation_from_semi(bytes) { + Ok(()) => JavaString::from(string_transformer(unsafe { + // SAFETY: validation succeeded + std::str::from_utf8_unchecked(bytes) + })), + Err(error) => { + self.transform_invalid_string(error, string_transformer, invalid_char_transformer) + } + } + } + + #[inline] + fn transform_invalid_string( + &self, + error: Utf8Error, + mut string_transformer: SF, + mut invalid_char_transformer: ICF, + ) -> JavaString + where + SF: FnMut(&str) -> String, + ICF: FnMut(&JavaStr) -> &JavaStr, + { + let bytes = self.as_bytes(); + let mut result = JavaString::from(string_transformer(unsafe { + // SAFETY: validation succeeded up to this index + std::str::from_utf8_unchecked(bytes.get_unchecked(..error.valid_up_to)) + })); + result.push_java_str(invalid_char_transformer(unsafe { + // SAFETY: any UTF-8 error in semi-valid UTF-8 is a 3 byte long sequence + // representing a surrogate code point. We're pushing that sequence now + JavaStr::from_semi_utf8_unchecked( + bytes.get_unchecked(error.valid_up_to..error.valid_up_to + 3), + ) + })); + let mut index = error.valid_up_to + 3; + loop { + let remainder = unsafe { bytes.get_unchecked(index..) }; + match run_utf8_full_validation_from_semi(remainder) { + Ok(()) => { + result.push_str(&string_transformer(unsafe { + // SAFETY: validation succeeded + std::str::from_utf8_unchecked(remainder) + })); + return result; + } + Err(error) => { + result.push_str(&string_transformer(unsafe { + // SAFETY: validation succeeded up to this index + std::str::from_utf8_unchecked( + bytes.get_unchecked(index..index + error.valid_up_to), + ) + })); + result.push_java_str(invalid_char_transformer(unsafe { + // SAFETY: see comment above + JavaStr::from_semi_utf8_unchecked(bytes.get_unchecked( + index + error.valid_up_to..index + error.valid_up_to + 3, + )) + })); + index += error.valid_up_to + 3; + } + } + } + } +} + +impl<'a> Add<&JavaStr> for Cow<'a, JavaStr> { + type Output = Cow<'a, JavaStr>; + + #[inline] + fn add(mut self, rhs: &JavaStr) -> Self::Output { + self += rhs; + self + } +} + +impl<'a> AddAssign<&JavaStr> for Cow<'a, JavaStr> { + #[inline] + fn add_assign(&mut self, rhs: &JavaStr) { + if !rhs.is_empty() { + match self { + Cow::Borrowed(lhs) => { + let mut result = lhs.to_owned(); + result.push_java_str(rhs); + *self = Cow::Owned(result); + } + Cow::Owned(lhs) => { + lhs.push_java_str(rhs); + } + } + } + } +} + +impl AsRef<[u8]> for JavaStr { + #[inline] + fn as_ref(&self) -> &[u8] { + self.as_bytes() + } +} + +impl Clone for Box { + #[inline] + fn clone(&self) -> Self { + let buf: Box<[u8]> = self.as_bytes().into(); + unsafe { JavaStr::from_boxed_semi_utf8_unchecked(buf) } + } +} + +impl Debug for JavaStr { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.write_char('"')?; + let mut from = 0; + for (i, c) in self.char_indices() { + let esc = c.escape_debug_ext(EscapeDebugExtArgs { + escape_single_quote: false, + escape_double_quote: true, + }); + // If char needs escaping, flush backlog so far and write, else skip. + // Also handle invalid UTF-8 here + if esc.len() != 1 || c.as_char().is_none() { + unsafe { + // SAFETY: any invalid UTF-8 should have been caught by a previous iteration + f.write_str(self[from..i].as_str_unchecked())?; + } + for c in esc { + f.write_char(c)?; + } + from = i + c.len_utf8(); + } + } + unsafe { + // SAFETY: any invalid UTF-8 should have been caught by the loop above + f.write_str(self[from..].as_str_unchecked())?; + } + f.write_char('"') + } +} + +impl Default for &JavaStr { + #[inline] + fn default() -> Self { + JavaStr::from_str("") + } +} + +impl Default for Box { + #[inline] + fn default() -> Self { + JavaStr::from_boxed_str(Box::::default()) + } +} + +impl Display for JavaStr { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + Display::fmt(&self.as_str_lossy(), f) + } +} + +impl<'a> From<&'a JavaStr> for Cow<'a, JavaStr> { + #[inline] + fn from(value: &'a JavaStr) -> Self { + Cow::Borrowed(value) + } +} + +impl From<&JavaStr> for Arc { + #[inline] + fn from(value: &JavaStr) -> Self { + let arc = Arc::<[u8]>::from(value.as_bytes()); + unsafe { Arc::from_raw(Arc::into_raw(arc) as *const JavaStr) } + } +} + +impl From<&JavaStr> for Box { + #[inline] + fn from(value: &JavaStr) -> Self { + unsafe { JavaStr::from_boxed_semi_utf8_unchecked(Box::from(value.as_bytes())) } + } +} + +impl From<&JavaStr> for Rc { + #[inline] + fn from(value: &JavaStr) -> Self { + let rc = Rc::<[u8]>::from(value.as_bytes()); + unsafe { Rc::from_raw(Rc::into_raw(rc) as *const JavaStr) } + } +} + +impl From<&JavaStr> for Vec { + #[inline] + fn from(value: &JavaStr) -> Self { + From::from(value.as_bytes()) + } +} + +impl From> for Box { + #[inline] + fn from(value: Cow<'_, JavaStr>) -> Self { + match value { + Cow::Borrowed(s) => Box::from(s), + Cow::Owned(s) => Box::from(s), + } + } +} + +impl From for Box { + #[inline] + fn from(value: JavaString) -> Self { + value.into_boxed_str() + } +} + +impl<'a> From<&'a str> for &'a JavaStr { + #[inline] + fn from(value: &'a str) -> Self { + JavaStr::from_str(value) + } +} + +impl Hash for JavaStr { + #[inline] + fn hash(&self, state: &mut H) { + state.write(self.as_bytes()); + state.write_u8(0xff); + } +} + +impl Index for JavaStr +where + I: JavaStrSliceIndex, +{ + type Output = JavaStr; + + #[inline] + fn index(&self, index: I) -> &Self::Output { + index.index(self) + } +} + +impl IndexMut for JavaStr +where + I: JavaStrSliceIndex, +{ + #[inline] + fn index_mut(&mut self, index: I) -> &mut Self::Output { + index.index_mut(self) + } +} + +impl<'a, 'b> PartialEq<&'b JavaStr> for Cow<'a, str> { + #[inline] + fn eq(&self, other: &&'b JavaStr) -> bool { + self == *other + } +} + +impl<'a, 'b> PartialEq<&'b JavaStr> for Cow<'a, JavaStr> { + #[inline] + fn eq(&self, other: &&'b JavaStr) -> bool { + self == *other + } +} + +impl<'a, 'b> PartialEq> for &'b JavaStr { + #[inline] + fn eq(&self, other: &Cow<'a, str>) -> bool { + *self == other + } +} + +impl<'a> PartialEq> for JavaStr { + #[inline] + fn eq(&self, other: &Cow<'a, str>) -> bool { + other == self + } +} + +impl<'a, 'b> PartialEq> for &'b JavaStr { + #[inline] + fn eq(&self, other: &Cow<'a, JavaStr>) -> bool { + *self == other + } +} + +impl<'a> PartialEq> for JavaStr { + #[inline] + fn eq(&self, other: &Cow<'a, JavaStr>) -> bool { + other == self + } +} + +impl<'a> PartialEq for &'a JavaStr { + #[inline] + fn eq(&self, other: &String) -> bool { + *self == other + } +} + +impl PartialEq for JavaStr { + #[inline] + fn eq(&self, other: &String) -> bool { + self == &other[..] + } +} + +impl PartialEq for String { + #[inline] + fn eq(&self, other: &JavaStr) -> bool { + &self[..] == other + } +} + +impl<'a> PartialEq for &'a JavaStr { + #[inline] + fn eq(&self, other: &JavaString) -> bool { + *self == other + } +} + +impl PartialEq for JavaStr { + #[inline] + fn eq(&self, other: &JavaString) -> bool { + self == other[..] + } +} + +impl<'a> PartialEq for Cow<'a, str> { + #[inline] + fn eq(&self, other: &JavaStr) -> bool { + match self { + Cow::Borrowed(this) => this == other, + Cow::Owned(this) => this == other, + } + } +} + +impl<'a> PartialEq for Cow<'a, JavaStr> { + #[inline] + fn eq(&self, other: &JavaStr) -> bool { + match self { + Cow::Borrowed(this) => this == other, + Cow::Owned(this) => this == other, + } + } +} + +impl PartialEq for str { + #[inline] + fn eq(&self, other: &JavaStr) -> bool { + JavaStr::from_str(self) == other + } +} + +impl<'a> PartialEq for &'a str { + #[inline] + fn eq(&self, other: &JavaStr) -> bool { + *self == other + } +} + +impl PartialEq for JavaStr { + #[inline] + fn eq(&self, other: &str) -> bool { + self == JavaStr::from_str(other) + } +} + +impl<'a> PartialEq<&'a str> for JavaStr { + #[inline] + fn eq(&self, other: &&'a str) -> bool { + self == *other + } +} + +impl<'a> PartialEq for &'a JavaStr { + #[inline] + fn eq(&self, other: &JavaStr) -> bool { + *self == other + } +} + +impl<'a> PartialEq<&'a JavaStr> for JavaStr { + #[inline] + fn eq(&self, other: &&'a JavaStr) -> bool { + self == *other + } +} + +impl ToOwned for JavaStr { + type Owned = JavaString; + + #[inline] + fn to_owned(&self) -> Self::Owned { + unsafe { JavaString::from_semi_utf8_unchecked(self.as_bytes().to_vec()) } + } +} + +mod private_slice_index { + use std::ops; + + pub trait Sealed {} + + impl Sealed for ops::Range {} + impl Sealed for ops::RangeTo {} + impl Sealed for ops::RangeFrom {} + impl Sealed for ops::RangeFull {} + impl Sealed for ops::RangeInclusive {} + impl Sealed for ops::RangeToInclusive {} +} + +/// # Safety +/// +/// Implementations' `check_bounds` method must properly check the bounds of the +/// slice, such that calling `get_unchecked` is not UB. +pub unsafe trait JavaStrSliceIndex: private_slice_index::Sealed + Sized { + fn check_bounds(&self, slice: &JavaStr) -> bool; + fn check_bounds_fail(self, slice: &JavaStr) -> !; + + /// # Safety + /// + /// - The input slice must be a valid pointer + /// - This index must not be out of bounds of the input slice + /// - The indices of this slice must point to char boundaries in the input + /// slice + unsafe fn get_unchecked(self, slice: *const JavaStr) -> *const JavaStr; + + /// # Safety + /// + /// - The input slice must be a valid pointer + /// - This index must not be out of bounds of the input slice + /// - The indices of this slice must point to char boundaries in the input + /// slice + unsafe fn get_unchecked_mut(self, slice: *mut JavaStr) -> *mut JavaStr; + + #[inline] + fn get(self, slice: &JavaStr) -> Option<&JavaStr> { + if self.check_bounds(slice) { + Some(unsafe { &*self.get_unchecked(slice) }) + } else { + None + } + } + + #[inline] + fn get_mut(self, slice: &mut JavaStr) -> Option<&mut JavaStr> { + if self.check_bounds(slice) { + Some(unsafe { &mut *self.get_unchecked_mut(slice) }) + } else { + None + } + } + + #[inline] + fn index(self, slice: &JavaStr) -> &JavaStr { + if self.check_bounds(slice) { + unsafe { &*self.get_unchecked(slice) } + } else { + self.check_bounds_fail(slice) + } + } + + #[inline] + fn index_mut(self, slice: &mut JavaStr) -> &mut JavaStr { + if self.check_bounds(slice) { + unsafe { &mut *self.get_unchecked_mut(slice) } + } else { + self.check_bounds_fail(slice) + } + } +} + +unsafe impl JavaStrSliceIndex for RangeFull { + #[inline] + fn check_bounds(&self, _slice: &JavaStr) -> bool { + true + } + + #[inline] + fn check_bounds_fail(self, _slice: &JavaStr) -> ! { + unreachable!() + } + + #[inline] + unsafe fn get_unchecked(self, slice: *const JavaStr) -> *const JavaStr { + slice + } + + #[inline] + unsafe fn get_unchecked_mut(self, slice: *mut JavaStr) -> *mut JavaStr { + slice + } +} + +unsafe impl JavaStrSliceIndex for Range { + #[inline] + fn check_bounds(&self, slice: &JavaStr) -> bool { + self.start <= self.end + && slice.is_char_boundary(self.start) + && slice.is_char_boundary(self.end) + } + + #[inline] + #[track_caller] + fn check_bounds_fail(self, slice: &JavaStr) -> ! { + slice_error_fail(slice, self.start, self.end) + } + + #[inline] + unsafe fn get_unchecked(self, slice: *const JavaStr) -> *const JavaStr { + let slice = slice as *const [u8]; + // SAFETY: the caller guarantees that `self` is in bounds of `slice` + // which satisfies all the conditions for `add`. + let ptr = unsafe { (slice as *const u8).add(self.start) }; + let len = self.end - self.start; + ptr::slice_from_raw_parts(ptr, len) as *const JavaStr + } + + #[inline] + unsafe fn get_unchecked_mut(self, slice: *mut JavaStr) -> *mut JavaStr { + let slice = slice as *mut [u8]; + // SAFETY: see comments for `get_unchecked`. + let ptr = unsafe { (slice as *mut u8).add(self.start) }; + let len = self.end - self.start; + ptr::slice_from_raw_parts_mut(ptr, len) as *mut JavaStr + } +} + +unsafe impl JavaStrSliceIndex for RangeTo { + #[inline] + fn check_bounds(&self, slice: &JavaStr) -> bool { + slice.is_char_boundary(self.end) + } + + #[inline] + #[track_caller] + fn check_bounds_fail(self, slice: &JavaStr) -> ! { + slice_error_fail(slice, 0, self.end) + } + + #[inline] + unsafe fn get_unchecked(self, slice: *const JavaStr) -> *const JavaStr { + unsafe { (0..self.end).get_unchecked(slice) } + } + + #[inline] + unsafe fn get_unchecked_mut(self, slice: *mut JavaStr) -> *mut JavaStr { + unsafe { (0..self.end).get_unchecked_mut(slice) } + } +} + +unsafe impl JavaStrSliceIndex for RangeFrom { + #[inline] + fn check_bounds(&self, slice: &JavaStr) -> bool { + slice.is_char_boundary(self.start) + } + + #[inline] + #[track_caller] + fn check_bounds_fail(self, slice: &JavaStr) -> ! { + slice_error_fail(slice, self.start, slice.len()) + } + + #[inline] + unsafe fn get_unchecked(self, slice: *const JavaStr) -> *const JavaStr { + let len = unsafe { (*(slice as *const [u8])).len() }; + unsafe { (self.start..len).get_unchecked(slice) } + } + + #[inline] + unsafe fn get_unchecked_mut(self, slice: *mut JavaStr) -> *mut JavaStr { + let len = unsafe { (*(slice as *mut [u8])).len() }; + unsafe { (self.start..len).get_unchecked_mut(slice) } + } +} + +#[inline] +fn into_slice_range(range: RangeInclusive) -> Range { + let exclusive_end = *range.end() + 1; + let start = match range.end_bound() { + Bound::Excluded(..) => exclusive_end, // excluded + Bound::Included(..) => *range.start(), + Bound::Unbounded => unreachable!(), + }; + start..exclusive_end +} + +unsafe impl JavaStrSliceIndex for RangeInclusive { + #[inline] + fn check_bounds(&self, slice: &JavaStr) -> bool { + *self.end() != usize::MAX && into_slice_range(self.clone()).check_bounds(slice) + } + + #[inline] + #[track_caller] + fn check_bounds_fail(self, slice: &JavaStr) -> ! { + if *self.end() == usize::MAX { + str_end_index_overflow_fail() + } else { + into_slice_range(self).check_bounds_fail(slice) + } + } + + #[inline] + unsafe fn get_unchecked(self, slice: *const JavaStr) -> *const JavaStr { + into_slice_range(self).get_unchecked(slice) + } + + #[inline] + unsafe fn get_unchecked_mut(self, slice: *mut JavaStr) -> *mut JavaStr { + into_slice_range(self).get_unchecked_mut(slice) + } +} + +unsafe impl JavaStrSliceIndex for RangeToInclusive { + #[inline] + fn check_bounds(&self, slice: &JavaStr) -> bool { + (0..=self.end).check_bounds(slice) + } + + #[inline] + fn check_bounds_fail(self, slice: &JavaStr) -> ! { + (0..=self.end).check_bounds_fail(slice) + } + + #[inline] + unsafe fn get_unchecked(self, slice: *const JavaStr) -> *const JavaStr { + (0..=self.end).get_unchecked(slice) + } + + #[inline] + unsafe fn get_unchecked_mut(self, slice: *mut JavaStr) -> *mut JavaStr { + (0..=self.end).get_unchecked_mut(slice) + } +} diff --git a/crates/java_string/src/validations.rs b/crates/java_string/src/validations.rs new file mode 100644 index 000000000..102783f55 --- /dev/null +++ b/crates/java_string/src/validations.rs @@ -0,0 +1,369 @@ +use std::ops::{Bound, Range, RangeBounds, RangeTo}; + +use crate::{JavaStr, Utf8Error}; + +pub(crate) const TAG_CONT: u8 = 0b1000_0000; +pub(crate) const TAG_TWO_B: u8 = 0b1100_0000; +pub(crate) const TAG_THREE_B: u8 = 0b1110_0000; +pub(crate) const TAG_FOUR_B: u8 = 0b1111_0000; +pub(crate) const CONT_MASK: u8 = 0b0011_1111; + +#[inline] +const fn utf8_first_byte(byte: u8, width: u32) -> u32 { + (byte & (0x7f >> width)) as u32 +} + +#[inline] +const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { + (ch << 6) | (byte & CONT_MASK) as u32 +} + +#[inline] +const fn utf8_is_cont_byte(byte: u8) -> bool { + (byte as i8) < -64 +} + +/// # Safety +/// +/// `bytes` must produce a semi-valid UTF-8 string +#[inline] +pub(crate) unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> Option { + // Decode UTF-8 + let x = *bytes.next()?; + if x < 128 { + return Some(x as u32); + } + + // Multibyte case follows + // Decode from a byte combination out of: [[[x y] z] w] + // NOTE: Performance is sensitive to the exact formulation here + let init = utf8_first_byte(x, 2); + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let y = unsafe { *bytes.next().unwrap_unchecked() }; + let mut ch = utf8_acc_cont_byte(init, y); + if x >= 0xe0 { + // [[x y z] w] case + // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let z = unsafe { *bytes.next().unwrap_unchecked() }; + let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); + ch = init << 12 | y_z; + if x >= 0xf0 { + // [x y z w] case + // use only the lower 3 bits of `init` + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let w = unsafe { *bytes.next().unwrap_unchecked() }; + ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); + } + } + + Some(ch) +} + +/// # Safety +/// +/// `bytes` must produce a semi-valid UTF-8 string +#[inline] +pub(crate) unsafe fn next_code_point_reverse<'a, I: DoubleEndedIterator>( + bytes: &mut I, +) -> Option { + // Decode UTF-8 + let w = match *bytes.next_back()? { + next_byte if next_byte < 128 => return Some(next_byte as u32), + back_byte => back_byte, + }; + + // Multibyte case follows + // Decode from a byte combination out of: [x [y [z w]]] + let mut ch; + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let z = unsafe { *bytes.next_back().unwrap_unchecked() }; + ch = utf8_first_byte(z, 2); + if utf8_is_cont_byte(z) { + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let y = unsafe { *bytes.next_back().unwrap_unchecked() }; + ch = utf8_first_byte(y, 3); + if utf8_is_cont_byte(y) { + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let x = unsafe { *bytes.next_back().unwrap_unchecked() }; + ch = utf8_first_byte(x, 4); + ch = utf8_acc_cont_byte(ch, y); + } + ch = utf8_acc_cont_byte(ch, z); + } + ch = utf8_acc_cont_byte(ch, w); + + Some(ch) +} + +#[inline(always)] +pub(crate) fn run_utf8_semi_validation(v: &[u8]) -> Result<(), Utf8Error> { + let mut index = 0; + let len = v.len(); + + let usize_bytes = std::mem::size_of::(); + let ascii_block_size = 2 * usize_bytes; + let blocks_end = if len >= ascii_block_size { + len - ascii_block_size + 1 + } else { + 0 + }; + let align = v.as_ptr().align_offset(usize_bytes); + + while index < len { + let old_offset = index; + macro_rules! err { + ($error_len:expr) => { + return Err(Utf8Error { + valid_up_to: old_offset, + error_len: $error_len, + }) + }; + } + + macro_rules! next { + () => {{ + index += 1; + // we needed data, but there was none: error! + if index >= len { + err!(None) + } + v[index] + }}; + } + + let first = v[index]; + if first >= 128 { + let w = utf8_char_width(first); + // 2-byte encoding is for codepoints \u{0080} to \u{07ff} + // first C2 80 last DF BF + // 3-byte encoding is for codepoints \u{0800} to \u{ffff} + // first E0 A0 80 last EF BF BF + // INCLUDING surrogates codepoints \u{d800} to \u{dfff} + // ED A0 80 to ED BF BF + // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff + // first F0 90 80 80 last F4 8F BF BF + // + // Use the UTF-8 syntax from the RFC + // + // https://tools.ietf.org/html/rfc3629 + // UTF8-1 = %x00-7F + // UTF8-2 = %xC2-DF UTF8-tail + // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / + // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) + // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / + // %xF4 %x80-8F 2( UTF8-tail ) + match w { + 2 => { + if next!() as i8 >= -64 { + err!(Some(1)) + } + } + 3 => { + match (first, next!()) { + (0xe0, 0xa0..=0xbf) | (0xe1..=0xef, 0x80..=0xbf) => {} /* INCLUDING surrogate codepoints here */ + _ => err!(Some(1)), + } + if next!() as i8 >= -64 { + err!(Some(2)) + } + } + 4 => { + match (first, next!()) { + (0xf0, 0x90..=0xbf) | (0xf1..=0xf3, 0x80..=0xbf) | (0xf4, 0x80..=0x8f) => {} + _ => err!(Some(1)), + } + if next!() as i8 >= -64 { + err!(Some(2)) + } + if next!() as i8 >= -64 { + err!(Some(3)) + } + } + _ => err!(Some(1)), + } + index += 1; + } else { + // Ascii case, try to skip forward quickly. + // When the pointer is aligned, read 2 words of data per iteration + // until we find a word containing a non-ascii byte. + if align != usize::MAX && align.wrapping_sub(index) % usize_bytes == 0 { + let ptr = v.as_ptr(); + while index < blocks_end { + // SAFETY: since `align - index` and `ascii_block_size` are + // multiples of `usize_bytes`, `block = ptr.add(index)` is + // always aligned with a `usize` so it's safe to dereference + // both `block` and `block.add(1)`. + unsafe { + let block = ptr.add(index) as *const usize; + // break if there is a nonascii byte + let zu = contains_nonascii(*block); + let zv = contains_nonascii(*block.add(1)); + if zu || zv { + break; + } + } + index += ascii_block_size; + } + // step from the point where the wordwise loop stopped + while index < len && v[index] < 128 { + index += 1; + } + } else { + index += 1; + } + } + } + + Ok(()) +} + +#[inline(always)] +pub(crate) const fn run_utf8_full_validation_from_semi(v: &[u8]) -> Result<(), Utf8Error> { + // this function checks for surrogate codepoints, between \u{d800} to \u{dfff}, + // or ED A0 80 to ED BF BF of width 3 unicode chars. The valid range of width 3 + // characters is ED 80 80 to ED BF BF, so we need to check for an ED byte + // followed by a >=A0 byte. + let mut index = 0; + while index + 3 <= v.len() { + if v[index] == 0xed && v[index + 1] >= 0xa0 { + return Err(Utf8Error { + valid_up_to: index, + error_len: Some(1), + }); + } + index += 1; + } + + Ok(()) +} + +#[inline] +pub(crate) const fn utf8_char_width(first_byte: u8) -> usize { + const UTF8_CHAR_WIDTH: [u8; 256] = [ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]; + + UTF8_CHAR_WIDTH[first_byte as usize] as _ +} + +#[inline] +const fn contains_nonascii(x: usize) -> bool { + const NONASCII_MASK: usize = usize::from_ne_bytes([0x80; std::mem::size_of::()]); + (x & NONASCII_MASK) != 0 +} + +#[cold] +#[track_caller] +pub(crate) fn slice_error_fail(s: &JavaStr, begin: usize, end: usize) -> ! { + const MAX_DISPLAY_LENGTH: usize = 256; + let trunc_len = s.floor_char_boundary(MAX_DISPLAY_LENGTH); + let s_trunc = &s[..trunc_len]; + let ellipsis = if trunc_len < s.len() { "[...]" } else { "" }; + + // 1. out of bounds + if begin > s.len() || end > s.len() { + let oob_index = if begin > s.len() { begin } else { end }; + panic!("byte index {oob_index} is out of bounds of `{s_trunc}`{ellipsis}"); + } + + // 2. begin <= end + assert!( + begin <= end, + "begin <= end ({} <= {}) when slicing `{}`{}", + begin, + end, + s_trunc, + ellipsis + ); + + // 3. character boundary + let index = if !s.is_char_boundary(begin) { + begin + } else { + end + }; + // find the character + let char_start = s.floor_char_boundary(index); + // `char_start` must be less than len and a char boundary + let ch = s[char_start..].chars().next().unwrap(); + let char_range = char_start..char_start + ch.len_utf8(); + panic!( + "byte index {} is not a char boundary; it is inside {:?} (bytes {:?}) of `{}`{}", + index, ch, char_range, s_trunc, ellipsis + ); +} + +#[cold] +#[track_caller] +pub(crate) fn str_end_index_len_fail(index: usize, len: usize) -> ! { + panic!("range end index {index} out of range for JavaStr of length {len}"); +} + +#[cold] +#[track_caller] +pub(crate) fn str_index_order_fail(index: usize, end: usize) -> ! { + panic!("JavaStr index starts at {index} but ends at {end}"); +} + +#[cold] +#[track_caller] +pub(crate) fn str_start_index_overflow_fail() -> ! { + panic!("attempted to index JavaStr from after maximum usize"); +} + +#[cold] +#[track_caller] +pub(crate) fn str_end_index_overflow_fail() -> ! { + panic!("attempted to index JavaStr up to maximum usize") +} + +#[inline] +#[track_caller] +pub(crate) fn to_range_checked(range: R, bounds: RangeTo) -> Range +where + R: RangeBounds, +{ + let len = bounds.end; + + let start = range.start_bound(); + let start = match start { + Bound::Included(&start) => start, + Bound::Excluded(start) => start + .checked_add(1) + .unwrap_or_else(|| str_start_index_overflow_fail()), + Bound::Unbounded => 0, + }; + + let end: Bound<&usize> = range.end_bound(); + let end = match end { + Bound::Included(end) => end + .checked_add(1) + .unwrap_or_else(|| str_end_index_overflow_fail()), + Bound::Excluded(&end) => end, + Bound::Unbounded => len, + }; + + if start > end { + str_index_order_fail(start, end); + } + if end > len { + str_end_index_len_fail(end, len); + } + + Range { start, end } +} diff --git a/typos.toml b/typos.toml index 8e54543f5..c59189146 100644 --- a/typos.toml +++ b/typos.toml @@ -1,5 +1,5 @@ [files] -extend-exclude = ["*.svg", "*.json"] +extend-exclude = ["*.svg", "*.json", "crates/java_string/src/slice.rs"] [default] extend-ignore-re = ['\d+ths', 'CC BY-NC-ND']