From e9011d9ce7ef6e3e411f376cc4fdaebafccc5740 Mon Sep 17 00:00:00 2001 From: Joe Date: Thu, 7 Sep 2023 01:12:26 +0100 Subject: [PATCH 01/11] Add a JavaString implementation --- Cargo.toml | 1 + crates/valence_java_string/Cargo.toml | 16 + crates/valence_java_string/README.md | 17 + crates/valence_java_string/src/char.rs | 789 +++++++++ crates/valence_java_string/src/error.rs | 126 ++ crates/valence_java_string/src/iter.rs | 975 +++++++++++ crates/valence_java_string/src/lib.rs | 25 + crates/valence_java_string/src/owned.rs | 1200 +++++++++++++ crates/valence_java_string/src/pattern.rs | 402 +++++ crates/valence_java_string/src/serde.rs | 43 + crates/valence_java_string/src/slice.rs | 1488 +++++++++++++++++ crates/valence_java_string/src/validations.rs | 368 ++++ 12 files changed, 5450 insertions(+) create mode 100644 crates/valence_java_string/Cargo.toml create mode 100644 crates/valence_java_string/README.md create mode 100644 crates/valence_java_string/src/char.rs create mode 100644 crates/valence_java_string/src/error.rs create mode 100644 crates/valence_java_string/src/iter.rs create mode 100644 crates/valence_java_string/src/lib.rs create mode 100644 crates/valence_java_string/src/owned.rs create mode 100644 crates/valence_java_string/src/pattern.rs create mode 100644 crates/valence_java_string/src/serde.rs create mode 100644 crates/valence_java_string/src/slice.rs create mode 100644 crates/valence_java_string/src/validations.rs diff --git a/Cargo.toml b/Cargo.toml index cccade60f..a83d4a774 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -179,6 +179,7 @@ valence_generated = { path = "crates/valence_generated", version = "0.2.0-alpha. valence_ident = { path = "crates/valence_ident", version = "0.2.0-alpha.1" } valence_ident_macros = { path = "crates/valence_ident_macros", version = "0.2.0-alpha.1" } valence_inventory = { path = "crates/valence_inventory", version = "0.2.0-alpha.1" } +valence_java_string = { path = "crates/valence_java_string", version = "0.1.0" } valence_lang = { path = "crates/valence_lang", version = "0.2.0-alpha.1" } valence_math = { path = "crates/valence_math", version = "0.2.0-alpha.1" } valence_nbt = { path = "crates/valence_nbt", features = [ diff --git a/crates/valence_java_string/Cargo.toml b/crates/valence_java_string/Cargo.toml new file mode 100644 index 000000000..81c33551b --- /dev/null +++ b/crates/valence_java_string/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "valence_java_string" +description = "An implementation of Java strings, tolerant of invalid UTF-16 encoding" +readme = "README.md" +version = "0.1.0" +keywords = ["java", "string", "utf16"] +edition.workspace = true +repository.workspace = true +documentation.workspace = true +license.workspace = true + +[features] +serde = ["dep:serde"] + +[dependencies] +serde = { workspace = true, optional = true } diff --git a/crates/valence_java_string/README.md b/crates/valence_java_string/README.md new file mode 100644 index 000000000..cff48a3b8 --- /dev/null +++ b/crates/valence_java_string/README.md @@ -0,0 +1,17 @@ +# valence_java_string + +An implementation of Java strings, tolerant of invalid UTF-16 encoding. +This allows for round-trip serialization of all Java strings, including those which contain invalid UTF-16, while still +being able to perform useful operations on those strings. + +These Java strings use the UTF-8 encoding, with the modification that surrogate code points (code points between U+D800 +and U+DFFF inclusive) are allowed. This allows for zero-cost conversion from Rust strings to Java strings. Similarly, +this crate introduces a `JavaCodePoint` type which is analogous to `char`, except that surrogate code points are +allowed. + +This crate is mostly undocumented, because most methods are entirely analogous to those of the same name in Rust's +strings. Please refer to the `std` documentation. + +# Features + +- `serde` Adds support for [`serde`](https://docs.rs/serde/latest/serde/) \ No newline at end of file diff --git a/crates/valence_java_string/src/char.rs b/crates/valence_java_string/src/char.rs new file mode 100644 index 000000000..13130aaf5 --- /dev/null +++ b/crates/valence_java_string/src/char.rs @@ -0,0 +1,789 @@ +use std::char::ParseCharError; +use std::cmp::Ordering; +use std::fmt; +use std::fmt::{Debug, Display, Formatter, Write}; +use std::hash::{Hash, Hasher}; +use std::iter::{once, FusedIterator, Once}; +use std::ops::Range; +use std::str::FromStr; + +use crate::validations::{TAG_CONT, TAG_FOUR_B, TAG_THREE_B, TAG_TWO_B}; + +// JavaCodePoint is guaranteed to have the same repr as a u32, with valid values +// of between 0 and 0x10FFFF, the same as a unicode code point. Surrogate code +// points are valid values of this type. +#[derive(Copy, Clone, PartialEq, Eq)] +#[repr(C)] +pub struct JavaCodePoint { + #[cfg(target_endian = "little")] + lower: u16, + upper: SeventeenValues, + #[cfg(target_endian = "big")] + lower: u16, +} + +#[repr(u16)] +#[derive(Copy, Clone, PartialEq, Eq)] +#[allow(unused)] +enum SeventeenValues { + V0, + V1, + V2, + V3, + V4, + V5, + V6, + V7, + V8, + V9, + V10, + V11, + V12, + V13, + V14, + V15, + V16, +} + +impl JavaCodePoint { + pub const MAX: JavaCodePoint = JavaCodePoint::from_char(char::MAX); + pub const REPLACEMENT_CHARACTER: JavaCodePoint = + JavaCodePoint::from_char(char::REPLACEMENT_CHARACTER); + + #[inline] + #[must_use] + pub const fn from_u32(i: u32) -> Option { + if i <= 0x10ffff { + unsafe { Some(Self::from_u32_unchecked(i)) } + } else { + None + } + } + + /// # Safety + /// The argument must be within the valid Unicode code point range of 0 to + /// 0x10FFFF inclusive. Surrogate code points are allowed. + #[inline] + #[must_use] + pub const unsafe fn from_u32_unchecked(i: u32) -> JavaCodePoint { + // SAFETY: the caller checks that the argument can be represented by this type + std::mem::transmute(i) + } + + #[inline] + #[must_use] + pub const fn from_char(char: char) -> JavaCodePoint { + unsafe { + // SAFETY: all chars are valid code points + JavaCodePoint::from_u32_unchecked(char as u32) + } + } + + #[inline] + #[must_use] + pub const fn as_u32(self) -> u32 { + unsafe { + // SAFETY: JavaCodePoint has the same repr as a u32 + let result = std::mem::transmute(self); + + if result > 0x10ffff { + // SAFETY: JavaCodePoint can never have a value > 0x10FFFF. + // This statement may allow the optimizer to remove branches in the calling code + // associated with out of bounds chars. + std::hint::unreachable_unchecked(); + } + + result + } + } + + #[inline] + #[must_use] + pub const fn as_char(self) -> Option { + char::from_u32(self.as_u32()) + } + + /// # Safety + /// The caller must ensure that this code point is not a surrogate code + /// point. + #[inline] + #[must_use] + pub unsafe fn as_char_unchecked(self) -> char { + char::from_u32_unchecked(self.as_u32()) + } + + #[inline] + pub fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] { + if let Some(char) = self.as_char() { + char.encode_utf16(dst) + } else { + dst[0] = self.as_u32() as u16; + &mut dst[..1] + } + } + + #[inline] + pub fn encode_semi_utf8(self, dst: &mut [u8]) -> &mut [u8] { + let len = self.len_utf8(); + let code = self.as_u32(); + match (len, &mut dst[..]) { + (1, [a, ..]) => { + *a = code as u8; + } + (2, [a, b, ..]) => { + *a = (code >> 6 & 0x1f) as u8 | TAG_TWO_B; + *b = (code & 0x3f) as u8 | TAG_CONT; + } + (3, [a, b, c, ..]) => { + *a = (code >> 12 & 0x0f) as u8 | TAG_THREE_B; + *b = (code >> 6 & 0x3f) as u8 | TAG_CONT; + *c = (code & 0x3f) as u8 | TAG_CONT; + } + (4, [a, b, c, d, ..]) => { + *a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; + *b = (code >> 12 & 0x3f) as u8 | TAG_CONT; + *c = (code >> 6 & 0x3f) as u8 | TAG_CONT; + *d = (code & 0x3f) as u8 | TAG_CONT; + } + _ => panic!( + "encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}", + len, + code, + dst.len() + ), + } + &mut dst[..len] + } + + #[inline] + pub fn eq_ignore_ascii_case(&self, other: &JavaCodePoint) -> bool { + match (self.as_char(), other.as_char()) { + (Some(char1), Some(char2)) => char1.eq_ignore_ascii_case(&char2), + (None, None) => self == other, + _ => false, + } + } + + #[inline] + #[must_use] + pub fn escape_debug(self) -> CharEscapeIter { + self.escape_debug_ext(EscapeDebugExtArgs::ESCAPE_ALL) + } + + #[inline] + #[must_use] + pub(crate) fn escape_debug_ext(self, args: EscapeDebugExtArgs) -> CharEscapeIter { + const NULL: u32 = '\0' as u32; + const TAB: u32 = '\t' as u32; + const CARRIAGE_RETURN: u32 = '\r' as u32; + const LINE_FEED: u32 = '\n' as u32; + const SINGLE_QUOTE: u32 = '\'' as u32; + const DOUBLE_QUOTE: u32 = '"' as u32; + const BACKSLASH: u32 = '\\' as u32; + + unsafe { + // SAFETY: all characters specified are in ascii range + match self.as_u32() { + NULL => CharEscapeIter::new([b'\\', b'0']), + TAB => CharEscapeIter::new([b'\\', b't']), + CARRIAGE_RETURN => CharEscapeIter::new([b'\\', b'r']), + LINE_FEED => CharEscapeIter::new([b'\\', b'n']), + SINGLE_QUOTE if args.escape_single_quote => CharEscapeIter::new([b'\\', b'\'']), + DOUBLE_QUOTE if args.escape_double_quote => CharEscapeIter::new([b'\\', b'"']), + BACKSLASH => CharEscapeIter::new([b'\\', b'\\']), + _ if self.is_printable() => { + // SAFETY: surrogate code points are not printable + CharEscapeIter::printable(self.as_char_unchecked()) + } + _ => self.escape_unicode(), + } + } + } + + #[inline] + fn is_printable(self) -> bool { + let Some(char) = self.as_char() else { + return false; + }; + if matches!(char, '\\' | '\'' | '"') { + return true; + } + char.escape_debug().next() != Some('\\') + } + + #[inline] + #[must_use] + pub fn escape_default(self) -> CharEscapeIter { + const TAB: u32 = '\t' as u32; + const CARRIAGE_RETURN: u32 = '\r' as u32; + const LINE_FEED: u32 = '\n' as u32; + const SINGLE_QUOTE: u32 = '\'' as u32; + const DOUBLE_QUOTE: u32 = '"' as u32; + const BACKSLASH: u32 = '\\' as u32; + + unsafe { + // SAFETY: all characters specified are in ascii range + match self.as_u32() { + TAB => CharEscapeIter::new([b'\\', b't']), + CARRIAGE_RETURN => CharEscapeIter::new([b'\\', b'r']), + LINE_FEED => CharEscapeIter::new([b'\\', b'n']), + SINGLE_QUOTE => CharEscapeIter::new([b'\\', b'\'']), + DOUBLE_QUOTE => CharEscapeIter::new([b'\\', b'"']), + BACKSLASH => CharEscapeIter::new([b'\\', b'\\']), + 0x20..=0x7e => CharEscapeIter::new([self.as_u32() as u8]), + _ => self.escape_unicode(), + } + } + } + + #[inline] + #[must_use] + pub fn escape_unicode(self) -> CharEscapeIter { + let x = self.as_u32(); + + let mut arr = [0; 10]; + arr[0] = b'\\'; + arr[1] = b'u'; + arr[2] = b'{'; + + let number_len = if x == 0 { + 1 + } else { + ((x.ilog2() >> 2) + 1) as usize + }; + arr[3 + number_len] = b'}'; + for hexit in 0..number_len { + arr[2 + number_len - hexit] = b"0123456789abcdef"[((x >> (hexit << 2)) & 15) as usize]; + } + + CharEscapeIter { + inner: EscapeIterInner::Escaped(EscapeIterEscaped { + bytes: arr, + range: 0..number_len + 4, + }), + } + } + + #[inline] + #[must_use] + pub fn is_alphabetic(self) -> bool { + self.as_char().is_some_and(|char| char.is_alphabetic()) + } + + #[inline] + #[must_use] + pub fn is_alphanumeric(self) -> bool { + self.as_char().is_some_and(|char| char.is_alphanumeric()) + } + + #[inline] + #[must_use] + pub fn is_ascii(self) -> bool { + self.as_u32() <= 0x7f + } + + #[inline] + #[must_use] + pub const fn is_ascii_alphabetic(self) -> bool { + self.is_ascii_lowercase() || self.is_ascii_uppercase() + } + + #[inline] + #[must_use] + pub const fn is_ascii_alphanumeric(self) -> bool { + self.is_ascii_alphabetic() || self.is_ascii_digit() + } + + #[inline] + #[must_use] + pub const fn is_ascii_control(self) -> bool { + matches!(self.as_u32(), 0..=0x1f | 0x7f) + } + + #[inline] + #[must_use] + pub const fn is_ascii_digit(self) -> bool { + const ZERO: u32 = '0' as u32; + const NINE: u32 = '9' as u32; + matches!(self.as_u32(), ZERO..=NINE) + } + + #[inline] + #[must_use] + pub const fn is_ascii_graphic(self) -> bool { + matches!(self.as_u32(), 0x21..=0x7e) + } + + #[inline] + #[must_use] + pub const fn is_ascii_hexdigit(self) -> bool { + const LOWER_A: u32 = 'a' as u32; + const LOWER_F: u32 = 'f' as u32; + const UPPER_A: u32 = 'A' as u32; + const UPPER_F: u32 = 'F' as u32; + self.is_ascii_digit() || matches!(self.as_u32(), (LOWER_A..=LOWER_F) | (UPPER_A..=UPPER_F)) + } + + #[inline] + #[must_use] + pub const fn is_ascii_lowercase(self) -> bool { + const A: u32 = 'a' as u32; + const Z: u32 = 'z' as u32; + matches!(self.as_u32(), A..=Z) + } + + #[inline] + #[must_use] + pub const fn is_ascii_octdigit(self) -> bool { + const ZERO: u32 = '0' as u32; + const SEVEN: u32 = '7' as u32; + matches!(self.as_u32(), ZERO..=SEVEN) + } + + #[inline] + #[must_use] + pub const fn is_ascii_punctuation(self) -> bool { + matches!( + self.as_u32(), + (0x21..=0x2f) | (0x3a..=0x40) | (0x5b..=0x60) | (0x7b..=0x7e) + ) + } + + #[inline] + #[must_use] + pub const fn is_ascii_uppercase(self) -> bool { + const A: u32 = 'A' as u32; + const Z: u32 = 'Z' as u32; + matches!(self.as_u32(), A..=Z) + } + + #[inline] + #[must_use] + pub const fn is_ascii_whitespace(self) -> bool { + const SPACE: u32 = ' ' as u32; + const HORIZONTAL_TAB: u32 = '\t' as u32; + const LINE_FEED: u32 = '\n' as u32; + const FORM_FEED: u32 = 0xc; + const CARRIAGE_RETURN: u32 = '\r' as u32; + matches!( + self.as_u32(), + SPACE | HORIZONTAL_TAB | LINE_FEED | FORM_FEED | CARRIAGE_RETURN + ) + } + + #[inline] + #[must_use] + pub fn is_control(self) -> bool { + self.as_char().is_some_and(|char| char.is_control()) + } + + #[inline] + #[must_use] + pub fn is_digit(self, radix: u32) -> bool { + self.to_digit(radix).is_some() + } + + #[inline] + #[must_use] + pub fn is_lowercase(self) -> bool { + self.as_char().is_some_and(|char| char.is_lowercase()) + } + + #[inline] + #[must_use] + pub fn is_numeric(self) -> bool { + self.as_char().is_some_and(|char| char.is_numeric()) + } + + #[inline] + #[must_use] + pub fn is_uppercase(self) -> bool { + self.as_char().is_some_and(|char| char.is_uppercase()) + } + + #[inline] + #[must_use] + pub fn is_whitespace(self) -> bool { + self.as_char().is_some_and(|char| char.is_whitespace()) + } + + #[inline] + #[must_use] + pub const fn len_utf16(self) -> usize { + if let Some(char) = self.as_char() { + char.len_utf16() + } else { + 1 // invalid code points are encoded as 1 utf16 code point anyway + } + } + + #[inline] + #[must_use] + pub const fn len_utf8(self) -> usize { + if let Some(char) = self.as_char() { + char.len_utf8() + } else { + 3 // invalid code points are all length 3 in semi-valid utf8 + } + } + + #[inline] + pub fn make_ascii_lowercase(&mut self) { + *self = self.to_ascii_lowercase(); + } + + #[inline] + pub fn make_ascii_uppercase(&mut self) { + *self = self.to_ascii_uppercase(); + } + + #[inline] + #[must_use] + pub const fn to_ascii_lowercase(self) -> JavaCodePoint { + if self.is_ascii_uppercase() { + unsafe { + // SAFETY: all lowercase chars are valid chars + Self::from_u32_unchecked(self.as_u32() + 32) + } + } else { + self + } + } + + #[inline] + #[must_use] + pub const fn to_ascii_uppercase(self) -> JavaCodePoint { + if self.is_ascii_lowercase() { + unsafe { + // SAFETY: all uppercase chars are valid chars + Self::from_u32_unchecked(self.as_u32() - 32) + } + } else { + self + } + } + + #[inline] + #[must_use] + pub const fn to_digit(self, radix: u32) -> Option { + if let Some(char) = self.as_char() { + char.to_digit(radix) + } else { + None + } + } + + #[inline] + #[must_use] + pub fn to_lowercase(self) -> ToLowercase { + match self.as_char() { + Some(char) => ToLowercase::char(char.to_lowercase()), + None => ToLowercase::invalid(self), + } + } + + #[inline] + #[must_use] + pub fn to_uppercase(self) -> ToUppercase { + match self.as_char() { + Some(char) => ToUppercase::char(char.to_uppercase()), + None => ToUppercase::invalid(self), + } + } +} + +impl Debug for JavaCodePoint { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.write_char('\'')?; + for c in self.escape_debug_ext(EscapeDebugExtArgs { + escape_single_quote: true, + escape_double_quote: false, + }) { + f.write_char(c)?; + } + f.write_char('\'') + } +} + +impl Default for JavaCodePoint { + #[inline] + fn default() -> Self { + JavaCodePoint::from_char('\0') + } +} + +impl Display for JavaCodePoint { + #[inline] + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + Display::fmt(&self.as_char().unwrap_or(char::REPLACEMENT_CHARACTER), f) + } +} + +impl From for u32 { + #[inline] + fn from(value: JavaCodePoint) -> Self { + value.as_u32() + } +} + +impl From for JavaCodePoint { + #[inline] + fn from(value: u8) -> Self { + JavaCodePoint::from_char(char::from(value)) + } +} + +impl FromStr for JavaCodePoint { + type Err = ParseCharError; + + #[inline] + fn from_str(s: &str) -> Result { + char::from_str(s).map(JavaCodePoint::from_char) + } +} + +impl Hash for JavaCodePoint { + #[inline] + fn hash(&self, state: &mut H) { + self.as_u32().hash(state) + } +} + +impl Ord for JavaCodePoint { + #[inline] + fn cmp(&self, other: &Self) -> Ordering { + self.as_u32().cmp(&other.as_u32()) + } +} + +impl PartialOrd for JavaCodePoint { + #[inline] + fn partial_cmp(&self, other: &Self) -> Option { + self.as_u32().partial_cmp(&other.as_u32()) + } +} + +impl PartialOrd for JavaCodePoint { + #[inline] + fn partial_cmp(&self, other: &char) -> Option { + self.partial_cmp(&JavaCodePoint::from_char(*other)) + } +} + +impl PartialOrd for char { + #[inline] + fn partial_cmp(&self, other: &JavaCodePoint) -> Option { + JavaCodePoint::from_char(*self).partial_cmp(other) + } +} + +impl PartialEq for JavaCodePoint { + #[inline] + fn eq(&self, other: &char) -> bool { + self == &JavaCodePoint::from_char(*other) + } +} + +impl PartialEq for char { + #[inline] + fn eq(&self, other: &JavaCodePoint) -> bool { + &JavaCodePoint::from_char(*self) == other + } +} + +pub(crate) struct EscapeDebugExtArgs { + pub(crate) escape_single_quote: bool, + pub(crate) escape_double_quote: bool, +} + +impl EscapeDebugExtArgs { + pub(crate) const ESCAPE_ALL: Self = Self { + escape_single_quote: true, + escape_double_quote: true, + }; +} + +#[derive(Clone, Debug)] +pub struct CharEscapeIter { + inner: EscapeIterInner, +} + +#[derive(Clone, Debug)] +enum EscapeIterInner { + Printable(Once), + Escaped(EscapeIterEscaped), +} + +impl CharEscapeIter { + #[inline] + fn printable(char: char) -> Self { + CharEscapeIter { + inner: EscapeIterInner::Printable(once(char)), + } + } + + /// # Safety + /// Assumes that the input byte array is ASCII + #[inline] + unsafe fn new(bytes: [u8; N]) -> Self { + assert!(N <= 10, "Too many bytes in escape iter"); + let mut ten_bytes = [0; 10]; + ten_bytes[..N].copy_from_slice(&bytes); + CharEscapeIter { + inner: EscapeIterInner::Escaped(EscapeIterEscaped { + bytes: ten_bytes, + range: 0..N, + }), + } + } +} + +impl Iterator for CharEscapeIter { + type Item = char; + + #[inline] + fn next(&mut self) -> Option { + match &mut self.inner { + EscapeIterInner::Printable(printable) => printable.next(), + EscapeIterInner::Escaped(escaped) => escaped.next(), + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + match &self.inner { + EscapeIterInner::Printable(printable) => printable.size_hint(), + EscapeIterInner::Escaped(escaped) => escaped.size_hint(), + } + } +} + +impl ExactSizeIterator for CharEscapeIter { + #[inline] + fn len(&self) -> usize { + match &self.inner { + EscapeIterInner::Printable(printable) => printable.len(), + EscapeIterInner::Escaped(escaped) => escaped.len(), + } + } +} + +impl FusedIterator for CharEscapeIter {} + +#[derive(Clone, Debug)] +struct EscapeIterEscaped { + // SAFETY: all values must be in the ASCII range + bytes: [u8; 10], + // SAFETY: range must not be out of bounds for length 10 + range: Range, +} + +impl Iterator for EscapeIterEscaped { + type Item = char; + + #[inline] + fn next(&mut self) -> Option { + self.range.next().map(|index| unsafe { + // SAFETY: the range is never out of bounds for length 10 + char::from(*self.bytes.get_unchecked(index)) + }) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.range.size_hint() + } + + #[inline] + fn count(self) -> usize { + self.range.len() + } +} + +impl ExactSizeIterator for EscapeIterEscaped { + #[inline] + fn len(&self) -> usize { + self.range.len() + } +} + +impl FusedIterator for EscapeIterEscaped {} + +impl fmt::Display for EscapeIterEscaped { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let str = unsafe { + // SAFETY: all bytes are in ASCII range, and range is in bounds for length 10 + std::str::from_utf8_unchecked(self.bytes.get_unchecked(self.range.clone())) + }; + f.write_str(str) + } +} + +pub type ToLowercase = CharIterDelegate; +pub type ToUppercase = CharIterDelegate; + +#[derive(Debug, Clone)] +pub struct CharIterDelegate(CharIterDelegateInner); + +impl CharIterDelegate { + #[inline] + fn char(iter: I) -> CharIterDelegate { + CharIterDelegate(CharIterDelegateInner::Char(iter)) + } + + #[inline] + fn invalid(code_point: JavaCodePoint) -> CharIterDelegate { + CharIterDelegate(CharIterDelegateInner::Invalid(Some(code_point).into_iter())) + } +} + +#[derive(Debug, Clone)] +enum CharIterDelegateInner { + Char(I), + Invalid(std::option::IntoIter), +} + +impl Iterator for CharIterDelegate +where + I: Iterator, +{ + type Item = JavaCodePoint; + + #[inline] + fn next(&mut self) -> Option { + match &mut self.0 { + CharIterDelegateInner::Char(char_iter) => { + char_iter.next().map(JavaCodePoint::from_char) + } + CharIterDelegateInner::Invalid(code_point) => code_point.next(), + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + match &self.0 { + CharIterDelegateInner::Char(char_iter) => char_iter.size_hint(), + CharIterDelegateInner::Invalid(code_point) => code_point.size_hint(), + } + } +} + +impl DoubleEndedIterator for CharIterDelegate +where + I: Iterator + DoubleEndedIterator, +{ + #[inline] + fn next_back(&mut self) -> Option { + match &mut self.0 { + CharIterDelegateInner::Char(char_iter) => { + char_iter.next_back().map(JavaCodePoint::from_char) + } + CharIterDelegateInner::Invalid(code_point) => code_point.next_back(), + } + } +} + +impl ExactSizeIterator for CharIterDelegate where I: Iterator + ExactSizeIterator {} + +impl FusedIterator for CharIterDelegate where I: Iterator + FusedIterator {} diff --git a/crates/valence_java_string/src/error.rs b/crates/valence_java_string/src/error.rs new file mode 100644 index 000000000..09742d014 --- /dev/null +++ b/crates/valence_java_string/src/error.rs @@ -0,0 +1,126 @@ +use std::error::Error; +use std::fmt; +use std::fmt::{Display, Formatter}; + +#[derive(Copy, Eq, PartialEq, Clone, Debug)] +pub struct Utf8Error { + pub(crate) valid_up_to: usize, + pub(crate) error_len: Option, +} + +impl Utf8Error { + #[must_use] + #[inline] + pub const fn valid_up_to(&self) -> usize { + self.valid_up_to + } + + #[must_use] + #[inline] + pub const fn error_len(&self) -> Option { + // Manual implementation of Option::map since it's not const + match self.error_len { + Some(len) => Some(len as usize), + None => None, + } + } + + #[must_use] + #[inline] + pub(crate) const fn from_std(value: std::str::Utf8Error) -> Self { + Self { + valid_up_to: value.valid_up_to(), + // Manual implementation of Option::map since it's not const + error_len: match value.error_len() { + Some(error_len) => Some(error_len as u8), + None => None, + }, + } + } +} + +impl Display for Utf8Error { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + if let Some(error_len) = self.error_len { + write!( + f, + "invalid utf-8 sequence of {} bytes from index {}", + error_len, self.valid_up_to + ) + } else { + write!( + f, + "incomplete utf-8 byte sequence from index {}", + self.valid_up_to + ) + } + } +} + +impl From for Utf8Error { + #[inline] + fn from(value: std::str::Utf8Error) -> Self { + Self::from_std(value) + } +} + +impl Error for Utf8Error {} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct FromUtf8Error { + pub(crate) bytes: Vec, + pub(crate) error: Utf8Error, +} + +impl FromUtf8Error { + pub fn as_bytes(&self) -> &[u8] { + &self.bytes[..] + } + + #[must_use] + pub fn into_bytes(self) -> Vec { + self.bytes + } + + pub fn utf8_error(&self) -> Utf8Error { + self.error + } +} + +impl Display for FromUtf8Error { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + Display::fmt(&self.error, f) + } +} + +impl Error for FromUtf8Error {} + +#[derive(Copy, Eq, PartialEq, Clone, Debug)] +pub enum ParseError { + InvalidUtf8(Utf8Error), + Err(E), +} + +impl Display for ParseError +where + E: Display, +{ + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self { + ParseError::InvalidUtf8(err) => Display::fmt(err, f), + ParseError::Err(err) => Display::fmt(err, f), + } + } +} + +impl Error for ParseError +where + E: Error + 'static, +{ + fn source(&self) -> Option<&(dyn Error + 'static)> { + match self { + ParseError::InvalidUtf8(err) => Some(err), + ParseError::Err(err) => Some(err), + } + } +} diff --git a/crates/valence_java_string/src/iter.rs b/crates/valence_java_string/src/iter.rs new file mode 100644 index 000000000..cebd1993e --- /dev/null +++ b/crates/valence_java_string/src/iter.rs @@ -0,0 +1,975 @@ +use std::fmt::{Debug, Display, Formatter, Write}; +use std::iter::{Chain, Copied, Filter, FlatMap, Flatten, FusedIterator, Map}; +use std::{option, slice}; + +use crate::validations::{next_code_point, next_code_point_reverse}; +use crate::{CharEscapeIter, JavaCodePoint, JavaStr, JavaStrPattern}; +macro_rules! delegate { + (Iterator for $ty:ident $(<$($lt:lifetime),+>)? => $item:ty $(, DoubleEnded = $double_ended:ty)?) => { + impl$(<$($lt),+>)? Iterator for $ty$(<$($lt),+>)? { + type Item = $item; + + #[inline] + fn next(&mut self) -> Option { + self.inner.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } + + #[inline] + fn count(self) -> usize { + self.inner.count() + } + + #[inline] + fn last(self) -> Option { + self.inner.last() + } + + #[inline] + fn nth(&mut self, n: usize) -> Option { + self.inner.nth(n) + } + + #[inline] + fn all(&mut self, f: F) -> bool + where + F: FnMut(Self::Item) -> bool, + { + self.inner.all(f) + } + + #[inline] + fn any(&mut self, f: F) -> bool + where + F: FnMut(Self::Item) -> bool, + { + self.inner.any(f) + } + + #[inline] + fn find

(&mut self, predicate: P) -> Option + where + P: FnMut(&Self::Item) -> bool, + { + self.inner.find(predicate) + } + + #[inline] + fn position

(&mut self, predicate: P) -> Option + where + P: FnMut(Self::Item) -> bool, + { + self.inner.position(predicate) + } + + $( + #[inline] + fn rposition

(&mut self, predicate: P) -> Option + where + P: FnMut(Self::Item) -> bool, + { + let _test: $double_ended = (); + self.inner.rposition(predicate) + } + )? + } + }; + + (DoubleEndedIterator for $ty:ident $(<$($lt:lifetime),+>)?) => { + impl$(<$($lt),+>)? DoubleEndedIterator for $ty$(<$($lt),+>)? { + #[inline] + fn next_back(&mut self) -> Option { + self.inner.next_back() + } + + #[inline] + fn nth_back(&mut self, n: usize) -> Option { + self.inner.nth_back(n) + } + + #[inline] + fn rfind

(&mut self, predicate: P) -> Option + where + P: FnMut(&Self::Item) -> bool, + { + self.inner.rfind(predicate) + } + } + }; + + (ExactSizeIterator for $ty:ident $(<$($lt:lifetime),+>)?) => { + impl$(<$($lt),+>)? ExactSizeIterator for $ty$(<$($lt),+>)? { + #[inline] + fn len(&self) -> usize { + self.inner.len() + } + } + }; + + (FusedIterator for $ty:ident $(<$($lt:lifetime),+>)?) => { + impl$(<$($lt),+>)? FusedIterator for $ty$(<$($lt),+>)? {} + }; + + (Iterator, DoubleEndedIterator, ExactSizeIterator, FusedIterator for $ty:ident $(<$($lt:lifetime),+>)? => $item:ty) => { + delegate!(Iterator for $ty$(<$($lt),+>)? => $item, DoubleEnded = ()); + delegate!(DoubleEndedIterator for $ty$(<$($lt),+>)?); + delegate!(ExactSizeIterator for $ty$(<$($lt),+>)?); + delegate!(FusedIterator for $ty$(<$($lt),+>)?); + }; +} + +#[must_use] +#[derive(Clone, Debug)] +pub struct Bytes<'a> { + pub(crate) inner: Copied>, +} +delegate!(Iterator, DoubleEndedIterator, ExactSizeIterator, FusedIterator for Bytes<'a> => u8); + +#[derive(Clone, Debug)] +#[must_use] +pub struct EscapeDebug<'a> { + #[allow(clippy::type_complexity)] + pub(crate) inner: Chain< + Flatten>, + FlatMap, CharEscapeIter, fn(JavaCodePoint) -> CharEscapeIter>, + >, +} +delegate!(Iterator for EscapeDebug<'a> => char); +delegate!(FusedIterator for EscapeDebug<'a>); +impl<'a> Display for EscapeDebug<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + self.clone().try_for_each(|c| f.write_char(c)) + } +} + +#[derive(Clone, Debug)] +#[must_use] +pub struct EscapeDefault<'a> { + pub(crate) inner: FlatMap, CharEscapeIter, fn(JavaCodePoint) -> CharEscapeIter>, +} +delegate!(Iterator for EscapeDefault<'a> => char); +delegate!(FusedIterator for EscapeDefault<'a>); +impl<'a> Display for EscapeDefault<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + self.clone().try_for_each(|c| f.write_char(c)) + } +} + +#[derive(Clone, Debug)] +#[must_use] +pub struct EscapeUnicode<'a> { + pub(crate) inner: FlatMap, CharEscapeIter, fn(JavaCodePoint) -> CharEscapeIter>, +} +delegate!(Iterator for EscapeUnicode<'a> => char); +delegate!(FusedIterator for EscapeUnicode<'a>); +impl<'a> Display for EscapeUnicode<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + self.clone().try_for_each(|c| f.write_char(c)) + } +} + +#[derive(Clone, Debug)] +#[must_use] +pub struct Lines<'a> { + pub(crate) inner: Map, fn(&JavaStr) -> &JavaStr>, +} +delegate!(Iterator for Lines<'a> => &'a JavaStr); +delegate!(DoubleEndedIterator for Lines<'a>); +delegate!(FusedIterator for Lines<'a>); + +#[derive(Clone)] +#[must_use] +pub struct Chars<'a> { + pub(crate) inner: slice::Iter<'a, u8>, +} + +impl<'a> Iterator for Chars<'a> { + type Item = JavaCodePoint; + + #[inline] + fn next(&mut self) -> Option { + // SAFETY: `JavaStr` invariant says `self.iter` is a semi-valid UTF-8 string and + // the resulting `ch` is a valid Unicode Scalar Value or surrogate code point. + unsafe { next_code_point(&mut self.inner).map(|ch| JavaCodePoint::from_u32_unchecked(ch)) } + } + + // TODO: std has an optimized count impl + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let len = self.inner.len(); + // `(len + 3)` can't overflow, because we know that the `slice::Iter` + // belongs to a slice in memory which has a maximum length of + // `isize::MAX` (that's well below `usize::MAX`). + ((len + 3) / 4, Some(len)) + } + + #[inline] + fn last(mut self) -> Option { + // No need to go through the entire string. + self.next_back() + } +} + +impl Debug for Chars<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "Chars(")?; + f.debug_list().entries(self.clone()).finish()?; + write!(f, ")")?; + Ok(()) + } +} + +impl<'a> DoubleEndedIterator for Chars<'a> { + #[inline] + fn next_back(&mut self) -> Option { + // SAFETY: `JavaStr` invariant says `self.iter` is a semi-valid UTF-8 string and + // the resulting `ch` is a valid Unicode Scalar Value or surrogate code point. + unsafe { + next_code_point_reverse(&mut self.inner).map(|ch| JavaCodePoint::from_u32_unchecked(ch)) + } + } +} + +impl FusedIterator for Chars<'_> {} + +impl<'a> Chars<'a> { + #[inline] + #[must_use] + pub fn as_str(&self) -> &'a JavaStr { + // SAFETY: `Chars` is only made from a JavaStr, which guarantees the iter is + // semi-valid UTF-8. + unsafe { JavaStr::from_semi_utf8_unchecked(self.inner.as_slice()) } + } +} + +#[derive(Clone, Debug)] +#[must_use] +pub struct CharIndices<'a> { + pub(crate) front_offset: usize, + pub(crate) inner: Chars<'a>, +} + +impl<'a> Iterator for CharIndices<'a> { + type Item = (usize, JavaCodePoint); + + #[inline] + fn next(&mut self) -> Option<(usize, JavaCodePoint)> { + let pre_len = self.inner.inner.len(); + match self.inner.next() { + None => None, + Some(ch) => { + let index = self.front_offset; + let len = self.inner.inner.len(); + self.front_offset += pre_len - len; + Some((index, ch)) + } + } + } + + #[inline] + fn count(self) -> usize { + self.inner.count() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } + + #[inline] + fn last(mut self) -> Option<(usize, JavaCodePoint)> { + // No need to go through the entire string. + self.next_back() + } +} + +impl<'a> DoubleEndedIterator for CharIndices<'a> { + #[inline] + fn next_back(&mut self) -> Option<(usize, JavaCodePoint)> { + self.inner.next_back().map(|ch| { + let index = self.front_offset + self.inner.inner.len(); + (index, ch) + }) + } +} + +impl FusedIterator for CharIndices<'_> {} + +impl<'a> CharIndices<'a> { + #[inline] + #[must_use] + pub fn as_str(&self) -> &'a JavaStr { + self.inner.as_str() + } +} + +#[must_use] +#[derive(Debug, Clone)] +pub struct Matches<'a, P> { + pub(crate) str: &'a JavaStr, + pub(crate) pat: P, +} + +impl<'a, P> Iterator for Matches<'a, P> +where + P: JavaStrPattern, +{ + type Item = &'a JavaStr; + + #[inline] + fn next(&mut self) -> Option { + if let Some((index, len)) = self.pat.find_in(self.str) { + // SAFETY: pattern returns valid indices + let ret = unsafe { self.str.get_unchecked(index..index + len) }; + self.str = unsafe { self.str.get_unchecked(index + len..) }; + Some(ret) + } else { + self.str = Default::default(); + None + } + } +} + +impl<'a, P> DoubleEndedIterator for Matches<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + fn next_back(&mut self) -> Option { + if let Some((index, len)) = self.pat.rfind_in(self.str) { + // SAFETY: pattern returns valid indices + let ret = unsafe { self.str.get_unchecked(index..index + len) }; + self.str = unsafe { self.str.get_unchecked(..index) }; + Some(ret) + } else { + self.str = Default::default(); + None + } + } +} + +#[must_use] +#[derive(Clone, Debug)] +pub struct RMatches<'a, P> { + pub(crate) inner: Matches<'a, P>, +} + +impl<'a, P> Iterator for RMatches<'a, P> +where + P: JavaStrPattern, +{ + type Item = &'a JavaStr; + + #[inline] + fn next(&mut self) -> Option { + self.inner.next_back() + } +} + +impl<'a, P> DoubleEndedIterator for RMatches<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + fn next_back(&mut self) -> Option { + self.inner.next() + } +} + +#[must_use] +#[derive(Clone, Debug)] +pub struct MatchIndices<'a, P> { + pub(crate) str: &'a JavaStr, + pub(crate) start: usize, + pub(crate) pat: P, +} + +impl<'a, P> Iterator for MatchIndices<'a, P> +where + P: JavaStrPattern, +{ + type Item = (usize, &'a JavaStr); + + #[inline] + fn next(&mut self) -> Option { + if let Some((index, len)) = self.pat.find_in(self.str) { + let full_index = self.start + index; + self.start = full_index + len; + // SAFETY: pattern returns valid indices + let ret = unsafe { self.str.get_unchecked(index..index + len) }; + self.str = unsafe { self.str.get_unchecked(index + len..) }; + Some((full_index, ret)) + } else { + self.start += self.str.len(); + self.str = Default::default(); + None + } + } +} + +impl<'a, P> DoubleEndedIterator for MatchIndices<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + fn next_back(&mut self) -> Option { + if let Some((index, len)) = self.pat.rfind_in(self.str) { + // SAFETY: pattern returns valid indices + let ret = unsafe { self.str.get_unchecked(index..index + len) }; + self.str = unsafe { self.str.get_unchecked(..index) }; + Some((self.start + index, ret)) + } else { + self.str = Default::default(); + None + } + } +} + +#[derive(Clone, Debug)] +pub struct RMatchIndices<'a, P> { + pub(crate) inner: MatchIndices<'a, P>, +} + +impl<'a, P> Iterator for RMatchIndices<'a, P> +where + P: JavaStrPattern, +{ + type Item = (usize, &'a JavaStr); + + #[inline] + fn next(&mut self) -> Option { + self.inner.next_back() + } +} + +impl<'a, P> DoubleEndedIterator for RMatchIndices<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + fn next_back(&mut self) -> Option { + self.inner.next() + } +} + +#[derive(Clone, Debug)] +struct SplitHelper<'a, P> { + start: usize, + end: usize, + haystack: &'a JavaStr, + pat: P, + allow_trailing_empty: bool, + finished: bool, + had_empty_match: bool, +} + +impl<'a, P> SplitHelper<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + fn new(haystack: &'a JavaStr, pat: P, allow_trailing_empty: bool) -> Self { + Self { + start: 0, + end: haystack.len(), + haystack, + pat, + allow_trailing_empty, + finished: false, + had_empty_match: false, + } + } + + #[inline] + fn get_end(&mut self) -> Option<&'a JavaStr> { + if !self.finished { + self.finished = true; + + if self.allow_trailing_empty || self.end - self.start > 0 { + // SAFETY: `self.start` and `self.end` always lie on unicode boundaries. + let string = unsafe { self.haystack.get_unchecked(self.start..self.end) }; + return Some(string); + } + } + + None + } + + #[inline] + fn next_match(&mut self) -> Option<(usize, usize)> { + // SAFETY: `self.start` always lies on a unicode boundary. + let substr = unsafe { self.haystack.get_unchecked(self.start..) }; + + let result = if self.had_empty_match { + // if we had an empty match before, we are going to find the empty match again. + // don't do that, search from the next index along. + + if substr.is_empty() { + None + } else { + // SAFETY: we can pop the string because we already checked if the string is + // empty above + let first_char_len = unsafe { substr.chars().next().unwrap_unchecked().len_utf8() }; + let popped_str = unsafe { substr.get_unchecked(first_char_len..) }; + + self.pat + .find_in(popped_str) + .map(|(index, len)| (index + first_char_len + self.start, len)) + } + } else { + self.pat + .find_in(substr) + .map(|(index, len)| (index + self.start, len)) + }; + + self.had_empty_match = result.is_some_and(|(_, len)| len == 0); + + result + } + + #[inline] + fn next(&mut self) -> Option<&'a JavaStr> { + if self.finished { + return None; + } + + match self.next_match() { + Some((index, len)) => unsafe { + // SAFETY: pattern guarantees valid indices + let elt = self.haystack.get_unchecked(self.start..index); + self.start = index + len; + Some(elt) + }, + None => self.get_end(), + } + } + + #[inline] + fn next_inclusive(&mut self) -> Option<&'a JavaStr> { + if self.finished { + return None; + } + + match self.next_match() { + Some((index, len)) => unsafe { + // SAFETY: pattern guarantees valid indices + let elt = self.haystack.get_unchecked(self.start..index + len); + self.start = index + len; + Some(elt) + }, + None => self.get_end(), + } + } + + #[inline] + fn next_match_back(&mut self) -> Option<(usize, usize)> { + // SAFETY: `self.end` always lies on a unicode boundary. + let substr = unsafe { self.haystack.get_unchecked(..self.end) }; + + let result = if self.had_empty_match { + // if we had an empty match before, we are going to find the empty match again. + // don't do that, search from the next index along. + + if substr.is_empty() { + None + } else { + // SAFETY: we can pop the string because we already checked if the string is + // empty above + let last_char_len = + unsafe { substr.chars().next_back().unwrap_unchecked().len_utf8() }; + let popped_str = unsafe { substr.get_unchecked(..substr.len() - last_char_len) }; + + self.pat.find_in(popped_str) + } + } else { + self.pat.find_in(substr) + }; + + self.had_empty_match = result.is_some_and(|(_, len)| len == 0); + + result + } + + #[inline] + fn next_back(&mut self) -> Option<&'a JavaStr> { + if self.finished { + return None; + } + + if !self.allow_trailing_empty { + self.allow_trailing_empty = true; + match self.next_back() { + Some(elt) if !elt.is_empty() => return Some(elt), + _ => { + if self.finished { + return None; + } + } + } + } + + match self.next_match_back() { + Some((index, len)) => unsafe { + // SAFETY: pattern guarantees valid indices + let elt = self.haystack.get_unchecked(index + len..self.end); + self.end = index; + Some(elt) + }, + None => unsafe { + // SAFETY: `self.start` and `self.end` always lie on unicode boundaries. + self.finished = true; + Some(self.haystack.get_unchecked(self.start..self.end)) + }, + } + } + + #[inline] + fn next_back_inclusive(&mut self) -> Option<&'a JavaStr> { + if self.finished { + return None; + } + + if !self.allow_trailing_empty { + self.allow_trailing_empty = true; + match self.next_back_inclusive() { + Some(elt) if !elt.is_empty() => return Some(elt), + _ => { + if self.finished { + return None; + } + } + } + } + + match self.next_match_back() { + Some((index, len)) => unsafe { + // SAFETY: pattern guarantees valid indices + let elt = self.haystack.get_unchecked(index + len..self.end); + self.end = index + len; + Some(elt) + }, + None => unsafe { + // SAFETY: `self.start` and `self.end` always lie on unicode boundaries. + self.finished = true; + Some(self.haystack.get_unchecked(self.start..self.end)) + }, + } + } +} + +#[derive(Clone, Debug)] +pub struct Split<'a, P> { + inner: SplitHelper<'a, P>, +} + +impl<'a, P> Split<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + pub(crate) fn new(haystack: &'a JavaStr, pat: P) -> Self { + Split { + inner: SplitHelper::new(haystack, pat, true), + } + } +} + +impl<'a, P> Iterator for Split<'a, P> +where + P: JavaStrPattern, +{ + type Item = &'a JavaStr; + + #[inline] + fn next(&mut self) -> Option { + self.inner.next() + } +} + +impl<'a, P> DoubleEndedIterator for Split<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + fn next_back(&mut self) -> Option { + self.inner.next_back() + } +} + +impl<'a, P> FusedIterator for Split<'a, P> where P: JavaStrPattern {} + +#[derive(Clone, Debug)] +pub struct RSplit<'a, P> { + inner: SplitHelper<'a, P>, +} + +impl<'a, P> RSplit<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + pub(crate) fn new(haystack: &'a JavaStr, pat: P) -> Self { + RSplit { + inner: SplitHelper::new(haystack, pat, true), + } + } +} + +impl<'a, P> Iterator for RSplit<'a, P> +where + P: JavaStrPattern, +{ + type Item = &'a JavaStr; + + #[inline] + fn next(&mut self) -> Option { + self.inner.next_back() + } +} + +impl<'a, P> DoubleEndedIterator for RSplit<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + fn next_back(&mut self) -> Option { + self.inner.next() + } +} + +impl<'a, P> FusedIterator for RSplit<'a, P> where P: JavaStrPattern {} + +#[derive(Clone, Debug)] +pub struct SplitTerminator<'a, P> { + inner: SplitHelper<'a, P>, +} + +impl<'a, P> SplitTerminator<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + pub(crate) fn new(haystack: &'a JavaStr, pat: P) -> Self { + SplitTerminator { + inner: SplitHelper::new(haystack, pat, false), + } + } +} + +impl<'a, P> Iterator for SplitTerminator<'a, P> +where + P: JavaStrPattern, +{ + type Item = &'a JavaStr; + + #[inline] + fn next(&mut self) -> Option { + self.inner.next() + } +} + +impl<'a, P> DoubleEndedIterator for SplitTerminator<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + fn next_back(&mut self) -> Option { + self.inner.next_back() + } +} + +impl<'a, P> FusedIterator for SplitTerminator<'a, P> where P: JavaStrPattern {} + +#[derive(Clone, Debug)] +pub struct RSplitTerminator<'a, P> { + inner: SplitHelper<'a, P>, +} + +impl<'a, P> RSplitTerminator<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + pub(crate) fn new(haystack: &'a JavaStr, pat: P) -> Self { + RSplitTerminator { + inner: SplitHelper::new(haystack, pat, false), + } + } +} + +impl<'a, P> Iterator for RSplitTerminator<'a, P> +where + P: JavaStrPattern, +{ + type Item = &'a JavaStr; + + #[inline] + fn next(&mut self) -> Option { + self.inner.next_back() + } +} + +impl<'a, P> DoubleEndedIterator for RSplitTerminator<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + fn next_back(&mut self) -> Option { + self.inner.next() + } +} + +impl<'a, P> FusedIterator for RSplitTerminator<'a, P> where P: JavaStrPattern {} + +#[derive(Clone, Debug)] +pub struct SplitInclusive<'a, P> { + inner: SplitHelper<'a, P>, +} + +impl<'a, P> SplitInclusive<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + pub(crate) fn new(haystack: &'a JavaStr, pat: P) -> Self { + SplitInclusive { + inner: SplitHelper::new(haystack, pat, false), + } + } +} + +impl<'a, P> Iterator for SplitInclusive<'a, P> +where + P: JavaStrPattern, +{ + type Item = &'a JavaStr; + + #[inline] + fn next(&mut self) -> Option { + self.inner.next_inclusive() + } +} + +impl<'a, P> DoubleEndedIterator for SplitInclusive<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + fn next_back(&mut self) -> Option { + self.inner.next_back_inclusive() + } +} + +impl<'a, P> FusedIterator for SplitInclusive<'a, P> where P: JavaStrPattern {} + +#[derive(Clone, Debug)] +pub struct SplitN<'a, P> { + inner: SplitHelper<'a, P>, + count: usize, +} + +impl<'a, P> SplitN<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + pub(crate) fn new(haystack: &'a JavaStr, pat: P, count: usize) -> Self { + SplitN { + inner: SplitHelper::new(haystack, pat, true), + count, + } + } +} + +impl<'a, P> Iterator for SplitN<'a, P> +where + P: JavaStrPattern, +{ + type Item = &'a JavaStr; + + #[inline] + fn next(&mut self) -> Option { + match self.count { + 0 => None, + 1 => { + self.count = 0; + self.inner.get_end() + } + _ => { + self.count -= 1; + self.inner.next() + } + } + } +} + +impl<'a, P> FusedIterator for SplitN<'a, P> where P: JavaStrPattern {} + +#[derive(Clone, Debug)] +pub struct RSplitN<'a, P> { + inner: SplitHelper<'a, P>, + count: usize, +} + +impl<'a, P> RSplitN<'a, P> +where + P: JavaStrPattern, +{ + #[inline] + pub(crate) fn new(haystack: &'a JavaStr, pat: P, count: usize) -> Self { + RSplitN { + inner: SplitHelper::new(haystack, pat, true), + count, + } + } +} + +impl<'a, P> Iterator for RSplitN<'a, P> +where + P: JavaStrPattern, +{ + type Item = &'a JavaStr; + + #[inline] + fn next(&mut self) -> Option { + match self.count { + 0 => None, + 1 => { + self.count = 0; + self.inner.get_end() + } + _ => { + self.count -= 1; + self.inner.next_back() + } + } + } +} + +impl<'a, P> FusedIterator for RSplitN<'a, P> where P: JavaStrPattern {} + +#[derive(Clone, Debug)] +pub struct SplitAsciiWhitespace<'a> { + #[allow(clippy::type_complexity)] + pub(crate) inner: Map< + Filter bool>, fn(&&[u8]) -> bool>, + fn(&[u8]) -> &JavaStr, + >, +} +delegate!(Iterator for SplitAsciiWhitespace<'a> => &'a JavaStr); +delegate!(DoubleEndedIterator for SplitAsciiWhitespace<'a>); +delegate!(FusedIterator for SplitAsciiWhitespace<'a>); + +#[derive(Clone, Debug)] +pub struct SplitWhitespace<'a> { + #[allow(clippy::type_complexity)] + pub(crate) inner: Filter bool>, fn(&&JavaStr) -> bool>, +} +delegate!(Iterator for SplitWhitespace<'a> => &'a JavaStr); +delegate!(DoubleEndedIterator for SplitWhitespace<'a>); +delegate!(FusedIterator for SplitWhitespace<'a>); diff --git a/crates/valence_java_string/src/lib.rs b/crates/valence_java_string/src/lib.rs new file mode 100644 index 000000000..82e80d7ab --- /dev/null +++ b/crates/valence_java_string/src/lib.rs @@ -0,0 +1,25 @@ +#![doc = include_str!("../README.md")] + +mod char; +mod error; +mod iter; +mod owned; +mod pattern; +#[cfg(feature = "serde")] +mod serde; +mod slice; +pub(crate) mod validations; + +pub use char::*; +pub use error::*; +pub use iter::*; +pub use owned::*; +pub use pattern::*; +pub use slice::*; + +#[macro_export] +macro_rules! format_java { + ($($arg:tt)*) => { + $crate::JavaString::from(::std::format!($($arg)*)) + } +} diff --git a/crates/valence_java_string/src/owned.rs b/crates/valence_java_string/src/owned.rs new file mode 100644 index 000000000..66a055d64 --- /dev/null +++ b/crates/valence_java_string/src/owned.rs @@ -0,0 +1,1200 @@ +use std::borrow::{Borrow, BorrowMut, Cow}; +use std::collections::{Bound, TryReserveError}; +use std::convert::Infallible; +use std::fmt::{Debug, Display, Formatter, Write}; +use std::hash::{Hash, Hasher}; +use std::iter::FusedIterator; +use std::ops::{ + Add, AddAssign, Deref, DerefMut, Index, IndexMut, Range, RangeBounds, RangeFrom, RangeFull, + RangeInclusive, RangeTo, RangeToInclusive, +}; +use std::rc::Rc; +use std::str::FromStr; +use std::sync::Arc; +use std::{ptr, slice}; + +use crate::validations::{ + run_utf8_full_validation_from_semi, run_utf8_semi_validation, to_range_checked, +}; +use crate::{Chars, FromUtf8Error, JavaCodePoint, JavaStr, Utf8Error}; + +#[derive(Default, PartialEq, PartialOrd, Eq, Ord)] +pub struct JavaString { + vec: Vec, +} + +impl JavaString { + #[inline] + #[must_use] + pub const fn new() -> JavaString { + JavaString { vec: Vec::new() } + } + + #[inline] + #[must_use] + pub fn with_capacity(capacity: usize) -> JavaString { + JavaString { + vec: Vec::with_capacity(capacity), + } + } + + #[inline] + pub fn from_utf8(vec: Vec) -> Result { + match std::str::from_utf8(&vec) { + Ok(..) => Ok(JavaString { vec }), + Err(e) => Err(FromUtf8Error { + bytes: vec, + error: e.into(), + }), + } + } + + pub fn from_semi_utf8(vec: Vec) -> Result { + match run_utf8_semi_validation(&vec) { + Ok(..) => Ok(JavaString { vec }), + Err(err) => Err(FromUtf8Error { + bytes: vec, + error: err, + }), + } + } + + #[must_use] + pub fn from_semi_utf8_lossy(v: &[u8]) -> Cow<'_, JavaStr> { + const REPLACEMENT: &str = "\u{FFFD}"; + + match run_utf8_semi_validation(v) { + Ok(()) => unsafe { + // SAFETY: validation succeeded + Cow::Borrowed(JavaStr::from_semi_utf8_unchecked(v)) + }, + Err(error) => { + let mut result = unsafe { + // SAFETY: validation succeeded up to this index + JavaString::from_semi_utf8_unchecked( + v.get_unchecked(..error.valid_up_to).to_vec(), + ) + }; + result.push_str(REPLACEMENT); + let mut index = error.valid_up_to + error.error_len.unwrap_or(1) as usize; + loop { + match run_utf8_semi_validation(&v[index..]) { + Ok(()) => { + unsafe { + // SAFETY: validation succeeded + result + .push_java_str(JavaStr::from_semi_utf8_unchecked(&v[index..])); + } + return Cow::Owned(result); + } + Err(error) => { + unsafe { + // SAFETY: validation succeeded up to this index + result.push_java_str(JavaStr::from_semi_utf8_unchecked( + v.get_unchecked(index..index + error.valid_up_to), + )); + } + result.push_str(REPLACEMENT); + index += error.valid_up_to + error.error_len.unwrap_or(1) as usize; + } + } + } + } + } + } + + /// # Safety + /// + /// The parameter must be in semi-valid UTF-8 format, that is, UTF-8 plus + /// surrogate code points. + #[inline] + #[must_use] + pub unsafe fn from_semi_utf8_unchecked(bytes: Vec) -> JavaString { + JavaString { vec: bytes } + } + + #[inline] + #[must_use] + pub fn into_bytes(self) -> Vec { + self.vec + } + + #[inline] + #[must_use] + pub fn as_java_str(&self) -> &JavaStr { + unsafe { + // SAFETY: this str has semi-valid UTF-8 + JavaStr::from_semi_utf8_unchecked(&self.vec) + } + } + + #[inline] + #[must_use] + pub fn as_mut_java_str(&mut self) -> &mut JavaStr { + unsafe { + // SAFETY: this str has semi-valid UTF-8 + JavaStr::from_semi_utf8_unchecked_mut(&mut self.vec) + } + } + + pub fn into_string(self) -> Result { + run_utf8_full_validation_from_semi(self.as_bytes()).map(|_| unsafe { + // SAFETY: validation succeeded + self.into_string_unchecked() + }) + } + + /// # Safety + /// + /// This string must be fully valid UTF-8, i.e. have no surrogate code + /// points. + #[inline] + #[must_use] + pub unsafe fn into_string_unchecked(self) -> String { + // SAFETY: preconditions checked by caller + String::from_utf8_unchecked(self.vec) + } + + #[inline] + pub fn push_java_str(&mut self, string: &JavaStr) { + self.vec.extend_from_slice(string.as_bytes()) + } + + #[inline] + pub fn push_str(&mut self, string: &str) { + self.vec.extend_from_slice(string.as_bytes()) + } + + #[inline] + #[must_use] + pub fn capacity(&self) -> usize { + self.vec.capacity() + } + + #[inline] + pub fn reserve(&mut self, additional: usize) { + self.vec.reserve(additional) + } + + #[inline] + pub fn reserve_exact(&mut self, additional: usize) { + self.vec.reserve_exact(additional) + } + + #[inline] + pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> { + self.vec.try_reserve(additional) + } + + #[inline] + pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> { + self.vec.try_reserve_exact(additional) + } + + #[inline] + pub fn shrink_to_fit(&mut self) { + self.vec.shrink_to_fit() + } + + #[inline] + pub fn shrink_to(&mut self, min_capacity: usize) { + self.vec.shrink_to(min_capacity) + } + + #[inline] + pub fn push(&mut self, ch: char) { + match ch.len_utf8() { + 1 => self.vec.push(ch as u8), + _ => self + .vec + .extend_from_slice(ch.encode_utf8(&mut [0; 4]).as_bytes()), + } + } + + #[inline] + pub fn push_java(&mut self, ch: JavaCodePoint) { + match ch.len_utf8() { + 1 => self.vec.push(ch.as_u32() as u8), + _ => self.vec.extend_from_slice(ch.encode_semi_utf8(&mut [0; 4])), + } + } + + #[inline] + #[must_use] + pub fn as_bytes(&self) -> &[u8] { + &self.vec + } + + #[inline] + pub fn truncate(&mut self, new_len: usize) { + if new_len <= self.len() { + assert!(self.is_char_boundary(new_len)); + self.vec.truncate(new_len) + } + } + + #[inline] + pub fn pop(&mut self) -> Option { + let ch = self.chars().next_back()?; + let newlen = self.len() - ch.len_utf8(); + unsafe { + self.vec.set_len(newlen); + } + Some(ch) + } + + #[inline] + pub fn remove(&mut self, idx: usize) -> JavaCodePoint { + let ch = match self[idx..].chars().next() { + Some(ch) => ch, + None => panic!("cannot remove a char from the end of a string"), + }; + + let next = idx + ch.len_utf8(); + let len = self.len(); + unsafe { + ptr::copy( + self.vec.as_ptr().add(next), + self.vec.as_mut_ptr().add(idx), + len - next, + ); + self.vec.set_len(len - (next - idx)); + } + ch + } + + #[inline] + pub fn retain(&mut self, mut f: F) + where + F: FnMut(JavaCodePoint) -> bool, + { + struct SetLenOnDrop<'a> { + s: &'a mut JavaString, + idx: usize, + del_bytes: usize, + } + + impl<'a> Drop for SetLenOnDrop<'a> { + #[inline] + fn drop(&mut self) { + let new_len = self.idx - self.del_bytes; + debug_assert!(new_len <= self.s.len()); + unsafe { self.s.vec.set_len(new_len) }; + } + } + + let len = self.len(); + let mut guard = SetLenOnDrop { + s: self, + idx: 0, + del_bytes: 0, + }; + + while guard.idx < len { + // SAFETY: `guard.idx` is positive-or-zero and less that len so the + // `get_unchecked` is in bound. `self` is valid UTF-8 like string + // and the returned slice starts at a unicode code point so the + // `Chars` always return one character. + let ch = unsafe { + guard + .s + .get_unchecked(guard.idx..len) + .chars() + .next() + .unwrap_unchecked() + }; + let ch_len = ch.len_utf8(); + + if !f(ch) { + guard.del_bytes += ch_len; + } else if guard.del_bytes > 0 { + // SAFETY: `guard.idx` is in bound and `guard.del_bytes` represent the number of + // bytes that are erased from the string so the resulting `guard.idx - + // guard.del_bytes` always represent a valid unicode code point. + // + // `guard.del_bytes` >= `ch.len_utf8()`, so taking a slice with `ch.len_utf8()` + // len is safe. + ch.encode_semi_utf8(unsafe { + slice::from_raw_parts_mut( + guard.s.as_mut_ptr().add(guard.idx - guard.del_bytes), + ch.len_utf8(), + ) + }); + } + + // Point idx to the next char + guard.idx += ch_len; + } + + drop(guard); + } + + #[inline] + pub fn insert(&mut self, idx: usize, ch: char) { + assert!(self.is_char_boundary(idx)); + let mut bits = [0; 4]; + let bits = ch.encode_utf8(&mut bits).as_bytes(); + + unsafe { + self.insert_bytes(idx, bits); + } + } + + #[inline] + pub fn insert_java(&mut self, idx: usize, ch: JavaCodePoint) { + assert!(self.is_char_boundary(idx)); + let mut bits = [0; 4]; + let bits = ch.encode_semi_utf8(&mut bits); + + unsafe { + self.insert_bytes(idx, bits); + } + } + + #[inline] + unsafe fn insert_bytes(&mut self, idx: usize, bytes: &[u8]) { + let len = self.len(); + let amt = bytes.len(); + self.vec.reserve(amt); + + unsafe { + ptr::copy( + self.vec.as_ptr().add(idx), + self.vec.as_mut_ptr().add(idx + amt), + len - idx, + ); + ptr::copy_nonoverlapping(bytes.as_ptr(), self.vec.as_mut_ptr().add(idx), amt); + self.vec.set_len(len + amt); + } + } + + #[inline] + pub fn insert_str(&mut self, idx: usize, string: &str) { + assert!(self.is_char_boundary(idx)); + + unsafe { + self.insert_bytes(idx, string.as_bytes()); + } + } + + pub fn insert_java_str(&mut self, idx: usize, string: &JavaStr) { + assert!(self.is_char_boundary(idx)); + + unsafe { + self.insert_bytes(idx, string.as_bytes()); + } + } + + /// # Safety + /// + /// The returned `Vec` must not have invalid UTF-8 written to it, besides + /// surrogate pairs. + #[inline] + pub unsafe fn as_mut_vec(&mut self) -> &mut Vec { + &mut self.vec + } + + #[inline] + #[must_use] + pub fn len(&self) -> usize { + self.vec.len() + } + + #[inline] + #[must_use] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + #[inline] + #[must_use] + pub fn split_off(&mut self, at: usize) -> JavaString { + assert!(self.is_char_boundary(at)); + let other = self.vec.split_off(at); + unsafe { JavaString::from_semi_utf8_unchecked(other) } + } + + #[inline] + pub fn clear(&mut self) { + self.vec.clear(); + } + + #[inline] + pub fn drain(&mut self, range: R) -> Drain<'_> + where + R: RangeBounds, + { + // Memory safety: see String::drain + let Range { start, end } = to_range_checked(range, ..self.len()); + assert!(self.is_char_boundary(start)); + assert!(self.is_char_boundary(end)); + + // Take out two simultaneous borrows. The &mut String won't be accessed + // until iteration is over, in Drop. + let self_ptr = self as *mut _; + // SAFETY: `to_range_checked` and `is_char_boundary` do the appropriate bounds + // checks. + let chars_iter = unsafe { self.get_unchecked(start..end) }.chars(); + + Drain { + start, + end, + iter: chars_iter, + string: self_ptr, + } + } + + pub fn replace_range(&mut self, range: R, replace_with: &str) + where + R: RangeBounds, + { + self.replace_range_java(range, JavaStr::from_str(replace_with)) + } + + pub fn replace_range_java(&mut self, range: R, replace_with: &JavaStr) + where + R: RangeBounds, + { + let start = range.start_bound(); + match start { + Bound::Included(&n) => assert!(self.is_char_boundary(n)), + Bound::Excluded(&n) => assert!(self.is_char_boundary(n + 1)), + Bound::Unbounded => {} + }; + let end = range.end_bound(); + match end { + Bound::Included(&n) => assert!(self.is_char_boundary(n + 1)), + Bound::Excluded(&n) => assert!(self.is_char_boundary(n)), + Bound::Unbounded => {} + }; + + unsafe { self.as_mut_vec() }.splice((start, end), replace_with.bytes()); + } + + #[inline] + #[must_use] + pub fn into_boxed_str(self) -> Box { + let slice = self.vec.into_boxed_slice(); + unsafe { JavaStr::from_boxed_semi_utf8_unchecked(slice) } + } + + #[inline] + pub fn leak<'a>(self) -> &'a mut JavaStr { + let slice = self.vec.leak(); + unsafe { JavaStr::from_semi_utf8_unchecked_mut(slice) } + } +} + +impl Add<&str> for JavaString { + type Output = JavaString; + + #[inline] + fn add(mut self, rhs: &str) -> Self::Output { + self.push_str(rhs); + self + } +} + +impl Add<&JavaStr> for JavaString { + type Output = JavaString; + + #[inline] + fn add(mut self, rhs: &JavaStr) -> Self::Output { + self.push_java_str(rhs); + self + } +} + +impl AddAssign<&str> for JavaString { + #[inline] + fn add_assign(&mut self, rhs: &str) { + self.push_str(rhs); + } +} + +impl AddAssign<&JavaStr> for JavaString { + #[inline] + fn add_assign(&mut self, rhs: &JavaStr) { + self.push_java_str(rhs); + } +} + +impl AsMut for JavaString { + #[inline] + fn as_mut(&mut self) -> &mut JavaStr { + self.as_mut_java_str() + } +} + +impl AsRef<[u8]> for JavaString { + #[inline] + fn as_ref(&self) -> &[u8] { + self.as_bytes() + } +} + +impl AsRef for JavaString { + #[inline] + fn as_ref(&self) -> &JavaStr { + self.as_java_str() + } +} + +impl Borrow for JavaString { + #[inline] + fn borrow(&self) -> &JavaStr { + self.as_java_str() + } +} + +impl BorrowMut for JavaString { + #[inline] + fn borrow_mut(&mut self) -> &mut JavaStr { + self.as_mut_java_str() + } +} + +impl Clone for JavaString { + #[inline] + fn clone(&self) -> Self { + JavaString { + vec: self.vec.clone(), + } + } + + #[inline] + fn clone_from(&mut self, source: &Self) { + self.vec.clone_from(&source.vec) + } +} + +impl Debug for JavaString { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + Debug::fmt(&**self, f) + } +} + +impl Deref for JavaString { + type Target = JavaStr; + + #[inline] + fn deref(&self) -> &Self::Target { + self.as_java_str() + } +} + +impl DerefMut for JavaString { + #[inline] + fn deref_mut(&mut self) -> &mut Self::Target { + self.as_mut_java_str() + } +} + +impl Display for JavaString { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + Display::fmt(&**self, f) + } +} + +impl Extend for JavaString { + fn extend>(&mut self, iter: T) { + let iterator = iter.into_iter(); + let (lower_bound, _) = iterator.size_hint(); + self.reserve(lower_bound); + iterator.for_each(move |c| self.push(c)); + } +} + +impl Extend for JavaString { + fn extend>(&mut self, iter: T) { + let iterator = iter.into_iter(); + let (lower_bound, _) = iterator.size_hint(); + self.reserve(lower_bound); + iterator.for_each(move |c| self.push_java(c)); + } +} + +impl Extend for JavaString { + fn extend>(&mut self, iter: T) { + iter.into_iter().for_each(move |s| self.push_str(&s)); + } +} + +impl Extend for JavaString { + fn extend>(&mut self, iter: T) { + iter.into_iter().for_each(move |s| self.push_java_str(&s)); + } +} + +impl<'a> Extend<&'a char> for JavaString { + fn extend>(&mut self, iter: T) { + self.extend(iter.into_iter().cloned()) + } +} + +impl<'a> Extend<&'a JavaCodePoint> for JavaString { + fn extend>(&mut self, iter: T) { + self.extend(iter.into_iter().cloned()) + } +} + +impl<'a> Extend<&'a str> for JavaString { + fn extend>(&mut self, iter: T) { + iter.into_iter().for_each(move |s| self.push_str(s)); + } +} + +impl<'a> Extend<&'a JavaStr> for JavaString { + fn extend>(&mut self, iter: T) { + iter.into_iter().for_each(move |s| self.push_java_str(s)); + } +} + +impl Extend> for JavaString { + fn extend>>(&mut self, iter: T) { + iter.into_iter().for_each(move |s| self.push_str(&s)); + } +} + +impl Extend> for JavaString { + fn extend>>(&mut self, iter: T) { + iter.into_iter().for_each(move |s| self.push_java_str(&s)); + } +} + +impl<'a> Extend> for JavaString { + fn extend>>(&mut self, iter: T) { + iter.into_iter().for_each(move |s| self.push_str(&s)); + } +} + +impl<'a> Extend> for JavaString { + fn extend>>(&mut self, iter: T) { + iter.into_iter().for_each(move |s| self.push_java_str(&s)); + } +} + +impl From for JavaString { + #[inline] + fn from(value: String) -> Self { + unsafe { + // SAFETY: value is valid UTF-8 + JavaString::from_semi_utf8_unchecked(value.into_bytes()) + } + } +} + +impl From<&String> for JavaString { + #[inline] + fn from(value: &String) -> Self { + Self::from(value.clone()) + } +} + +impl From<&JavaString> for JavaString { + #[inline] + fn from(value: &JavaString) -> Self { + value.clone() + } +} + +impl From<&mut str> for JavaString { + #[inline] + fn from(value: &mut str) -> Self { + Self::from(&*value) + } +} + +impl From<&str> for JavaString { + #[inline] + fn from(value: &str) -> Self { + Self::from(value.to_owned()) + } +} + +impl From<&mut JavaStr> for JavaString { + #[inline] + fn from(value: &mut JavaStr) -> Self { + Self::from(&*value) + } +} + +impl From<&JavaStr> for JavaString { + #[inline] + fn from(value: &JavaStr) -> Self { + value.to_owned() + } +} + +impl From> for JavaString { + #[inline] + fn from(value: Box) -> Self { + Self::from(value.into_string()) + } +} + +impl From> for JavaString { + #[inline] + fn from(value: Box) -> Self { + value.into_string() + } +} + +impl<'a> From> for JavaString { + #[inline] + fn from(value: Cow<'a, str>) -> Self { + Self::from(value.into_owned()) + } +} + +impl<'a> From> for JavaString { + #[inline] + fn from(value: Cow<'a, JavaStr>) -> Self { + value.into_owned() + } +} + +impl From for Arc { + #[inline] + fn from(value: JavaString) -> Self { + Arc::from(&value[..]) + } +} + +impl<'a> From for Cow<'a, JavaStr> { + #[inline] + fn from(value: JavaString) -> Self { + Cow::Owned(value) + } +} + +impl From for Rc { + #[inline] + fn from(value: JavaString) -> Self { + Rc::from(&value[..]) + } +} + +impl From for Vec { + #[inline] + fn from(value: JavaString) -> Self { + value.into_bytes() + } +} + +impl From for JavaString { + #[inline] + fn from(value: char) -> Self { + Self::from(value.encode_utf8(&mut [0; 4])) + } +} + +impl From for JavaString { + #[inline] + fn from(value: JavaCodePoint) -> Self { + unsafe { + // SAFETY: we're encoding into semi-valid UTF-8 + JavaString::from_semi_utf8_unchecked(value.encode_semi_utf8(&mut [0; 4]).to_vec()) + } + } +} + +impl FromIterator for JavaString { + #[inline] + fn from_iter>(iter: T) -> Self { + let mut buf = JavaString::new(); + buf.extend(iter); + buf + } +} + +impl<'a> FromIterator<&'a char> for JavaString { + #[inline] + fn from_iter>(iter: T) -> Self { + let mut buf = JavaString::new(); + buf.extend(iter); + buf + } +} + +impl FromIterator for JavaString { + #[inline] + fn from_iter>(iter: T) -> Self { + let mut buf = JavaString::new(); + buf.extend(iter); + buf + } +} + +impl<'a> FromIterator<&'a JavaCodePoint> for JavaString { + #[inline] + fn from_iter>(iter: T) -> Self { + let mut buf = JavaString::new(); + buf.extend(iter); + buf + } +} + +impl<'a> FromIterator<&'a str> for JavaString { + #[inline] + fn from_iter>(iter: T) -> Self { + let mut buf = JavaString::new(); + buf.extend(iter); + buf + } +} + +impl FromIterator for JavaString { + fn from_iter>(iter: T) -> Self { + let mut iterator = iter.into_iter(); + + match iterator.next() { + None => JavaString::new(), + Some(buf) => { + let mut buf = JavaString::from(buf); + buf.extend(iterator); + buf + } + } + } +} + +impl FromIterator for JavaString { + fn from_iter>(iter: T) -> Self { + let mut iterator = iter.into_iter(); + + match iterator.next() { + None => JavaString::new(), + Some(mut buf) => { + buf.extend(iterator); + buf + } + } + } +} + +impl FromIterator> for JavaString { + #[inline] + fn from_iter>>(iter: T) -> Self { + let mut buf = JavaString::new(); + buf.extend(iter); + buf + } +} + +impl FromIterator> for JavaString { + #[inline] + fn from_iter>>(iter: T) -> Self { + let mut buf = JavaString::new(); + buf.extend(iter); + buf + } +} + +impl<'a> FromIterator> for JavaString { + #[inline] + fn from_iter>>(iter: T) -> Self { + let mut buf = JavaString::new(); + buf.extend(iter); + buf + } +} + +impl<'a> FromIterator> for JavaString { + #[inline] + fn from_iter>>(iter: T) -> Self { + let mut buf = JavaString::new(); + buf.extend(iter); + buf + } +} + +impl FromStr for JavaString { + type Err = Infallible; + + #[inline] + fn from_str(s: &str) -> Result { + Ok(Self::from(s)) + } +} + +impl Hash for JavaString { + #[inline] + fn hash(&self, state: &mut H) { + (**self).hash(state) + } +} + +impl Index> for JavaString { + type Output = JavaStr; + + #[inline] + fn index(&self, index: Range) -> &Self::Output { + &self[..][index] + } +} + +impl Index> for JavaString { + type Output = JavaStr; + + #[inline] + fn index(&self, index: RangeFrom) -> &Self::Output { + &self[..][index] + } +} + +impl Index for JavaString { + type Output = JavaStr; + + #[inline] + fn index(&self, _index: RangeFull) -> &Self::Output { + self.as_java_str() + } +} + +impl Index> for JavaString { + type Output = JavaStr; + + #[inline] + fn index(&self, index: RangeInclusive) -> &Self::Output { + &self[..][index] + } +} + +impl Index> for JavaString { + type Output = JavaStr; + + #[inline] + fn index(&self, index: RangeTo) -> &Self::Output { + &self[..][index] + } +} + +impl Index> for JavaString { + type Output = JavaStr; + + #[inline] + fn index(&self, index: RangeToInclusive) -> &Self::Output { + &self[..][index] + } +} + +impl IndexMut> for JavaString { + #[inline] + fn index_mut(&mut self, index: Range) -> &mut Self::Output { + &mut self[..][index] + } +} + +impl IndexMut> for JavaString { + #[inline] + fn index_mut(&mut self, index: RangeFrom) -> &mut Self::Output { + &mut self[..][index] + } +} + +impl IndexMut for JavaString { + #[inline] + fn index_mut(&mut self, _index: RangeFull) -> &mut Self::Output { + self.as_mut_java_str() + } +} + +impl IndexMut> for JavaString { + #[inline] + fn index_mut(&mut self, index: RangeInclusive) -> &mut Self::Output { + &mut self[..][index] + } +} + +impl IndexMut> for JavaString { + #[inline] + fn index_mut(&mut self, index: RangeTo) -> &mut Self::Output { + &mut self[..][index] + } +} + +impl IndexMut> for JavaString { + #[inline] + fn index_mut(&mut self, index: RangeToInclusive) -> &mut Self::Output { + &mut self[..][index] + } +} + +impl PartialEq for JavaString { + #[inline] + fn eq(&self, other: &str) -> bool { + self[..] == other + } +} + +impl PartialEq for str { + #[inline] + fn eq(&self, other: &JavaString) -> bool { + self == other[..] + } +} + +impl<'a> PartialEq<&'a str> for JavaString { + #[inline] + fn eq(&self, other: &&'a str) -> bool { + self == *other + } +} + +impl<'a> PartialEq for &'a str { + #[inline] + fn eq(&self, other: &JavaString) -> bool { + *self == other + } +} + +impl PartialEq for JavaString { + #[inline] + fn eq(&self, other: &String) -> bool { + &self[..] == other + } +} + +impl PartialEq for String { + #[inline] + fn eq(&self, other: &JavaString) -> bool { + self == &other[..] + } +} + +impl PartialEq for JavaString { + #[inline] + fn eq(&self, other: &JavaStr) -> bool { + self[..] == other + } +} + +impl<'a> PartialEq<&'a JavaStr> for JavaString { + #[inline] + fn eq(&self, other: &&'a JavaStr) -> bool { + self == *other + } +} + +impl<'a> PartialEq> for JavaString { + #[inline] + fn eq(&self, other: &Cow<'a, str>) -> bool { + &self[..] == other + } +} + +impl<'a> PartialEq for Cow<'a, str> { + #[inline] + fn eq(&self, other: &JavaString) -> bool { + self == &other[..] + } +} + +impl<'a> PartialEq> for JavaString { + #[inline] + fn eq(&self, other: &Cow<'a, JavaStr>) -> bool { + &self[..] == other + } +} + +impl<'a> PartialEq for Cow<'a, JavaStr> { + #[inline] + fn eq(&self, other: &JavaString) -> bool { + self == &other[..] + } +} + +impl Write for JavaString { + #[inline] + fn write_str(&mut self, s: &str) -> std::fmt::Result { + self.push_str(s); + Ok(()) + } + + #[inline] + fn write_char(&mut self, c: char) -> std::fmt::Result { + self.push(c); + Ok(()) + } +} + +pub struct Drain<'a> { + string: *mut JavaString, + start: usize, + end: usize, + iter: Chars<'a>, +} + +impl Debug for Drain<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("Drain").field(&self.as_str()).finish() + } +} + +unsafe impl Sync for Drain<'_> {} +unsafe impl Send for Drain<'_> {} + +impl Drop for Drain<'_> { + #[inline] + fn drop(&mut self) { + unsafe { + // Use Vec::drain. "Reaffirm" the bounds checks to avoid + // panic code being inserted again. + let self_vec = (*self.string).as_mut_vec(); + if self.start <= self.end && self.end <= self_vec.len() { + self_vec.drain(self.start..self.end); + } + } + } +} + +impl AsRef for Drain<'_> { + #[inline] + fn as_ref(&self) -> &JavaStr { + self.as_str() + } +} + +impl AsRef<[u8]> for Drain<'_> { + #[inline] + fn as_ref(&self) -> &[u8] { + self.as_str().as_bytes() + } +} + +impl Drain<'_> { + #[inline] + #[must_use] + pub fn as_str(&self) -> &JavaStr { + self.iter.as_str() + } +} + +impl Iterator for Drain<'_> { + type Item = JavaCodePoint; + + #[inline] + fn next(&mut self) -> Option { + self.iter.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.iter.size_hint() + } + + #[inline] + fn last(mut self) -> Option { + self.next_back() + } +} + +impl DoubleEndedIterator for Drain<'_> { + #[inline] + fn next_back(&mut self) -> Option { + self.iter.next_back() + } +} + +impl FusedIterator for Drain<'_> {} diff --git a/crates/valence_java_string/src/pattern.rs b/crates/valence_java_string/src/pattern.rs new file mode 100644 index 000000000..06cc78041 --- /dev/null +++ b/crates/valence_java_string/src/pattern.rs @@ -0,0 +1,402 @@ +use crate::{JavaCodePoint, JavaStr}; + +mod private_pattern { + use crate::{JavaCodePoint, JavaStr}; + + pub trait Sealed {} + + impl Sealed for char {} + impl Sealed for JavaCodePoint {} + impl Sealed for &str {} + impl Sealed for &JavaStr {} + impl Sealed for F where F: FnMut(JavaCodePoint) -> bool {} + impl Sealed for &[char] {} + impl Sealed for &[JavaCodePoint] {} + impl Sealed for &char {} + impl Sealed for &JavaCodePoint {} + impl Sealed for &&str {} + impl Sealed for &&JavaStr {} +} + +/// # Safety +/// +/// Methods in this trait must only return indexes that are on char boundaries +pub unsafe trait JavaStrPattern: private_pattern::Sealed { + fn prefix_len_in(&mut self, haystack: &JavaStr) -> Option; + fn suffix_len_in(&mut self, haystack: &JavaStr) -> Option; + fn find_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)>; + fn rfind_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)>; +} + +unsafe impl JavaStrPattern for char { + #[inline] + fn prefix_len_in(&mut self, haystack: &JavaStr) -> Option { + let ch = haystack.chars().next()?; + if ch == *self { + Some(ch.len_utf8()) + } else { + None + } + } + + #[inline] + fn suffix_len_in(&mut self, haystack: &JavaStr) -> Option { + let ch = haystack.chars().next_back()?; + if ch == *self { + Some(ch.len_utf8()) + } else { + None + } + } + + #[inline] + fn find_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + let mut encoded = [0; 4]; + let encoded = self.encode_utf8(&mut encoded).as_bytes(); + find(haystack.as_bytes(), encoded).map(|index| (index, encoded.len())) + } + + #[inline] + fn rfind_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + let mut encoded = [0; 4]; + let encoded = self.encode_utf8(&mut encoded).as_bytes(); + rfind(haystack.as_bytes(), encoded).map(|index| (index, encoded.len())) + } +} + +unsafe impl JavaStrPattern for JavaCodePoint { + #[inline] + fn prefix_len_in(&mut self, haystack: &JavaStr) -> Option { + let ch = haystack.chars().next()?; + if ch == *self { + Some(ch.len_utf8()) + } else { + None + } + } + + #[inline] + fn suffix_len_in(&mut self, haystack: &JavaStr) -> Option { + let ch = haystack.chars().next_back()?; + if ch == *self { + Some(ch.len_utf8()) + } else { + None + } + } + + #[inline] + fn find_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + let mut encoded = [0; 4]; + let encoded = self.encode_semi_utf8(&mut encoded); + find(haystack.as_bytes(), encoded).map(|index| (index, encoded.len())) + } + + #[inline] + fn rfind_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + let mut encoded = [0; 4]; + let encoded = self.encode_semi_utf8(&mut encoded); + rfind(haystack.as_bytes(), encoded).map(|index| (index, encoded.len())) + } +} + +unsafe impl JavaStrPattern for &str { + #[inline] + fn prefix_len_in(&mut self, haystack: &JavaStr) -> Option { + if haystack.as_bytes().starts_with(self.as_bytes()) { + Some(self.len()) + } else { + None + } + } + + #[inline] + fn suffix_len_in(&mut self, haystack: &JavaStr) -> Option { + if haystack.as_bytes().ends_with(self.as_bytes()) { + Some(self.len()) + } else { + None + } + } + + #[inline] + fn find_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + find(haystack.as_bytes(), self.as_bytes()).map(|index| (index, self.len())) + } + + #[inline] + fn rfind_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + rfind(haystack.as_bytes(), self.as_bytes()).map(|index| (index, self.len())) + } +} + +unsafe impl JavaStrPattern for &JavaStr { + #[inline] + fn prefix_len_in(&mut self, haystack: &JavaStr) -> Option { + if haystack.as_bytes().starts_with(self.as_bytes()) { + Some(self.len()) + } else { + None + } + } + + #[inline] + fn suffix_len_in(&mut self, haystack: &JavaStr) -> Option { + if haystack.as_bytes().ends_with(self.as_bytes()) { + Some(self.len()) + } else { + None + } + } + + #[inline] + fn find_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + find(haystack.as_bytes(), self.as_bytes()).map(|index| (index, self.len())) + } + + #[inline] + fn rfind_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + rfind(haystack.as_bytes(), self.as_bytes()).map(|index| (index, self.len())) + } +} + +unsafe impl JavaStrPattern for F +where + F: FnMut(JavaCodePoint) -> bool, +{ + #[inline] + fn prefix_len_in(&mut self, haystack: &JavaStr) -> Option { + let ch = haystack.chars().next()?; + if self(ch) { + Some(ch.len_utf8()) + } else { + None + } + } + + #[inline] + fn suffix_len_in(&mut self, haystack: &JavaStr) -> Option { + let ch = haystack.chars().next_back()?; + if self(ch) { + Some(ch.len_utf8()) + } else { + None + } + } + + #[inline] + fn find_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + haystack + .char_indices() + .find(|(_, ch)| self(*ch)) + .map(|(index, ch)| (index, ch.len_utf8())) + } + + #[inline] + fn rfind_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + haystack + .char_indices() + .rfind(|(_, ch)| self(*ch)) + .map(|(index, ch)| (index, ch.len_utf8())) + } +} + +unsafe impl JavaStrPattern for &[char] { + #[inline] + fn prefix_len_in(&mut self, haystack: &JavaStr) -> Option { + let ch = haystack.chars().next()?; + if self.iter().any(|c| ch == *c) { + Some(ch.len_utf8()) + } else { + None + } + } + + #[inline] + fn suffix_len_in(&mut self, haystack: &JavaStr) -> Option { + let ch = haystack.chars().next_back()?; + if self.iter().any(|c| ch == *c) { + Some(ch.len_utf8()) + } else { + None + } + } + + #[inline] + fn find_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + haystack + .char_indices() + .find(|(_, ch)| self.iter().any(|c| *ch == *c)) + .map(|(index, ch)| (index, ch.len_utf8())) + } + + #[inline] + fn rfind_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + haystack + .char_indices() + .rfind(|(_, ch)| self.iter().any(|c| *ch == *c)) + .map(|(index, ch)| (index, ch.len_utf8())) + } +} + +unsafe impl JavaStrPattern for &[JavaCodePoint] { + #[inline] + fn prefix_len_in(&mut self, haystack: &JavaStr) -> Option { + let ch = haystack.chars().next()?; + if self.contains(&ch) { + Some(ch.len_utf8()) + } else { + None + } + } + + #[inline] + fn suffix_len_in(&mut self, haystack: &JavaStr) -> Option { + let ch = haystack.chars().next_back()?; + if self.contains(&ch) { + Some(ch.len_utf8()) + } else { + None + } + } + + #[inline] + fn find_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + haystack + .char_indices() + .find(|(_, ch)| self.contains(ch)) + .map(|(index, ch)| (index, ch.len_utf8())) + } + + #[inline] + fn rfind_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + haystack + .char_indices() + .rfind(|(_, ch)| self.contains(ch)) + .map(|(index, ch)| (index, ch.len_utf8())) + } +} + +unsafe impl JavaStrPattern for &char { + #[inline] + fn prefix_len_in(&mut self, haystack: &JavaStr) -> Option { + let mut ch = **self; + ch.prefix_len_in(haystack) + } + + #[inline] + fn suffix_len_in(&mut self, haystack: &JavaStr) -> Option { + let mut ch = **self; + ch.suffix_len_in(haystack) + } + + #[inline] + fn find_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + let mut ch = **self; + ch.find_in(haystack) + } + + #[inline] + fn rfind_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + let mut ch = **self; + ch.rfind_in(haystack) + } +} + +unsafe impl JavaStrPattern for &JavaCodePoint { + #[inline] + fn prefix_len_in(&mut self, haystack: &JavaStr) -> Option { + let mut ch = **self; + ch.prefix_len_in(haystack) + } + + #[inline] + fn suffix_len_in(&mut self, haystack: &JavaStr) -> Option { + let mut ch = **self; + ch.suffix_len_in(haystack) + } + + #[inline] + fn find_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + let mut ch = **self; + ch.find_in(haystack) + } + + #[inline] + fn rfind_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + let mut ch = **self; + ch.rfind_in(haystack) + } +} + +unsafe impl JavaStrPattern for &&str { + #[inline] + fn prefix_len_in(&mut self, haystack: &JavaStr) -> Option { + let mut str = **self; + str.prefix_len_in(haystack) + } + + #[inline] + fn suffix_len_in(&mut self, haystack: &JavaStr) -> Option { + let mut str = **self; + str.suffix_len_in(haystack) + } + + #[inline] + fn find_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + let mut str = **self; + str.find_in(haystack) + } + + #[inline] + fn rfind_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + let mut str = **self; + str.rfind_in(haystack) + } +} + +unsafe impl JavaStrPattern for &&JavaStr { + #[inline] + fn prefix_len_in(&mut self, haystack: &JavaStr) -> Option { + let mut str = **self; + str.prefix_len_in(haystack) + } + + #[inline] + fn suffix_len_in(&mut self, haystack: &JavaStr) -> Option { + let mut str = **self; + str.suffix_len_in(haystack) + } + + #[inline] + fn find_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + let mut str = **self; + str.find_in(haystack) + } + + #[inline] + fn rfind_in(&mut self, haystack: &JavaStr) -> Option<(usize, usize)> { + let mut str = **self; + str.rfind_in(haystack) + } +} + +#[inline] +fn find(haystack: &[u8], needle: &[u8]) -> Option { + if needle.is_empty() { + return Some(0); + } + haystack + .windows(needle.len()) + .position(|window| window == needle) +} + +#[inline] +fn rfind(haystack: &[u8], needle: &[u8]) -> Option { + if needle.is_empty() { + return Some(haystack.len()); + } + haystack + .windows(needle.len()) + .rposition(|window| window == needle) +} diff --git a/crates/valence_java_string/src/serde.rs b/crates/valence_java_string/src/serde.rs new file mode 100644 index 000000000..71e31c173 --- /dev/null +++ b/crates/valence_java_string/src/serde.rs @@ -0,0 +1,43 @@ +use serde::{Deserialize, Deserializer, Serialize, Serializer}; + +use crate::{JavaStr, JavaString}; + +impl Serialize for JavaString { + #[inline] + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + self.as_str_lossy().serialize(serializer) + } +} + +impl<'de> Deserialize<'de> for JavaString { + #[inline] + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + String::deserialize(deserializer).map(JavaString::from) + } +} + +impl Serialize for JavaStr { + #[inline] + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + self.as_str_lossy().serialize(serializer) + } +} + +impl<'de: 'a, 'a> Deserialize<'de> for &'a JavaStr { + #[inline] + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + <&'a str>::deserialize(deserializer).map(JavaStr::from_str) + } +} diff --git a/crates/valence_java_string/src/slice.rs b/crates/valence_java_string/src/slice.rs new file mode 100644 index 000000000..8cc1e3fda --- /dev/null +++ b/crates/valence_java_string/src/slice.rs @@ -0,0 +1,1488 @@ +use std::borrow::Cow; +use std::collections::Bound; +use std::fmt::{Debug, Display, Formatter, Write}; +use std::hash::{Hash, Hasher}; +use std::ops::{ + Add, AddAssign, Index, IndexMut, Range, RangeBounds, RangeFrom, RangeFull, RangeInclusive, + RangeTo, RangeToInclusive, +}; +use std::rc::Rc; +use std::str::FromStr; +use std::sync::Arc; +use std::{ptr, slice}; + +use crate::char::EscapeDebugExtArgs; +use crate::validations::{ + run_utf8_full_validation_from_semi, run_utf8_semi_validation, slice_error_fail, + str_end_index_overflow_fail, +}; +use crate::{ + Bytes, CharEscapeIter, CharIndices, Chars, EscapeDebug, EscapeDefault, EscapeUnicode, + JavaCodePoint, JavaStrPattern, JavaString, Lines, MatchIndices, Matches, ParseError, + RMatchIndices, RMatches, RSplit, RSplitN, RSplitTerminator, Split, SplitAsciiWhitespace, + SplitInclusive, SplitN, SplitTerminator, SplitWhitespace, Utf8Error, +}; + +#[repr(transparent)] +#[derive(PartialEq, Eq, PartialOrd, Ord)] +pub struct JavaStr { + inner: [u8], +} + +impl JavaStr { + #[inline] + pub const fn from_utf8(v: &[u8]) -> Result<&JavaStr, Utf8Error> { + match std::str::from_utf8(v) { + Ok(str) => Ok(JavaStr::from_str(str)), + Err(err) => Err(Utf8Error::from_std(err)), + } + } + + #[inline] + pub fn from_utf8_mut(v: &mut [u8]) -> Result<&mut JavaStr, Utf8Error> { + match std::str::from_utf8_mut(v) { + Ok(str) => Ok(JavaStr::from_mut_str(str)), + Err(err) => Err(Utf8Error::from_std(err)), + } + } + + pub fn from_semi_utf8(v: &[u8]) -> Result<&JavaStr, Utf8Error> { + match run_utf8_semi_validation(v) { + Ok(()) => Ok(unsafe { JavaStr::from_semi_utf8_unchecked(v) }), + Err(err) => Err(err), + } + } + + pub fn from_semi_utf8_mut(v: &mut [u8]) -> Result<&mut JavaStr, Utf8Error> { + match run_utf8_semi_validation(v) { + Ok(()) => Ok(unsafe { JavaStr::from_semi_utf8_unchecked_mut(v) }), + Err(err) => Err(err), + } + } + + /// # Safety + /// + /// The parameter must be in semi-valid UTF-8 format, that is, UTF-8 plus + /// surrogate code points. + #[inline] + #[must_use] + pub const unsafe fn from_semi_utf8_unchecked(v: &[u8]) -> &JavaStr { + // SAFETY: the caller must guarantee that the bytes `v` are valid UTF-8, minus + // the absence of surrogate chars. Also relies on `&JavaStr` and `&[u8]` + // having the same layout. + std::mem::transmute(v) + } + + /// # Safety + /// + /// The parameter must be in semi-valid UTF-8 format, that is, UTF-8 plus + /// surrogate code points. + #[inline] + #[must_use] + pub unsafe fn from_semi_utf8_unchecked_mut(v: &mut [u8]) -> &mut JavaStr { + // SAFETY: see from_semi_utf8_unchecked + std::mem::transmute(v) + } + + #[inline] + #[must_use] + pub const fn from_str(str: &str) -> &JavaStr { + unsafe { + // SAFETY: the input str is guaranteed to have valid UTF-8. + JavaStr::from_semi_utf8_unchecked(str.as_bytes()) + } + } + + #[inline] + #[must_use] + pub fn from_mut_str(str: &mut str) -> &mut JavaStr { + unsafe { + // SAFETY: the input str is guaranteed to have valid UTF-8. + JavaStr::from_semi_utf8_unchecked_mut(str.as_bytes_mut()) + } + } + + #[inline] + #[must_use] + pub fn from_boxed_str(v: Box) -> Box { + unsafe { JavaStr::from_boxed_semi_utf8_unchecked(v.into_boxed_bytes()) } + } + + /// # Safety + /// + /// The parameter must be in semi-valid UTF-8 format, that is, UTF-8 plus + /// surrogate code points. + #[inline] + #[must_use] + pub unsafe fn from_boxed_semi_utf8_unchecked(v: Box<[u8]>) -> Box { + unsafe { Box::from_raw(Box::into_raw(v) as *mut JavaStr) } + } + + #[inline] + #[must_use] + pub const fn as_bytes(&self) -> &[u8] { + &self.inner + } + + /// # Safety + /// + /// The returned slice must not have invalid UTF-8 written to it, besides + /// surrogate pairs. + #[inline] + #[must_use] + pub unsafe fn as_bytes_mut(&mut self) -> &mut [u8] { + &mut self.inner + } + + #[inline] + #[must_use] + pub fn as_mut_ptr(&mut self) -> *mut u8 { + self.inner.as_mut_ptr() + } + + #[inline] + #[must_use] + pub const fn as_ptr(&self) -> *const u8 { + self.inner.as_ptr() + } + + pub const fn as_str(&self) -> Result<&str, Utf8Error> { + // Manual implementation of Option::map since it's not const + match run_utf8_full_validation_from_semi(self.as_bytes()) { + Ok(..) => unsafe { + // SAFETY: we were already semi-valid, and full validation just succeeded. + Ok(self.as_str_unchecked()) + }, + Err(err) => Err(err), + } + } + + /// # Safety + /// + /// This string must be fully valid UTF-8, i.e. have no surrogate code + /// points. + #[inline] + #[must_use] + pub const unsafe fn as_str_unchecked(&self) -> &str { + std::str::from_utf8_unchecked(self.as_bytes()) + } + + #[must_use] + pub fn as_str_lossy(&self) -> Cow<'_, str> { + match run_utf8_full_validation_from_semi(self.as_bytes()) { + Ok(()) => unsafe { + // SAFETY: validation succeeded + Cow::Borrowed(self.as_str_unchecked()) + }, + Err(error) => unsafe { + // SAFETY: invalid parts of string are converted to replacement char + Cow::Owned( + self.transform_invalid_string(error, str::to_owned, |_| { + JavaStr::from_str("\u{FFFD}") + }) + .into_string_unchecked(), + ) + }, + } + } + + #[inline] + pub fn bytes(&self) -> Bytes<'_> { + Bytes { + inner: self.inner.iter().copied(), + } + } + + #[inline] + pub fn char_indices(&self) -> CharIndices<'_> { + CharIndices { + front_offset: 0, + inner: self.chars(), + } + } + + #[inline] + pub fn chars(&self) -> Chars<'_> { + Chars { + inner: self.inner.iter(), + } + } + + #[inline] + #[must_use] + pub fn contains

(&self, mut pat: P) -> bool + where + P: JavaStrPattern, + { + pat.find_in(self).is_some() + } + + #[inline] + #[must_use] + pub fn ends_with

(&self, mut pat: P) -> bool + where + P: JavaStrPattern, + { + pat.suffix_len_in(self).is_some() + } + + #[inline] + #[must_use] + pub fn eq_ignore_ascii_case(&self, other: &str) -> bool { + self.as_bytes().eq_ignore_ascii_case(other.as_bytes()) + } + + #[inline] + #[must_use] + pub fn eq_java_ignore_ascii_case(&self, other: &JavaStr) -> bool { + self.as_bytes().eq_ignore_ascii_case(other.as_bytes()) + } + + #[inline] + pub fn escape_debug(&self) -> EscapeDebug<'_> { + #[inline] + fn escape_first(first: JavaCodePoint) -> CharEscapeIter { + first.escape_debug_ext(EscapeDebugExtArgs::ESCAPE_ALL) + } + #[inline] + fn escape_rest(char: JavaCodePoint) -> CharEscapeIter { + char.escape_debug_ext(EscapeDebugExtArgs { + escape_single_quote: true, + escape_double_quote: true, + }) + } + + let mut chars = self.chars(); + EscapeDebug { + inner: chars + .next() + .map(escape_first as fn(JavaCodePoint) -> CharEscapeIter) + .into_iter() + .flatten() + .chain(chars.flat_map(escape_rest as fn(JavaCodePoint) -> CharEscapeIter)), + } + } + + #[inline] + pub fn escape_default(&self) -> EscapeDefault<'_> { + EscapeDefault { + inner: self.chars().flat_map(JavaCodePoint::escape_default), + } + } + + #[inline] + pub fn escape_unicode(&self) -> EscapeUnicode<'_> { + EscapeUnicode { + inner: self.chars().flat_map(JavaCodePoint::escape_unicode), + } + } + + #[inline] + #[must_use] + pub fn find

(&self, mut pat: P) -> Option + where + P: JavaStrPattern, + { + pat.find_in(self).map(|(index, _)| index) + } + + #[inline] + #[must_use] + pub fn get(&self, i: I) -> Option<&JavaStr> + where + I: JavaStrSliceIndex, + { + i.get(self) + } + + #[inline] + #[must_use] + pub fn get_mut(&mut self, i: I) -> Option<&mut JavaStr> + where + I: JavaStrSliceIndex, + { + i.get_mut(self) + } + + /// # Safety + /// + /// - The starting index must not exceed the ending index + /// - Indexes must be within bounds of the original slice + /// - Indexes must lie on UTF-8 sequence boundaries + #[inline] + #[must_use] + pub unsafe fn get_unchecked(&self, i: I) -> &JavaStr + where + I: JavaStrSliceIndex, + { + unsafe { &*i.get_unchecked(self) } + } + + /// # Safety + /// + /// - The starting index must not exceed the ending index + /// - Indexes must be within bounds of the original slice + /// - Indexes must lie on UTF-8 sequence boundaries + #[inline] + #[must_use] + pub unsafe fn get_unchecked_mut(&mut self, i: I) -> &mut JavaStr + where + I: JavaStrSliceIndex, + { + unsafe { &mut *i.get_unchecked_mut(self) } + } + + #[inline] + #[must_use] + pub fn into_boxed_bytes(self: Box) -> Box<[u8]> { + unsafe { Box::from_raw(Box::into_raw(self) as *mut [u8]) } + } + + #[inline] + #[must_use] + pub fn into_string(self: Box) -> JavaString { + let slice = self.into_boxed_bytes(); + unsafe { JavaString::from_semi_utf8_unchecked(slice.into_vec()) } + } + + #[inline] + #[must_use] + pub fn is_ascii(&self) -> bool { + self.as_bytes().is_ascii() + } + + #[inline] + #[must_use] + pub fn is_char_boundary(&self, index: usize) -> bool { + // 0 is always ok. + // Test for 0 explicitly so that it can optimize out the check + // easily and skip reading string data for that case. + // Note that optimizing `self.get(..index)` relies on this. + if index == 0 { + return true; + } + + match self.as_bytes().get(index) { + // For `None` we have two options: + // + // - index == self.len() Empty strings are valid, so return true + // - index > self.len() In this case return false + // + // The check is placed exactly here, because it improves generated + // code on higher opt-levels. See https://github.com/rust-lang/rust/pull/84751 for more details. + None => index == self.len(), + + Some(&b) => { + // This is bit magic equivalent to: b < 128 || b >= 192 + (b as i8) >= -0x40 + } + } + } + + pub(crate) fn floor_char_boundary(&self, index: usize) -> usize { + if index >= self.len() { + self.len() + } else { + let lower_bound = index.saturating_sub(3); + let new_index = self.as_bytes()[lower_bound..=index].iter().rposition(|b| { + // This is bit magic equivalent to: b < 128 || b >= 192 + (*b as i8) >= -0x40 + }); + + // SAFETY: we know that the character boundary will be within four bytes + unsafe { lower_bound + new_index.unwrap_unchecked() } + } + } + + #[inline] + #[must_use] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + #[inline] + #[must_use] + pub fn len(&self) -> usize { + self.inner.len() + } + + #[inline] + pub fn lines(&self) -> Lines<'_> { + Lines { + inner: self.split_inclusive('\n').map(|line| { + let Some(line) = line.strip_suffix('\n') else { + return line; + }; + let Some(line) = line.strip_suffix('\r') else { + return line; + }; + line + }), + } + } + + #[inline] + pub fn make_ascii_lowercase(&mut self) { + // SAFETY: changing ASCII letters only does not invalidate UTF-8. + let me = unsafe { self.as_bytes_mut() }; + me.make_ascii_lowercase() + } + + #[inline] + pub fn make_ascii_uppercase(&mut self) { + // SAFETY: changing ASCII letters only does not invalidate UTF-8. + let me = unsafe { self.as_bytes_mut() }; + me.make_ascii_uppercase() + } + + #[inline] + pub fn match_indices

(&self, pat: P) -> MatchIndices

+ where + P: JavaStrPattern, + { + MatchIndices { + str: self, + start: 0, + pat, + } + } + + #[inline] + pub fn matches

(&self, pat: P) -> Matches

+ where + P: JavaStrPattern, + { + Matches { str: self, pat } + } + + #[inline] + pub fn parse(&self) -> Result::Err>> + where + F: FromStr, + { + match self.as_str() { + Ok(str) => str.parse().map_err(ParseError::Err), + Err(err) => Err(ParseError::InvalidUtf8(err)), + } + } + + #[inline] + #[must_use] + pub fn repeat(&self, n: usize) -> JavaString { + unsafe { JavaString::from_semi_utf8_unchecked(self.as_bytes().repeat(n)) } + } + + #[inline] + #[must_use] + pub fn replace

(&self, from: P, to: &str) -> JavaString + where + P: JavaStrPattern, + { + self.replace_java(from, JavaStr::from_str(to)) + } + + #[inline] + #[must_use] + pub fn replace_java

(&self, from: P, to: &JavaStr) -> JavaString + where + P: JavaStrPattern, + { + let mut result = JavaString::new(); + let mut last_end = 0; + for (start, part) in self.match_indices(from) { + result.push_java_str(unsafe { self.get_unchecked(last_end..start) }); + result.push_java_str(to); + last_end = start + part.len(); + } + result.push_java_str(unsafe { self.get_unchecked(last_end..self.len()) }); + result + } + + #[inline] + #[must_use] + pub fn replacen

(&self, from: P, to: &str, count: usize) -> JavaString + where + P: JavaStrPattern, + { + self.replacen_java(from, JavaStr::from_str(to), count) + } + + #[inline] + #[must_use] + pub fn replacen_java

(&self, from: P, to: &JavaStr, count: usize) -> JavaString + where + P: JavaStrPattern, + { + // Hope to reduce the times of re-allocation + let mut result = JavaString::with_capacity(32); + let mut last_end = 0; + for (start, part) in self.match_indices(from).take(count) { + result.push_java_str(unsafe { self.get_unchecked(last_end..start) }); + result.push_java_str(to); + last_end = start + part.len(); + } + result.push_java_str(unsafe { self.get_unchecked(last_end..self.len()) }); + result + } + + #[inline] + #[must_use] + pub fn rfind

(&self, mut pat: P) -> Option + where + P: JavaStrPattern, + { + pat.rfind_in(self).map(|(index, _)| index) + } + + #[inline] + pub fn rmatch_indices

(&self, pat: P) -> RMatchIndices

+ where + P: JavaStrPattern, + { + RMatchIndices { + inner: self.match_indices(pat), + } + } + + #[inline] + pub fn rmatches

(&self, pat: P) -> RMatches

+ where + P: JavaStrPattern, + { + RMatches { + inner: self.matches(pat), + } + } + + #[inline] + pub fn rsplit

(&self, pat: P) -> RSplit

+ where + P: JavaStrPattern, + { + RSplit::new(self, pat) + } + + #[inline] + #[must_use] + pub fn rsplit_once

(&self, mut delimiter: P) -> Option<(&JavaStr, &JavaStr)> + where + P: JavaStrPattern, + { + let (index, len) = delimiter.rfind_in(self)?; + // SAFETY: pattern is known to return valid indices. + unsafe { + Some(( + self.get_unchecked(..index), + self.get_unchecked(index + len..), + )) + } + } + + #[inline] + pub fn rsplit_terminator

(&self, pat: P) -> RSplitTerminator

+ where + P: JavaStrPattern, + { + RSplitTerminator::new(self, pat) + } + + #[inline] + pub fn rsplitn

(&self, n: usize, pat: P) -> RSplitN

+ where + P: JavaStrPattern, + { + RSplitN::new(self, pat, n) + } + + #[inline] + pub fn split

(&self, pat: P) -> Split

+ where + P: JavaStrPattern, + { + Split::new(self, pat) + } + + #[inline] + pub fn split_ascii_whitespace(&self) -> SplitAsciiWhitespace<'_> { + #[inline] + fn is_non_empty(bytes: &&[u8]) -> bool { + !bytes.is_empty() + } + + SplitAsciiWhitespace { + inner: self + .as_bytes() + .split(u8::is_ascii_whitespace as fn(&u8) -> bool) + .filter(is_non_empty as fn(&&[u8]) -> bool) + .map(|bytes| unsafe { JavaStr::from_semi_utf8_unchecked(bytes) }), + } + } + + #[inline] + #[must_use] + pub fn split_at(&self, mid: usize) -> (&JavaStr, &JavaStr) { + // is_char_boundary checks that the index is in [0, .len()] + if self.is_char_boundary(mid) { + // SAFETY: just checked that `mid` is on a char boundary. + unsafe { + ( + self.get_unchecked(0..mid), + self.get_unchecked(mid..self.len()), + ) + } + } else { + slice_error_fail(self, 0, mid) + } + } + + #[inline] + #[must_use] + pub fn split_at_mut(&mut self, mid: usize) -> (&mut JavaStr, &mut JavaStr) { + // is_char_boundary checks that the index is in [0, .len()] + if self.is_char_boundary(mid) { + let len = self.len(); + let ptr = self.as_mut_ptr(); + // SAFETY: just checked that `mid` is on a char boundary. + unsafe { + ( + JavaStr::from_semi_utf8_unchecked_mut(slice::from_raw_parts_mut(ptr, mid)), + JavaStr::from_semi_utf8_unchecked_mut(slice::from_raw_parts_mut( + ptr.add(mid), + len - mid, + )), + ) + } + } else { + slice_error_fail(self, 0, mid) + } + } + + #[inline] + pub fn split_inclusive

(&self, pat: P) -> SplitInclusive

+ where + P: JavaStrPattern, + { + SplitInclusive::new(self, pat) + } + + #[inline] + #[must_use] + pub fn split_once

(&self, mut delimiter: P) -> Option<(&JavaStr, &JavaStr)> + where + P: JavaStrPattern, + { + let (index, len) = delimiter.find_in(self)?; + // SAFETY: pattern is known to return valid indices. + unsafe { + Some(( + self.get_unchecked(..index), + self.get_unchecked(index + len..), + )) + } + } + + #[inline] + pub fn split_terminator

(&self, pat: P) -> SplitTerminator

+ where + P: JavaStrPattern, + { + SplitTerminator::new(self, pat) + } + + #[inline] + pub fn split_whitespace(&self) -> SplitWhitespace<'_> { + SplitWhitespace { + inner: self + .split(JavaCodePoint::is_whitespace as fn(JavaCodePoint) -> bool) + .filter(|str| !str.is_empty()), + } + } + + #[inline] + pub fn splitn

(&self, n: usize, pat: P) -> SplitN

+ where + P: JavaStrPattern, + { + SplitN::new(self, pat, n) + } + + #[inline] + #[must_use] + pub fn starts_with

(&self, mut pat: P) -> bool + where + P: JavaStrPattern, + { + pat.prefix_len_in(self).is_some() + } + + #[inline] + #[must_use] + pub fn strip_prefix

(&self, mut prefix: P) -> Option<&JavaStr> + where + P: JavaStrPattern, + { + let len = prefix.prefix_len_in(self)?; + // SAFETY: pattern is known to return valid indices. + unsafe { Some(self.get_unchecked(len..)) } + } + + #[inline] + #[must_use] + pub fn strip_suffix

(&self, mut suffix: P) -> Option<&JavaStr> + where + P: JavaStrPattern, + { + let len = suffix.suffix_len_in(self)?; + // SAFETY: pattern is known to return valid indices. + unsafe { Some(self.get_unchecked(..self.len() - len)) } + } + + #[inline] + #[must_use] + pub fn to_ascii_lowercase(&self) -> JavaString { + let mut s = self.to_owned(); + s.make_ascii_lowercase(); + s + } + + #[inline] + #[must_use] + pub fn to_ascii_uppercase(&self) -> JavaString { + let mut s = self.to_owned(); + s.make_ascii_uppercase(); + s + } + + #[inline] + #[must_use] + pub fn to_lowercase(&self) -> JavaString { + self.transform_string(str::to_lowercase, |ch| ch) + } + + #[inline] + #[must_use] + pub fn to_uppercase(&self) -> JavaString { + self.transform_string(str::to_uppercase, |ch| ch) + } + + #[inline] + #[must_use] + pub fn trim(&self) -> &JavaStr { + self.trim_matches(|c: JavaCodePoint| c.is_whitespace()) + } + + #[inline] + #[must_use] + pub fn trim_end(&self) -> &JavaStr { + self.trim_end_matches(|c: JavaCodePoint| c.is_whitespace()) + } + + #[inline] + #[must_use] + pub fn trim_end_matches

(&self, mut pat: P) -> &JavaStr + where + P: JavaStrPattern, + { + let mut str = self; + while let Some(suffix_len) = pat.suffix_len_in(str) { + if suffix_len == 0 { + break; + } + // SAFETY: pattern is known to return valid indices. + str = unsafe { str.get_unchecked(..str.len() - suffix_len) }; + } + str + } + + #[inline] + #[must_use] + pub fn trim_matches

(&self, mut pat: P) -> &JavaStr + where + P: JavaStrPattern, + { + let mut str = self; + while let Some(prefix_len) = pat.prefix_len_in(str) { + if prefix_len == 0 { + break; + } + // SAFETY: pattern is known to return valid indices. + str = unsafe { str.get_unchecked(prefix_len..) }; + } + while let Some(suffix_len) = pat.suffix_len_in(str) { + if suffix_len == 0 { + break; + } + // SAFETY: pattern is known to return valid indices. + str = unsafe { str.get_unchecked(..str.len() - suffix_len) }; + } + str + } + + #[inline] + #[must_use] + pub fn trim_start(&self) -> &JavaStr { + self.trim_start_matches(|c: JavaCodePoint| c.is_whitespace()) + } + + #[inline] + #[must_use] + pub fn trim_start_matches

(&self, mut pat: P) -> &JavaStr + where + P: JavaStrPattern, + { + let mut str = self; + while let Some(prefix_len) = pat.prefix_len_in(str) { + if prefix_len == 0 { + break; + } + // SAFETY: pattern is known to return valid indices. + str = unsafe { str.get_unchecked(prefix_len..) }; + } + str + } + + #[inline] + fn transform_string( + &self, + mut string_transformer: SF, + invalid_char_transformer: ICF, + ) -> JavaString + where + SF: FnMut(&str) -> String, + ICF: FnMut(&JavaStr) -> &JavaStr, + { + let bytes = self.as_bytes(); + match run_utf8_full_validation_from_semi(bytes) { + Ok(()) => JavaString::from(string_transformer(unsafe { + // SAFETY: validation succeeded + std::str::from_utf8_unchecked(bytes) + })), + Err(error) => { + self.transform_invalid_string(error, string_transformer, invalid_char_transformer) + } + } + } + + #[inline] + fn transform_invalid_string( + &self, + error: Utf8Error, + mut string_transformer: SF, + mut invalid_char_transformer: ICF, + ) -> JavaString + where + SF: FnMut(&str) -> String, + ICF: FnMut(&JavaStr) -> &JavaStr, + { + let bytes = self.as_bytes(); + let mut result = JavaString::from(string_transformer(unsafe { + // SAFETY: validation succeeded up to this index + std::str::from_utf8_unchecked(bytes.get_unchecked(..error.valid_up_to)) + })); + result.push_java_str(invalid_char_transformer(unsafe { + // SAFETY: any UTF-8 error in semi-valid UTF-8 is a 3 byte long sequence + // representing a surrogate code point. We're pushing that sequence now + JavaStr::from_semi_utf8_unchecked( + bytes.get_unchecked(error.valid_up_to..error.valid_up_to + 3), + ) + })); + let mut index = error.valid_up_to + 3; + loop { + let remainder = unsafe { bytes.get_unchecked(index..) }; + match run_utf8_full_validation_from_semi(remainder) { + Ok(()) => { + result.push_str(&string_transformer(unsafe { + // SAFETY: validation succeeded + std::str::from_utf8_unchecked(remainder) + })); + return result; + } + Err(error) => { + result.push_str(&string_transformer(unsafe { + // SAFETY: validation succeeded up to this index + std::str::from_utf8_unchecked( + bytes.get_unchecked(index..index + error.valid_up_to), + ) + })); + result.push_java_str(invalid_char_transformer(unsafe { + // SAFETY: see comment above + JavaStr::from_semi_utf8_unchecked(bytes.get_unchecked( + index + error.valid_up_to..index + error.valid_up_to + 3, + )) + })); + index += error.valid_up_to + 3; + } + } + } + } +} + +impl<'a> Add<&JavaStr> for Cow<'a, JavaStr> { + type Output = Cow<'a, JavaStr>; + + #[inline] + fn add(mut self, rhs: &JavaStr) -> Self::Output { + self += rhs; + self + } +} + +impl<'a> AddAssign<&JavaStr> for Cow<'a, JavaStr> { + #[inline] + fn add_assign(&mut self, rhs: &JavaStr) { + if !rhs.is_empty() { + match self { + Cow::Borrowed(lhs) => { + let mut result = lhs.to_owned(); + result.push_java_str(rhs); + *self = Cow::Owned(result); + } + Cow::Owned(lhs) => { + lhs.push_java_str(rhs); + } + } + } + } +} + +impl AsRef<[u8]> for JavaStr { + #[inline] + fn as_ref(&self) -> &[u8] { + self.as_bytes() + } +} + +impl Clone for Box { + #[inline] + fn clone(&self) -> Self { + let buf: Box<[u8]> = self.as_bytes().into(); + unsafe { JavaStr::from_boxed_semi_utf8_unchecked(buf) } + } +} + +impl Debug for JavaStr { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.write_char('"')?; + let mut from = 0; + for (i, c) in self.char_indices() { + let esc = c.escape_debug_ext(EscapeDebugExtArgs { + escape_single_quote: false, + escape_double_quote: true, + }); + // If char needs escaping, flush backlog so far and write, else skip. + // Also handle invalid UTF-8 here + if esc.len() != 1 || c.as_char().is_none() { + unsafe { + // SAFETY: any invalid UTF-8 should have been caught by a previous iteration + f.write_str(self[from..i].as_str_unchecked())?; + } + for c in esc { + f.write_char(c)?; + } + from = i + c.len_utf8(); + } + } + unsafe { + // SAFETY: any invalid UTF-8 should have been caught by the loop above + f.write_str(self[from..].as_str_unchecked())?; + } + f.write_char('"') + } +} + +impl Default for &JavaStr { + #[inline] + fn default() -> Self { + JavaStr::from_str("") + } +} + +impl Default for Box { + #[inline] + fn default() -> Self { + JavaStr::from_boxed_str(Box::::default()) + } +} + +impl Display for JavaStr { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + Display::fmt(&self.as_str_lossy(), f) + } +} + +impl<'a> From<&'a JavaStr> for Cow<'a, JavaStr> { + #[inline] + fn from(value: &'a JavaStr) -> Self { + Cow::Borrowed(value) + } +} + +impl From<&JavaStr> for Arc { + #[inline] + fn from(value: &JavaStr) -> Self { + let arc = Arc::<[u8]>::from(value.as_bytes()); + unsafe { Arc::from_raw(Arc::into_raw(arc) as *const JavaStr) } + } +} + +impl From<&JavaStr> for Box { + #[inline] + fn from(value: &JavaStr) -> Self { + unsafe { JavaStr::from_boxed_semi_utf8_unchecked(Box::from(value.as_bytes())) } + } +} + +impl From<&JavaStr> for Rc { + #[inline] + fn from(value: &JavaStr) -> Self { + let rc = Rc::<[u8]>::from(value.as_bytes()); + unsafe { Rc::from_raw(Rc::into_raw(rc) as *const JavaStr) } + } +} + +impl From<&JavaStr> for Vec { + #[inline] + fn from(value: &JavaStr) -> Self { + From::from(value.as_bytes()) + } +} + +impl From> for Box { + #[inline] + fn from(value: Cow<'_, JavaStr>) -> Self { + match value { + Cow::Borrowed(s) => Box::from(s), + Cow::Owned(s) => Box::from(s), + } + } +} + +impl From for Box { + #[inline] + fn from(value: JavaString) -> Self { + value.into_boxed_str() + } +} + +impl<'a> From<&'a str> for &'a JavaStr { + #[inline] + fn from(value: &'a str) -> Self { + JavaStr::from_str(value) + } +} + +impl Hash for JavaStr { + #[inline] + fn hash(&self, state: &mut H) { + state.write(self.as_bytes()); + state.write_u8(0xff); + } +} + +impl Index for JavaStr +where + I: JavaStrSliceIndex, +{ + type Output = JavaStr; + + #[inline] + fn index(&self, index: I) -> &Self::Output { + index.index(self) + } +} + +impl IndexMut for JavaStr +where + I: JavaStrSliceIndex, +{ + #[inline] + fn index_mut(&mut self, index: I) -> &mut Self::Output { + index.index_mut(self) + } +} + +impl<'a, 'b> PartialEq<&'b JavaStr> for Cow<'a, str> { + #[inline] + fn eq(&self, other: &&'b JavaStr) -> bool { + self == *other + } +} + +impl<'a, 'b> PartialEq<&'b JavaStr> for Cow<'a, JavaStr> { + #[inline] + fn eq(&self, other: &&'b JavaStr) -> bool { + self == *other + } +} + +impl<'a, 'b> PartialEq> for &'b JavaStr { + #[inline] + fn eq(&self, other: &Cow<'a, str>) -> bool { + *self == other + } +} + +impl<'a> PartialEq> for JavaStr { + #[inline] + fn eq(&self, other: &Cow<'a, str>) -> bool { + other == self + } +} + +impl<'a, 'b> PartialEq> for &'b JavaStr { + #[inline] + fn eq(&self, other: &Cow<'a, JavaStr>) -> bool { + *self == other + } +} + +impl<'a> PartialEq> for JavaStr { + #[inline] + fn eq(&self, other: &Cow<'a, JavaStr>) -> bool { + other == self + } +} + +impl<'a> PartialEq for &'a JavaStr { + #[inline] + fn eq(&self, other: &String) -> bool { + *self == other + } +} + +impl PartialEq for JavaStr { + #[inline] + fn eq(&self, other: &String) -> bool { + self == &other[..] + } +} + +impl PartialEq for String { + #[inline] + fn eq(&self, other: &JavaStr) -> bool { + &self[..] == other + } +} + +impl<'a> PartialEq for &'a JavaStr { + #[inline] + fn eq(&self, other: &JavaString) -> bool { + *self == other + } +} + +impl PartialEq for JavaStr { + #[inline] + fn eq(&self, other: &JavaString) -> bool { + self == other[..] + } +} + +impl<'a> PartialEq for Cow<'a, str> { + #[inline] + fn eq(&self, other: &JavaStr) -> bool { + match self { + Cow::Borrowed(this) => this == other, + Cow::Owned(this) => this == other, + } + } +} + +impl<'a> PartialEq for Cow<'a, JavaStr> { + #[inline] + fn eq(&self, other: &JavaStr) -> bool { + match self { + Cow::Borrowed(this) => this == other, + Cow::Owned(this) => this == other, + } + } +} + +impl PartialEq for str { + #[inline] + fn eq(&self, other: &JavaStr) -> bool { + JavaStr::from_str(self) == other + } +} + +impl<'a> PartialEq for &'a str { + #[inline] + fn eq(&self, other: &JavaStr) -> bool { + *self == other + } +} + +impl PartialEq for JavaStr { + #[inline] + fn eq(&self, other: &str) -> bool { + self == JavaStr::from_str(other) + } +} + +impl<'a> PartialEq<&'a str> for JavaStr { + #[inline] + fn eq(&self, other: &&'a str) -> bool { + self == *other + } +} + +impl<'a> PartialEq for &'a JavaStr { + #[inline] + fn eq(&self, other: &JavaStr) -> bool { + *self == other + } +} + +impl<'a> PartialEq<&'a JavaStr> for JavaStr { + #[inline] + fn eq(&self, other: &&'a JavaStr) -> bool { + self == *other + } +} + +impl ToOwned for JavaStr { + type Owned = JavaString; + + #[inline] + fn to_owned(&self) -> Self::Owned { + unsafe { JavaString::from_semi_utf8_unchecked(self.as_bytes().to_vec()) } + } +} + +mod private_slice_index { + use std::ops; + + pub trait Sealed {} + + impl Sealed for ops::Range {} + impl Sealed for ops::RangeTo {} + impl Sealed for ops::RangeFrom {} + impl Sealed for ops::RangeFull {} + impl Sealed for ops::RangeInclusive {} + impl Sealed for ops::RangeToInclusive {} +} + +/// # Safety +/// +/// Implementations' `check_bounds` method must properly check the bounds of the +/// slice, such that calling `get_unchecked` is not UB. +pub unsafe trait JavaStrSliceIndex: private_slice_index::Sealed + Sized { + fn check_bounds(&self, slice: &JavaStr) -> bool; + fn check_bounds_fail(self, slice: &JavaStr) -> !; + + /// # Safety + /// + /// - The input slice must be a valid pointer + /// - This index must not be out of bounds of the input slice + /// - The indices of this slice must point to char boundaries in the input + /// slice + unsafe fn get_unchecked(self, slice: *const JavaStr) -> *const JavaStr; + + /// # Safety + /// + /// - The input slice must be a valid pointer + /// - This index must not be out of bounds of the input slice + /// - The indices of this slice must point to char boundaries in the input + /// slice + unsafe fn get_unchecked_mut(self, slice: *mut JavaStr) -> *mut JavaStr; + + #[inline] + fn get(self, slice: &JavaStr) -> Option<&JavaStr> { + if self.check_bounds(slice) { + Some(unsafe { &*self.get_unchecked(slice) }) + } else { + None + } + } + + #[inline] + fn get_mut(self, slice: &mut JavaStr) -> Option<&mut JavaStr> { + if self.check_bounds(slice) { + Some(unsafe { &mut *self.get_unchecked_mut(slice) }) + } else { + None + } + } + + #[inline] + fn index(self, slice: &JavaStr) -> &JavaStr { + if self.check_bounds(slice) { + unsafe { &*self.get_unchecked(slice) } + } else { + self.check_bounds_fail(slice) + } + } + + #[inline] + fn index_mut(self, slice: &mut JavaStr) -> &mut JavaStr { + if self.check_bounds(slice) { + unsafe { &mut *self.get_unchecked_mut(slice) } + } else { + self.check_bounds_fail(slice) + } + } +} + +unsafe impl JavaStrSliceIndex for RangeFull { + #[inline] + fn check_bounds(&self, _slice: &JavaStr) -> bool { + true + } + + #[inline] + fn check_bounds_fail(self, _slice: &JavaStr) -> ! { + unreachable!() + } + + #[inline] + unsafe fn get_unchecked(self, slice: *const JavaStr) -> *const JavaStr { + slice + } + + #[inline] + unsafe fn get_unchecked_mut(self, slice: *mut JavaStr) -> *mut JavaStr { + slice + } +} + +unsafe impl JavaStrSliceIndex for Range { + #[inline] + fn check_bounds(&self, slice: &JavaStr) -> bool { + self.start <= self.end + && slice.is_char_boundary(self.start) + && slice.is_char_boundary(self.end) + } + + #[inline] + #[track_caller] + fn check_bounds_fail(self, slice: &JavaStr) -> ! { + slice_error_fail(slice, self.start, self.end) + } + + #[inline] + unsafe fn get_unchecked(self, slice: *const JavaStr) -> *const JavaStr { + let slice = slice as *const [u8]; + // SAFETY: the caller guarantees that `self` is in bounds of `slice` + // which satisfies all the conditions for `add`. + let ptr = unsafe { (slice as *const u8).add(self.start) }; + let len = self.end - self.start; + ptr::slice_from_raw_parts(ptr, len) as *const JavaStr + } + + #[inline] + unsafe fn get_unchecked_mut(self, slice: *mut JavaStr) -> *mut JavaStr { + let slice = slice as *mut [u8]; + // SAFETY: see comments for `get_unchecked`. + let ptr = unsafe { (slice as *mut u8).add(self.start) }; + let len = self.end - self.start; + ptr::slice_from_raw_parts_mut(ptr, len) as *mut JavaStr + } +} + +unsafe impl JavaStrSliceIndex for RangeTo { + #[inline] + fn check_bounds(&self, slice: &JavaStr) -> bool { + slice.is_char_boundary(self.end) + } + + #[inline] + #[track_caller] + fn check_bounds_fail(self, slice: &JavaStr) -> ! { + slice_error_fail(slice, 0, self.end) + } + + #[inline] + unsafe fn get_unchecked(self, slice: *const JavaStr) -> *const JavaStr { + unsafe { (0..self.end).get_unchecked(slice) } + } + + #[inline] + unsafe fn get_unchecked_mut(self, slice: *mut JavaStr) -> *mut JavaStr { + unsafe { (0..self.end).get_unchecked_mut(slice) } + } +} + +unsafe impl JavaStrSliceIndex for RangeFrom { + #[inline] + fn check_bounds(&self, slice: &JavaStr) -> bool { + slice.is_char_boundary(self.start) + } + + #[inline] + #[track_caller] + fn check_bounds_fail(self, slice: &JavaStr) -> ! { + slice_error_fail(slice, self.start, slice.len()) + } + + #[inline] + unsafe fn get_unchecked(self, slice: *const JavaStr) -> *const JavaStr { + let len = unsafe { (*(slice as *const [u8])).len() }; + unsafe { (self.start..len).get_unchecked(slice) } + } + + #[inline] + unsafe fn get_unchecked_mut(self, slice: *mut JavaStr) -> *mut JavaStr { + let len = unsafe { (*(slice as *mut [u8])).len() }; + unsafe { (self.start..len).get_unchecked_mut(slice) } + } +} + +#[inline] +fn into_slice_range(range: RangeInclusive) -> Range { + let exclusive_end = *range.end() + 1; + let start = match range.end_bound() { + Bound::Excluded(..) => exclusive_end, // excluded + Bound::Included(..) => *range.start(), + Bound::Unbounded => unreachable!(), + }; + start..exclusive_end +} + +unsafe impl JavaStrSliceIndex for RangeInclusive { + #[inline] + fn check_bounds(&self, slice: &JavaStr) -> bool { + *self.end() != usize::MAX && into_slice_range(self.clone()).check_bounds(slice) + } + + #[inline] + #[track_caller] + fn check_bounds_fail(self, slice: &JavaStr) -> ! { + if *self.end() == usize::MAX { + str_end_index_overflow_fail() + } else { + into_slice_range(self).check_bounds_fail(slice) + } + } + + #[inline] + unsafe fn get_unchecked(self, slice: *const JavaStr) -> *const JavaStr { + into_slice_range(self).get_unchecked(slice) + } + + #[inline] + unsafe fn get_unchecked_mut(self, slice: *mut JavaStr) -> *mut JavaStr { + into_slice_range(self).get_unchecked_mut(slice) + } +} + +unsafe impl JavaStrSliceIndex for RangeToInclusive { + #[inline] + fn check_bounds(&self, slice: &JavaStr) -> bool { + (0..=self.end).check_bounds(slice) + } + + #[inline] + fn check_bounds_fail(self, slice: &JavaStr) -> ! { + (0..=self.end).check_bounds_fail(slice) + } + + #[inline] + unsafe fn get_unchecked(self, slice: *const JavaStr) -> *const JavaStr { + (0..=self.end).get_unchecked(slice) + } + + #[inline] + unsafe fn get_unchecked_mut(self, slice: *mut JavaStr) -> *mut JavaStr { + (0..=self.end).get_unchecked_mut(slice) + } +} diff --git a/crates/valence_java_string/src/validations.rs b/crates/valence_java_string/src/validations.rs new file mode 100644 index 000000000..09f8dd6a5 --- /dev/null +++ b/crates/valence_java_string/src/validations.rs @@ -0,0 +1,368 @@ +use std::ops::{Bound, Range, RangeBounds, RangeTo}; + +use crate::{JavaStr, Utf8Error}; + +pub(crate) const TAG_CONT: u8 = 0b1000_0000; +pub(crate) const TAG_TWO_B: u8 = 0b1100_0000; +pub(crate) const TAG_THREE_B: u8 = 0b1110_0000; +pub(crate) const TAG_FOUR_B: u8 = 0b1111_0000; +const CONT_MASK: u8 = 0b0011_1111; + +#[inline] +const fn utf8_first_byte(byte: u8, width: u32) -> u32 { + (byte & (0x7f >> width)) as u32 +} + +#[inline] +const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { + (ch << 6) | (byte & CONT_MASK) as u32 +} + +#[inline] +const fn utf8_is_cont_byte(byte: u8) -> bool { + (byte as i8) < -64 +} + +/// # Safety +/// +/// `bytes` must produce a semi-valid UTF-8 string +#[inline] +pub(crate) unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> Option { + // Decode UTF-8 + let x = *bytes.next()?; + if x < 128 { + return Some(x as u32); + } + + // Multibyte case follows + // Decode from a byte combination out of: [[[x y] z] w] + // NOTE: Performance is sensitive to the exact formulation here + let init = utf8_first_byte(x, 2); + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let y = unsafe { *bytes.next().unwrap_unchecked() }; + let mut ch = utf8_acc_cont_byte(init, y); + if x >= 0xe0 { + // [[x y z] w] case + // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let z = unsafe { *bytes.next().unwrap_unchecked() }; + let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); + ch = init << 12 | y_z; + if x >= 0xf0 { + // [x y z w] case + // use only the lower 3 bits of `init` + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let w = unsafe { *bytes.next().unwrap_unchecked() }; + ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); + } + } + + Some(ch) +} + +/// # Safety +/// +/// `bytes` must produce a semi-valid UTF-8 string +#[inline] +pub(crate) unsafe fn next_code_point_reverse<'a, I: DoubleEndedIterator>( + bytes: &mut I, +) -> Option { + // Decode UTF-8 + let w = match *bytes.next_back()? { + next_byte if next_byte < 128 => return Some(next_byte as u32), + back_byte => back_byte, + }; + + // Multibyte case follows + // Decode from a byte combination out of: [x [y [z w]]] + let mut ch; + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let z = unsafe { *bytes.next_back().unwrap_unchecked() }; + ch = utf8_first_byte(z, 2); + if utf8_is_cont_byte(z) { + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let y = unsafe { *bytes.next_back().unwrap_unchecked() }; + ch = utf8_first_byte(y, 3); + if utf8_is_cont_byte(y) { + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let x = unsafe { *bytes.next_back().unwrap_unchecked() }; + ch = utf8_first_byte(x, 4); + ch = utf8_acc_cont_byte(ch, y); + } + ch = utf8_acc_cont_byte(ch, z); + } + ch = utf8_acc_cont_byte(ch, w); + + Some(ch) +} + +#[inline(always)] +pub(crate) fn run_utf8_semi_validation(v: &[u8]) -> Result<(), Utf8Error> { + let mut index = 0; + let len = v.len(); + + let usize_bytes = std::mem::size_of::(); + let ascii_block_size = 2 * usize_bytes; + let blocks_end = if len >= ascii_block_size { + len - ascii_block_size + 1 + } else { + 0 + }; + let align = v.as_ptr().align_offset(usize_bytes); + + while index < len { + let old_offset = index; + macro_rules! err { + ($error_len:expr) => { + return Err(Utf8Error { + valid_up_to: old_offset, + error_len: $error_len, + }) + }; + } + + macro_rules! next { + () => {{ + index += 1; + // we needed data, but there was none: error! + if index >= len { + err!(None) + } + v[index] + }}; + } + + let first = v[index]; + if first >= 128 { + let w = utf8_char_width(first); + // 2-byte encoding is for codepoints \u{0080} to \u{07ff} + // first C2 80 last DF BF + // 3-byte encoding is for codepoints \u{0800} to \u{ffff} + // first E0 A0 80 last EF BF BF + // INCLUDING surrogates codepoints \u{d800} to \u{dfff} + // ED A0 80 to ED BF BF + // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff + // first F0 90 80 80 last F4 8F BF BF + // + // Use the UTF-8 syntax from the RFC + // + // https://tools.ietf.org/html/rfc3629 + // UTF8-1 = %x00-7F + // UTF8-2 = %xC2-DF UTF8-tail + // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / + // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) + // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / + // %xF4 %x80-8F 2( UTF8-tail ) + match w { + 2 => { + if next!() as i8 >= -64 { + err!(Some(1)) + } + } + 3 => { + match (first, next!()) { + (0xe0, 0xa0..=0xbf) | (0xe1..=0xef, 0x80..=0xbf) => {} /* INCLUDING surrogate codepoints here */ + _ => err!(Some(1)), + } + if next!() as i8 >= -64 { + err!(Some(2)) + } + } + 4 => { + match (first, next!()) { + (0xf0, 0x90..=0xbf) | (0xf1..=0xf3, 0x80..=0xbf) | (0xf4, 0x80..=0x8f) => {} + _ => err!(Some(1)), + } + if next!() as i8 >= -64 { + err!(Some(2)) + } + if next!() as i8 >= -64 { + err!(Some(3)) + } + } + _ => err!(Some(1)), + } + } else { + // Ascii case, try to skip forward quickly. + // When the pointer is aligned, read 2 words of data per iteration + // until we find a word containing a non-ascii byte. + if align != usize::MAX && align.wrapping_sub(index) % usize_bytes == 0 { + let ptr = v.as_ptr(); + while index < blocks_end { + // SAFETY: since `align - index` and `ascii_block_size` are + // multiples of `usize_bytes`, `block = ptr.add(index)` is + // always aligned with a `usize` so it's safe to dereference + // both `block` and `block.add(1)`. + unsafe { + let block = ptr.add(index) as *const usize; + // break if there is a nonascii byte + let zu = contains_nonascii(*block); + let zv = contains_nonascii(*block.add(1)); + if zu || zv { + break; + } + } + index += ascii_block_size; + } + // step from the point where the wordwise loop stopped + while index < len && v[index] < 128 { + index += 1; + } + } else { + index += 1; + } + } + } + + Ok(()) +} + +#[inline(always)] +pub(crate) const fn run_utf8_full_validation_from_semi(v: &[u8]) -> Result<(), Utf8Error> { + // this function checks for surrogate codepoints, between \u{d800} to \u{dfff}, + // or ED A0 80 to ED BF BF of width 3 unicode chars. The valid range of width 3 + // characters is ED 80 80 to ED BF BF, so we need to check for an ED byte + // followed by a >=A0 byte. + let mut index = 0; + while index + 3 <= v.len() { + if v[index] == 0xed && v[index + 1] >= 0xa0 { + return Err(Utf8Error { + valid_up_to: index, + error_len: Some(1), + }); + } + index += 1; + } + + Ok(()) +} + +#[inline] +const fn utf8_char_width(first_byte: u8) -> usize { + const UTF8_CHAR_WIDTH: [u8; 256] = [ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]; + + UTF8_CHAR_WIDTH[first_byte as usize] as _ +} + +#[inline] +const fn contains_nonascii(x: usize) -> bool { + const NONASCII_MASK: usize = usize::from_ne_bytes([0x80; std::mem::size_of::()]); + (x & NONASCII_MASK) != 0 +} + +#[cold] +#[track_caller] +pub(crate) fn slice_error_fail(s: &JavaStr, begin: usize, end: usize) -> ! { + const MAX_DISPLAY_LENGTH: usize = 256; + let trunc_len = s.floor_char_boundary(MAX_DISPLAY_LENGTH); + let s_trunc = &s[..trunc_len]; + let ellipsis = if trunc_len < s.len() { "[...]" } else { "" }; + + // 1. out of bounds + if begin > s.len() || end > s.len() { + let oob_index = if begin > s.len() { begin } else { end }; + panic!("byte index {oob_index} is out of bounds of `{s_trunc}`{ellipsis}"); + } + + // 2. begin <= end + assert!( + begin <= end, + "begin <= end ({} <= {}) when slicing `{}`{}", + begin, + end, + s_trunc, + ellipsis + ); + + // 3. character boundary + let index = if !s.is_char_boundary(begin) { + begin + } else { + end + }; + // find the character + let char_start = s.floor_char_boundary(index); + // `char_start` must be less than len and a char boundary + let ch = s[char_start..].chars().next().unwrap(); + let char_range = char_start..char_start + ch.len_utf8(); + panic!( + "byte index {} is not a char boundary; it is inside {:?} (bytes {:?}) of `{}`{}", + index, ch, char_range, s_trunc, ellipsis + ); +} + +#[cold] +#[track_caller] +pub(crate) fn str_end_index_len_fail(index: usize, len: usize) -> ! { + panic!("range end index {index} out of range for JavaStr of length {len}"); +} + +#[cold] +#[track_caller] +pub(crate) fn str_index_order_fail(index: usize, end: usize) -> ! { + panic!("JavaStr index starts at {index} but ends at {end}"); +} + +#[cold] +#[track_caller] +pub(crate) fn str_start_index_overflow_fail() -> ! { + panic!("attempted to index JavaStr from after maximum usize"); +} + +#[cold] +#[track_caller] +pub(crate) fn str_end_index_overflow_fail() -> ! { + panic!("attempted to index JavaStr up to maximum usize") +} + +#[inline] +#[track_caller] +pub(crate) fn to_range_checked(range: R, bounds: RangeTo) -> Range +where + R: RangeBounds, +{ + let len = bounds.end; + + let start = range.start_bound(); + let start = match start { + Bound::Included(&start) => start, + Bound::Excluded(start) => start + .checked_add(1) + .unwrap_or_else(|| str_start_index_overflow_fail()), + Bound::Unbounded => 0, + }; + + let end: Bound<&usize> = range.end_bound(); + let end = match end { + Bound::Included(end) => end + .checked_add(1) + .unwrap_or_else(|| str_end_index_overflow_fail()), + Bound::Excluded(&end) => end, + Bound::Unbounded => len, + }; + + if start > end { + str_index_order_fail(start, end); + } + if end > len { + str_end_index_len_fail(end, len); + } + + Range { start, end } +} From e48b34741ee180c24c81e0e90a9b8c4cbdb5a638 Mon Sep 17 00:00:00 2001 From: Joe Date: Mon, 2 Oct 2023 19:52:07 +0100 Subject: [PATCH 02/11] Add encoding to/from Java's modified UTF-8 format --- crates/valence_java_string/README.md | 6 +- crates/valence_java_string/src/cesu8.rs | 251 ++++++++++++++++++ crates/valence_java_string/src/lib.rs | 2 + crates/valence_java_string/src/owned.rs | 2 +- crates/valence_java_string/src/slice.rs | 4 +- crates/valence_java_string/src/validations.rs | 4 +- 6 files changed, 261 insertions(+), 8 deletions(-) create mode 100644 crates/valence_java_string/src/cesu8.rs diff --git a/crates/valence_java_string/README.md b/crates/valence_java_string/README.md index cff48a3b8..d3a960ab0 100644 --- a/crates/valence_java_string/README.md +++ b/crates/valence_java_string/README.md @@ -5,9 +5,9 @@ This allows for round-trip serialization of all Java strings, including those wh being able to perform useful operations on those strings. These Java strings use the UTF-8 encoding, with the modification that surrogate code points (code points between U+D800 -and U+DFFF inclusive) are allowed. This allows for zero-cost conversion from Rust strings to Java strings. Similarly, -this crate introduces a `JavaCodePoint` type which is analogous to `char`, except that surrogate code points are -allowed. +and U+DFFF inclusive) are allowed. This allows for zero-cost conversion from Rust strings to Java strings. This modified +encoding is known as "semi-UTF-8" throughout the codebase. Similarly, this crate introduces a `JavaCodePoint` type which +is analogous to `char`, except that surrogate code points are allowed. This crate is mostly undocumented, because most methods are entirely analogous to those of the same name in Rust's strings. Please refer to the `std` documentation. diff --git a/crates/valence_java_string/src/cesu8.rs b/crates/valence_java_string/src/cesu8.rs new file mode 100644 index 000000000..0ad77787c --- /dev/null +++ b/crates/valence_java_string/src/cesu8.rs @@ -0,0 +1,251 @@ +use std::borrow::Cow; + +use crate::validations::{utf8_char_width, CONT_MASK, TAG_CONT}; +use crate::{JavaStr, JavaString, Utf8Error}; + +impl JavaStr { + /// Converts from Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format to a `Cow`. + #[inline] + pub fn from_modified_utf8(bytes: &[u8]) -> Result, Utf8Error> { + match JavaStr::from_full_utf8(bytes) { + Ok(str) => Ok(Cow::Borrowed(str)), + Err(_) => JavaString::from_modified_utf8_iter(bytes.iter().copied()).map(Cow::Owned), + } + } + + /// Converts to Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format. + #[inline] + #[must_use] + pub fn to_modified_utf8(&self) -> Cow<[u8]> { + if is_valid_cesu8(self) { + Cow::Borrowed(self.as_bytes()) + } else { + Cow::Owned(self.to_modified_utf8_internal()) + } + } + + #[inline] + fn to_modified_utf8_internal(&self) -> Vec { + let bytes = self.as_bytes(); + let mut encoded = Vec::with_capacity(bytes.len() + bytes.len() >> 2); + let mut i = 0; + while i < bytes.len() { + let b = bytes[i]; + if b == 0 { + encoded.extend([0xc0, 0x80].into_iter()); + i += 1; + } else if b < 128 { + // Pass ASCII through quickly. + encoded.push(b); + i += 1; + } else { + // Figure out how many bytes we need for this character. + let w = utf8_char_width(b); + let char_bytes = unsafe { + // SAFETY: input must be valid semi UTF-8, so there must be at least w more + // bytes from i + bytes.get_unchecked(i..i + w) + }; + if w != 4 { + // Pass through short UTF-8 sequences unmodified. + encoded.extend(char_bytes.iter().copied()) + } else { + // Encode 4-byte sequences as 6 bytes + let s = unsafe { + // SAFETY: input is valid semi UTF-8 + JavaStr::from_semi_utf8_unchecked(bytes) + }; + let c = unsafe { + // SAFETY: s contains a single char of width 4 + s.chars().next().unwrap_unchecked().as_u32() - 0x10000 + }; + let s = [((c >> 10) as u16) | 0xd800, ((c & 0x3ff) as u16) | 0xdc00]; + encoded.extend(enc_surrogate(s[0]).into_iter()); + encoded.extend(enc_surrogate(s[1]).into_iter()); + } + i += w; + } + } + encoded + } +} + +impl JavaString { + /// Converts from Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format to a `JavaString`. + #[inline] + pub fn from_modified_utf8(bytes: Vec) -> Result { + match JavaString::from_full_utf8(bytes) { + Ok(str) => Ok(str), + Err(err) => JavaString::from_modified_utf8_iter(err.bytes.into_iter()), + } + } + + /// Converts from Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format to a `JavaString`. + pub fn from_modified_utf8_iter(mut iter: I) -> Result + where + I: Iterator, + { + let mut index = 0; + let mut decoded = Vec::with_capacity(iter.size_hint().0); + let mut surrogate_first: Option<[u8; 3]> = None; + + macro_rules! flush_first_surrogate_half { + () => { + // append any preceding first half of a surrogate pair + if let Some(surrogate_first) = surrogate_first.take() { + decoded.extend(surrogate_first.into_iter()); + } + }; + } + + while let Some(first) = iter.next() { + let old_offset = index; + + macro_rules! err { + ($error_len:expr) => { + return Err(Utf8Error { + valid_up_to: old_offset, + error_len: $error_len, + }) + }; + } + + macro_rules! next { + () => {{ + index += 1; + match iter.next() { + Some(a) => a, + None => err!(None), + } + }}; + } + + macro_rules! next_cont { + ($error_len:expr) => {{ + let byte = next!(); + if (byte) & !CONT_MASK == TAG_CONT { + byte + } else { + err!($error_len) + } + }}; + } + + if first == 0 { + // modified UTF-8 should never contain \0 directly. + err!(None); + } else if first < 128 { + flush_first_surrogate_half!(); + // Pass ASCII through directly. + decoded.push(first); + } else if first == 0xc0 { + flush_first_surrogate_half!(); + // modified UTF-8 encoding of null character + match next!() { + 0x80 => decoded.push(0), + _ => err!(Some(1)), + } + } else { + let w = utf8_char_width(first); + let second = next_cont!(Some(1)); + match w { + // Two-byte sequences can be used directly. + 2 => { + flush_first_surrogate_half!(); + decoded.extend([first, second].into_iter()); + } + 3 => { + let third = next_cont!(Some(2)); + match (first, second) { + // These are valid UTF-8, so pass them through. + (0xe0, 0xa0..=0xbf) + | (0xe1..=0xec, 0x80..=0xbf) + | (0xed, 0x80..=0x9f) + | (0xee..=0xef, 0x80..=0xbf) => { + flush_first_surrogate_half!(); + decoded.extend([first, second, third].into_iter()) + } + // First half of a surrogate pair + (0xed, 0xa0..=0xaf) => { + flush_first_surrogate_half!(); + surrogate_first = Some([first, second, third]); + } + // Second half of a surrogate pair + (0xed, 0xb0..=0xbf) => { + // try to pair the second half with a preceding first half + if let Some([_, b, c]) = surrogate_first.take() { + let (fifth, sixth) = (second, third); + let (second, third) = (b, c); + let s = dec_surrogates(second, third, fifth, sixth); + decoded.extend(s.into_iter()); + } else { + // no first half, append the second half directly + decoded.extend([first, second, third].into_iter()); + } + } + _ => err!(Some(1)), + } + } + _ => err!(Some(1)), // modified UTF-8 doesn't allow width 4 + } + } + } + + flush_first_surrogate_half!(); + + unsafe { + // SAFETY: we built a semi UTF-8 encoded string + Ok(JavaString::from_semi_utf8_unchecked(decoded)) + } + } + + /// Converts to Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format. + #[inline] + #[must_use] + pub fn into_modified_utf8(self) -> Vec { + if is_valid_cesu8(&self) { + self.into_bytes() + } else { + self.to_modified_utf8_internal() + } + } +} + +#[inline] +fn dec_surrogate(second: u8, third: u8) -> u32 { + 0xd000 | ((second & CONT_MASK) as u32) << 6 | (third & CONT_MASK) as u32 +} + +#[inline] +fn dec_surrogates(second: u8, third: u8, fifth: u8, sixth: u8) -> [u8; 4] { + // Convert to a 32-bit code point. + let s1 = dec_surrogate(second, third); + let s2 = dec_surrogate(fifth, sixth); + let c = 0x10000 + (((s1 - 0xd800) << 10) | (s2 - 0xdc00)); + assert!((0x010000..=0x10ffff).contains(&c)); + + // Convert to UTF-8. + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + [ + 0b1111_0000u8 | ((c & 0b1_1100_0000_0000_0000_0000) >> 18) as u8, + TAG_CONT | ((c & 0b0_0011_1111_0000_0000_0000) >> 12) as u8, + TAG_CONT | ((c & 0b0_0000_0000_1111_1100_0000) >> 6) as u8, + TAG_CONT | (c & 0b0_0000_0000_0000_0011_1111) as u8, + ] +} + +#[inline] +fn is_valid_cesu8(text: &JavaStr) -> bool { + text.bytes() + .all(|b| b != 0 && ((b & !CONT_MASK) == TAG_CONT || utf8_char_width(b) <= 3)) +} + +#[inline] +fn enc_surrogate(surrogate: u16) -> [u8; 3] { + // 1110xxxx 10xxxxxx 10xxxxxx + [ + 0b11100000 | ((surrogate & 0b11110000_00000000) >> 12) as u8, + TAG_CONT | ((surrogate & 0b00001111_11000000) >> 6) as u8, + TAG_CONT | (surrogate & 0b00000000_00111111) as u8, + ] +} diff --git a/crates/valence_java_string/src/lib.rs b/crates/valence_java_string/src/lib.rs index 82e80d7ab..57f035944 100644 --- a/crates/valence_java_string/src/lib.rs +++ b/crates/valence_java_string/src/lib.rs @@ -1,5 +1,6 @@ #![doc = include_str!("../README.md")] +mod cesu8; mod char; mod error; mod iter; @@ -10,6 +11,7 @@ mod serde; mod slice; pub(crate) mod validations; +pub use cesu8::*; pub use char::*; pub use error::*; pub use iter::*; diff --git a/crates/valence_java_string/src/owned.rs b/crates/valence_java_string/src/owned.rs index 66a055d64..83d21a62c 100644 --- a/crates/valence_java_string/src/owned.rs +++ b/crates/valence_java_string/src/owned.rs @@ -39,7 +39,7 @@ impl JavaString { } #[inline] - pub fn from_utf8(vec: Vec) -> Result { + pub fn from_full_utf8(vec: Vec) -> Result { match std::str::from_utf8(&vec) { Ok(..) => Ok(JavaString { vec }), Err(e) => Err(FromUtf8Error { diff --git a/crates/valence_java_string/src/slice.rs b/crates/valence_java_string/src/slice.rs index 8cc1e3fda..49a5373c5 100644 --- a/crates/valence_java_string/src/slice.rs +++ b/crates/valence_java_string/src/slice.rs @@ -31,7 +31,7 @@ pub struct JavaStr { impl JavaStr { #[inline] - pub const fn from_utf8(v: &[u8]) -> Result<&JavaStr, Utf8Error> { + pub const fn from_full_utf8(v: &[u8]) -> Result<&JavaStr, Utf8Error> { match std::str::from_utf8(v) { Ok(str) => Ok(JavaStr::from_str(str)), Err(err) => Err(Utf8Error::from_std(err)), @@ -39,7 +39,7 @@ impl JavaStr { } #[inline] - pub fn from_utf8_mut(v: &mut [u8]) -> Result<&mut JavaStr, Utf8Error> { + pub fn from_full_utf8_mut(v: &mut [u8]) -> Result<&mut JavaStr, Utf8Error> { match std::str::from_utf8_mut(v) { Ok(str) => Ok(JavaStr::from_mut_str(str)), Err(err) => Err(Utf8Error::from_std(err)), diff --git a/crates/valence_java_string/src/validations.rs b/crates/valence_java_string/src/validations.rs index 09f8dd6a5..3ae2d2c07 100644 --- a/crates/valence_java_string/src/validations.rs +++ b/crates/valence_java_string/src/validations.rs @@ -6,7 +6,7 @@ pub(crate) const TAG_CONT: u8 = 0b1000_0000; pub(crate) const TAG_TWO_B: u8 = 0b1100_0000; pub(crate) const TAG_THREE_B: u8 = 0b1110_0000; pub(crate) const TAG_FOUR_B: u8 = 0b1111_0000; -const CONT_MASK: u8 = 0b0011_1111; +pub(crate) const CONT_MASK: u8 = 0b0011_1111; #[inline] const fn utf8_first_byte(byte: u8, width: u32) -> u32 { @@ -244,7 +244,7 @@ pub(crate) const fn run_utf8_full_validation_from_semi(v: &[u8]) -> Result<(), U } #[inline] -const fn utf8_char_width(first_byte: u8) -> usize { +pub(crate) const fn utf8_char_width(first_byte: u8) -> usize { const UTF8_CHAR_WIDTH: [u8; 256] = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, From 40ea873e7d559a95b8a28dea3b02c8f5a3468caa Mon Sep 17 00:00:00 2001 From: Joe Date: Mon, 2 Oct 2023 20:42:10 +0100 Subject: [PATCH 03/11] Add docs where there is no analogous function in std --- crates/valence_java_string/src/char.rs | 2 ++ crates/valence_java_string/src/iter.rs | 4 ++-- crates/valence_java_string/src/owned.rs | 8 ++++++++ crates/valence_java_string/src/slice.rs | 12 ++++++++++++ 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/crates/valence_java_string/src/char.rs b/crates/valence_java_string/src/char.rs index 13130aaf5..466de2e1f 100644 --- a/crates/valence_java_string/src/char.rs +++ b/crates/valence_java_string/src/char.rs @@ -122,6 +122,8 @@ impl JavaCodePoint { } } + /// Encodes this `JavaCodePoint` into semi UTF-8, that is, UTF-8 with + /// surrogate code points. #[inline] pub fn encode_semi_utf8(self, dst: &mut [u8]) -> &mut [u8] { let len = self.len_utf8(); diff --git a/crates/valence_java_string/src/iter.rs b/crates/valence_java_string/src/iter.rs index cebd1993e..5c73a0b6e 100644 --- a/crates/valence_java_string/src/iter.rs +++ b/crates/valence_java_string/src/iter.rs @@ -192,7 +192,7 @@ impl<'a> Iterator for Chars<'a> { #[inline] fn next(&mut self) -> Option { - // SAFETY: `JavaStr` invariant says `self.iter` is a semi-valid UTF-8 string and + // SAFETY: `JavaStr` invariant says `self.inner` is a semi-valid UTF-8 string and // the resulting `ch` is a valid Unicode Scalar Value or surrogate code point. unsafe { next_code_point(&mut self.inner).map(|ch| JavaCodePoint::from_u32_unchecked(ch)) } } @@ -227,7 +227,7 @@ impl Debug for Chars<'_> { impl<'a> DoubleEndedIterator for Chars<'a> { #[inline] fn next_back(&mut self) -> Option { - // SAFETY: `JavaStr` invariant says `self.iter` is a semi-valid UTF-8 string and + // SAFETY: `JavaStr` invariant says `self.inner` is a semi-valid UTF-8 string and // the resulting `ch` is a valid Unicode Scalar Value or surrogate code point. unsafe { next_code_point_reverse(&mut self.inner).map(|ch| JavaCodePoint::from_u32_unchecked(ch)) diff --git a/crates/valence_java_string/src/owned.rs b/crates/valence_java_string/src/owned.rs index 83d21a62c..7c332f4e7 100644 --- a/crates/valence_java_string/src/owned.rs +++ b/crates/valence_java_string/src/owned.rs @@ -38,6 +38,8 @@ impl JavaString { } } + /// Converts `vec` to a `JavaString` if it is fully-valid UTF-8, i.e. UTF-8 + /// without surrogate code points. #[inline] pub fn from_full_utf8(vec: Vec) -> Result { match std::str::from_utf8(&vec) { @@ -49,6 +51,8 @@ impl JavaString { } } + /// Converts `vec` to a `JavaString` if it is semi-valid UTF-8, i.e. UTF-8 + /// with surrogate code points. pub fn from_semi_utf8(vec: Vec) -> Result { match run_utf8_semi_validation(&vec) { Ok(..) => Ok(JavaString { vec }), @@ -59,6 +63,8 @@ impl JavaString { } } + /// Converts `v` to a `Cow`, replacing invalid semi-UTF-8 with the + /// replacement character �. #[must_use] pub fn from_semi_utf8_lossy(v: &[u8]) -> Cow<'_, JavaStr> { const REPLACEMENT: &str = "\u{FFFD}"; @@ -137,6 +143,8 @@ impl JavaString { } } + /// Tries to convert this `JavaString` to a `String`, returning an error if + /// it is not fully valid UTF-8, i.e. has no surrogate code points. pub fn into_string(self) -> Result { run_utf8_full_validation_from_semi(self.as_bytes()).map(|_| unsafe { // SAFETY: validation succeeded diff --git a/crates/valence_java_string/src/slice.rs b/crates/valence_java_string/src/slice.rs index 49a5373c5..5d5d91e7d 100644 --- a/crates/valence_java_string/src/slice.rs +++ b/crates/valence_java_string/src/slice.rs @@ -30,6 +30,8 @@ pub struct JavaStr { } impl JavaStr { + /// Converts `v` to a `&JavaStr` if it is fully-valid UTF-8, i.e. UTF-8 + /// without surrogate code points. #[inline] pub const fn from_full_utf8(v: &[u8]) -> Result<&JavaStr, Utf8Error> { match std::str::from_utf8(v) { @@ -38,6 +40,8 @@ impl JavaStr { } } + /// Converts `v` to a `&mut JavaStr` if it is fully-valid UTF-8, i.e. UTF-8 + /// without surrogate code points. #[inline] pub fn from_full_utf8_mut(v: &mut [u8]) -> Result<&mut JavaStr, Utf8Error> { match std::str::from_utf8_mut(v) { @@ -46,6 +50,8 @@ impl JavaStr { } } + /// Converts `v` to a `&JavaStr` if it is semi-valid UTF-8, i.e. UTF-8 + /// with surrogate code points. pub fn from_semi_utf8(v: &[u8]) -> Result<&JavaStr, Utf8Error> { match run_utf8_semi_validation(v) { Ok(()) => Ok(unsafe { JavaStr::from_semi_utf8_unchecked(v) }), @@ -53,6 +59,8 @@ impl JavaStr { } } + /// Converts `v` to a `&mut JavaStr` if it is semi-valid UTF-8, i.e. UTF-8 + /// with surrogate code points. pub fn from_semi_utf8_mut(v: &mut [u8]) -> Result<&mut JavaStr, Utf8Error> { match run_utf8_semi_validation(v) { Ok(()) => Ok(unsafe { JavaStr::from_semi_utf8_unchecked_mut(v) }), @@ -146,6 +154,8 @@ impl JavaStr { self.inner.as_ptr() } + /// Tries to convert this `&JavaStr` to a `&str`, returning an error if + /// it is not fully valid UTF-8, i.e. has no surrogate code points. pub const fn as_str(&self) -> Result<&str, Utf8Error> { // Manual implementation of Option::map since it's not const match run_utf8_full_validation_from_semi(self.as_bytes()) { @@ -167,6 +177,8 @@ impl JavaStr { std::str::from_utf8_unchecked(self.as_bytes()) } + /// Converts this `&JavaStr` to a `Cow`, replacing surrogate code + /// points with the replacement character �. #[must_use] pub fn as_str_lossy(&self) -> Cow<'_, str> { match run_utf8_full_validation_from_semi(self.as_bytes()) { From fd5f163ee9d85e9c0e37f620fb93808bd0da3c27 Mon Sep 17 00:00:00 2001 From: Joe Date: Tue, 3 Oct 2023 01:06:22 +0100 Subject: [PATCH 04/11] Update local clippy and rustfmt --- crates/valence_java_string/src/cesu8.rs | 2 +- crates/valence_java_string/src/iter.rs | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/crates/valence_java_string/src/cesu8.rs b/crates/valence_java_string/src/cesu8.rs index 0ad77787c..bba2ed21c 100644 --- a/crates/valence_java_string/src/cesu8.rs +++ b/crates/valence_java_string/src/cesu8.rs @@ -27,7 +27,7 @@ impl JavaStr { #[inline] fn to_modified_utf8_internal(&self) -> Vec { let bytes = self.as_bytes(); - let mut encoded = Vec::with_capacity(bytes.len() + bytes.len() >> 2); + let mut encoded = Vec::with_capacity((bytes.len() + bytes.len()) >> 2); let mut i = 0; while i < bytes.len() { let b = bytes[i]; diff --git a/crates/valence_java_string/src/iter.rs b/crates/valence_java_string/src/iter.rs index 5c73a0b6e..e8c8469e8 100644 --- a/crates/valence_java_string/src/iter.rs +++ b/crates/valence_java_string/src/iter.rs @@ -192,8 +192,9 @@ impl<'a> Iterator for Chars<'a> { #[inline] fn next(&mut self) -> Option { - // SAFETY: `JavaStr` invariant says `self.inner` is a semi-valid UTF-8 string and - // the resulting `ch` is a valid Unicode Scalar Value or surrogate code point. + // SAFETY: `JavaStr` invariant says `self.inner` is a semi-valid UTF-8 string + // and the resulting `ch` is a valid Unicode Scalar Value or surrogate + // code point. unsafe { next_code_point(&mut self.inner).map(|ch| JavaCodePoint::from_u32_unchecked(ch)) } } @@ -227,8 +228,9 @@ impl Debug for Chars<'_> { impl<'a> DoubleEndedIterator for Chars<'a> { #[inline] fn next_back(&mut self) -> Option { - // SAFETY: `JavaStr` invariant says `self.inner` is a semi-valid UTF-8 string and - // the resulting `ch` is a valid Unicode Scalar Value or surrogate code point. + // SAFETY: `JavaStr` invariant says `self.inner` is a semi-valid UTF-8 string + // and the resulting `ch` is a valid Unicode Scalar Value or surrogate + // code point. unsafe { next_code_point_reverse(&mut self.inner).map(|ch| JavaCodePoint::from_u32_unchecked(ch)) } From 3672a9bb1988b8e187be4d0704922098d9150e0d Mon Sep 17 00:00:00 2001 From: Joe Date: Tue, 3 Oct 2023 12:17:01 +0100 Subject: [PATCH 05/11] Rename to java_string --- Cargo.toml | 2 +- crates/{valence_java_string => java_string}/Cargo.toml | 2 +- crates/{valence_java_string => java_string}/README.md | 0 crates/{valence_java_string => java_string}/src/cesu8.rs | 0 crates/{valence_java_string => java_string}/src/char.rs | 0 crates/{valence_java_string => java_string}/src/error.rs | 0 crates/{valence_java_string => java_string}/src/iter.rs | 0 crates/{valence_java_string => java_string}/src/lib.rs | 0 crates/{valence_java_string => java_string}/src/owned.rs | 0 crates/{valence_java_string => java_string}/src/pattern.rs | 0 crates/{valence_java_string => java_string}/src/serde.rs | 0 crates/{valence_java_string => java_string}/src/slice.rs | 0 crates/{valence_java_string => java_string}/src/validations.rs | 0 13 files changed, 2 insertions(+), 2 deletions(-) rename crates/{valence_java_string => java_string}/Cargo.toml (92%) rename crates/{valence_java_string => java_string}/README.md (100%) rename crates/{valence_java_string => java_string}/src/cesu8.rs (100%) rename crates/{valence_java_string => java_string}/src/char.rs (100%) rename crates/{valence_java_string => java_string}/src/error.rs (100%) rename crates/{valence_java_string => java_string}/src/iter.rs (100%) rename crates/{valence_java_string => java_string}/src/lib.rs (100%) rename crates/{valence_java_string => java_string}/src/owned.rs (100%) rename crates/{valence_java_string => java_string}/src/pattern.rs (100%) rename crates/{valence_java_string => java_string}/src/serde.rs (100%) rename crates/{valence_java_string => java_string}/src/slice.rs (100%) rename crates/{valence_java_string => java_string}/src/validations.rs (100%) diff --git a/Cargo.toml b/Cargo.toml index a83d4a774..f221a2dd4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -135,6 +135,7 @@ hmac = "0.12.1" image = "0.24.6" indexmap = "2.0.0" itertools = "0.11.0" +java_string = { path = "crates/java_string", version = "0.1.0" } lru = "0.11.0" noise = "0.8.2" num = "0.4.0" @@ -179,7 +180,6 @@ valence_generated = { path = "crates/valence_generated", version = "0.2.0-alpha. valence_ident = { path = "crates/valence_ident", version = "0.2.0-alpha.1" } valence_ident_macros = { path = "crates/valence_ident_macros", version = "0.2.0-alpha.1" } valence_inventory = { path = "crates/valence_inventory", version = "0.2.0-alpha.1" } -valence_java_string = { path = "crates/valence_java_string", version = "0.1.0" } valence_lang = { path = "crates/valence_lang", version = "0.2.0-alpha.1" } valence_math = { path = "crates/valence_math", version = "0.2.0-alpha.1" } valence_nbt = { path = "crates/valence_nbt", features = [ diff --git a/crates/valence_java_string/Cargo.toml b/crates/java_string/Cargo.toml similarity index 92% rename from crates/valence_java_string/Cargo.toml rename to crates/java_string/Cargo.toml index 81c33551b..414e22a7d 100644 --- a/crates/valence_java_string/Cargo.toml +++ b/crates/java_string/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "valence_java_string" +name = "java_string" description = "An implementation of Java strings, tolerant of invalid UTF-16 encoding" readme = "README.md" version = "0.1.0" diff --git a/crates/valence_java_string/README.md b/crates/java_string/README.md similarity index 100% rename from crates/valence_java_string/README.md rename to crates/java_string/README.md diff --git a/crates/valence_java_string/src/cesu8.rs b/crates/java_string/src/cesu8.rs similarity index 100% rename from crates/valence_java_string/src/cesu8.rs rename to crates/java_string/src/cesu8.rs diff --git a/crates/valence_java_string/src/char.rs b/crates/java_string/src/char.rs similarity index 100% rename from crates/valence_java_string/src/char.rs rename to crates/java_string/src/char.rs diff --git a/crates/valence_java_string/src/error.rs b/crates/java_string/src/error.rs similarity index 100% rename from crates/valence_java_string/src/error.rs rename to crates/java_string/src/error.rs diff --git a/crates/valence_java_string/src/iter.rs b/crates/java_string/src/iter.rs similarity index 100% rename from crates/valence_java_string/src/iter.rs rename to crates/java_string/src/iter.rs diff --git a/crates/valence_java_string/src/lib.rs b/crates/java_string/src/lib.rs similarity index 100% rename from crates/valence_java_string/src/lib.rs rename to crates/java_string/src/lib.rs diff --git a/crates/valence_java_string/src/owned.rs b/crates/java_string/src/owned.rs similarity index 100% rename from crates/valence_java_string/src/owned.rs rename to crates/java_string/src/owned.rs diff --git a/crates/valence_java_string/src/pattern.rs b/crates/java_string/src/pattern.rs similarity index 100% rename from crates/valence_java_string/src/pattern.rs rename to crates/java_string/src/pattern.rs diff --git a/crates/valence_java_string/src/serde.rs b/crates/java_string/src/serde.rs similarity index 100% rename from crates/valence_java_string/src/serde.rs rename to crates/java_string/src/serde.rs diff --git a/crates/valence_java_string/src/slice.rs b/crates/java_string/src/slice.rs similarity index 100% rename from crates/valence_java_string/src/slice.rs rename to crates/java_string/src/slice.rs diff --git a/crates/valence_java_string/src/validations.rs b/crates/java_string/src/validations.rs similarity index 100% rename from crates/valence_java_string/src/validations.rs rename to crates/java_string/src/validations.rs From 9891c373b1ce63d81f29c3faaf48ed4a442d6909 Mon Sep 17 00:00:00 2001 From: Joe Date: Tue, 3 Oct 2023 12:37:41 +0100 Subject: [PATCH 06/11] Update depgraph.svg --- assets/depgraph.svg | 366 ++++++++++++++++++++++---------------------- 1 file changed, 186 insertions(+), 180 deletions(-) diff --git a/assets/depgraph.svg b/assets/depgraph.svg index b9523f69e..8bb16f718 100644 --- a/assets/depgraph.svg +++ b/assets/depgraph.svg @@ -12,368 +12,374 @@ 0 - -valence_advancement + +java_string 1 + +valence_advancement + + + +2 valence_server - + -0->1 +1->2 - - -2 + + +3 valence_entity - + -1->2 +2->3 - - -11 + + +12 valence_registry - + -1->11 +2->12 - - -10 - -valence_server_common + + +11 + +valence_server_common - + -2->10 - - +3->11 + + - + -11->10 - - +12->11 + + - - -6 - -valence_protocol + + +7 + +valence_protocol - + -10->6 - - - - - -3 - -valence_math +11->7 + + 4 - -valence_nbt + +valence_math 5 - -valence_ident + +valence_nbt - - -7 - -valence_generated + + +6 + +valence_ident - + + +8 + +valence_generated + + -6->7 - - +7->8 + + - - -9 - -valence_text + + +10 + +valence_text - + -6->9 - - +7->10 + + - + -7->3 - - +8->4 + + - + -7->5 - - +8->6 + + - + -9->4 - - +10->5 + + - + -9->5 - - +10->6 + + - - -8 + + +9 valence_build_utils - - -12 + + +13 valence_anvil - + -12->1 +13->2 - - -13 + + +14 valence_boss_bar - + -13->1 +14->2 - - -14 + + +15 valence_inventory - + -14->1 +15->2 - - -15 - -valence_lang - 16 + +valence_lang + + + +17 valence_network - + -16->1 +17->2 - + -16->15 +17->16 - - -17 + + +18 valence_player_list - + -17->1 +18->2 - - -18 + + +19 valence_scoreboard - + -18->1 +19->2 - - -19 - -valence_spatial - 20 + +valence_spatial + + + +21 valence_weather - + -20->1 +21->2 - - -21 + + +22 valence_world_border - + -21->1 +22->2 - - -22 - -dump_schedule - 23 + +dump_schedule + + + +24 valence - + -22->23 +23->24 - + -23->0 +24->1 - + -23->12 +24->13 - + -23->13 +24->14 - + -23->14 +24->15 - + -23->16 +24->17 - + -23->17 +24->18 - + -23->18 +24->19 - + -23->20 +24->21 - + -23->21 +24->22 - - -24 - -packet_inspector - - - -24->6 - - - 25 + +packet_inspector + + + +25->7 + + + + + +26 playground - + -25->23 +26->24 - - -26 - -stresser + + +27 + +stresser - + -26->6 - - +27->7 + + From 1d9d6eb705e86ea174ea63ea3715eca08684d7b9 Mon Sep 17 00:00:00 2001 From: Joe Date: Tue, 3 Oct 2023 20:14:32 +0100 Subject: [PATCH 07/11] Add tests, fix a couple of bugs --- crates/java_string/src/cesu8.rs | 49 +- crates/java_string/src/char.rs | 227 +++++++- crates/java_string/src/iter.rs | 4 +- crates/java_string/src/owned.rs | 195 ++++++- crates/java_string/src/slice.rs | 743 +++++++++++++++++++++++++- crates/java_string/src/validations.rs | 1 + 6 files changed, 1210 insertions(+), 9 deletions(-) diff --git a/crates/java_string/src/cesu8.rs b/crates/java_string/src/cesu8.rs index bba2ed21c..4964bc497 100644 --- a/crates/java_string/src/cesu8.rs +++ b/crates/java_string/src/cesu8.rs @@ -5,6 +5,28 @@ use crate::{JavaStr, JavaString, Utf8Error}; impl JavaStr { /// Converts from Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format to a `Cow`. + /// + /// ``` + /// # use std::borrow::Cow; + /// # use java_string::{JavaCodePoint, JavaStr, JavaString}; + /// + /// let result = JavaStr::from_modified_utf8("Hello World!".as_bytes()).unwrap(); + /// assert!(matches!(result, Cow::Borrowed(_))); + /// assert_eq!(JavaStr::from_str("Hello World!"), result); + /// + /// let result = JavaStr::from_modified_utf8(&[ + /// 0x61, 0x62, 0x63, 0xc0, 0x80, 0xe2, 0x84, 0x9d, 0xed, 0xa0, 0xbd, 0xed, 0xb2, 0xa3, 0xed, + /// 0xa0, 0x80, + /// ]) + /// .unwrap(); + /// assert!(matches!(result, Cow::Owned(_))); + /// let mut expected = JavaString::from("abc\0ℝ💣"); + /// expected.push_java(JavaCodePoint::from_u32(0xd800).unwrap()); + /// assert_eq!(expected, result); + /// + /// let result = JavaStr::from_modified_utf8(&[0xed]); + /// assert!(result.is_err()); + /// ``` #[inline] pub fn from_modified_utf8(bytes: &[u8]) -> Result, Utf8Error> { match JavaStr::from_full_utf8(bytes) { @@ -14,6 +36,25 @@ impl JavaStr { } /// Converts to Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format. + /// + /// ``` + /// # use std::borrow::Cow; + /// # use java_string::{JavaCodePoint, JavaStr, JavaString}; + /// + /// let result = JavaStr::from_str("Hello World!").to_modified_utf8(); + /// assert!(matches!(result, Cow::Borrowed(_))); + /// assert_eq!(result, &b"Hello World!"[..]); + /// + /// let mut str = JavaString::from("abc\0ℝ💣"); + /// str.push_java(JavaCodePoint::from_u32(0xd800).unwrap()); + /// let result = str.to_modified_utf8(); + /// let expected = [ + /// 0x61, 0x62, 0x63, 0xc0, 0x80, 0xe2, 0x84, 0x9d, 0xed, 0xa0, 0xbd, 0xed, 0xb2, 0xa3, 0xed, + /// 0xa0, 0x80, + /// ]; + /// assert!(matches!(result, Cow::Owned(_))); + /// assert_eq!(result, &expected[..]); + /// ``` #[inline] #[must_use] pub fn to_modified_utf8(&self) -> Cow<[u8]> { @@ -53,7 +94,7 @@ impl JavaStr { // Encode 4-byte sequences as 6 bytes let s = unsafe { // SAFETY: input is valid semi UTF-8 - JavaStr::from_semi_utf8_unchecked(bytes) + JavaStr::from_semi_utf8_unchecked(char_bytes) }; let c = unsafe { // SAFETY: s contains a single char of width 4 @@ -72,6 +113,8 @@ impl JavaStr { impl JavaString { /// Converts from Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format to a `JavaString`. + /// + /// See [JavaStr::from_modified_utf8]. #[inline] pub fn from_modified_utf8(bytes: Vec) -> Result { match JavaString::from_full_utf8(bytes) { @@ -81,6 +124,8 @@ impl JavaString { } /// Converts from Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format to a `JavaString`. + /// + /// See [JavaStr::from_modified_utf8]. pub fn from_modified_utf8_iter(mut iter: I) -> Result where I: Iterator, @@ -200,6 +245,8 @@ impl JavaString { } /// Converts to Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format. + /// + /// See [JavaStr::to_modified_utf8]. #[inline] #[must_use] pub fn into_modified_utf8(self) -> Vec { diff --git a/crates/java_string/src/char.rs b/crates/java_string/src/char.rs index 466de2e1f..5bc26a4f7 100644 --- a/crates/java_string/src/char.rs +++ b/crates/java_string/src/char.rs @@ -50,6 +50,15 @@ impl JavaCodePoint { pub const REPLACEMENT_CHARACTER: JavaCodePoint = JavaCodePoint::from_char(char::REPLACEMENT_CHARACTER); + /// See [char::from_u32] + /// + /// ``` + /// # use java_string::JavaCodePoint; + /// let c = JavaCodePoint::from_u32(0x2764); + /// assert_eq!(Some(JavaCodePoint::from_char('❤')), c); + /// + /// assert_eq!(None, JavaCodePoint::from_u32(0x110000)); + /// ``` #[inline] #[must_use] pub const fn from_u32(i: u32) -> Option { @@ -70,6 +79,7 @@ impl JavaCodePoint { std::mem::transmute(i) } + /// Converts a `char` to a code point. #[inline] #[must_use] pub const fn from_char(char: char) -> JavaCodePoint { @@ -79,6 +89,13 @@ impl JavaCodePoint { } } + /// Converts this code point to a `u32`. + /// + /// ``` + /// # use java_string::JavaCodePoint; + /// assert_eq!(65, JavaCodePoint::from_char('A').as_u32()); + /// assert_eq!(0xd800, JavaCodePoint::from_u32(0xd800).unwrap().as_u32()); + /// ``` #[inline] #[must_use] pub const fn as_u32(self) -> u32 { @@ -97,6 +114,13 @@ impl JavaCodePoint { } } + /// Converts this code point to a `char`. + /// + /// ``` + /// # use java_string::JavaCodePoint; + /// assert_eq!(Some('a'), JavaCodePoint::from_char('a').as_char()); + /// assert_eq!(None, JavaCodePoint::from_u32(0xd800).unwrap().as_char()); + /// ``` #[inline] #[must_use] pub const fn as_char(self) -> Option { @@ -112,6 +136,29 @@ impl JavaCodePoint { char::from_u32_unchecked(self.as_u32()) } + /// See [char::encode_utf16] + /// + /// ``` + /// # use java_string::JavaCodePoint; + /// assert_eq!( + /// 2, + /// JavaCodePoint::from_char('𝕊') + /// .encode_utf16(&mut [0; 2]) + /// .len() + /// ); + /// assert_eq!( + /// 1, + /// JavaCodePoint::from_u32(0xd800) + /// .unwrap() + /// .encode_utf16(&mut [0; 2]) + /// .len() + /// ); + /// ``` + /// ```should_panic + /// # use java_string::JavaCodePoint; + /// // Should panic + /// JavaCodePoint::from_char('𝕊').encode_utf16(&mut [0; 1]); + /// ``` #[inline] pub fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] { if let Some(char) = self.as_char() { @@ -123,7 +170,29 @@ impl JavaCodePoint { } /// Encodes this `JavaCodePoint` into semi UTF-8, that is, UTF-8 with - /// surrogate code points. + /// surrogate code points. See also [char::encode_utf8]. + /// + /// ``` + /// # use java_string::JavaCodePoint; + /// assert_eq!( + /// 2, + /// JavaCodePoint::from_char('ß') + /// .encode_semi_utf8(&mut [0; 4]) + /// .len() + /// ); + /// assert_eq!( + /// 3, + /// JavaCodePoint::from_u32(0xd800) + /// .unwrap() + /// .encode_semi_utf8(&mut [0; 4]) + /// .len() + /// ); + /// ``` + /// ```should_panic + /// # use java_string::JavaCodePoint; + /// // Should panic + /// JavaCodePoint::from_char('ß').encode_semi_utf8(&mut [0; 1]); + /// ``` #[inline] pub fn encode_semi_utf8(self, dst: &mut [u8]) -> &mut [u8] { let len = self.len_utf8(); @@ -157,6 +226,7 @@ impl JavaCodePoint { &mut dst[..len] } + /// See [char::eq_ignore_ascii_case]. #[inline] pub fn eq_ignore_ascii_case(&self, other: &JavaCodePoint) -> bool { match (self.as_char(), other.as_char()) { @@ -166,6 +236,26 @@ impl JavaCodePoint { } } + /// See [char::escape_debug]. + /// + /// ``` + /// # use java_string::JavaCodePoint; + /// assert_eq!( + /// "a", + /// JavaCodePoint::from_char('a').escape_debug().to_string() + /// ); + /// assert_eq!( + /// "\\n", + /// JavaCodePoint::from_char('\n').escape_debug().to_string() + /// ); + /// assert_eq!( + /// "\\u{d800}", + /// JavaCodePoint::from_u32(0xd800) + /// .unwrap() + /// .escape_debug() + /// .to_string() + /// ); + /// ``` #[inline] #[must_use] pub fn escape_debug(self) -> CharEscapeIter { @@ -213,6 +303,26 @@ impl JavaCodePoint { char.escape_debug().next() != Some('\\') } + /// See [char::escape_default]. + /// + /// ``` + /// # use java_string::JavaCodePoint; + /// assert_eq!( + /// "a", + /// JavaCodePoint::from_char('a').escape_default().to_string() + /// ); + /// assert_eq!( + /// "\\n", + /// JavaCodePoint::from_char('\n').escape_default().to_string() + /// ); + /// assert_eq!( + /// "\\u{d800}", + /// JavaCodePoint::from_u32(0xd800) + /// .unwrap() + /// .escape_default() + /// .to_string() + /// ); + /// ``` #[inline] #[must_use] pub fn escape_default(self) -> CharEscapeIter { @@ -238,6 +348,22 @@ impl JavaCodePoint { } } + /// See [char::escape_unicode]. + /// + /// ``` + /// # use java_string::JavaCodePoint; + /// assert_eq!( + /// "\\u{2764}", + /// JavaCodePoint::from_char('❤').escape_unicode().to_string() + /// ); + /// assert_eq!( + /// "\\u{d800}", + /// JavaCodePoint::from_u32(0xd800) + /// .unwrap() + /// .escape_unicode() + /// .to_string() + /// ); + /// ``` #[inline] #[must_use] pub fn escape_unicode(self) -> CharEscapeIter { @@ -266,42 +392,49 @@ impl JavaCodePoint { } } + /// See [char::is_alphabetic]. #[inline] #[must_use] pub fn is_alphabetic(self) -> bool { self.as_char().is_some_and(|char| char.is_alphabetic()) } + /// See [char::is_alphanumeric]. #[inline] #[must_use] pub fn is_alphanumeric(self) -> bool { self.as_char().is_some_and(|char| char.is_alphanumeric()) } + /// See [char::is_ascii]. #[inline] #[must_use] pub fn is_ascii(self) -> bool { self.as_u32() <= 0x7f } + /// See [char::is_ascii_alphabetic]. #[inline] #[must_use] pub const fn is_ascii_alphabetic(self) -> bool { self.is_ascii_lowercase() || self.is_ascii_uppercase() } + /// See [char::is_ascii_alphanumeric]. #[inline] #[must_use] pub const fn is_ascii_alphanumeric(self) -> bool { self.is_ascii_alphabetic() || self.is_ascii_digit() } + /// See [char::is_ascii_control]. #[inline] #[must_use] pub const fn is_ascii_control(self) -> bool { matches!(self.as_u32(), 0..=0x1f | 0x7f) } + /// See [char::is_ascii_digit]. #[inline] #[must_use] pub const fn is_ascii_digit(self) -> bool { @@ -310,12 +443,14 @@ impl JavaCodePoint { matches!(self.as_u32(), ZERO..=NINE) } + /// See [char::is_ascii_graphic]. #[inline] #[must_use] pub const fn is_ascii_graphic(self) -> bool { matches!(self.as_u32(), 0x21..=0x7e) } + /// See [char::is_ascii_hexdigit]. #[inline] #[must_use] pub const fn is_ascii_hexdigit(self) -> bool { @@ -326,6 +461,7 @@ impl JavaCodePoint { self.is_ascii_digit() || matches!(self.as_u32(), (LOWER_A..=LOWER_F) | (UPPER_A..=UPPER_F)) } + /// See [char::is_ascii_lowercase]. #[inline] #[must_use] pub const fn is_ascii_lowercase(self) -> bool { @@ -334,6 +470,7 @@ impl JavaCodePoint { matches!(self.as_u32(), A..=Z) } + /// See [char::is_ascii_octdigit]. #[inline] #[must_use] pub const fn is_ascii_octdigit(self) -> bool { @@ -342,6 +479,7 @@ impl JavaCodePoint { matches!(self.as_u32(), ZERO..=SEVEN) } + /// See [char::is_ascii_punctuation]. #[inline] #[must_use] pub const fn is_ascii_punctuation(self) -> bool { @@ -351,6 +489,7 @@ impl JavaCodePoint { ) } + /// See [char::is_ascii_uppercase]. #[inline] #[must_use] pub const fn is_ascii_uppercase(self) -> bool { @@ -359,6 +498,7 @@ impl JavaCodePoint { matches!(self.as_u32(), A..=Z) } + /// See [char::is_ascii_whitespace]. #[inline] #[must_use] pub const fn is_ascii_whitespace(self) -> bool { @@ -373,42 +513,61 @@ impl JavaCodePoint { ) } + /// See [char::is_control]. #[inline] #[must_use] pub fn is_control(self) -> bool { self.as_char().is_some_and(|char| char.is_control()) } + /// See [char::is_digit]. #[inline] #[must_use] pub fn is_digit(self, radix: u32) -> bool { self.to_digit(radix).is_some() } + /// See [char::is_lowercase]. #[inline] #[must_use] pub fn is_lowercase(self) -> bool { self.as_char().is_some_and(|char| char.is_lowercase()) } + /// See [char::is_numeric]. #[inline] #[must_use] pub fn is_numeric(self) -> bool { self.as_char().is_some_and(|char| char.is_numeric()) } + /// See [char::is_uppercase]. #[inline] #[must_use] pub fn is_uppercase(self) -> bool { self.as_char().is_some_and(|char| char.is_uppercase()) } + /// See [char::is_whitespace]. #[inline] #[must_use] pub fn is_whitespace(self) -> bool { self.as_char().is_some_and(|char| char.is_whitespace()) } + /// See [char::len_utf16]. Surrogate code points return 1. + /// + /// ``` + /// # use java_string::JavaCodePoint; + /// + /// let n = JavaCodePoint::from_char('ß').len_utf16(); + /// assert_eq!(n, 1); + /// + /// let len = JavaCodePoint::from_char('💣').len_utf16(); + /// assert_eq!(len, 2); + /// + /// assert_eq!(1, JavaCodePoint::from_u32(0xd800).unwrap().len_utf16()); + /// ``` #[inline] #[must_use] pub const fn len_utf16(self) -> usize { @@ -419,6 +578,26 @@ impl JavaCodePoint { } } + /// See [char::len_utf8]. Surrogate code points return 3. + /// + /// ``` + /// # use java_string::JavaCodePoint; + /// + /// let len = JavaCodePoint::from_char('A').len_utf8(); + /// assert_eq!(len, 1); + /// + /// let len = JavaCodePoint::from_char('ß').len_utf8(); + /// assert_eq!(len, 2); + /// + /// let len = JavaCodePoint::from_char('ℝ').len_utf8(); + /// assert_eq!(len, 3); + /// + /// let len = JavaCodePoint::from_char('💣').len_utf8(); + /// assert_eq!(len, 4); + /// + /// let len = JavaCodePoint::from_u32(0xd800).unwrap().len_utf8(); + /// assert_eq!(len, 3); + /// ``` #[inline] #[must_use] pub const fn len_utf8(self) -> usize { @@ -429,16 +608,29 @@ impl JavaCodePoint { } } + /// See [char::make_ascii_lowercase]. #[inline] pub fn make_ascii_lowercase(&mut self) { *self = self.to_ascii_lowercase(); } + /// See [char::make_ascii_uppercase]. #[inline] pub fn make_ascii_uppercase(&mut self) { *self = self.to_ascii_uppercase(); } + /// See [char::to_ascii_lowercase]. + /// + /// ``` + /// # use java_string::JavaCodePoint; + /// + /// let ascii = JavaCodePoint::from_char('A'); + /// let non_ascii = JavaCodePoint::from_char('❤'); + /// + /// assert_eq!('a', ascii.to_ascii_lowercase()); + /// assert_eq!('❤', non_ascii.to_ascii_lowercase()); + /// ``` #[inline] #[must_use] pub const fn to_ascii_lowercase(self) -> JavaCodePoint { @@ -452,6 +644,17 @@ impl JavaCodePoint { } } + /// See [char::to_ascii_uppercase]. + /// + /// ``` + /// # use java_string::JavaCodePoint; + /// + /// let ascii = JavaCodePoint::from_char('a'); + /// let non_ascii = JavaCodePoint::from_char('❤'); + /// + /// assert_eq!('A', ascii.to_ascii_uppercase()); + /// assert_eq!('❤', non_ascii.to_ascii_uppercase()); + /// ``` #[inline] #[must_use] pub const fn to_ascii_uppercase(self) -> JavaCodePoint { @@ -465,6 +668,7 @@ impl JavaCodePoint { } } + /// See [char::to_digit]. #[inline] #[must_use] pub const fn to_digit(self, radix: u32) -> Option { @@ -475,6 +679,7 @@ impl JavaCodePoint { } } + /// See [char::to_lowercase]. #[inline] #[must_use] pub fn to_lowercase(self) -> ToLowercase { @@ -484,6 +689,7 @@ impl JavaCodePoint { } } + /// See [char::to_uppercase]. #[inline] #[must_use] pub fn to_uppercase(self) -> ToUppercase { @@ -616,6 +822,15 @@ enum EscapeIterInner { Escaped(EscapeIterEscaped), } +impl Display for EscapeIterInner { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self { + EscapeIterInner::Printable(char) => char.clone().try_for_each(|ch| f.write_char(ch)), + EscapeIterInner::Escaped(escaped) => Display::fmt(escaped, f), + } + } +} + impl CharEscapeIter { #[inline] fn printable(char: char) -> Self { @@ -672,6 +887,12 @@ impl ExactSizeIterator for CharEscapeIter { impl FusedIterator for CharEscapeIter {} +impl Display for CharEscapeIter { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + Display::fmt(&self.inner, f) + } +} + #[derive(Clone, Debug)] struct EscapeIterEscaped { // SAFETY: all values must be in the ASCII range @@ -711,9 +932,9 @@ impl ExactSizeIterator for EscapeIterEscaped { impl FusedIterator for EscapeIterEscaped {} -impl fmt::Display for EscapeIterEscaped { +impl Display for EscapeIterEscaped { #[inline] - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { let str = unsafe { // SAFETY: all bytes are in ASCII range, and range is in bounds for length 10 std::str::from_utf8_unchecked(self.bytes.get_unchecked(self.range.clone())) diff --git a/crates/java_string/src/iter.rs b/crates/java_string/src/iter.rs index e8c8469e8..3762f6d72 100644 --- a/crates/java_string/src/iter.rs +++ b/crates/java_string/src/iter.rs @@ -586,10 +586,10 @@ where unsafe { substr.chars().next_back().unwrap_unchecked().len_utf8() }; let popped_str = unsafe { substr.get_unchecked(..substr.len() - last_char_len) }; - self.pat.find_in(popped_str) + self.pat.rfind_in(popped_str) } } else { - self.pat.find_in(substr) + self.pat.rfind_in(substr) }; self.had_empty_match = result.is_some_and(|(_, len)| len == 0); diff --git a/crates/java_string/src/owned.rs b/crates/java_string/src/owned.rs index 7c332f4e7..67c700c41 100644 --- a/crates/java_string/src/owned.rs +++ b/crates/java_string/src/owned.rs @@ -39,7 +39,7 @@ impl JavaString { } /// Converts `vec` to a `JavaString` if it is fully-valid UTF-8, i.e. UTF-8 - /// without surrogate code points. + /// without surrogate code points. See [String::from_utf8]. #[inline] pub fn from_full_utf8(vec: Vec) -> Result { match std::str::from_utf8(&vec) { @@ -53,6 +53,24 @@ impl JavaString { /// Converts `vec` to a `JavaString` if it is semi-valid UTF-8, i.e. UTF-8 /// with surrogate code points. + /// + /// ``` + /// # use java_string::{JavaCodePoint, JavaString}; + /// + /// assert_eq!( + /// JavaString::from_semi_utf8(b"Hello World!".to_vec()).unwrap(), + /// "Hello World!" + /// ); + /// assert_eq!( + /// JavaString::from_semi_utf8(vec![0xf0, 0x9f, 0x92, 0x96]).unwrap(), + /// "💖" + /// ); + /// assert_eq!( + /// JavaString::from_semi_utf8(vec![0xed, 0xa0, 0x80]).unwrap(), + /// JavaString::from(JavaCodePoint::from_u32(0xd800).unwrap()) + /// ); + /// assert!(JavaString::from_semi_utf8(vec![0xed]).is_err()); + /// ``` pub fn from_semi_utf8(vec: Vec) -> Result { match run_utf8_semi_validation(&vec) { Ok(..) => Ok(JavaString { vec }), @@ -65,6 +83,21 @@ impl JavaString { /// Converts `v` to a `Cow`, replacing invalid semi-UTF-8 with the /// replacement character �. + /// + /// ``` + /// # use std::borrow::Cow; + /// # use java_string::{JavaStr, JavaString}; + /// + /// let sparkle_heart = [0xf0, 0x9f, 0x92, 0x96]; + /// let result = JavaString::from_semi_utf8_lossy(&sparkle_heart); + /// assert!(matches!(result, Cow::Borrowed(_))); + /// assert_eq!(result, JavaStr::from_str("💖")); + /// + /// let foobar_with_error = [b'f', b'o', b'o', 0xed, b'b', b'a', b'r']; + /// let result = JavaString::from_semi_utf8_lossy(&foobar_with_error); + /// assert!(matches!(result, Cow::Owned(_))); + /// assert_eq!(result, JavaStr::from_str("foo�bar")); + /// ``` #[must_use] pub fn from_semi_utf8_lossy(v: &[u8]) -> Cow<'_, JavaStr> { const REPLACEMENT: &str = "\u{FFFD}"; @@ -119,12 +152,14 @@ impl JavaString { JavaString { vec: bytes } } + /// See [String::into_bytes]. #[inline] #[must_use] pub fn into_bytes(self) -> Vec { self.vec } + /// See [String::as_str]. #[inline] #[must_use] pub fn as_java_str(&self) -> &JavaStr { @@ -134,6 +169,7 @@ impl JavaString { } } + /// See [String::as_mut_str]. #[inline] #[must_use] pub fn as_mut_java_str(&mut self) -> &mut JavaStr { @@ -145,6 +181,23 @@ impl JavaString { /// Tries to convert this `JavaString` to a `String`, returning an error if /// it is not fully valid UTF-8, i.e. has no surrogate code points. + /// + /// ``` + /// # use java_string::{JavaCodePoint, JavaString}; + /// + /// assert_eq!( + /// JavaString::from("Hello World!").into_string().unwrap(), + /// "Hello World!" + /// ); + /// assert_eq!( + /// JavaString::from("abc\0ℝ💣").into_string().unwrap(), + /// "abc\0ℝ💣" + /// ); + /// + /// let string_with_error = JavaString::from("abc") + /// + JavaString::from(JavaCodePoint::from_u32(0xd800).unwrap()).as_java_str(); + /// assert!(string_with_error.into_string().is_err()); + /// ``` pub fn into_string(self) -> Result { run_utf8_full_validation_from_semi(self.as_bytes()).map(|_| unsafe { // SAFETY: validation succeeded @@ -163,52 +216,62 @@ impl JavaString { String::from_utf8_unchecked(self.vec) } + /// See [String::push_str]. #[inline] pub fn push_java_str(&mut self, string: &JavaStr) { self.vec.extend_from_slice(string.as_bytes()) } + /// See [String::push_str]. #[inline] pub fn push_str(&mut self, string: &str) { self.vec.extend_from_slice(string.as_bytes()) } + /// See [String::capacity]. #[inline] #[must_use] pub fn capacity(&self) -> usize { self.vec.capacity() } + /// See [String::reserve]. #[inline] pub fn reserve(&mut self, additional: usize) { self.vec.reserve(additional) } + /// See [String::reserve_exact]. #[inline] pub fn reserve_exact(&mut self, additional: usize) { self.vec.reserve_exact(additional) } + /// See [String::try_reserve]. #[inline] pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> { self.vec.try_reserve(additional) } + /// See [String::try_reserve_exact]. #[inline] pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> { self.vec.try_reserve_exact(additional) } + /// See [String::shrink_to_fit]. #[inline] pub fn shrink_to_fit(&mut self) { self.vec.shrink_to_fit() } + /// See [String::shrink_to]. #[inline] pub fn shrink_to(&mut self, min_capacity: usize) { self.vec.shrink_to(min_capacity) } + /// See [String::push]. #[inline] pub fn push(&mut self, ch: char) { match ch.len_utf8() { @@ -219,6 +282,7 @@ impl JavaString { } } + /// See [String::push]. #[inline] pub fn push_java(&mut self, ch: JavaCodePoint) { match ch.len_utf8() { @@ -227,12 +291,14 @@ impl JavaString { } } + /// See [String::as_bytes]. #[inline] #[must_use] pub fn as_bytes(&self) -> &[u8] { &self.vec } + /// See [String::truncate]. #[inline] pub fn truncate(&mut self, new_len: usize) { if new_len <= self.len() { @@ -241,6 +307,21 @@ impl JavaString { } } + /// See [String::pop]. + /// + /// ``` + /// # use java_string::JavaString; + /// + /// let mut str = JavaString::from("Hello World!"); + /// assert_eq!(str.pop().unwrap(), '!'); + /// assert_eq!(str, "Hello World"); + /// + /// let mut str = JavaString::from("東京"); + /// assert_eq!(str.pop().unwrap(), '京'); + /// assert_eq!(str, "東"); + /// + /// assert!(JavaString::new().pop().is_none()); + /// ``` #[inline] pub fn pop(&mut self) -> Option { let ch = self.chars().next_back()?; @@ -251,6 +332,29 @@ impl JavaString { Some(ch) } + /// See [String::remove]. + /// + /// ``` + /// # use java_string::JavaString; + /// + /// let mut str = JavaString::from("Hello World!"); + /// assert_eq!(str.remove(5), ' '); + /// assert_eq!(str, "HelloWorld!"); + /// + /// let mut str = JavaString::from("Hello 🦀 World!"); + /// assert_eq!(str.remove(6), '🦀'); + /// assert_eq!(str, "Hello World!"); + /// ``` + /// ```should_panic + /// # use java_string::JavaString; + /// // Should panic + /// JavaString::new().remove(0); + /// ``` + /// ```should_panic + /// # use java_string::JavaString; + /// // Should panic + /// JavaString::from("🦀").remove(1); + /// ``` #[inline] pub fn remove(&mut self, idx: usize) -> JavaCodePoint { let ch = match self[idx..].chars().next() { @@ -271,6 +375,17 @@ impl JavaString { ch } + /// See [String::retain]. + /// + /// ``` + /// # use java_string::{JavaCodePoint, JavaString}; + /// + /// let mut str = JavaString::from("Hello 🦀 World!"); + /// str.retain(|ch| !ch.is_ascii_uppercase()); + /// assert_eq!(str, "ello 🦀 orld!"); + /// str.retain(JavaCodePoint::is_ascii); + /// assert_eq!(str, "ello orld!"); + /// ``` #[inline] pub fn retain(&mut self, mut f: F) where @@ -337,6 +452,16 @@ impl JavaString { drop(guard); } + /// See [String::insert]. + /// + /// ``` + /// # use java_string::JavaString; + /// let mut s = JavaString::from("foo"); + /// s.insert(3, 'a'); + /// s.insert(4, 'r'); + /// s.insert(3, 'b'); + /// assert_eq!(s, "foobar"); + /// ``` #[inline] pub fn insert(&mut self, idx: usize, ch: char) { assert!(self.is_char_boundary(idx)); @@ -348,6 +473,7 @@ impl JavaString { } } + /// See [String::insert]. #[inline] pub fn insert_java(&mut self, idx: usize, ch: JavaCodePoint) { assert!(self.is_char_boundary(idx)); @@ -376,6 +502,14 @@ impl JavaString { } } + /// See [String::insert_str]. + /// + /// ``` + /// # use java_string::JavaString; + /// let mut s = JavaString::from("bar"); + /// s.insert_str(0, "foo"); + /// assert_eq!(s, "foobar"); + /// ``` #[inline] pub fn insert_str(&mut self, idx: usize, string: &str) { assert!(self.is_char_boundary(idx)); @@ -385,6 +519,7 @@ impl JavaString { } } + /// See [String::insert_java_str]. pub fn insert_java_str(&mut self, idx: usize, string: &JavaStr) { assert!(self.is_char_boundary(idx)); @@ -393,6 +528,8 @@ impl JavaString { } } + /// See [String::as_mut_vec]. + /// /// # Safety /// /// The returned `Vec` must not have invalid UTF-8 written to it, besides @@ -402,18 +539,35 @@ impl JavaString { &mut self.vec } + /// See [String::len]. #[inline] #[must_use] pub fn len(&self) -> usize { self.vec.len() } + /// See [String::is_empty]. #[inline] #[must_use] pub fn is_empty(&self) -> bool { self.len() == 0 } + /// See [String::split_off]. + /// + /// ``` + /// # use java_string::JavaString; + /// let mut hello = JavaString::from("Hello World!"); + /// let world = hello.split_off(6); + /// assert_eq!(hello, "Hello "); + /// assert_eq!(world, "World!"); + /// ``` + /// ```should_panic + /// # use java_string::JavaString; + /// let mut s = JavaString::from("🦀"); + /// // Should panic + /// let _ = s.split_off(1); + /// ``` #[inline] #[must_use] pub fn split_off(&mut self, at: usize) -> JavaString { @@ -422,11 +576,29 @@ impl JavaString { unsafe { JavaString::from_semi_utf8_unchecked(other) } } + /// See [String::clear]. #[inline] pub fn clear(&mut self) { self.vec.clear(); } + /// See [String::drain]. + /// + /// ``` + /// # use java_string::JavaString; + /// + /// let mut s = JavaString::from("α is alpha, β is beta"); + /// let beta_offset = s.find('β').unwrap_or(s.len()); + /// + /// // Remove the range up until the β from the string + /// let t: JavaString = s.drain(..beta_offset).collect(); + /// assert_eq!(t, "α is alpha, "); + /// assert_eq!(s, "β is beta"); + /// + /// // A full range clears the string, like `clear()` does + /// s.drain(..); + /// assert_eq!(s, ""); + /// ``` #[inline] pub fn drain(&mut self, range: R) -> Drain<'_> where @@ -452,6 +624,24 @@ impl JavaString { } } + /// See [String::replace_range]. + /// + /// ``` + /// # use java_string::JavaString; + /// + /// let mut s = JavaString::from("α is alpha, β is beta"); + /// let beta_offset = s.find('β').unwrap_or(s.len()); + /// + /// // Replace the range up until the β from the string + /// s.replace_range(..beta_offset, "Α is capital alpha; "); + /// assert_eq!(s, "Α is capital alpha; β is beta"); + /// ``` + /// ```should_panic + /// # use java_string::JavaString; + /// let mut s = JavaString::from("α is alpha, β is beta"); + /// // Should panic + /// s.replace_range(..1, "Α is capital alpha; "); + /// ``` pub fn replace_range(&mut self, range: R, replace_with: &str) where R: RangeBounds, @@ -459,6 +649,7 @@ impl JavaString { self.replace_range_java(range, JavaStr::from_str(replace_with)) } + /// See [String::replace_range]. pub fn replace_range_java(&mut self, range: R, replace_with: &JavaStr) where R: RangeBounds, @@ -479,6 +670,7 @@ impl JavaString { unsafe { self.as_mut_vec() }.splice((start, end), replace_with.bytes()); } + /// See [String::into_boxed_str]. #[inline] #[must_use] pub fn into_boxed_str(self) -> Box { @@ -486,6 +678,7 @@ impl JavaString { unsafe { JavaStr::from_boxed_semi_utf8_unchecked(slice) } } + /// See [String::leak]. #[inline] pub fn leak<'a>(self) -> &'a mut JavaStr { let slice = self.vec.leak(); diff --git a/crates/java_string/src/slice.rs b/crates/java_string/src/slice.rs index 5d5d91e7d..104df4228 100644 --- a/crates/java_string/src/slice.rs +++ b/crates/java_string/src/slice.rs @@ -31,7 +31,7 @@ pub struct JavaStr { impl JavaStr { /// Converts `v` to a `&JavaStr` if it is fully-valid UTF-8, i.e. UTF-8 - /// without surrogate code points. + /// without surrogate code points. See [std::str::from_utf8]. #[inline] pub const fn from_full_utf8(v: &[u8]) -> Result<&JavaStr, Utf8Error> { match std::str::from_utf8(v) { @@ -41,7 +41,7 @@ impl JavaStr { } /// Converts `v` to a `&mut JavaStr` if it is fully-valid UTF-8, i.e. UTF-8 - /// without surrogate code points. + /// without surrogate code points. See [std::str::from_utf8_mut]. #[inline] pub fn from_full_utf8_mut(v: &mut [u8]) -> Result<&mut JavaStr, Utf8Error> { match std::str::from_utf8_mut(v) { @@ -126,12 +126,15 @@ impl JavaStr { unsafe { Box::from_raw(Box::into_raw(v) as *mut JavaStr) } } + /// See [str::as_bytes]. #[inline] #[must_use] pub const fn as_bytes(&self) -> &[u8] { &self.inner } + /// See [str::as_bytes_mut]. + /// /// # Safety /// /// The returned slice must not have invalid UTF-8 written to it, besides @@ -142,12 +145,14 @@ impl JavaStr { &mut self.inner } + /// See [str::as_mut_ptr]. #[inline] #[must_use] pub fn as_mut_ptr(&mut self) -> *mut u8 { self.inner.as_mut_ptr() } + /// See [str::as_ptr]. #[inline] #[must_use] pub const fn as_ptr(&self) -> *const u8 { @@ -179,6 +184,22 @@ impl JavaStr { /// Converts this `&JavaStr` to a `Cow`, replacing surrogate code /// points with the replacement character �. + /// + /// ``` + /// # use std::borrow::Cow; + /// # use java_string::{JavaCodePoint, JavaStr, JavaString}; + /// let s = JavaStr::from_str("Hello 🦀 World!"); + /// let result = s.as_str_lossy(); + /// assert!(matches!(result, Cow::Borrowed(_))); + /// assert_eq!(result, "Hello 🦀 World!"); + /// + /// let s = JavaString::from("Hello ") + /// + JavaString::from(JavaCodePoint::from_u32(0xd800).unwrap()).as_java_str() + /// + JavaStr::from_str(" World!"); + /// let result = s.as_str_lossy(); + /// assert!(matches!(result, Cow::Owned(_))); + /// assert_eq!(result, "Hello � World!"); + /// ``` #[must_use] pub fn as_str_lossy(&self) -> Cow<'_, str> { match run_utf8_full_validation_from_semi(self.as_bytes()) { @@ -198,6 +219,7 @@ impl JavaStr { } } + /// See [str::bytes]. #[inline] pub fn bytes(&self) -> Bytes<'_> { Bytes { @@ -205,6 +227,7 @@ impl JavaStr { } } + /// See [str::char_indices]. #[inline] pub fn char_indices(&self) -> CharIndices<'_> { CharIndices { @@ -213,6 +236,7 @@ impl JavaStr { } } + /// See [str::chars]. #[inline] pub fn chars(&self) -> Chars<'_> { Chars { @@ -220,6 +244,15 @@ impl JavaStr { } } + /// See [str::contains]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let bananas = JavaStr::from_str("bananas"); + /// + /// assert!(bananas.contains("nana")); + /// assert!(!bananas.contains("apples")); + /// ``` #[inline] #[must_use] pub fn contains

(&self, mut pat: P) -> bool @@ -229,6 +262,15 @@ impl JavaStr { pat.find_in(self).is_some() } + /// See [str::ends_with]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let bananas = JavaStr::from_str("bananas"); + /// + /// assert!(bananas.ends_with("anas")); + /// assert!(!bananas.ends_with("nana")); + /// ``` #[inline] #[must_use] pub fn ends_with

(&self, mut pat: P) -> bool @@ -238,18 +280,29 @@ impl JavaStr { pat.suffix_len_in(self).is_some() } + /// See [str::eq_ignore_ascii_case]. #[inline] #[must_use] pub fn eq_ignore_ascii_case(&self, other: &str) -> bool { self.as_bytes().eq_ignore_ascii_case(other.as_bytes()) } + /// See [str::eq_ignore_ascii_case]. #[inline] #[must_use] pub fn eq_java_ignore_ascii_case(&self, other: &JavaStr) -> bool { self.as_bytes().eq_ignore_ascii_case(other.as_bytes()) } + /// See [str::escape_debug]. + /// + /// ``` + /// # use java_string::JavaStr; + /// assert_eq!( + /// JavaStr::from_str("❤\n!").escape_debug().to_string(), + /// "❤\\n!" + /// ); + /// ``` #[inline] pub fn escape_debug(&self) -> EscapeDebug<'_> { #[inline] @@ -275,6 +328,15 @@ impl JavaStr { } } + /// See [str::escape_default]. + /// + /// ``` + /// # use java_string::JavaStr; + /// assert_eq!( + /// JavaStr::from_str("❤\n!").escape_default().to_string(), + /// "\\u{2764}\\n!" + /// ); + /// ``` #[inline] pub fn escape_default(&self) -> EscapeDefault<'_> { EscapeDefault { @@ -282,6 +344,15 @@ impl JavaStr { } } + /// See [str::escape_unicode]. + /// + /// ``` + /// # use java_string::JavaStr; + /// assert_eq!( + /// JavaStr::from_str("❤\n!").escape_unicode().to_string(), + /// "\\u{2764}\\u{a}\\u{21}" + /// ); + /// ``` #[inline] pub fn escape_unicode(&self) -> EscapeUnicode<'_> { EscapeUnicode { @@ -289,6 +360,18 @@ impl JavaStr { } } + /// See [str::find]. + /// + /// ``` + /// let s = "Löwe 老虎 Léopard Gepardi"; + /// + /// assert_eq!(s.find('L'), Some(0)); + /// assert_eq!(s.find('é'), Some(14)); + /// assert_eq!(s.find("pard"), Some(17)); + /// + /// let x: &[_] = &['1', '2']; + /// assert_eq!(s.find(x), None); + /// ``` #[inline] #[must_use] pub fn find

(&self, mut pat: P) -> Option @@ -298,6 +381,21 @@ impl JavaStr { pat.find_in(self).map(|(index, _)| index) } + /// See [str::get]. + /// + /// ``` + /// # use java_string::{JavaStr, JavaString}; + /// let v = JavaString::from("🗻∈🌏"); + /// + /// assert_eq!(Some(JavaStr::from_str("🗻")), v.get(0..4)); + /// + /// // indices not on UTF-8 sequence boundaries + /// assert!(v.get(1..).is_none()); + /// assert!(v.get(..8).is_none()); + /// + /// // out of bounds + /// assert!(v.get(..42).is_none()); + /// ``` #[inline] #[must_use] pub fn get(&self, i: I) -> Option<&JavaStr> @@ -307,6 +405,7 @@ impl JavaStr { i.get(self) } + /// See [str::get_mut]. #[inline] #[must_use] pub fn get_mut(&mut self, i: I) -> Option<&mut JavaStr> @@ -316,6 +415,8 @@ impl JavaStr { i.get_mut(self) } + /// See [str::get_unchecked]. + /// /// # Safety /// /// - The starting index must not exceed the ending index @@ -330,6 +431,8 @@ impl JavaStr { unsafe { &*i.get_unchecked(self) } } + /// See [str::get_unchecked_mut]. + /// /// # Safety /// /// - The starting index must not exceed the ending index @@ -344,12 +447,14 @@ impl JavaStr { unsafe { &mut *i.get_unchecked_mut(self) } } + /// See [str::into_boxed_bytes]. #[inline] #[must_use] pub fn into_boxed_bytes(self: Box) -> Box<[u8]> { unsafe { Box::from_raw(Box::into_raw(self) as *mut [u8]) } } + /// See [str::into_string]. #[inline] #[must_use] pub fn into_string(self: Box) -> JavaString { @@ -357,12 +462,14 @@ impl JavaStr { unsafe { JavaString::from_semi_utf8_unchecked(slice.into_vec()) } } + /// See [str::is_ascii]. #[inline] #[must_use] pub fn is_ascii(&self) -> bool { self.as_bytes().is_ascii() } + /// See [str::is_char_boundary]. #[inline] #[must_use] pub fn is_char_boundary(&self, index: usize) -> bool { @@ -406,18 +513,21 @@ impl JavaStr { } } + /// See [str::is_empty]. #[inline] #[must_use] pub fn is_empty(&self) -> bool { self.len() == 0 } + /// See [str::len]. #[inline] #[must_use] pub fn len(&self) -> usize { self.inner.len() } + /// See [str::lines]. #[inline] pub fn lines(&self) -> Lines<'_> { Lines { @@ -433,6 +543,7 @@ impl JavaStr { } } + /// See [str::make_ascii_lowercase]. #[inline] pub fn make_ascii_lowercase(&mut self) { // SAFETY: changing ASCII letters only does not invalidate UTF-8. @@ -440,6 +551,7 @@ impl JavaStr { me.make_ascii_lowercase() } + /// See [str::make_ascii_uppercase]. #[inline] pub fn make_ascii_uppercase(&mut self) { // SAFETY: changing ASCII letters only does not invalidate UTF-8. @@ -447,6 +559,31 @@ impl JavaStr { me.make_ascii_uppercase() } + /// See [str::match_indices]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let v: Vec<_> = JavaStr::from_str("abcXXXabcYYYabc") + /// .match_indices("abc") + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// (0, JavaStr::from_str("abc")), + /// (6, JavaStr::from_str("abc")), + /// (12, JavaStr::from_str("abc")) + /// ] + /// ); + /// + /// let v: Vec<_> = JavaStr::from_str("1abcabc2").match_indices("abc").collect(); + /// assert_eq!( + /// v, + /// [(1, JavaStr::from_str("abc")), (4, JavaStr::from_str("abc"))] + /// ); + /// + /// let v: Vec<_> = JavaStr::from_str("ababa").match_indices("aba").collect(); + /// assert_eq!(v, [(0, JavaStr::from_str("aba"))]); // only the first `aba` + /// ``` #[inline] pub fn match_indices

(&self, pat: P) -> MatchIndices

where @@ -459,6 +596,34 @@ impl JavaStr { } } + /// See [str::matches]. + /// + /// ``` + /// # use java_string::{JavaCodePoint, JavaStr}; + /// let v: Vec<&JavaStr> = JavaStr::from_str("abcXXXabcYYYabc") + /// .matches("abc") + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("abc"), + /// JavaStr::from_str("abc"), + /// JavaStr::from_str("abc") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("1abc2abc3") + /// .matches(JavaCodePoint::is_numeric) + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("1"), + /// JavaStr::from_str("2"), + /// JavaStr::from_str("3") + /// ] + /// ); + /// ``` #[inline] pub fn matches

(&self, pat: P) -> Matches

where @@ -467,6 +632,7 @@ impl JavaStr { Matches { str: self, pat } } + /// See [str::parse]. #[inline] pub fn parse(&self) -> Result::Err>> where @@ -478,12 +644,22 @@ impl JavaStr { } } + /// See [str::repeat]. #[inline] #[must_use] pub fn repeat(&self, n: usize) -> JavaString { unsafe { JavaString::from_semi_utf8_unchecked(self.as_bytes().repeat(n)) } } + /// See [str::replace]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let s = JavaStr::from_str("this is old"); + /// + /// assert_eq!("this is new", s.replace("old", "new")); + /// assert_eq!("than an old", s.replace("is", "an")); + /// ``` #[inline] #[must_use] pub fn replace

(&self, from: P, to: &str) -> JavaString @@ -493,6 +669,7 @@ impl JavaStr { self.replace_java(from, JavaStr::from_str(to)) } + /// See [str::replace]. #[inline] #[must_use] pub fn replace_java

(&self, from: P, to: &JavaStr) -> JavaString @@ -510,6 +687,18 @@ impl JavaStr { result } + /// See [str::replacen]. + /// + /// ``` + /// # use java_string::{JavaCodePoint, JavaStr}; + /// let s = JavaStr::from_str("foo foo 123 foo"); + /// assert_eq!("new new 123 foo", s.replacen("foo", "new", 2)); + /// assert_eq!("faa fao 123 foo", s.replacen('o', "a", 3)); + /// assert_eq!( + /// "foo foo new23 foo", + /// s.replacen(JavaCodePoint::is_numeric, "new", 1) + /// ); + /// ``` #[inline] #[must_use] pub fn replacen

(&self, from: P, to: &str, count: usize) -> JavaString @@ -519,6 +708,7 @@ impl JavaStr { self.replacen_java(from, JavaStr::from_str(to), count) } + /// See [str::replacen]. #[inline] #[must_use] pub fn replacen_java

(&self, from: P, to: &JavaStr, count: usize) -> JavaString @@ -537,6 +727,19 @@ impl JavaStr { result } + /// See [str::rfind]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let s = JavaStr::from_str("Löwe 老虎 Léopard Gepardi"); + /// + /// assert_eq!(s.rfind('L'), Some(13)); + /// assert_eq!(s.rfind('é'), Some(14)); + /// assert_eq!(s.rfind("pard"), Some(24)); + /// + /// let x: &[_] = &['1', '2']; + /// assert_eq!(s.rfind(x), None); + /// ``` #[inline] #[must_use] pub fn rfind

(&self, mut pat: P) -> Option @@ -546,6 +749,33 @@ impl JavaStr { pat.rfind_in(self).map(|(index, _)| index) } + /// See [str::rmatch_indices]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let v: Vec<_> = JavaStr::from_str("abcXXXabcYYYabc") + /// .rmatch_indices("abc") + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// (12, JavaStr::from_str("abc")), + /// (6, JavaStr::from_str("abc")), + /// (0, JavaStr::from_str("abc")) + /// ] + /// ); + /// + /// let v: Vec<_> = JavaStr::from_str("1abcabc2") + /// .rmatch_indices("abc") + /// .collect(); + /// assert_eq!( + /// v, + /// [(4, JavaStr::from_str("abc")), (1, JavaStr::from_str("abc"))] + /// ); + /// + /// let v: Vec<_> = JavaStr::from_str("ababa").rmatch_indices("aba").collect(); + /// assert_eq!(v, [(2, JavaStr::from_str("aba"))]); // only the last `aba` + /// ``` #[inline] pub fn rmatch_indices

(&self, pat: P) -> RMatchIndices

where @@ -556,6 +786,34 @@ impl JavaStr { } } + /// See [str::rmatches]. + /// + /// ``` + /// # use java_string::{JavaCodePoint, JavaStr}; + /// let v: Vec<&JavaStr> = JavaStr::from_str("abcXXXabcYYYabc") + /// .rmatches("abc") + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("abc"), + /// JavaStr::from_str("abc"), + /// JavaStr::from_str("abc") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("1abc2abc3") + /// .rmatches(JavaCodePoint::is_numeric) + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("3"), + /// JavaStr::from_str("2"), + /// JavaStr::from_str("1") + /// ] + /// ); + /// ``` #[inline] pub fn rmatches

(&self, pat: P) -> RMatches

where @@ -566,6 +824,52 @@ impl JavaStr { } } + /// See [str::rsplit]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let v: Vec<&JavaStr> = JavaStr::from_str("Mary had a little lamb") + /// .rsplit(' ') + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("lamb"), + /// JavaStr::from_str("little"), + /// JavaStr::from_str("a"), + /// JavaStr::from_str("had"), + /// JavaStr::from_str("Mary") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("").rsplit('X').collect(); + /// assert_eq!(v, [JavaStr::from_str("")]); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("lionXXtigerXleopard") + /// .rsplit('X') + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("leopard"), + /// JavaStr::from_str("tiger"), + /// JavaStr::from_str(""), + /// JavaStr::from_str("lion") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("lion::tiger::leopard") + /// .rsplit("::") + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("leopard"), + /// JavaStr::from_str("tiger"), + /// JavaStr::from_str("lion") + /// ] + /// ); + /// ``` #[inline] pub fn rsplit

(&self, pat: P) -> RSplit

where @@ -574,6 +878,20 @@ impl JavaStr { RSplit::new(self, pat) } + /// See [str::rsplit_once]. + /// + /// ``` + /// # use java_string::JavaStr; + /// assert_eq!(JavaStr::from_str("cfg").rsplit_once('='), None); + /// assert_eq!( + /// JavaStr::from_str("cfg=foo").rsplit_once('='), + /// Some((JavaStr::from_str("cfg"), JavaStr::from_str("foo"))) + /// ); + /// assert_eq!( + /// JavaStr::from_str("cfg=foo=bar").rsplit_once('='), + /// Some((JavaStr::from_str("cfg=foo"), JavaStr::from_str("bar"))) + /// ); + /// ``` #[inline] #[must_use] pub fn rsplit_once

(&self, mut delimiter: P) -> Option<(&JavaStr, &JavaStr)> @@ -590,6 +908,37 @@ impl JavaStr { } } + /// See [str::rsplit_terminator]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let v: Vec<&JavaStr> = JavaStr::from_str("A.B.").rsplit_terminator('.').collect(); + /// assert_eq!(v, [JavaStr::from_str("B"), JavaStr::from_str("A")]); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("A..B..").rsplit_terminator(".").collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str(""), + /// JavaStr::from_str("B"), + /// JavaStr::from_str(""), + /// JavaStr::from_str("A") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("A.B:C.D") + /// .rsplit_terminator(&['.', ':'][..]) + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("D"), + /// JavaStr::from_str("C"), + /// JavaStr::from_str("B"), + /// JavaStr::from_str("A") + /// ] + /// ); + /// ``` #[inline] pub fn rsplit_terminator

(&self, pat: P) -> RSplitTerminator

where @@ -598,6 +947,45 @@ impl JavaStr { RSplitTerminator::new(self, pat) } + /// See [str::rsplitn]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let v: Vec<&JavaStr> = JavaStr::from_str("Mary had a little lamb") + /// .rsplitn(3, ' ') + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("lamb"), + /// JavaStr::from_str("little"), + /// JavaStr::from_str("Mary had a") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("lionXXtigerXleopard") + /// .rsplitn(3, 'X') + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("leopard"), + /// JavaStr::from_str("tiger"), + /// JavaStr::from_str("lionX") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("lion::tiger::leopard") + /// .rsplitn(2, "::") + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("leopard"), + /// JavaStr::from_str("lion::tiger") + /// ] + /// ); + /// ``` #[inline] pub fn rsplitn

(&self, n: usize, pat: P) -> RSplitN

where @@ -606,6 +994,76 @@ impl JavaStr { RSplitN::new(self, pat, n) } + /// See [str::split]. + /// + /// ``` + /// # use java_string::{JavaCodePoint, JavaStr}; + /// let v: Vec<&JavaStr> = JavaStr::from_str("Mary had a little lamb") + /// .split(' ') + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("Mary"), + /// JavaStr::from_str("had"), + /// JavaStr::from_str("a"), + /// JavaStr::from_str("little"), + /// JavaStr::from_str("lamb") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("").split('X').collect(); + /// assert_eq!(v, [JavaStr::from_str("")]); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("lionXXtigerXleopard") + /// .split('X') + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("lion"), + /// JavaStr::from_str(""), + /// JavaStr::from_str("tiger"), + /// JavaStr::from_str("leopard") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("lion::tiger::leopard") + /// .split("::") + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("lion"), + /// JavaStr::from_str("tiger"), + /// JavaStr::from_str("leopard") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("abc1def2ghi") + /// .split(JavaCodePoint::is_numeric) + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("abc"), + /// JavaStr::from_str("def"), + /// JavaStr::from_str("ghi") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("lionXtigerXleopard") + /// .split(JavaCodePoint::is_uppercase) + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("lion"), + /// JavaStr::from_str("tiger"), + /// JavaStr::from_str("leopard") + /// ] + /// ); + /// ``` #[inline] pub fn split

(&self, pat: P) -> Split

where @@ -614,6 +1072,19 @@ impl JavaStr { Split::new(self, pat) } + /// See [str::split_ascii_whitespace]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let mut iter = JavaStr::from_str(" Mary had\ta little \n\t lamb").split_ascii_whitespace(); + /// assert_eq!(Some(JavaStr::from_str("Mary")), iter.next()); + /// assert_eq!(Some(JavaStr::from_str("had")), iter.next()); + /// assert_eq!(Some(JavaStr::from_str("a")), iter.next()); + /// assert_eq!(Some(JavaStr::from_str("little")), iter.next()); + /// assert_eq!(Some(JavaStr::from_str("lamb")), iter.next()); + /// + /// assert_eq!(None, iter.next()); + /// ``` #[inline] pub fn split_ascii_whitespace(&self) -> SplitAsciiWhitespace<'_> { #[inline] @@ -630,6 +1101,23 @@ impl JavaStr { } } + /// See [str::split_at]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let s = JavaStr::from_str("Per Martin-Löf"); + /// + /// let (first, last) = s.split_at(3); + /// + /// assert_eq!("Per", first); + /// assert_eq!(" Martin-Löf", last); + /// ``` + /// ```should_panic + /// # use java_string::JavaStr; + /// let s = JavaStr::from_str("Per Martin-Löf"); + /// // Should panic + /// let _ = s.split_at(13); + /// ``` #[inline] #[must_use] pub fn split_at(&self, mid: usize) -> (&JavaStr, &JavaStr) { @@ -647,6 +1135,25 @@ impl JavaStr { } } + /// See [str::split_at_mut]. + /// + /// ``` + /// # use java_string::{JavaStr, JavaString}; + /// let mut s = JavaString::from("Per Martin-Löf"); + /// let s = s.as_mut_java_str(); + /// + /// let (first, last) = s.split_at_mut(3); + /// + /// assert_eq!("Per", first); + /// assert_eq!(" Martin-Löf", last); + /// ``` + /// ```should_panic + /// # use java_string::{JavaStr, JavaString}; + /// let mut s = JavaString::from("Per Martin-Löf"); + /// let s = s.as_mut_java_str(); + /// // Should panic + /// let _ = s.split_at(13); + /// ``` #[inline] #[must_use] pub fn split_at_mut(&mut self, mid: usize) -> (&mut JavaStr, &mut JavaStr) { @@ -669,6 +1176,22 @@ impl JavaStr { } } + /// See [str::split_inclusive]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let v: Vec<&JavaStr> = JavaStr::from_str("Mary had a little lamb\nlittle lamb\nlittle lamb.\n") + /// .split_inclusive('\n') + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("Mary had a little lamb\n"), + /// JavaStr::from_str("little lamb\n"), + /// JavaStr::from_str("little lamb.\n") + /// ] + /// ); + /// ``` #[inline] pub fn split_inclusive

(&self, pat: P) -> SplitInclusive

where @@ -677,6 +1200,24 @@ impl JavaStr { SplitInclusive::new(self, pat) } + /// See [str::split_once]. + /// + /// ``` + /// # use java_string::JavaStr; + /// assert_eq!(JavaStr::from_str("cfg").split_once('='), None); + /// assert_eq!( + /// JavaStr::from_str("cfg=").split_once('='), + /// Some((JavaStr::from_str("cfg"), JavaStr::from_str(""))) + /// ); + /// assert_eq!( + /// JavaStr::from_str("cfg=foo").split_once('='), + /// Some((JavaStr::from_str("cfg"), JavaStr::from_str("foo"))) + /// ); + /// assert_eq!( + /// JavaStr::from_str("cfg=foo=bar").split_once('='), + /// Some((JavaStr::from_str("cfg"), JavaStr::from_str("foo=bar"))) + /// ); + /// ``` #[inline] #[must_use] pub fn split_once

(&self, mut delimiter: P) -> Option<(&JavaStr, &JavaStr)> @@ -693,6 +1234,37 @@ impl JavaStr { } } + /// See [str::split_terminator]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let v: Vec<&JavaStr> = JavaStr::from_str("A.B.").split_terminator('.').collect(); + /// assert_eq!(v, [JavaStr::from_str("A"), JavaStr::from_str("B")]); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("A..B..").split_terminator(".").collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("A"), + /// JavaStr::from_str(""), + /// JavaStr::from_str("B"), + /// JavaStr::from_str("") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("A.B:C.D") + /// .split_terminator(&['.', ':'][..]) + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("A"), + /// JavaStr::from_str("B"), + /// JavaStr::from_str("C"), + /// JavaStr::from_str("D") + /// ] + /// ); + /// ``` #[inline] pub fn split_terminator

(&self, pat: P) -> SplitTerminator

where @@ -701,6 +1273,7 @@ impl JavaStr { SplitTerminator::new(self, pat) } + /// See [str::split_whitespace]. #[inline] pub fn split_whitespace(&self) -> SplitWhitespace<'_> { SplitWhitespace { @@ -710,6 +1283,40 @@ impl JavaStr { } } + /// See [str::splitn]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let v: Vec<&JavaStr> = JavaStr::from_str("Mary had a little lambda") + /// .splitn(3, ' ') + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("Mary"), + /// JavaStr::from_str("had"), + /// JavaStr::from_str("a little lambda") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("lionXXtigerXleopard") + /// .splitn(3, "X") + /// .collect(); + /// assert_eq!( + /// v, + /// [ + /// JavaStr::from_str("lion"), + /// JavaStr::from_str(""), + /// JavaStr::from_str("tigerXleopard") + /// ] + /// ); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("abcXdef").splitn(1, 'X').collect(); + /// assert_eq!(v, [JavaStr::from_str("abcXdef")]); + /// + /// let v: Vec<&JavaStr> = JavaStr::from_str("").splitn(1, 'X').collect(); + /// assert_eq!(v, [JavaStr::from_str("")]); + /// ``` #[inline] pub fn splitn

(&self, n: usize, pat: P) -> SplitN

where @@ -718,6 +1325,15 @@ impl JavaStr { SplitN::new(self, pat, n) } + /// See [str::starts_with]. + /// + /// ``` + /// # use java_string::JavaStr; + /// let bananas = JavaStr::from_str("bananas"); + /// + /// assert!(bananas.starts_with("bana")); + /// assert!(!bananas.starts_with("nana")); + /// ``` #[inline] #[must_use] pub fn starts_with

(&self, mut pat: P) -> bool @@ -727,6 +1343,20 @@ impl JavaStr { pat.prefix_len_in(self).is_some() } + /// See [str::strip_prefix]. + /// + /// ``` + /// # use java_string::JavaStr; + /// assert_eq!( + /// JavaStr::from_str("foo:bar").strip_prefix("foo:"), + /// Some(JavaStr::from_str("bar")) + /// ); + /// assert_eq!(JavaStr::from_str("foo:bar").strip_prefix("bar"), None); + /// assert_eq!( + /// JavaStr::from_str("foofoo").strip_prefix("foo"), + /// Some(JavaStr::from_str("foo")) + /// ); + /// ``` #[inline] #[must_use] pub fn strip_prefix

(&self, mut prefix: P) -> Option<&JavaStr> @@ -738,6 +1368,20 @@ impl JavaStr { unsafe { Some(self.get_unchecked(len..)) } } + /// See [str::strip_suffix]. + /// + /// ``` + /// # use java_string::JavaStr; + /// assert_eq!( + /// JavaStr::from_str("bar:foo").strip_suffix(":foo"), + /// Some(JavaStr::from_str("bar")) + /// ); + /// assert_eq!(JavaStr::from_str("bar:foo").strip_suffix("bar"), None); + /// assert_eq!( + /// JavaStr::from_str("foofoo").strip_suffix("foo"), + /// Some(JavaStr::from_str("foo")) + /// ); + /// ``` #[inline] #[must_use] pub fn strip_suffix

(&self, mut suffix: P) -> Option<&JavaStr> @@ -749,6 +1393,7 @@ impl JavaStr { unsafe { Some(self.get_unchecked(..self.len() - len)) } } + /// See [str::to_ascii_lowercase]. #[inline] #[must_use] pub fn to_ascii_lowercase(&self) -> JavaString { @@ -757,6 +1402,7 @@ impl JavaStr { s } + /// See [str::to_ascii_uppercase]. #[inline] #[must_use] pub fn to_ascii_uppercase(&self) -> JavaString { @@ -765,30 +1411,87 @@ impl JavaStr { s } + /// See [str::to_lowercase]. + /// + /// ``` + /// # use java_string::{JavaCodePoint, JavaStr, JavaString}; + /// let s = JavaStr::from_str("HELLO"); + /// assert_eq!("hello", s.to_lowercase()); + /// + /// let odysseus = JavaStr::from_str("ὈΔΥΣΣΕΎΣ"); + /// assert_eq!("ὀδυσσεύς", odysseus.to_lowercase()); + /// + /// let s = JavaString::from("Hello ") + /// + JavaString::from(JavaCodePoint::from_u32(0xd800).unwrap()).as_java_str() + /// + JavaStr::from_str(" World!"); + /// let expected = JavaString::from("hello ") + /// + JavaString::from(JavaCodePoint::from_u32(0xd800).unwrap()).as_java_str() + /// + JavaStr::from_str(" world!"); + /// assert_eq!(expected, s.to_lowercase()); + /// ``` #[inline] #[must_use] pub fn to_lowercase(&self) -> JavaString { self.transform_string(str::to_lowercase, |ch| ch) } + /// See [str::to_uppercase]. + /// + /// ``` + /// # use java_string::{JavaCodePoint, JavaStr, JavaString}; + /// let s = JavaStr::from_str("hello"); + /// assert_eq!("HELLO", s.to_uppercase()); + /// + /// let s = JavaStr::from_str("tschüß"); + /// assert_eq!("TSCHÜSS", s.to_uppercase()); + /// + /// let s = JavaString::from("Hello ") + /// + JavaString::from(JavaCodePoint::from_u32(0xd800).unwrap()).as_java_str() + /// + JavaStr::from_str(" World!"); + /// let expected = JavaString::from("HELLO ") + /// + JavaString::from(JavaCodePoint::from_u32(0xd800).unwrap()).as_java_str() + /// + JavaStr::from_str(" WORLD!"); + /// assert_eq!(expected, s.to_uppercase()); + /// ``` #[inline] #[must_use] pub fn to_uppercase(&self) -> JavaString { self.transform_string(str::to_uppercase, |ch| ch) } + /// See [str::trim]. #[inline] #[must_use] pub fn trim(&self) -> &JavaStr { self.trim_matches(|c: JavaCodePoint| c.is_whitespace()) } + /// See [str::trim_end]. #[inline] #[must_use] pub fn trim_end(&self) -> &JavaStr { self.trim_end_matches(|c: JavaCodePoint| c.is_whitespace()) } + /// See [str::trim_end_matches]. + /// + /// ``` + /// # use java_string::{JavaCodePoint, JavaStr}; + /// assert_eq!( + /// JavaStr::from_str("11foo1bar11").trim_end_matches('1'), + /// "11foo1bar" + /// ); + /// assert_eq!( + /// JavaStr::from_str("123foo1bar123").trim_end_matches(JavaCodePoint::is_numeric), + /// "123foo1bar" + /// ); + /// + /// let x: &[_] = &['1', '2']; + /// assert_eq!( + /// JavaStr::from_str("12foo1bar12").trim_end_matches(x), + /// "12foo1bar" + /// ); + /// ``` #[inline] #[must_use] pub fn trim_end_matches

(&self, mut pat: P) -> &JavaStr @@ -806,6 +1509,22 @@ impl JavaStr { str } + /// See [str::trim_matches]. + /// + /// ``` + /// # use java_string::{JavaCodePoint, JavaStr}; + /// assert_eq!( + /// JavaStr::from_str("11foo1bar11").trim_matches('1'), + /// "foo1bar" + /// ); + /// assert_eq!( + /// JavaStr::from_str("123foo1bar123").trim_matches(JavaCodePoint::is_numeric), + /// "foo1bar" + /// ); + /// + /// let x: &[_] = &['1', '2']; + /// assert_eq!(JavaStr::from_str("12foo1bar12").trim_matches(x), "foo1bar"); + /// ``` #[inline] #[must_use] pub fn trim_matches

(&self, mut pat: P) -> &JavaStr @@ -830,12 +1549,32 @@ impl JavaStr { str } + /// See [str::trim_start]. #[inline] #[must_use] pub fn trim_start(&self) -> &JavaStr { self.trim_start_matches(|c: JavaCodePoint| c.is_whitespace()) } + /// See [str::trim_start_matches]. + /// + /// ``` + /// # use java_string::{JavaCodePoint, JavaStr}; + /// assert_eq!( + /// JavaStr::from_str("11foo1bar11").trim_start_matches('1'), + /// "foo1bar11" + /// ); + /// assert_eq!( + /// JavaStr::from_str("123foo1bar123").trim_start_matches(JavaCodePoint::is_numeric), + /// "foo1bar123" + /// ); + /// + /// let x: &[_] = &['1', '2']; + /// assert_eq!( + /// JavaStr::from_str("12foo1bar12").trim_start_matches(x), + /// "foo1bar12" + /// ); + /// ``` #[inline] #[must_use] pub fn trim_start_matches

(&self, mut pat: P) -> &JavaStr diff --git a/crates/java_string/src/validations.rs b/crates/java_string/src/validations.rs index 3ae2d2c07..102783f55 100644 --- a/crates/java_string/src/validations.rs +++ b/crates/java_string/src/validations.rs @@ -188,6 +188,7 @@ pub(crate) fn run_utf8_semi_validation(v: &[u8]) -> Result<(), Utf8Error> { } _ => err!(Some(1)), } + index += 1; } else { // Ascii case, try to skip forward quickly. // When the pointer is aligned, read 2 words of data per iteration From 285872f35343a14346a36a48e1ad437e71f0785f Mon Sep 17 00:00:00 2001 From: Joe Date: Tue, 3 Oct 2023 20:37:33 +0100 Subject: [PATCH 08/11] Fix docs --- crates/java_string/src/owned.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/java_string/src/owned.rs b/crates/java_string/src/owned.rs index 67c700c41..e03f82a7d 100644 --- a/crates/java_string/src/owned.rs +++ b/crates/java_string/src/owned.rs @@ -519,7 +519,7 @@ impl JavaString { } } - /// See [String::insert_java_str]. + /// See [String::insert_str]. pub fn insert_java_str(&mut self, idx: usize, string: &JavaStr) { assert!(self.is_char_boundary(idx)); From bd237357c38249c65e6cb715b5e7d8c3259a9b55 Mon Sep 17 00:00:00 2001 From: Joe Date: Wed, 4 Oct 2023 16:52:07 +0100 Subject: [PATCH 09/11] Address review comments --- .github/workflows/ci.yml | 19 +++ crates/java_string/README.md | 2 +- crates/java_string/src/cesu8.rs | 23 ++-- crates/java_string/src/serde.rs | 230 +++++++++++++++++++++++++++++++- typos.toml | 2 +- 5 files changed, 258 insertions(+), 18 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 78e12f7f5..ea46962fd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -117,6 +117,25 @@ jobs: - name: Run valence_nbt tests without preserve_order feature run: cargo test -p valence_nbt --all-targets + valence-miri: + name: Miri Tests + runs-on: ubuntu-latest + steps: + - name: Checkout Actions Repository + uses: actions/checkout@v3 + + - name: Setup Rust toolchain and cache + uses: actions-rust-lang/setup-rust-toolchain@v1.5.0 + with: + toolchain: "nightly" + components: "miri" + + - name: Run tests + run: cargo miri test --workspace --all-features --doc + + - name: Run doctests + run: cargo miri test --workspace --all-features --doc + extractor-build: name: Build Extractor runs-on: ubuntu-latest diff --git a/crates/java_string/README.md b/crates/java_string/README.md index d3a960ab0..7135d6454 100644 --- a/crates/java_string/README.md +++ b/crates/java_string/README.md @@ -1,4 +1,4 @@ -# valence_java_string +# java_string An implementation of Java strings, tolerant of invalid UTF-16 encoding. This allows for round-trip serialization of all Java strings, including those which contain invalid UTF-16, while still diff --git a/crates/java_string/src/cesu8.rs b/crates/java_string/src/cesu8.rs index 4964bc497..90fd1924a 100644 --- a/crates/java_string/src/cesu8.rs +++ b/crates/java_string/src/cesu8.rs @@ -73,7 +73,7 @@ impl JavaStr { while i < bytes.len() { let b = bytes[i]; if b == 0 { - encoded.extend([0xc0, 0x80].into_iter()); + encoded.extend([0xc0, 0x80]); i += 1; } else if b < 128 { // Pass ASCII through quickly. @@ -101,8 +101,8 @@ impl JavaStr { s.chars().next().unwrap_unchecked().as_u32() - 0x10000 }; let s = [((c >> 10) as u16) | 0xd800, ((c & 0x3ff) as u16) | 0xdc00]; - encoded.extend(enc_surrogate(s[0]).into_iter()); - encoded.extend(enc_surrogate(s[1]).into_iter()); + encoded.extend(enc_surrogate(s[0])); + encoded.extend(enc_surrogate(s[1])); } i += w; } @@ -119,17 +119,18 @@ impl JavaString { pub fn from_modified_utf8(bytes: Vec) -> Result { match JavaString::from_full_utf8(bytes) { Ok(str) => Ok(str), - Err(err) => JavaString::from_modified_utf8_iter(err.bytes.into_iter()), + Err(err) => JavaString::from_modified_utf8_iter(err.bytes), } } /// Converts from Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format to a `JavaString`. /// /// See [JavaStr::from_modified_utf8]. - pub fn from_modified_utf8_iter(mut iter: I) -> Result + pub fn from_modified_utf8_iter(iter: I) -> Result where - I: Iterator, + I: IntoIterator, { + let mut iter = iter.into_iter(); let mut index = 0; let mut decoded = Vec::with_capacity(iter.size_hint().0); let mut surrogate_first: Option<[u8; 3]> = None; @@ -178,7 +179,7 @@ impl JavaString { if first == 0 { // modified UTF-8 should never contain \0 directly. - err!(None); + err!(Some(1)); } else if first < 128 { flush_first_surrogate_half!(); // Pass ASCII through directly. @@ -197,7 +198,7 @@ impl JavaString { // Two-byte sequences can be used directly. 2 => { flush_first_surrogate_half!(); - decoded.extend([first, second].into_iter()); + decoded.extend([first, second]); } 3 => { let third = next_cont!(Some(2)); @@ -208,7 +209,7 @@ impl JavaString { | (0xed, 0x80..=0x9f) | (0xee..=0xef, 0x80..=0xbf) => { flush_first_surrogate_half!(); - decoded.extend([first, second, third].into_iter()) + decoded.extend([first, second, third]) } // First half of a surrogate pair (0xed, 0xa0..=0xaf) => { @@ -222,10 +223,10 @@ impl JavaString { let (fifth, sixth) = (second, third); let (second, third) = (b, c); let s = dec_surrogates(second, third, fifth, sixth); - decoded.extend(s.into_iter()); + decoded.extend(s); } else { // no first half, append the second half directly - decoded.extend([first, second, third].into_iter()); + decoded.extend([first, second, third]); } } _ => err!(Some(1)), diff --git a/crates/java_string/src/serde.rs b/crates/java_string/src/serde.rs index 71e31c173..e1c152d11 100644 --- a/crates/java_string/src/serde.rs +++ b/crates/java_string/src/serde.rs @@ -1,6 +1,11 @@ +use std::fmt::Formatter; + +use serde::de::value::SeqAccessDeserializer; +use serde::de::{Error, SeqAccess, Unexpected, Visitor}; +use serde::ser::SerializeSeq; use serde::{Deserialize, Deserializer, Serialize, Serializer}; -use crate::{JavaStr, JavaString}; +use crate::{JavaCodePoint, JavaStr, JavaString}; impl Serialize for JavaString { #[inline] @@ -8,7 +13,16 @@ impl Serialize for JavaString { where S: Serializer, { - self.as_str_lossy().serialize(serializer) + match self.as_str() { + Ok(str) => str.serialize(serializer), + Err(_) => { + let mut seq = serializer.serialize_seq(None)?; + for ch in self.chars() { + seq.serialize_element(&ch.as_u32())?; + } + seq.end() + } + } } } @@ -18,7 +32,57 @@ impl<'de> Deserialize<'de> for JavaString { where D: Deserializer<'de>, { - String::deserialize(deserializer).map(JavaString::from) + deserializer.deserialize_any(JavaStringVisitor) + } +} + +struct JavaStringVisitor; + +impl<'de> Visitor<'de> for JavaStringVisitor { + type Value = JavaString; + + fn expecting(&self, formatter: &mut Formatter) -> std::fmt::Result { + formatter.write_str("a JavaString") + } + + fn visit_str(self, v: &str) -> Result + where + E: Error, + { + Ok(JavaString::from(v)) + } + + fn visit_string(self, v: String) -> Result + where + E: Error, + { + Ok(JavaString::from(v)) + } + + fn visit_bytes(self, v: &[u8]) -> Result + where + E: Error, + { + match JavaStr::from_semi_utf8(v) { + Ok(str) => Ok(str.to_owned()), + Err(_) => Err(Error::invalid_value(Unexpected::Bytes(v), &self)), + } + } + + fn visit_byte_buf(self, v: Vec) -> Result + where + E: Error, + { + JavaString::from_semi_utf8(v) + .map_err(|err| Error::invalid_value(Unexpected::Bytes(&err.into_bytes()), &self)) + } + + fn visit_seq(self, seq: A) -> Result + where + A: SeqAccess<'de>, + { + let vec = Vec::::deserialize(SeqAccessDeserializer::new(seq))?; + JavaString::from_semi_utf8(vec).map_err(|_| Error::invalid_value(Unexpected::Seq, &self)) } } @@ -28,7 +92,16 @@ impl Serialize for JavaStr { where S: Serializer, { - self.as_str_lossy().serialize(serializer) + match self.as_str() { + Ok(str) => str.serialize(serializer), + Err(_) => { + let mut seq = serializer.serialize_seq(None)?; + for ch in self.chars() { + seq.serialize_element(&ch.as_u32())?; + } + seq.end() + } + } } } @@ -38,6 +111,153 @@ impl<'de: 'a, 'a> Deserialize<'de> for &'a JavaStr { where D: Deserializer<'de>, { - <&'a str>::deserialize(deserializer).map(JavaStr::from_str) + deserializer.deserialize_any(JavaStrVisitor) + } +} + +struct JavaStrVisitor; + +impl<'de> Visitor<'de> for JavaStrVisitor { + type Value = &'de JavaStr; + + fn expecting(&self, formatter: &mut Formatter) -> std::fmt::Result { + formatter.write_str("a borrowed JavaStr") + } + + fn visit_borrowed_str(self, v: &'de str) -> Result + where + E: Error, + { + Ok(JavaStr::from_str(v)) + } + + fn visit_borrowed_bytes(self, v: &'de [u8]) -> Result + where + E: Error, + { + JavaStr::from_semi_utf8(v).map_err(|_| Error::invalid_value(Unexpected::Bytes(v), &self)) + } +} + +impl Serialize for JavaCodePoint { + #[inline] + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + match self.as_char() { + Some(ch) => ch.serialize(serializer), + None => self.as_u32().serialize(serializer), + } + } +} + +impl<'de> Deserialize<'de> for JavaCodePoint { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + deserializer.deserialize_any(JavaCodePointVisitor) + } +} + +struct JavaCodePointVisitor; + +impl<'de> Visitor<'de> for JavaCodePointVisitor { + type Value = JavaCodePoint; + + fn expecting(&self, formatter: &mut Formatter) -> std::fmt::Result { + formatter.write_str("a character") + } + + #[inline] + fn visit_i8(self, v: i8) -> Result + where + E: Error, + { + self.visit_i32(v as i32) + } + + #[inline] + fn visit_i16(self, v: i16) -> Result + where + E: Error, + { + self.visit_i32(v as i32) + } + + fn visit_i32(self, v: i32) -> Result + where + E: Error, + { + if v < 0 { + Err(Error::invalid_value(Unexpected::Signed(v as i64), &self)) + } else { + self.visit_u32(v as u32) + } + } + + fn visit_i64(self, v: i64) -> Result + where + E: Error, + { + if v < 0 { + Err(Error::invalid_value(Unexpected::Signed(v), &self)) + } else { + self.visit_u64(v as u64) + } + } + + #[inline] + fn visit_u8(self, v: u8) -> Result + where + E: Error, + { + self.visit_u32(v as u32) + } + + #[inline] + fn visit_u16(self, v: u16) -> Result + where + E: Error, + { + self.visit_u32(v as u32) + } + + fn visit_u32(self, v: u32) -> Result + where + E: Error, + { + JavaCodePoint::from_u32(v) + .ok_or_else(|| Error::invalid_value(Unexpected::Unsigned(v as u64), &self)) + } + + fn visit_u64(self, v: u64) -> Result + where + E: Error, + { + if v > u32::MAX as u64 { + Err(Error::invalid_value(Unexpected::Unsigned(v), &self)) + } else { + self.visit_u32(v as u32) + } + } + + fn visit_char(self, v: char) -> Result + where + E: Error, + { + Ok(JavaCodePoint::from_char(v)) + } + + fn visit_str(self, v: &str) -> Result + where + E: Error, + { + let mut iter = v.chars(); + match (iter.next(), iter.next()) { + (Some(c), None) => Ok(JavaCodePoint::from_char(c)), + _ => Err(Error::invalid_value(Unexpected::Str(v), &self)), + } } } diff --git a/typos.toml b/typos.toml index 8e54543f5..c59189146 100644 --- a/typos.toml +++ b/typos.toml @@ -1,5 +1,5 @@ [files] -extend-exclude = ["*.svg", "*.json"] +extend-exclude = ["*.svg", "*.json", "crates/java_string/src/slice.rs"] [default] extend-ignore-re = ['\d+ths', 'CC BY-NC-ND'] From ea0f0cb80c484c358d1a306bb0758ed58157786e Mon Sep 17 00:00:00 2001 From: Joe Date: Wed, 4 Oct 2023 17:00:13 +0100 Subject: [PATCH 10/11] Return the correct error index in `JavaString::from_modified_utf8_iter` --- crates/java_string/src/cesu8.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/java_string/src/cesu8.rs b/crates/java_string/src/cesu8.rs index 90fd1924a..31fce28ef 100644 --- a/crates/java_string/src/cesu8.rs +++ b/crates/java_string/src/cesu8.rs @@ -235,6 +235,7 @@ impl JavaString { _ => err!(Some(1)), // modified UTF-8 doesn't allow width 4 } } + index += 1; } flush_first_surrogate_half!(); From 4751693790239d651e102fe3531dffdd1d453d3b Mon Sep 17 00:00:00 2001 From: Joe Date: Thu, 5 Oct 2023 14:01:33 +0100 Subject: [PATCH 11/11] Change `JavaString::from_modified_utf8_iter` to slice version --- crates/java_string/src/cesu8.rs | 83 ++++++++++++--------------------- 1 file changed, 31 insertions(+), 52 deletions(-) diff --git a/crates/java_string/src/cesu8.rs b/crates/java_string/src/cesu8.rs index 31fce28ef..eb94ee6c1 100644 --- a/crates/java_string/src/cesu8.rs +++ b/crates/java_string/src/cesu8.rs @@ -31,7 +31,7 @@ impl JavaStr { pub fn from_modified_utf8(bytes: &[u8]) -> Result, Utf8Error> { match JavaStr::from_full_utf8(bytes) { Ok(str) => Ok(Cow::Borrowed(str)), - Err(_) => JavaString::from_modified_utf8_iter(bytes.iter().copied()).map(Cow::Owned), + Err(_) => JavaString::from_modified_utf8_internal(bytes).map(Cow::Owned), } } @@ -119,33 +119,17 @@ impl JavaString { pub fn from_modified_utf8(bytes: Vec) -> Result { match JavaString::from_full_utf8(bytes) { Ok(str) => Ok(str), - Err(err) => JavaString::from_modified_utf8_iter(err.bytes), + Err(err) => JavaString::from_modified_utf8_internal(&err.bytes), } } - /// Converts from Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format to a `JavaString`. - /// - /// See [JavaStr::from_modified_utf8]. - pub fn from_modified_utf8_iter(iter: I) -> Result - where - I: IntoIterator, - { - let mut iter = iter.into_iter(); - let mut index = 0; - let mut decoded = Vec::with_capacity(iter.size_hint().0); - let mut surrogate_first: Option<[u8; 3]> = None; - - macro_rules! flush_first_surrogate_half { - () => { - // append any preceding first half of a surrogate pair - if let Some(surrogate_first) = surrogate_first.take() { - decoded.extend(surrogate_first.into_iter()); - } - }; - } + fn from_modified_utf8_internal(slice: &[u8]) -> Result { + let mut offset = 0; + let mut decoded = Vec::with_capacity(slice.len() + 1); - while let Some(first) = iter.next() { - let old_offset = index; + while let Some(&first) = slice.get(offset) { + let old_offset = offset; + offset += 1; macro_rules! err { ($error_len:expr) => { @@ -158,10 +142,11 @@ impl JavaString { macro_rules! next { () => {{ - index += 1; - match iter.next() { - Some(a) => a, - None => err!(None), + if let Some(&b) = slice.get(offset) { + offset += 1; + b + } else { + err!(None) } }}; } @@ -181,11 +166,9 @@ impl JavaString { // modified UTF-8 should never contain \0 directly. err!(Some(1)); } else if first < 128 { - flush_first_surrogate_half!(); // Pass ASCII through directly. decoded.push(first); } else if first == 0xc0 { - flush_first_surrogate_half!(); // modified UTF-8 encoding of null character match next!() { 0x80 => decoded.push(0), @@ -197,7 +180,6 @@ impl JavaString { match w { // Two-byte sequences can be used directly. 2 => { - flush_first_surrogate_half!(); decoded.extend([first, second]); } 3 => { @@ -207,26 +189,26 @@ impl JavaString { (0xe0, 0xa0..=0xbf) | (0xe1..=0xec, 0x80..=0xbf) | (0xed, 0x80..=0x9f) - | (0xee..=0xef, 0x80..=0xbf) => { - flush_first_surrogate_half!(); - decoded.extend([first, second, third]) - } + | (0xee..=0xef, 0x80..=0xbf) + // Second half of a surrogate pair without a preceding first half, also pass this through. + | (0xed, 0xb0..=0xbf) + => decoded.extend([first, second, third]), // First half of a surrogate pair (0xed, 0xa0..=0xaf) => { - flush_first_surrogate_half!(); - surrogate_first = Some([first, second, third]); - } - // Second half of a surrogate pair - (0xed, 0xb0..=0xbf) => { - // try to pair the second half with a preceding first half - if let Some([_, b, c]) = surrogate_first.take() { - let (fifth, sixth) = (second, third); - let (second, third) = (b, c); - let s = dec_surrogates(second, third, fifth, sixth); - decoded.extend(s); - } else { - // no first half, append the second half directly - decoded.extend([first, second, third]); + // Peek ahead and try to pair the first half of surrogate pair with + // second. + match &slice[offset..] { + [0xed, fifth @ 0xb0..=0xbf, sixth, ..] + if *sixth & !CONT_MASK == TAG_CONT => + { + let s = dec_surrogates(second, third, *fifth, *sixth); + decoded.extend(s); + offset += 3; + } + _ => { + // No second half, append the first half directly. + decoded.extend([first, second, third]); + } } } _ => err!(Some(1)), @@ -235,11 +217,8 @@ impl JavaString { _ => err!(Some(1)), // modified UTF-8 doesn't allow width 4 } } - index += 1; } - flush_first_surrogate_half!(); - unsafe { // SAFETY: we built a semi UTF-8 encoded string Ok(JavaString::from_semi_utf8_unchecked(decoded))