Add encoding to/from Java's modified UTF-8 format

valence-rs · Oct 2, 2023 · e48b347 · e48b347
1 parent e9011d9
commit e48b347
Show file tree

Hide file tree

Showing 6 changed files with 261 additions and 8 deletions.
diff --git a/crates/valence_java_string/README.md b/crates/valence_java_string/README.md
@@ -5,9 +5,9 @@ This allows for round-trip serialization of all Java strings, including those wh
 being able to perform useful operations on those strings. 
 
 These Java strings use the UTF-8 encoding, with the modification that surrogate code points (code points between U+D800 
-and U+DFFF inclusive) are allowed. This allows for zero-cost conversion from Rust strings to Java strings. Similarly,
-this crate introduces a `JavaCodePoint` type which is analogous to `char`, except that surrogate code points are 
-allowed.
+and U+DFFF inclusive) are allowed. This allows for zero-cost conversion from Rust strings to Java strings. This modified
+encoding is known as "semi-UTF-8" throughout the codebase. Similarly, this crate introduces a `JavaCodePoint` type which
+is analogous to `char`, except that surrogate code points are allowed.
 
 This crate is mostly undocumented, because most methods are entirely analogous to those of the same name in Rust's
 strings. Please refer to the `std` documentation.

diff --git a/crates/valence_java_string/src/cesu8.rs b/crates/valence_java_string/src/cesu8.rs
@@ -0,0 +1,251 @@
+use std::borrow::Cow;
+
+use crate::validations::{utf8_char_width, CONT_MASK, TAG_CONT};
+use crate::{JavaStr, JavaString, Utf8Error};
+
+impl JavaStr {
+    /// Converts from Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format to a `Cow<JavaStr>`.
+    #[inline]
+    pub fn from_modified_utf8(bytes: &[u8]) -> Result<Cow<JavaStr>, Utf8Error> {
+        match JavaStr::from_full_utf8(bytes) {
+            Ok(str) => Ok(Cow::Borrowed(str)),
+            Err(_) => JavaString::from_modified_utf8_iter(bytes.iter().copied()).map(Cow::Owned),
+        }
+    }
+
+    /// Converts to Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format.
+    #[inline]
+    #[must_use]
+    pub fn to_modified_utf8(&self) -> Cow<[u8]> {
+        if is_valid_cesu8(self) {
+            Cow::Borrowed(self.as_bytes())
+        } else {
+            Cow::Owned(self.to_modified_utf8_internal())
+        }
+    }
+
+    #[inline]
+    fn to_modified_utf8_internal(&self) -> Vec<u8> {
+        let bytes = self.as_bytes();
+        let mut encoded = Vec::with_capacity(bytes.len() + bytes.len() >> 2);
+        let mut i = 0;
+        while i < bytes.len() {
+            let b = bytes[i];
+            if b == 0 {
+                encoded.extend([0xc0, 0x80].into_iter());
+                i += 1;
+            } else if b < 128 {
+                // Pass ASCII through quickly.
+                encoded.push(b);
+                i += 1;
+            } else {
+                // Figure out how many bytes we need for this character.
+                let w = utf8_char_width(b);
+                let char_bytes = unsafe {
+                    // SAFETY: input must be valid semi UTF-8, so there must be at least w more
+                    // bytes from i
+                    bytes.get_unchecked(i..i + w)
+                };
+                if w != 4 {
+                    // Pass through short UTF-8 sequences unmodified.
+                    encoded.extend(char_bytes.iter().copied())
+                } else {
+                    // Encode 4-byte sequences as 6 bytes
+                    let s = unsafe {
+                        // SAFETY: input is valid semi UTF-8
+                        JavaStr::from_semi_utf8_unchecked(bytes)
+                    };
+                    let c = unsafe {
+                        // SAFETY: s contains a single char of width 4
+                        s.chars().next().unwrap_unchecked().as_u32() - 0x10000
+                    };
+                    let s = [((c >> 10) as u16) | 0xd800, ((c & 0x3ff) as u16) | 0xdc00];
+                    encoded.extend(enc_surrogate(s[0]).into_iter());
+                    encoded.extend(enc_surrogate(s[1]).into_iter());
+                }
+                i += w;
+            }
+        }
+        encoded
+    }
+}
+
+impl JavaString {
+    /// Converts from Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format to a `JavaString`.
+    #[inline]
+    pub fn from_modified_utf8(bytes: Vec<u8>) -> Result<JavaString, Utf8Error> {
+        match JavaString::from_full_utf8(bytes) {
+            Ok(str) => Ok(str),
+            Err(err) => JavaString::from_modified_utf8_iter(err.bytes.into_iter()),
+        }
+    }
+
+    /// Converts from Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format to a `JavaString`.
+    pub fn from_modified_utf8_iter<I>(mut iter: I) -> Result<JavaString, Utf8Error>
+    where
+        I: Iterator<Item = u8>,
+    {
+        let mut index = 0;
+        let mut decoded = Vec::with_capacity(iter.size_hint().0);
+        let mut surrogate_first: Option<[u8; 3]> = None;
+
+        macro_rules! flush_first_surrogate_half {
+            () => {
+                // append any preceding first half of a surrogate pair
+                if let Some(surrogate_first) = surrogate_first.take() {
+                    decoded.extend(surrogate_first.into_iter());
+                }
+            };
+        }
+
+        while let Some(first) = iter.next() {
+            let old_offset = index;
+
+            macro_rules! err {
+                ($error_len:expr) => {
+                    return Err(Utf8Error {
+                        valid_up_to: old_offset,
+                        error_len: $error_len,
+                    })
+                };
+            }
+
+            macro_rules! next {
+                () => {{
+                    index += 1;
+                    match iter.next() {
+                        Some(a) => a,
+                        None => err!(None),
+                    }
+                }};
+            }
+
+            macro_rules! next_cont {
+                ($error_len:expr) => {{
+                    let byte = next!();
+                    if (byte) & !CONT_MASK == TAG_CONT {
+                        byte
+                    } else {
+                        err!($error_len)
+                    }
+                }};
+            }
+
+            if first == 0 {
+                // modified UTF-8 should never contain \0 directly.
+                err!(None);
+            } else if first < 128 {
+                flush_first_surrogate_half!();
+                // Pass ASCII through directly.
+                decoded.push(first);
+            } else if first == 0xc0 {
+                flush_first_surrogate_half!();
+                // modified UTF-8 encoding of null character
+                match next!() {
+                    0x80 => decoded.push(0),
+                    _ => err!(Some(1)),
+                }
+            } else {
+                let w = utf8_char_width(first);
+                let second = next_cont!(Some(1));
+                match w {
+                    // Two-byte sequences can be used directly.
+                    2 => {
+                        flush_first_surrogate_half!();
+                        decoded.extend([first, second].into_iter());
+                    }
+                    3 => {
+                        let third = next_cont!(Some(2));
+                        match (first, second) {
+                            // These are valid UTF-8, so pass them through.
+                            (0xe0, 0xa0..=0xbf)
+                            | (0xe1..=0xec, 0x80..=0xbf)
+                            | (0xed, 0x80..=0x9f)
+                            | (0xee..=0xef, 0x80..=0xbf) => {
+                                flush_first_surrogate_half!();
+                                decoded.extend([first, second, third].into_iter())
+                            }
+                            // First half of a surrogate pair
+                            (0xed, 0xa0..=0xaf) => {
+                                flush_first_surrogate_half!();
+                                surrogate_first = Some([first, second, third]);
+                            }
+                            // Second half of a surrogate pair
+                            (0xed, 0xb0..=0xbf) => {
+                                // try to pair the second half with a preceding first half
+                                if let Some([_, b, c]) = surrogate_first.take() {
+                                    let (fifth, sixth) = (second, third);
+                                    let (second, third) = (b, c);
+                                    let s = dec_surrogates(second, third, fifth, sixth);
+                                    decoded.extend(s.into_iter());
+                                } else {
+                                    // no first half, append the second half directly
+                                    decoded.extend([first, second, third].into_iter());
+                                }
+                            }
+                            _ => err!(Some(1)),
+                        }
+                    }
+                    _ => err!(Some(1)), // modified UTF-8 doesn't allow width 4
+                }
+            }
+        }
+
+        flush_first_surrogate_half!();
+
+        unsafe {
+            // SAFETY: we built a semi UTF-8 encoded string
+            Ok(JavaString::from_semi_utf8_unchecked(decoded))
+        }
+    }
+
+    /// Converts to Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format.
+    #[inline]
+    #[must_use]
+    pub fn into_modified_utf8(self) -> Vec<u8> {
+        if is_valid_cesu8(&self) {
+            self.into_bytes()
+        } else {
+            self.to_modified_utf8_internal()
+        }
+    }
+}
+
+#[inline]
+fn dec_surrogate(second: u8, third: u8) -> u32 {
+    0xd000 | ((second & CONT_MASK) as u32) << 6 | (third & CONT_MASK) as u32
+}
+
+#[inline]
+fn dec_surrogates(second: u8, third: u8, fifth: u8, sixth: u8) -> [u8; 4] {
+    // Convert to a 32-bit code point.
+    let s1 = dec_surrogate(second, third);
+    let s2 = dec_surrogate(fifth, sixth);
+    let c = 0x10000 + (((s1 - 0xd800) << 10) | (s2 - 0xdc00));
+    assert!((0x010000..=0x10ffff).contains(&c));
+
+    // Convert to UTF-8.
+    // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+    [
+        0b1111_0000u8 | ((c & 0b1_1100_0000_0000_0000_0000) >> 18) as u8,
+        TAG_CONT | ((c & 0b0_0011_1111_0000_0000_0000) >> 12) as u8,
+        TAG_CONT | ((c & 0b0_0000_0000_1111_1100_0000) >> 6) as u8,
+        TAG_CONT | (c & 0b0_0000_0000_0000_0011_1111) as u8,
+    ]
+}
+
+#[inline]
+fn is_valid_cesu8(text: &JavaStr) -> bool {
+    text.bytes()
+        .all(|b| b != 0 && ((b & !CONT_MASK) == TAG_CONT || utf8_char_width(b) <= 3))
+}
+
+#[inline]
+fn enc_surrogate(surrogate: u16) -> [u8; 3] {
+    // 1110xxxx 10xxxxxx 10xxxxxx
+    [
+        0b11100000 | ((surrogate & 0b11110000_00000000) >> 12) as u8,
+        TAG_CONT | ((surrogate & 0b00001111_11000000) >> 6) as u8,
+        TAG_CONT | (surrogate & 0b00000000_00111111) as u8,
+    ]
+}
diff --git a/crates/valence_java_string/src/lib.rs b/crates/valence_java_string/src/lib.rs
@@ -1,5 +1,6 @@
 #![doc = include_str!("../README.md")]
 
+mod cesu8;
 mod char;
 mod error;
 mod iter;
@@ -10,6 +11,7 @@ mod serde;
 mod slice;
 pub(crate) mod validations;
 
+pub use cesu8::*;
 pub use char::*;
 pub use error::*;
 pub use iter::*;

diff --git a/crates/valence_java_string/src/owned.rs b/crates/valence_java_string/src/owned.rs
@@ -39,7 +39,7 @@ impl JavaString {
     }
 
     #[inline]
-    pub fn from_utf8(vec: Vec<u8>) -> Result<JavaString, FromUtf8Error> {
+    pub fn from_full_utf8(vec: Vec<u8>) -> Result<JavaString, FromUtf8Error> {
         match std::str::from_utf8(&vec) {
             Ok(..) => Ok(JavaString { vec }),
             Err(e) => Err(FromUtf8Error {

diff --git a/crates/valence_java_string/src/slice.rs b/crates/valence_java_string/src/slice.rs
@@ -31,15 +31,15 @@ pub struct JavaStr {
 
 impl JavaStr {
     #[inline]
-    pub const fn from_utf8(v: &[u8]) -> Result<&JavaStr, Utf8Error> {
+    pub const fn from_full_utf8(v: &[u8]) -> Result<&JavaStr, Utf8Error> {
         match std::str::from_utf8(v) {
             Ok(str) => Ok(JavaStr::from_str(str)),
             Err(err) => Err(Utf8Error::from_std(err)),
         }
     }
 
     #[inline]
-    pub fn from_utf8_mut(v: &mut [u8]) -> Result<&mut JavaStr, Utf8Error> {
+    pub fn from_full_utf8_mut(v: &mut [u8]) -> Result<&mut JavaStr, Utf8Error> {
         match std::str::from_utf8_mut(v) {
             Ok(str) => Ok(JavaStr::from_mut_str(str)),
             Err(err) => Err(Utf8Error::from_std(err)),

diff --git a/crates/valence_java_string/src/validations.rs b/crates/valence_java_string/src/validations.rs
@@ -6,7 +6,7 @@ pub(crate) const TAG_CONT: u8 = 0b1000_0000;
 pub(crate) const TAG_TWO_B: u8 = 0b1100_0000;
 pub(crate) const TAG_THREE_B: u8 = 0b1110_0000;
 pub(crate) const TAG_FOUR_B: u8 = 0b1111_0000;
-const CONT_MASK: u8 = 0b0011_1111;
+pub(crate) const CONT_MASK: u8 = 0b0011_1111;
 
 #[inline]
 const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
@@ -244,7 +244,7 @@ pub(crate) const fn run_utf8_full_validation_from_semi(v: &[u8]) -> Result<(), U
 }
 
 #[inline]
-const fn utf8_char_width(first_byte: u8) -> usize {
+pub(crate) const fn utf8_char_width(first_byte: u8) -> usize {
     const UTF8_CHAR_WIDTH: [u8; 256] = [
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,