Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement JavaString #540

Merged
merged 11 commits into from
Oct 6, 2023
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ hmac = "0.12.1"
image = "0.24.6"
indexmap = "2.0.0"
itertools = "0.11.0"
java_string = { path = "crates/java_string", version = "0.1.0" }
lru = "0.11.0"
noise = "0.8.2"
num = "0.4.0"
Expand Down
366 changes: 186 additions & 180 deletions assets/depgraph.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
16 changes: 16 additions & 0 deletions crates/java_string/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[package]
name = "java_string"
description = "An implementation of Java strings, tolerant of invalid UTF-16 encoding"
readme = "README.md"
version = "0.1.0"
keywords = ["java", "string", "utf16"]
edition.workspace = true
repository.workspace = true
documentation.workspace = true
license.workspace = true

[features]
serde = ["dep:serde"]

[dependencies]
serde = { workspace = true, optional = true }
17 changes: 17 additions & 0 deletions crates/java_string/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# valence_java_string
Earthcomputer marked this conversation as resolved.
Show resolved Hide resolved

An implementation of Java strings, tolerant of invalid UTF-16 encoding.
This allows for round-trip serialization of all Java strings, including those which contain invalid UTF-16, while still
being able to perform useful operations on those strings.

These Java strings use the UTF-8 encoding, with the modification that surrogate code points (code points between U+D800
and U+DFFF inclusive) are allowed. This allows for zero-cost conversion from Rust strings to Java strings. This modified
encoding is known as "semi-UTF-8" throughout the codebase. Similarly, this crate introduces a `JavaCodePoint` type which
is analogous to `char`, except that surrogate code points are allowed.

This crate is mostly undocumented, because most methods are entirely analogous to those of the same name in Rust's
strings. Please refer to the `std` documentation.

# Features

- `serde` Adds support for [`serde`](https://docs.rs/serde/latest/serde/)
298 changes: 298 additions & 0 deletions crates/java_string/src/cesu8.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,298 @@
use std::borrow::Cow;

use crate::validations::{utf8_char_width, CONT_MASK, TAG_CONT};
use crate::{JavaStr, JavaString, Utf8Error};

impl JavaStr {
/// Converts from Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format to a `Cow<JavaStr>`.
///
/// ```
/// # use std::borrow::Cow;
/// # use java_string::{JavaCodePoint, JavaStr, JavaString};
///
/// let result = JavaStr::from_modified_utf8("Hello World!".as_bytes()).unwrap();
/// assert!(matches!(result, Cow::Borrowed(_)));
/// assert_eq!(JavaStr::from_str("Hello World!"), result);
///
/// let result = JavaStr::from_modified_utf8(&[
/// 0x61, 0x62, 0x63, 0xc0, 0x80, 0xe2, 0x84, 0x9d, 0xed, 0xa0, 0xbd, 0xed, 0xb2, 0xa3, 0xed,
/// 0xa0, 0x80,
/// ])
/// .unwrap();
/// assert!(matches!(result, Cow::Owned(_)));
/// let mut expected = JavaString::from("abc\0ℝ💣");
/// expected.push_java(JavaCodePoint::from_u32(0xd800).unwrap());
/// assert_eq!(expected, result);
///
/// let result = JavaStr::from_modified_utf8(&[0xed]);
/// assert!(result.is_err());
/// ```
#[inline]
pub fn from_modified_utf8(bytes: &[u8]) -> Result<Cow<JavaStr>, Utf8Error> {
match JavaStr::from_full_utf8(bytes) {
Ok(str) => Ok(Cow::Borrowed(str)),
Err(_) => JavaString::from_modified_utf8_iter(bytes.iter().copied()).map(Cow::Owned),
}
}

/// Converts to Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format.
///
/// ```
/// # use std::borrow::Cow;
/// # use java_string::{JavaCodePoint, JavaStr, JavaString};
///
/// let result = JavaStr::from_str("Hello World!").to_modified_utf8();
/// assert!(matches!(result, Cow::Borrowed(_)));
/// assert_eq!(result, &b"Hello World!"[..]);
///
/// let mut str = JavaString::from("abc\0ℝ💣");
/// str.push_java(JavaCodePoint::from_u32(0xd800).unwrap());
/// let result = str.to_modified_utf8();
/// let expected = [
/// 0x61, 0x62, 0x63, 0xc0, 0x80, 0xe2, 0x84, 0x9d, 0xed, 0xa0, 0xbd, 0xed, 0xb2, 0xa3, 0xed,
/// 0xa0, 0x80,
/// ];
/// assert!(matches!(result, Cow::Owned(_)));
/// assert_eq!(result, &expected[..]);
/// ```
#[inline]
#[must_use]
pub fn to_modified_utf8(&self) -> Cow<[u8]> {
if is_valid_cesu8(self) {
Cow::Borrowed(self.as_bytes())
} else {
Cow::Owned(self.to_modified_utf8_internal())
}
}

#[inline]
fn to_modified_utf8_internal(&self) -> Vec<u8> {
let bytes = self.as_bytes();
let mut encoded = Vec::with_capacity((bytes.len() + bytes.len()) >> 2);
let mut i = 0;
while i < bytes.len() {
let b = bytes[i];
if b == 0 {
encoded.extend([0xc0, 0x80].into_iter());
i += 1;
} else if b < 128 {
// Pass ASCII through quickly.
encoded.push(b);
i += 1;
} else {
// Figure out how many bytes we need for this character.
let w = utf8_char_width(b);
let char_bytes = unsafe {
// SAFETY: input must be valid semi UTF-8, so there must be at least w more
// bytes from i
bytes.get_unchecked(i..i + w)
};
if w != 4 {
// Pass through short UTF-8 sequences unmodified.
encoded.extend(char_bytes.iter().copied())
} else {
// Encode 4-byte sequences as 6 bytes
let s = unsafe {
// SAFETY: input is valid semi UTF-8
JavaStr::from_semi_utf8_unchecked(char_bytes)
};
let c = unsafe {
// SAFETY: s contains a single char of width 4
s.chars().next().unwrap_unchecked().as_u32() - 0x10000
};
let s = [((c >> 10) as u16) | 0xd800, ((c & 0x3ff) as u16) | 0xdc00];
encoded.extend(enc_surrogate(s[0]).into_iter());
encoded.extend(enc_surrogate(s[1]).into_iter());
}
i += w;
}
}
encoded
}
}

impl JavaString {
/// Converts from Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format to a `JavaString`.
///
/// See [JavaStr::from_modified_utf8].
#[inline]
pub fn from_modified_utf8(bytes: Vec<u8>) -> Result<JavaString, Utf8Error> {
match JavaString::from_full_utf8(bytes) {
Ok(str) => Ok(str),
Err(err) => JavaString::from_modified_utf8_iter(err.bytes.into_iter()),
}
}

/// Converts from Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format to a `JavaString`.
///
/// See [JavaStr::from_modified_utf8].
pub fn from_modified_utf8_iter<I>(mut iter: I) -> Result<JavaString, Utf8Error>
where
I: Iterator<Item = u8>,
Earthcomputer marked this conversation as resolved.
Show resolved Hide resolved
{
let mut index = 0;
let mut decoded = Vec::with_capacity(iter.size_hint().0);
let mut surrogate_first: Option<[u8; 3]> = None;

macro_rules! flush_first_surrogate_half {
() => {
// append any preceding first half of a surrogate pair
if let Some(surrogate_first) = surrogate_first.take() {
decoded.extend(surrogate_first.into_iter());
}
};
}

while let Some(first) = iter.next() {
let old_offset = index;

macro_rules! err {
($error_len:expr) => {
return Err(Utf8Error {
valid_up_to: old_offset,
error_len: $error_len,
})
};
}

macro_rules! next {
() => {{
index += 1;
match iter.next() {
Some(a) => a,
None => err!(None),
}
}};
}

macro_rules! next_cont {
($error_len:expr) => {{
let byte = next!();
if (byte) & !CONT_MASK == TAG_CONT {
byte
} else {
err!($error_len)
}
}};
}

if first == 0 {
// modified UTF-8 should never contain \0 directly.
err!(None);
Earthcomputer marked this conversation as resolved.
Show resolved Hide resolved
} else if first < 128 {
flush_first_surrogate_half!();
// Pass ASCII through directly.
decoded.push(first);
} else if first == 0xc0 {
flush_first_surrogate_half!();
// modified UTF-8 encoding of null character
match next!() {
0x80 => decoded.push(0),
_ => err!(Some(1)),
}
} else {
let w = utf8_char_width(first);
let second = next_cont!(Some(1));
match w {
// Two-byte sequences can be used directly.
2 => {
flush_first_surrogate_half!();
decoded.extend([first, second].into_iter());
Earthcomputer marked this conversation as resolved.
Show resolved Hide resolved
}
3 => {
let third = next_cont!(Some(2));
match (first, second) {
// These are valid UTF-8, so pass them through.
(0xe0, 0xa0..=0xbf)
| (0xe1..=0xec, 0x80..=0xbf)
| (0xed, 0x80..=0x9f)
| (0xee..=0xef, 0x80..=0xbf) => {
flush_first_surrogate_half!();
decoded.extend([first, second, third].into_iter())
Earthcomputer marked this conversation as resolved.
Show resolved Hide resolved
}
// First half of a surrogate pair
(0xed, 0xa0..=0xaf) => {
flush_first_surrogate_half!();
surrogate_first = Some([first, second, third]);
}
Earthcomputer marked this conversation as resolved.
Show resolved Hide resolved
// Second half of a surrogate pair
(0xed, 0xb0..=0xbf) => {
// try to pair the second half with a preceding first half
if let Some([_, b, c]) = surrogate_first.take() {
let (fifth, sixth) = (second, third);
let (second, third) = (b, c);
let s = dec_surrogates(second, third, fifth, sixth);
decoded.extend(s.into_iter());
Earthcomputer marked this conversation as resolved.
Show resolved Hide resolved
} else {
// no first half, append the second half directly
decoded.extend([first, second, third].into_iter());
Earthcomputer marked this conversation as resolved.
Show resolved Hide resolved
}
}
_ => err!(Some(1)),
}
}
_ => err!(Some(1)), // modified UTF-8 doesn't allow width 4
}
}
}

flush_first_surrogate_half!();

unsafe {
// SAFETY: we built a semi UTF-8 encoded string
Ok(JavaString::from_semi_utf8_unchecked(decoded))
}
}

/// Converts to Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format.
///
/// See [JavaStr::to_modified_utf8].
#[inline]
#[must_use]
pub fn into_modified_utf8(self) -> Vec<u8> {
if is_valid_cesu8(&self) {
self.into_bytes()
} else {
self.to_modified_utf8_internal()
}
}
}

#[inline]
fn dec_surrogate(second: u8, third: u8) -> u32 {
0xd000 | ((second & CONT_MASK) as u32) << 6 | (third & CONT_MASK) as u32
}

#[inline]
fn dec_surrogates(second: u8, third: u8, fifth: u8, sixth: u8) -> [u8; 4] {
// Convert to a 32-bit code point.
let s1 = dec_surrogate(second, third);
let s2 = dec_surrogate(fifth, sixth);
let c = 0x10000 + (((s1 - 0xd800) << 10) | (s2 - 0xdc00));
assert!((0x010000..=0x10ffff).contains(&c));

// Convert to UTF-8.
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
[
0b1111_0000u8 | ((c & 0b1_1100_0000_0000_0000_0000) >> 18) as u8,
TAG_CONT | ((c & 0b0_0011_1111_0000_0000_0000) >> 12) as u8,
TAG_CONT | ((c & 0b0_0000_0000_1111_1100_0000) >> 6) as u8,
TAG_CONT | (c & 0b0_0000_0000_0000_0011_1111) as u8,
]
}

#[inline]
fn is_valid_cesu8(text: &JavaStr) -> bool {
text.bytes()
.all(|b| b != 0 && ((b & !CONT_MASK) == TAG_CONT || utf8_char_width(b) <= 3))
}

#[inline]
fn enc_surrogate(surrogate: u16) -> [u8; 3] {
// 1110xxxx 10xxxxxx 10xxxxxx
[
0b11100000 | ((surrogate & 0b11110000_00000000) >> 12) as u8,
TAG_CONT | ((surrogate & 0b00001111_11000000) >> 6) as u8,
TAG_CONT | (surrogate & 0b00000000_00111111) as u8,
]
}
Loading
Loading