Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
rj00a committed Jun 9, 2024
1 parent 3a98063 commit 394ece3
Show file tree
Hide file tree
Showing 9 changed files with 154 additions and 211 deletions.
13 changes: 6 additions & 7 deletions crates/java_string/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
[package]
name = "java_string"
description = "An implementation of Java strings, tolerant of invalid UTF-16 encoding"
readme = "README.md"
version = "0.1.2"
description = "An implementation of Java strings, tolerant of invalid UTF-16 encoding"
keywords = ["java", "string", "utf16"]
edition.workspace = true
repository.workspace = true
documentation.workspace = true
license.workspace = true

[lints]
workspace = true

[features]
serde = ["dep:serde"]

[dependencies]
serde = { workspace = true, optional = true }
serde = { version = "1.0.200", optional = true }

[lints]
workspace = true

33 changes: 17 additions & 16 deletions crates/java_string/src/cesu8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,16 @@ impl JavaStr {
/// assert_eq!(JavaStr::from_str("Hello World!"), result);

Check warning on line 15 in crates/java_string/src/cesu8.rs

View workflow job for this annotation

GitHub Actions / Formatting

Diff in /home/runner/work/valence/valence/crates/java_string/src/cesu8.rs
///
/// let result = JavaStr::from_modified_utf8(&[
/// 0x61, 0x62, 0x63, 0xc0, 0x80, 0xe2, 0x84, 0x9d, 0xed, 0xa0, 0xbd, 0xed, 0xb2, 0xa3, 0xed,
/// 0xa0, 0x80,
/// 0x61, 0x62, 0x63, 0xC0, 0x80, 0xE2, 0x84, 0x9D, 0xED, 0xA0, 0xBD, 0xED, 0xB2, 0xA3, 0xED,
/// 0xA0, 0x80,
/// ])
/// .unwrap();
/// assert!(matches!(result, Cow::Owned(_)));
/// let mut expected = JavaString::from("abc\0ℝ💣");

Check warning on line 23 in crates/java_string/src/cesu8.rs

View workflow job for this annotation

GitHub Actions / Formatting

Diff in /home/runner/work/valence/valence/crates/java_string/src/cesu8.rs
/// expected.push_java(JavaCodePoint::from_u32(0xd800).unwrap());
/// expected.push_java(JavaCodePoint::from_u32(0xD800).unwrap());
/// assert_eq!(expected, result);
///
/// let result = JavaStr::from_modified_utf8(&[0xed]);
/// let result = JavaStr::from_modified_utf8(&[0xED]);
/// assert!(result.is_err());
/// ```
#[inline]
Expand All @@ -46,11 +46,11 @@ impl JavaStr {
/// assert_eq!(result, &b"Hello World!"[..]);

Check warning on line 46 in crates/java_string/src/cesu8.rs

View workflow job for this annotation

GitHub Actions / Formatting

Diff in /home/runner/work/valence/valence/crates/java_string/src/cesu8.rs
///
/// let mut str = JavaString::from("abc\0ℝ💣");
/// str.push_java(JavaCodePoint::from_u32(0xd800).unwrap());
/// str.push_java(JavaCodePoint::from_u32(0xD800).unwrap());
/// let result = str.to_modified_utf8();
/// let expected = [
/// 0x61, 0x62, 0x63, 0xc0, 0x80, 0xe2, 0x84, 0x9d, 0xed, 0xa0, 0xbd, 0xed, 0xb2, 0xa3, 0xed,
/// 0xa0, 0x80,
/// 0x61, 0x62, 0x63, 0xC0, 0x80, 0xE2, 0x84, 0x9D, 0xED, 0xA0, 0xBD, 0xED, 0xB2, 0xA3, 0xED,
/// 0xA0, 0x80,
/// ];
/// assert!(matches!(result, Cow::Owned(_)));
/// assert_eq!(result, &expected[..]);
Expand All @@ -73,7 +73,7 @@ impl JavaStr {
while i < bytes.len() {

Check warning on line 73 in crates/java_string/src/cesu8.rs

View workflow job for this annotation

GitHub Actions / Formatting

Diff in /home/runner/work/valence/valence/crates/java_string/src/cesu8.rs
let b = bytes[i];
if b == 0 {
encoded.extend([0xc0, 0x80]);
encoded.extend([0xC0, 0x80]);
i += 1;
} else if b < 128 {
// Pass ASCII through quickly.
Expand All @@ -100,7 +100,7 @@ impl JavaStr {
// SAFETY: s contains a single char of width 4

Check warning on line 100 in crates/java_string/src/cesu8.rs

View workflow job for this annotation

GitHub Actions / Formatting

Diff in /home/runner/work/valence/valence/crates/java_string/src/cesu8.rs
s.chars().next().unwrap_unchecked().as_u32() - 0x10000
};
let s = [((c >> 10) as u16) | 0xd800, ((c & 0x3ff) as u16) | 0xdc00];
let s = [((c >> 10) as u16) | 0xD800, ((c & 0x3FF) as u16) | 0xDC00];
encoded.extend(enc_surrogate(s[0]));
encoded.extend(enc_surrogate(s[1]));
}
Expand All @@ -114,7 +114,7 @@ impl JavaStr {
impl JavaString {
/// Converts from Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format to a `JavaString`.
///
/// See [JavaStr::from_modified_utf8].
/// See [`JavaStr::from_modified_utf8`].
#[inline]
pub fn from_modified_utf8(bytes: Vec<u8>) -> Result<JavaString, Utf8Error> {
match JavaString::from_full_utf8(bytes) {
Expand Down Expand Up @@ -168,7 +168,7 @@ impl JavaString {
} else if first < 128 {

Check warning on line 168 in crates/java_string/src/cesu8.rs

View workflow job for this annotation

GitHub Actions / Formatting

Diff in /home/runner/work/valence/valence/crates/java_string/src/cesu8.rs
// Pass ASCII through directly.
decoded.push(first);
} else if first == 0xc0 {
} else if first == 0xC0 {
// modified UTF-8 encoding of null character
match next!() {
0x80 => decoded.push(0),
Expand All @@ -184,6 +184,7 @@ impl JavaString {
}
3 => {
let third = next_cont!(Some(2));
#[allow(clippy::unnested_or_patterns)] // Justification: readability
match (first, second) {
// These are valid UTF-8, so pass them through.
(0xe0, 0xa0..=0xbf)
Expand Down Expand Up @@ -227,7 +228,7 @@ impl JavaString {

/// Converts to Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format.
///
/// See [JavaStr::to_modified_utf8].
/// See [`JavaStr::to_modified_utf8`].
#[inline]
#[must_use]
pub fn into_modified_utf8(self) -> Vec<u8> {
Expand All @@ -241,21 +242,21 @@ impl JavaString {

Check warning on line 242 in crates/java_string/src/cesu8.rs

View workflow job for this annotation

GitHub Actions / Formatting

Diff in /home/runner/work/valence/valence/crates/java_string/src/cesu8.rs
#[inline]
fn dec_surrogate(second: u8, third: u8) -> u32 {
0xd000 | ((second & CONT_MASK) as u32) << 6 | (third & CONT_MASK) as u32
0xD000 | u32::from(second & CONT_MASK) << 6 | u32::from(third & CONT_MASK)
}

#[inline]
fn dec_surrogates(second: u8, third: u8, fifth: u8, sixth: u8) -> [u8; 4] {
// Convert to a 32-bit code point.

Check warning on line 250 in crates/java_string/src/cesu8.rs

View workflow job for this annotation

GitHub Actions / Formatting

Diff in /home/runner/work/valence/valence/crates/java_string/src/cesu8.rs
let s1 = dec_surrogate(second, third);
let s2 = dec_surrogate(fifth, sixth);
let c = 0x10000 + (((s1 - 0xd800) << 10) | (s2 - 0xdc00));
assert!((0x010000..=0x10ffff).contains(&c));
let c = 0x10000 + (((s1 - 0xD800) << 10) | (s2 - 0xDC00));
assert!((0x010000..=0x10FFFF).contains(&c));

// Convert to UTF-8.
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
[
0b1111_0000u8 | ((c & 0b1_1100_0000_0000_0000_0000) >> 18) as u8,
0b1111_0000_u8 | ((c & 0b1_1100_0000_0000_0000_0000) >> 18) as u8,
TAG_CONT | ((c & 0b0_0011_1111_0000_0000_0000) >> 12) as u8,
TAG_CONT | ((c & 0b0_0000_0000_1111_1100_0000) >> 6) as u8,
TAG_CONT | (c & 0b0_0000_0000_0000_0011_1111) as u8,
Expand Down
52 changes: 26 additions & 26 deletions crates/java_string/src/char.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ impl JavaCodePoint {
#[inline]

Check warning on line 62 in crates/java_string/src/char.rs

View workflow job for this annotation

GitHub Actions / Formatting

Diff in /home/runner/work/valence/valence/crates/java_string/src/char.rs
#[must_use]
pub const fn from_u32(i: u32) -> Option<JavaCodePoint> {
if i <= 0x10ffff {
if i <= 0x10FFFF {
unsafe { Some(Self::from_u32_unchecked(i)) }
} else {
None
Expand Down Expand Up @@ -94,7 +94,7 @@ impl JavaCodePoint {
/// ```

Check warning on line 94 in crates/java_string/src/char.rs

View workflow job for this annotation

GitHub Actions / Formatting

Diff in /home/runner/work/valence/valence/crates/java_string/src/char.rs
/// # use java_string::JavaCodePoint;
/// assert_eq!(65, JavaCodePoint::from_char('A').as_u32());
/// assert_eq!(0xd800, JavaCodePoint::from_u32(0xd800).unwrap().as_u32());
/// assert_eq!(0xD800, JavaCodePoint::from_u32(0xD800).unwrap().as_u32());
/// ```
#[inline]
#[must_use]
Expand All @@ -103,7 +103,7 @@ impl JavaCodePoint {
// SAFETY: JavaCodePoint has the same repr as a u32
let result = std::mem::transmute(self);

if result > 0x10ffff {
if result > 0x10FFFF {
// SAFETY: JavaCodePoint can never have a value > 0x10FFFF.
// This statement may allow the optimizer to remove branches in the calling code
// associated with out of bounds chars.
Expand All @@ -119,7 +119,7 @@ impl JavaCodePoint {
/// ```
/// # use java_string::JavaCodePoint;
/// assert_eq!(Some('a'), JavaCodePoint::from_char('a').as_char());
/// assert_eq!(None, JavaCodePoint::from_u32(0xd800).unwrap().as_char());
/// assert_eq!(None, JavaCodePoint::from_u32(0xD800).unwrap().as_char());
/// ```
#[inline]
#[must_use]
Expand Down Expand Up @@ -148,7 +148,7 @@ impl JavaCodePoint {
/// );
/// assert_eq!(
/// 1,
/// JavaCodePoint::from_u32(0xd800)
/// JavaCodePoint::from_u32(0xD800)
/// .unwrap()
/// .encode_utf16(&mut [0; 2])
/// .len()
Expand All @@ -170,7 +170,7 @@ impl JavaCodePoint {
}

/// Encodes this `JavaCodePoint` into semi UTF-8, that is, UTF-8 with
/// surrogate code points. See also [char::encode_utf8].
/// surrogate code points. See also [`char::encode_utf8`].
///
/// ```
/// # use java_string::JavaCodePoint;
Expand All @@ -182,7 +182,7 @@ impl JavaCodePoint {
/// );
/// assert_eq!(
/// 3,
/// JavaCodePoint::from_u32(0xd800)
/// JavaCodePoint::from_u32(0xD800)
/// .unwrap()
/// .encode_semi_utf8(&mut [0; 4])
/// .len()
Expand All @@ -202,19 +202,19 @@ impl JavaCodePoint {
*a = code as u8;
}
(2, [a, b, ..]) => {
*a = (code >> 6 & 0x1f) as u8 | TAG_TWO_B;
*b = (code & 0x3f) as u8 | TAG_CONT;
*a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
*b = (code & 0x3F) as u8 | TAG_CONT;
}
(3, [a, b, c, ..]) => {
*a = (code >> 12 & 0x0f) as u8 | TAG_THREE_B;
*b = (code >> 6 & 0x3f) as u8 | TAG_CONT;
*c = (code & 0x3f) as u8 | TAG_CONT;
*a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
*b = (code >> 6 & 0x3F) as u8 | TAG_CONT;
*c = (code & 0x3F) as u8 | TAG_CONT;
}
(4, [a, b, c, d, ..]) => {
*a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
*b = (code >> 12 & 0x3f) as u8 | TAG_CONT;
*c = (code >> 6 & 0x3f) as u8 | TAG_CONT;
*d = (code & 0x3f) as u8 | TAG_CONT;
*b = (code >> 12 & 0x3F) as u8 | TAG_CONT;
*c = (code >> 6 & 0x3F) as u8 | TAG_CONT;
*d = (code & 0x3F) as u8 | TAG_CONT;
}
_ => panic!(
"encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}",
Expand Down Expand Up @@ -250,7 +250,7 @@ impl JavaCodePoint {
/// );
/// assert_eq!(
/// "\\u{d800}",
/// JavaCodePoint::from_u32(0xd800)
/// JavaCodePoint::from_u32(0xD800)
/// .unwrap()
/// .escape_debug()
/// .to_string()
Expand Down Expand Up @@ -317,7 +317,7 @@ impl JavaCodePoint {
/// );
/// assert_eq!(
/// "\\u{d800}",
/// JavaCodePoint::from_u32(0xd800)
/// JavaCodePoint::from_u32(0xD800)
/// .unwrap()
/// .escape_default()
/// .to_string()
Expand All @@ -342,7 +342,7 @@ impl JavaCodePoint {
SINGLE_QUOTE => CharEscapeIter::new([b'\\', b'\'']),
DOUBLE_QUOTE => CharEscapeIter::new([b'\\', b'"']),
BACKSLASH => CharEscapeIter::new([b'\\', b'\\']),
0x20..=0x7e => CharEscapeIter::new([self.as_u32() as u8]),
0x20..=0x7E => CharEscapeIter::new([self.as_u32() as u8]),
_ => self.escape_unicode(),
}
}
Expand All @@ -358,7 +358,7 @@ impl JavaCodePoint {
/// );
/// assert_eq!(
/// "\\u{d800}",
/// JavaCodePoint::from_u32(0xd800)
/// JavaCodePoint::from_u32(0xD800)
/// .unwrap()
/// .escape_unicode()
/// .to_string()
Expand Down Expand Up @@ -410,7 +410,7 @@ impl JavaCodePoint {
#[inline]
#[must_use]
pub fn is_ascii(self) -> bool {
self.as_u32() <= 0x7f
self.as_u32() <= 0x7F
}

/// See [`char::is_ascii_alphabetic`].
Expand All @@ -431,7 +431,7 @@ impl JavaCodePoint {
#[inline]
#[must_use]
pub const fn is_ascii_control(self) -> bool {
matches!(self.as_u32(), 0..=0x1f | 0x7f)
matches!(self.as_u32(), 0..=0x1F | 0x7F)
}

/// See [`char::is_ascii_digit`].
Expand All @@ -447,7 +447,7 @@ impl JavaCodePoint {
#[inline]
#[must_use]
pub const fn is_ascii_graphic(self) -> bool {
matches!(self.as_u32(), 0x21..=0x7e)
matches!(self.as_u32(), 0x21..=0x7E)
}

/// See [`char::is_ascii_hexdigit`].
Expand Down Expand Up @@ -485,7 +485,7 @@ impl JavaCodePoint {
pub const fn is_ascii_punctuation(self) -> bool {
matches!(
self.as_u32(),
(0x21..=0x2f) | (0x3a..=0x40) | (0x5b..=0x60) | (0x7b..=0x7e)
(0x21..=0x2F) | (0x3A..=0x40) | (0x5B..=0x60) | (0x7B..=0x7E)
)
}

Expand All @@ -505,7 +505,7 @@ impl JavaCodePoint {
const SPACE: u32 = ' ' as u32;
const HORIZONTAL_TAB: u32 = '\t' as u32;
const LINE_FEED: u32 = '\n' as u32;
const FORM_FEED: u32 = 0xc;
const FORM_FEED: u32 = 0xC;
const CARRIAGE_RETURN: u32 = '\r' as u32;
matches!(
self.as_u32(),
Expand Down Expand Up @@ -566,7 +566,7 @@ impl JavaCodePoint {
/// let len = JavaCodePoint::from_char('💣').len_utf16();
/// assert_eq!(len, 2);
///
/// assert_eq!(1, JavaCodePoint::from_u32(0xd800).unwrap().len_utf16());
/// assert_eq!(1, JavaCodePoint::from_u32(0xD800).unwrap().len_utf16());
/// ```
#[inline]
#[must_use]
Expand Down Expand Up @@ -595,7 +595,7 @@ impl JavaCodePoint {
/// let len = JavaCodePoint::from_char('💣').len_utf8();
/// assert_eq!(len, 4);
///
/// let len = JavaCodePoint::from_u32(0xd800).unwrap().len_utf8();
/// let len = JavaCodePoint::from_u32(0xD800).unwrap().len_utf8();
/// assert_eq!(len, 3);
/// ```
#[inline]
Expand Down
14 changes: 7 additions & 7 deletions crates/java_string/src/iter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -649,17 +649,17 @@ where
}

match self.next_match_back() {
Some((index, len)) => unsafe {
Some((index, len)) => {
// SAFETY: pattern guarantees valid indices
let elt = self.haystack.get_unchecked(index + len..self.end);
let elt = unsafe { self.haystack.get_unchecked(index + len..self.end) };
self.end = index + len;
Some(elt)
},
None => unsafe {
// SAFETY: `self.start` and `self.end` always lie on unicode boundaries.
}
None => {
self.finished = true;
Some(self.haystack.get_unchecked(self.start..self.end))
},
// SAFETY: `self.start` and `self.end` always lie on unicode boundaries.
Some(unsafe { self.haystack.get_unchecked(self.start..self.end) })
}
}
}
}
Expand Down
Loading

0 comments on commit 394ece3

Please sign in to comment.