From bd237357c38249c65e6cb715b5e7d8c3259a9b55 Mon Sep 17 00:00:00 2001 From: Joe Date: Wed, 4 Oct 2023 16:52:07 +0100 Subject: [PATCH] Address review comments --- .github/workflows/ci.yml | 19 +++ crates/java_string/README.md | 2 +- crates/java_string/src/cesu8.rs | 23 ++-- crates/java_string/src/serde.rs | 230 +++++++++++++++++++++++++++++++- typos.toml | 2 +- 5 files changed, 258 insertions(+), 18 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 78e12f7f5..ea46962fd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -117,6 +117,25 @@ jobs: - name: Run valence_nbt tests without preserve_order feature run: cargo test -p valence_nbt --all-targets + valence-miri: + name: Miri Tests + runs-on: ubuntu-latest + steps: + - name: Checkout Actions Repository + uses: actions/checkout@v3 + + - name: Setup Rust toolchain and cache + uses: actions-rust-lang/setup-rust-toolchain@v1.5.0 + with: + toolchain: "nightly" + components: "miri" + + - name: Run tests + run: cargo miri test --workspace --all-features --doc + + - name: Run doctests + run: cargo miri test --workspace --all-features --doc + extractor-build: name: Build Extractor runs-on: ubuntu-latest diff --git a/crates/java_string/README.md b/crates/java_string/README.md index d3a960ab0..7135d6454 100644 --- a/crates/java_string/README.md +++ b/crates/java_string/README.md @@ -1,4 +1,4 @@ -# valence_java_string +# java_string An implementation of Java strings, tolerant of invalid UTF-16 encoding. This allows for round-trip serialization of all Java strings, including those which contain invalid UTF-16, while still diff --git a/crates/java_string/src/cesu8.rs b/crates/java_string/src/cesu8.rs index 4964bc497..90fd1924a 100644 --- a/crates/java_string/src/cesu8.rs +++ b/crates/java_string/src/cesu8.rs @@ -73,7 +73,7 @@ impl JavaStr { while i < bytes.len() { let b = bytes[i]; if b == 0 { - encoded.extend([0xc0, 0x80].into_iter()); + encoded.extend([0xc0, 0x80]); i += 1; } else if b < 128 { // Pass ASCII through quickly. @@ -101,8 +101,8 @@ impl JavaStr { s.chars().next().unwrap_unchecked().as_u32() - 0x10000 }; let s = [((c >> 10) as u16) | 0xd800, ((c & 0x3ff) as u16) | 0xdc00]; - encoded.extend(enc_surrogate(s[0]).into_iter()); - encoded.extend(enc_surrogate(s[1]).into_iter()); + encoded.extend(enc_surrogate(s[0])); + encoded.extend(enc_surrogate(s[1])); } i += w; } @@ -119,17 +119,18 @@ impl JavaString { pub fn from_modified_utf8(bytes: Vec) -> Result { match JavaString::from_full_utf8(bytes) { Ok(str) => Ok(str), - Err(err) => JavaString::from_modified_utf8_iter(err.bytes.into_iter()), + Err(err) => JavaString::from_modified_utf8_iter(err.bytes), } } /// Converts from Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format to a `JavaString`. /// /// See [JavaStr::from_modified_utf8]. - pub fn from_modified_utf8_iter(mut iter: I) -> Result + pub fn from_modified_utf8_iter(iter: I) -> Result where - I: Iterator, + I: IntoIterator, { + let mut iter = iter.into_iter(); let mut index = 0; let mut decoded = Vec::with_capacity(iter.size_hint().0); let mut surrogate_first: Option<[u8; 3]> = None; @@ -178,7 +179,7 @@ impl JavaString { if first == 0 { // modified UTF-8 should never contain \0 directly. - err!(None); + err!(Some(1)); } else if first < 128 { flush_first_surrogate_half!(); // Pass ASCII through directly. @@ -197,7 +198,7 @@ impl JavaString { // Two-byte sequences can be used directly. 2 => { flush_first_surrogate_half!(); - decoded.extend([first, second].into_iter()); + decoded.extend([first, second]); } 3 => { let third = next_cont!(Some(2)); @@ -208,7 +209,7 @@ impl JavaString { | (0xed, 0x80..=0x9f) | (0xee..=0xef, 0x80..=0xbf) => { flush_first_surrogate_half!(); - decoded.extend([first, second, third].into_iter()) + decoded.extend([first, second, third]) } // First half of a surrogate pair (0xed, 0xa0..=0xaf) => { @@ -222,10 +223,10 @@ impl JavaString { let (fifth, sixth) = (second, third); let (second, third) = (b, c); let s = dec_surrogates(second, third, fifth, sixth); - decoded.extend(s.into_iter()); + decoded.extend(s); } else { // no first half, append the second half directly - decoded.extend([first, second, third].into_iter()); + decoded.extend([first, second, third]); } } _ => err!(Some(1)), diff --git a/crates/java_string/src/serde.rs b/crates/java_string/src/serde.rs index 71e31c173..e1c152d11 100644 --- a/crates/java_string/src/serde.rs +++ b/crates/java_string/src/serde.rs @@ -1,6 +1,11 @@ +use std::fmt::Formatter; + +use serde::de::value::SeqAccessDeserializer; +use serde::de::{Error, SeqAccess, Unexpected, Visitor}; +use serde::ser::SerializeSeq; use serde::{Deserialize, Deserializer, Serialize, Serializer}; -use crate::{JavaStr, JavaString}; +use crate::{JavaCodePoint, JavaStr, JavaString}; impl Serialize for JavaString { #[inline] @@ -8,7 +13,16 @@ impl Serialize for JavaString { where S: Serializer, { - self.as_str_lossy().serialize(serializer) + match self.as_str() { + Ok(str) => str.serialize(serializer), + Err(_) => { + let mut seq = serializer.serialize_seq(None)?; + for ch in self.chars() { + seq.serialize_element(&ch.as_u32())?; + } + seq.end() + } + } } } @@ -18,7 +32,57 @@ impl<'de> Deserialize<'de> for JavaString { where D: Deserializer<'de>, { - String::deserialize(deserializer).map(JavaString::from) + deserializer.deserialize_any(JavaStringVisitor) + } +} + +struct JavaStringVisitor; + +impl<'de> Visitor<'de> for JavaStringVisitor { + type Value = JavaString; + + fn expecting(&self, formatter: &mut Formatter) -> std::fmt::Result { + formatter.write_str("a JavaString") + } + + fn visit_str(self, v: &str) -> Result + where + E: Error, + { + Ok(JavaString::from(v)) + } + + fn visit_string(self, v: String) -> Result + where + E: Error, + { + Ok(JavaString::from(v)) + } + + fn visit_bytes(self, v: &[u8]) -> Result + where + E: Error, + { + match JavaStr::from_semi_utf8(v) { + Ok(str) => Ok(str.to_owned()), + Err(_) => Err(Error::invalid_value(Unexpected::Bytes(v), &self)), + } + } + + fn visit_byte_buf(self, v: Vec) -> Result + where + E: Error, + { + JavaString::from_semi_utf8(v) + .map_err(|err| Error::invalid_value(Unexpected::Bytes(&err.into_bytes()), &self)) + } + + fn visit_seq(self, seq: A) -> Result + where + A: SeqAccess<'de>, + { + let vec = Vec::::deserialize(SeqAccessDeserializer::new(seq))?; + JavaString::from_semi_utf8(vec).map_err(|_| Error::invalid_value(Unexpected::Seq, &self)) } } @@ -28,7 +92,16 @@ impl Serialize for JavaStr { where S: Serializer, { - self.as_str_lossy().serialize(serializer) + match self.as_str() { + Ok(str) => str.serialize(serializer), + Err(_) => { + let mut seq = serializer.serialize_seq(None)?; + for ch in self.chars() { + seq.serialize_element(&ch.as_u32())?; + } + seq.end() + } + } } } @@ -38,6 +111,153 @@ impl<'de: 'a, 'a> Deserialize<'de> for &'a JavaStr { where D: Deserializer<'de>, { - <&'a str>::deserialize(deserializer).map(JavaStr::from_str) + deserializer.deserialize_any(JavaStrVisitor) + } +} + +struct JavaStrVisitor; + +impl<'de> Visitor<'de> for JavaStrVisitor { + type Value = &'de JavaStr; + + fn expecting(&self, formatter: &mut Formatter) -> std::fmt::Result { + formatter.write_str("a borrowed JavaStr") + } + + fn visit_borrowed_str(self, v: &'de str) -> Result + where + E: Error, + { + Ok(JavaStr::from_str(v)) + } + + fn visit_borrowed_bytes(self, v: &'de [u8]) -> Result + where + E: Error, + { + JavaStr::from_semi_utf8(v).map_err(|_| Error::invalid_value(Unexpected::Bytes(v), &self)) + } +} + +impl Serialize for JavaCodePoint { + #[inline] + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + match self.as_char() { + Some(ch) => ch.serialize(serializer), + None => self.as_u32().serialize(serializer), + } + } +} + +impl<'de> Deserialize<'de> for JavaCodePoint { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + deserializer.deserialize_any(JavaCodePointVisitor) + } +} + +struct JavaCodePointVisitor; + +impl<'de> Visitor<'de> for JavaCodePointVisitor { + type Value = JavaCodePoint; + + fn expecting(&self, formatter: &mut Formatter) -> std::fmt::Result { + formatter.write_str("a character") + } + + #[inline] + fn visit_i8(self, v: i8) -> Result + where + E: Error, + { + self.visit_i32(v as i32) + } + + #[inline] + fn visit_i16(self, v: i16) -> Result + where + E: Error, + { + self.visit_i32(v as i32) + } + + fn visit_i32(self, v: i32) -> Result + where + E: Error, + { + if v < 0 { + Err(Error::invalid_value(Unexpected::Signed(v as i64), &self)) + } else { + self.visit_u32(v as u32) + } + } + + fn visit_i64(self, v: i64) -> Result + where + E: Error, + { + if v < 0 { + Err(Error::invalid_value(Unexpected::Signed(v), &self)) + } else { + self.visit_u64(v as u64) + } + } + + #[inline] + fn visit_u8(self, v: u8) -> Result + where + E: Error, + { + self.visit_u32(v as u32) + } + + #[inline] + fn visit_u16(self, v: u16) -> Result + where + E: Error, + { + self.visit_u32(v as u32) + } + + fn visit_u32(self, v: u32) -> Result + where + E: Error, + { + JavaCodePoint::from_u32(v) + .ok_or_else(|| Error::invalid_value(Unexpected::Unsigned(v as u64), &self)) + } + + fn visit_u64(self, v: u64) -> Result + where + E: Error, + { + if v > u32::MAX as u64 { + Err(Error::invalid_value(Unexpected::Unsigned(v), &self)) + } else { + self.visit_u32(v as u32) + } + } + + fn visit_char(self, v: char) -> Result + where + E: Error, + { + Ok(JavaCodePoint::from_char(v)) + } + + fn visit_str(self, v: &str) -> Result + where + E: Error, + { + let mut iter = v.chars(); + match (iter.next(), iter.next()) { + (Some(c), None) => Ok(JavaCodePoint::from_char(c)), + _ => Err(Error::invalid_value(Unexpected::Str(v), &self)), + } } } diff --git a/typos.toml b/typos.toml index 8e54543f5..c59189146 100644 --- a/typos.toml +++ b/typos.toml @@ -1,5 +1,5 @@ [files] -extend-exclude = ["*.svg", "*.json"] +extend-exclude = ["*.svg", "*.json", "crates/java_string/src/slice.rs"] [default] extend-ignore-re = ['\d+ths', 'CC BY-NC-ND']