Skip to content

Commit

Permalink
Address review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
Earthcomputer committed Oct 4, 2023
1 parent 285872f commit bd23735
Show file tree
Hide file tree
Showing 5 changed files with 258 additions and 18 deletions.
19 changes: 19 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,25 @@ jobs:
- name: Run valence_nbt tests without preserve_order feature
run: cargo test -p valence_nbt --all-targets

valence-miri:
name: Miri Tests
runs-on: ubuntu-latest
steps:
- name: Checkout Actions Repository
uses: actions/checkout@v3

- name: Setup Rust toolchain and cache
uses: actions-rust-lang/[email protected]
with:
toolchain: "nightly"
components: "miri"

- name: Run tests
run: cargo miri test --workspace --all-features --doc

- name: Run doctests
run: cargo miri test --workspace --all-features --doc

extractor-build:
name: Build Extractor
runs-on: ubuntu-latest
Expand Down
2 changes: 1 addition & 1 deletion crates/java_string/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# valence_java_string
# java_string

An implementation of Java strings, tolerant of invalid UTF-16 encoding.
This allows for round-trip serialization of all Java strings, including those which contain invalid UTF-16, while still
Expand Down
23 changes: 12 additions & 11 deletions crates/java_string/src/cesu8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ impl JavaStr {
while i < bytes.len() {
let b = bytes[i];
if b == 0 {
encoded.extend([0xc0, 0x80].into_iter());
encoded.extend([0xc0, 0x80]);
i += 1;
} else if b < 128 {
// Pass ASCII through quickly.
Expand Down Expand Up @@ -101,8 +101,8 @@ impl JavaStr {
s.chars().next().unwrap_unchecked().as_u32() - 0x10000
};
let s = [((c >> 10) as u16) | 0xd800, ((c & 0x3ff) as u16) | 0xdc00];
encoded.extend(enc_surrogate(s[0]).into_iter());
encoded.extend(enc_surrogate(s[1]).into_iter());
encoded.extend(enc_surrogate(s[0]));
encoded.extend(enc_surrogate(s[1]));
}
i += w;
}
Expand All @@ -119,17 +119,18 @@ impl JavaString {
pub fn from_modified_utf8(bytes: Vec<u8>) -> Result<JavaString, Utf8Error> {
match JavaString::from_full_utf8(bytes) {
Ok(str) => Ok(str),
Err(err) => JavaString::from_modified_utf8_iter(err.bytes.into_iter()),
Err(err) => JavaString::from_modified_utf8_iter(err.bytes),
}
}

/// Converts from Java's [modified UTF-8](https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8) format to a `JavaString`.
///
/// See [JavaStr::from_modified_utf8].
pub fn from_modified_utf8_iter<I>(mut iter: I) -> Result<JavaString, Utf8Error>
pub fn from_modified_utf8_iter<I>(iter: I) -> Result<JavaString, Utf8Error>
where
I: Iterator<Item = u8>,
I: IntoIterator<Item = u8>,
{
let mut iter = iter.into_iter();
let mut index = 0;
let mut decoded = Vec::with_capacity(iter.size_hint().0);
let mut surrogate_first: Option<[u8; 3]> = None;
Expand Down Expand Up @@ -178,7 +179,7 @@ impl JavaString {

if first == 0 {
// modified UTF-8 should never contain \0 directly.
err!(None);
err!(Some(1));
} else if first < 128 {
flush_first_surrogate_half!();
// Pass ASCII through directly.
Expand All @@ -197,7 +198,7 @@ impl JavaString {
// Two-byte sequences can be used directly.
2 => {
flush_first_surrogate_half!();
decoded.extend([first, second].into_iter());
decoded.extend([first, second]);
}
3 => {
let third = next_cont!(Some(2));
Expand All @@ -208,7 +209,7 @@ impl JavaString {
| (0xed, 0x80..=0x9f)
| (0xee..=0xef, 0x80..=0xbf) => {
flush_first_surrogate_half!();
decoded.extend([first, second, third].into_iter())
decoded.extend([first, second, third])
}
// First half of a surrogate pair
(0xed, 0xa0..=0xaf) => {
Expand All @@ -222,10 +223,10 @@ impl JavaString {
let (fifth, sixth) = (second, third);
let (second, third) = (b, c);
let s = dec_surrogates(second, third, fifth, sixth);
decoded.extend(s.into_iter());
decoded.extend(s);
} else {
// no first half, append the second half directly
decoded.extend([first, second, third].into_iter());
decoded.extend([first, second, third]);
}
}
_ => err!(Some(1)),
Expand Down
230 changes: 225 additions & 5 deletions crates/java_string/src/serde.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,28 @@
use std::fmt::Formatter;

use serde::de::value::SeqAccessDeserializer;
use serde::de::{Error, SeqAccess, Unexpected, Visitor};
use serde::ser::SerializeSeq;
use serde::{Deserialize, Deserializer, Serialize, Serializer};

use crate::{JavaStr, JavaString};
use crate::{JavaCodePoint, JavaStr, JavaString};

impl Serialize for JavaString {
#[inline]
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
self.as_str_lossy().serialize(serializer)
match self.as_str() {
Ok(str) => str.serialize(serializer),
Err(_) => {
let mut seq = serializer.serialize_seq(None)?;
for ch in self.chars() {
seq.serialize_element(&ch.as_u32())?;
}
seq.end()
}
}
}
}

Expand All @@ -18,7 +32,57 @@ impl<'de> Deserialize<'de> for JavaString {
where
D: Deserializer<'de>,
{
String::deserialize(deserializer).map(JavaString::from)
deserializer.deserialize_any(JavaStringVisitor)
}
}

struct JavaStringVisitor;

impl<'de> Visitor<'de> for JavaStringVisitor {
type Value = JavaString;

fn expecting(&self, formatter: &mut Formatter) -> std::fmt::Result {
formatter.write_str("a JavaString")
}

fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
where
E: Error,
{
Ok(JavaString::from(v))
}

fn visit_string<E>(self, v: String) -> Result<Self::Value, E>
where
E: Error,
{
Ok(JavaString::from(v))
}

fn visit_bytes<E>(self, v: &[u8]) -> Result<Self::Value, E>
where
E: Error,
{
match JavaStr::from_semi_utf8(v) {
Ok(str) => Ok(str.to_owned()),
Err(_) => Err(Error::invalid_value(Unexpected::Bytes(v), &self)),
}
}

fn visit_byte_buf<E>(self, v: Vec<u8>) -> Result<Self::Value, E>
where
E: Error,
{
JavaString::from_semi_utf8(v)
.map_err(|err| Error::invalid_value(Unexpected::Bytes(&err.into_bytes()), &self))
}

fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
where
A: SeqAccess<'de>,
{
let vec = Vec::<u8>::deserialize(SeqAccessDeserializer::new(seq))?;
JavaString::from_semi_utf8(vec).map_err(|_| Error::invalid_value(Unexpected::Seq, &self))
}
}

Expand All @@ -28,7 +92,16 @@ impl Serialize for JavaStr {
where
S: Serializer,
{
self.as_str_lossy().serialize(serializer)
match self.as_str() {
Ok(str) => str.serialize(serializer),
Err(_) => {
let mut seq = serializer.serialize_seq(None)?;
for ch in self.chars() {
seq.serialize_element(&ch.as_u32())?;
}
seq.end()
}
}
}
}

Expand All @@ -38,6 +111,153 @@ impl<'de: 'a, 'a> Deserialize<'de> for &'a JavaStr {
where
D: Deserializer<'de>,
{
<&'a str>::deserialize(deserializer).map(JavaStr::from_str)
deserializer.deserialize_any(JavaStrVisitor)
}
}

struct JavaStrVisitor;

impl<'de> Visitor<'de> for JavaStrVisitor {
type Value = &'de JavaStr;

fn expecting(&self, formatter: &mut Formatter) -> std::fmt::Result {
formatter.write_str("a borrowed JavaStr")
}

fn visit_borrowed_str<E>(self, v: &'de str) -> Result<Self::Value, E>
where
E: Error,
{
Ok(JavaStr::from_str(v))
}

fn visit_borrowed_bytes<E>(self, v: &'de [u8]) -> Result<Self::Value, E>
where
E: Error,
{
JavaStr::from_semi_utf8(v).map_err(|_| Error::invalid_value(Unexpected::Bytes(v), &self))
}
}

impl Serialize for JavaCodePoint {
#[inline]
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
match self.as_char() {
Some(ch) => ch.serialize(serializer),
None => self.as_u32().serialize(serializer),
}
}
}

impl<'de> Deserialize<'de> for JavaCodePoint {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
deserializer.deserialize_any(JavaCodePointVisitor)
}
}

struct JavaCodePointVisitor;

impl<'de> Visitor<'de> for JavaCodePointVisitor {
type Value = JavaCodePoint;

fn expecting(&self, formatter: &mut Formatter) -> std::fmt::Result {
formatter.write_str("a character")
}

#[inline]
fn visit_i8<E>(self, v: i8) -> Result<Self::Value, E>
where
E: Error,
{
self.visit_i32(v as i32)
}

#[inline]
fn visit_i16<E>(self, v: i16) -> Result<Self::Value, E>
where
E: Error,
{
self.visit_i32(v as i32)
}

fn visit_i32<E>(self, v: i32) -> Result<Self::Value, E>
where
E: Error,
{
if v < 0 {
Err(Error::invalid_value(Unexpected::Signed(v as i64), &self))
} else {
self.visit_u32(v as u32)
}
}

fn visit_i64<E>(self, v: i64) -> Result<Self::Value, E>
where
E: Error,
{
if v < 0 {
Err(Error::invalid_value(Unexpected::Signed(v), &self))
} else {
self.visit_u64(v as u64)
}
}

#[inline]
fn visit_u8<E>(self, v: u8) -> Result<Self::Value, E>
where
E: Error,
{
self.visit_u32(v as u32)
}

#[inline]
fn visit_u16<E>(self, v: u16) -> Result<Self::Value, E>
where
E: Error,
{
self.visit_u32(v as u32)
}

fn visit_u32<E>(self, v: u32) -> Result<Self::Value, E>
where
E: Error,
{
JavaCodePoint::from_u32(v)
.ok_or_else(|| Error::invalid_value(Unexpected::Unsigned(v as u64), &self))
}

fn visit_u64<E>(self, v: u64) -> Result<Self::Value, E>
where
E: Error,
{
if v > u32::MAX as u64 {
Err(Error::invalid_value(Unexpected::Unsigned(v), &self))
} else {
self.visit_u32(v as u32)
}
}

fn visit_char<E>(self, v: char) -> Result<Self::Value, E>
where
E: Error,
{
Ok(JavaCodePoint::from_char(v))
}

fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
where
E: Error,
{
let mut iter = v.chars();
match (iter.next(), iter.next()) {
(Some(c), None) => Ok(JavaCodePoint::from_char(c)),
_ => Err(Error::invalid_value(Unexpected::Str(v), &self)),
}
}
}
2 changes: 1 addition & 1 deletion typos.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[files]
extend-exclude = ["*.svg", "*.json"]
extend-exclude = ["*.svg", "*.json", "crates/java_string/src/slice.rs"]

[default]
extend-ignore-re = ['\d+ths', 'CC BY-NC-ND']

0 comments on commit bd23735

Please sign in to comment.