Skip to content

Commit

Permalink
Make utf16_iter/utf8_iter deps optional in normalizer (unicode-org#5927)
Browse files Browse the repository at this point in the history
Fixes unicode-org#4988


<!--
Thank you for your pull request to ICU4X!

Reminder: try to use [Conventional
Comments](https://conventionalcomments.org/) to make comments clearer.

Please see
https://github.com/unicode-org/icu4x/blob/main/CONTRIBUTING.md for
general
information on contributing to ICU4X.
-->
  • Loading branch information
Manishearth authored Dec 19, 2024
1 parent 3d25e72 commit 3e22528
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 17 deletions.
14 changes: 10 additions & 4 deletions components/normalizer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ icu_collections = { workspace = true }
icu_properties = { workspace = true, optional = true }
icu_provider = { workspace = true, features = ["macros"] }
smallvec = { workspace = true }
utf16_iter = { workspace = true }
utf8_iter = { workspace = true }
write16 = { workspace = true, features = ["alloc"] }
utf16_iter = { workspace = true, optional = true }
utf8_iter = { workspace = true, optional = true }
write16 = { workspace = true, features = ["alloc"], optional = true }
zerovec = { workspace = true }

databake = { workspace = true, features = ["derive"], optional = true }
Expand All @@ -47,7 +47,7 @@ write16 = { workspace = true, features = ["arrayvec"] }
criterion = { workspace = true }

[features]
default = ["compiled_data"]
default = ["compiled_data", "utf8_iter", "utf16_iter"]
std = ["icu_collections/std", "icu_properties?/std", "icu_provider/std"]
serde = ["dep:serde", "icu_collections/serde", "zerovec/serde", "icu_properties?/serde", "icu_provider/serde"]
# n.b. "icu_properties" + "icu_properties?/datagen" is equivalent to "icu_properties/datagen", however
Expand All @@ -57,6 +57,12 @@ experimental = []
compiled_data = ["dep:icu_normalizer_data", "icu_properties?/compiled_data"]
icu_properties = ["dep:icu_properties"]

# For dealing with UTF16 strings
utf16_iter = ["dep:utf16_iter", "write16"]
# For dealing with potentially ill-formed UTF8 strings
utf8_iter = ["dep:utf8_iter"]

[[bench]]
name = "bench"
harness = false
required_features = ["utf16_iter", "utf8_iter"]
2 changes: 1 addition & 1 deletion components/normalizer/fuzz/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ rust_icu_unorm2 = { version = "3", features = ["use-bindgen", "icu_config"] }
rust_icu_sys = { version = "3", features = ["use-bindgen", "icu_config"] }
rust_icu_ustring = { version = "3", features = ["use-bindgen", "icu_config"] }
encoding_rs = "0.8.31"
icu_normalizer = { path = ".." }
icu_normalizer = { path = "..", features = ["utf16_iter"] }
utf8_iter = "1.0.1"
utf16_iter = "1.0.3"

Expand Down
59 changes: 48 additions & 11 deletions components/normalizer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,7 @@ use crate::provider::DecompositionDataV2;
use crate::provider::Uts46DecompositionDataV2Marker;
use alloc::borrow::Cow;
use alloc::string::String;
use alloc::vec::Vec;
use core::char::REPLACEMENT_CHARACTER;
use core::str::from_utf8_unchecked;
use icu_collections::char16trie::Char16Trie;
use icu_collections::char16trie::Char16TrieIterator;
use icu_collections::char16trie::TrieResult;
Expand All @@ -100,9 +98,10 @@ use provider::CanonicalDecompositionTablesV1Marker;
use provider::CompatibilityDecompositionTablesV1Marker;
use provider::DecompositionTablesV1;
use smallvec::SmallVec;
#[cfg(feature = "utf16_iter")]
use utf16_iter::Utf16CharsEx;
#[cfg(feature = "utf8_iter")]
use utf8_iter::Utf8CharsEx;
use write16::Write16;
use zerovec::{zeroslice, ZeroSlice};

/// This type exists as a shim for icu_properties CanonicalCombiningClass when the crate is disabled
Expand Down Expand Up @@ -263,6 +262,7 @@ fn in_inclusive_range(c: char, start: char, end: char) -> bool {
}

#[inline(always)]
#[cfg(feature = "utf16_iter")]
fn in_inclusive_range16(u: u16, start: u16, end: u16) -> bool {
u.wrapping_sub(start) <= (end - start)
}
Expand Down Expand Up @@ -372,6 +372,7 @@ impl CharacterAndTrieValue {

/// See trie-value-format.md
#[inline(always)]
#[cfg(feature = "utf8_iter")]
pub fn starter_and_decomposes_to_self_except_replacement(&self) -> bool {
// This intentionally leaves `NON_ROUND_TRIP_MARKER` in the value
// to be compared with zero. U+FFFD has that flag set despite really
Expand Down Expand Up @@ -1469,19 +1470,25 @@ macro_rules! normalizer_methods {
///
/// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
/// before normalizing.
///
/// ✨ *Enabled with the `utf16_iter` Cargo feature.*
#[cfg(feature = "utf16_iter")]
pub fn normalize_utf16<'a>(&self, text: &'a [u16]) -> Cow<'a, [u16]> {
let up_to = self.is_normalized_utf16_up_to(text);
if up_to == text.len() {
return Cow::Borrowed(text);
}
let mut ret = Vec::with_capacity(text.len());
let mut ret = alloc::vec::Vec::with_capacity(text.len());
let (head, tail) = text.split_at(up_to);
ret.extend_from_slice(head);
let _ = self.normalize_utf16_to(tail, &mut ret);
Cow::Owned(ret)
}

/// Return the index a slice of potentially-invalid UTF-16 is normalized up to.
///
/// ✨ *Enabled with the `utf16_iter` Cargo feature.*
#[cfg(feature = "utf16_iter")]
pub fn is_normalized_utf16_up_to(&self, text: &[u16]) -> usize {
let mut sink = IsNormalizedSinkUtf16::new(text);
let _ = self.normalize_utf16_to(text, &mut sink);
Expand All @@ -1491,6 +1498,9 @@ macro_rules! normalizer_methods {
/// Checks whether a slice of potentially-invalid UTF-16 is normalized.
///
/// Unpaired surrogates are treated as the REPLACEMENT CHARACTER.
///
/// ✨ *Enabled with the `utf16_iter` Cargo feature.*
#[cfg(feature = "utf16_iter")]
pub fn is_normalized_utf16(&self, text: &[u16]) -> bool {
let mut sink = IsNormalizedSinkUtf16::new(text);
if self.normalize_utf16_to(text, &mut sink).is_err() {
Expand All @@ -1503,6 +1513,9 @@ macro_rules! normalizer_methods {
///
/// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
/// according to the WHATWG Encoding Standard.
///
/// ✨ *Enabled with the `utf8_iter` Cargo feature.*
#[cfg(feature = "utf8_iter")]
pub fn normalize_utf8<'a>(&self, text: &'a [u8]) -> Cow<'a, str> {
let up_to = self.is_normalized_utf8_up_to(text);
if up_to == text.len() {
Expand All @@ -1521,6 +1534,9 @@ macro_rules! normalizer_methods {
}

/// Return the index a slice of potentially-invalid UTF-8 is normalized up to
///
/// ✨ *Enabled with the `utf8_iter` Cargo feature.*
#[cfg(feature = "utf8_iter")]
pub fn is_normalized_utf8_up_to(&self, text: &[u8]) -> usize {
let mut sink = IsNormalizedSinkUtf8::new(text);
let _ = self.normalize_utf8_to(text, &mut sink);
Expand All @@ -1531,6 +1547,9 @@ macro_rules! normalizer_methods {
///
/// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
/// according to the WHATWG Encoding Standard before checking.
///
/// ✨ *Enabled with the `utf8_iter` Cargo feature.*
#[cfg(feature = "utf8_iter")]
pub fn is_normalized_utf8(&self, text: &[u8]) -> bool {
let mut sink = IsNormalizedSinkUtf8::new(text);
if self.normalize_utf8_to(text, &mut sink).is_err() {
Expand Down Expand Up @@ -1807,6 +1826,9 @@ impl DecomposingNormalizerBorrowed<'_> {
///
/// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
/// according to the WHATWG Encoding Standard.
///
/// ✨ *Enabled with the `utf8_iter` Cargo feature.*
#[cfg(feature = "utf8_iter")]
,
normalize_utf8_to,
core::fmt::Write,
Expand All @@ -1829,7 +1851,7 @@ impl DecomposingNormalizerBorrowed<'_> {
break 'fastest;
}
// End of stream
sink.write_str(unsafe { from_utf8_unchecked(pending_slice) })?;
sink.write_str(unsafe { core::str::from_utf8_unchecked(pending_slice) })?;
return Ok(());
}
decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();
Expand Down Expand Up @@ -1858,7 +1880,7 @@ impl DecomposingNormalizerBorrowed<'_> {
let back = consumed_so_far.next_back();
debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
let consumed_so_far_slice = consumed_so_far.as_slice();
sink.write_str(unsafe{from_utf8_unchecked(consumed_so_far_slice)})?;
sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) } )?;

// We could call `gather_and_sort_combining` here and
// `continue 'outer`, but this should be better for code
Expand All @@ -1871,7 +1893,7 @@ impl DecomposingNormalizerBorrowed<'_> {
let consumed_so_far_slice = &pending_slice[..pending_slice.len()
- decomposition.delegate.as_slice().len()
- upcoming.len_utf8()];
sink.write_str(unsafe{from_utf8_unchecked(consumed_so_far_slice)})?;
sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) } )?;

// Now let's figure out if we got a starter or a non-starter.
if decomposition_starts_with_non_starter(
Expand Down Expand Up @@ -1902,6 +1924,9 @@ impl DecomposingNormalizerBorrowed<'_> {
///
/// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
/// before normalizing.
///
/// ✨ *Enabled with the `utf16_iter` Cargo feature.*
#[cfg(feature = "utf16_iter")]
,
normalize_utf16_to,
write16::Write16,
Expand Down Expand Up @@ -2383,6 +2408,9 @@ impl ComposingNormalizerBorrowed<'_> {
///
/// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
/// according to the WHATWG Encoding Standard.
///
/// ✨ *Enabled with the `utf8_iter` Cargo feature.*
#[cfg(feature = "utf8_iter")]
,
normalize_utf8_to,
core::fmt::Write,
Expand Down Expand Up @@ -2422,7 +2450,7 @@ impl ComposingNormalizerBorrowed<'_> {
let back = consumed_so_far.next_back();
debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
let consumed_so_far_slice = consumed_so_far.as_slice();
sink.write_str(unsafe{ from_utf8_unchecked(consumed_so_far_slice)})?;
sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) })?;
undecomposed_starter = CharacterAndTrieValue::new(REPLACEMENT_CHARACTER, 0);
composition.decomposition.pending = None;
break 'fast;
Expand All @@ -2442,11 +2470,11 @@ impl ComposingNormalizerBorrowed<'_> {
undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
}
let consumed_so_far_slice = consumed_so_far.as_slice();
sink.write_str(unsafe { from_utf8_unchecked(consumed_so_far_slice)})?;
sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice)})?;
break 'fast;
}
// End of stream
sink.write_str(unsafe {from_utf8_unchecked(pending_slice) })?;
sink.write_str(unsafe { core::str::from_utf8_unchecked(pending_slice) })?;
return Ok(());
}
},
Expand All @@ -2464,6 +2492,9 @@ impl ComposingNormalizerBorrowed<'_> {
///
/// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
/// before normalizing.
///
/// ✨ *Enabled with the `utf16_iter` Cargo feature.*
#[cfg(feature = "utf16_iter")]
,
normalize_utf16_to,
write16::Write16,
Expand Down Expand Up @@ -2695,10 +2726,12 @@ impl ComposingNormalizer {
}
}

#[cfg(feature = "utf16_iter")]
struct IsNormalizedSinkUtf16<'a> {
expect: &'a [u16],
}

#[cfg(feature = "utf16_iter")]
impl<'a> IsNormalizedSinkUtf16<'a> {
pub fn new(slice: &'a [u16]) -> Self {
IsNormalizedSinkUtf16 { expect: slice }
Expand All @@ -2711,7 +2744,8 @@ impl<'a> IsNormalizedSinkUtf16<'a> {
}
}

impl Write16 for IsNormalizedSinkUtf16<'_> {
#[cfg(feature = "utf16_iter")]
impl write16::Write16 for IsNormalizedSinkUtf16<'_> {
fn write_slice(&mut self, s: &[u16]) -> core::fmt::Result {
// We know that if we get a slice, it's a pass-through,
// so we can compare addresses. Indexing is OK, because
Expand All @@ -2737,10 +2771,12 @@ impl Write16 for IsNormalizedSinkUtf16<'_> {
}
}

#[cfg(feature = "utf8_iter")]
struct IsNormalizedSinkUtf8<'a> {
expect: &'a [u8],
}

#[cfg(feature = "utf8_iter")]
impl<'a> IsNormalizedSinkUtf8<'a> {
pub fn new(slice: &'a [u8]) -> Self {
IsNormalizedSinkUtf8 { expect: slice }
Expand All @@ -2753,6 +2789,7 @@ impl<'a> IsNormalizedSinkUtf8<'a> {
}
}

#[cfg(feature = "utf8_iter")]
impl core::fmt::Write for IsNormalizedSinkUtf8<'_> {
fn write_str(&mut self, s: &str) -> core::fmt::Result {
// We know that if we get a slice, it's a pass-through,
Expand Down
2 changes: 1 addition & 1 deletion ffi/capi/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ decimal = ["dep:icu_decimal", "dep:fixed_decimal"]
experimental = ["dep:icu_experimental"]
list = ["dep:icu_list"]
locale = ["dep:icu_locale"]
normalizer = ["dep:icu_normalizer"]
normalizer = ["dep:icu_normalizer", "icu_normalizer?/utf8_iter", "icu_normalizer?/utf16_iter"]
plurals = ["dep:icu_plurals", "dep:fixed_decimal"]
properties = ["dep:icu_properties", "dep:icu_collections", "dep:unicode-bidi"]
segmenter = ["dep:icu_segmenter"]
Expand Down

0 comments on commit 3e22528

Please sign in to comment.