Skip to content

Commit

Permalink
Return Cow instead of String/Vec in the normalizer
Browse files Browse the repository at this point in the history
  • Loading branch information
hsivonen committed Dec 16, 2024
1 parent 6f10eaa commit d1b33d8
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 14 deletions.
48 changes: 36 additions & 12 deletions components/normalizer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ use crate::provider::CanonicalDecompositionDataV1Marker;
use crate::provider::CompatibilityDecompositionSupplementV1Marker;
use crate::provider::DecompositionDataV1;
use crate::provider::Uts46DecompositionSupplementV1Marker;
use alloc::borrow::Cow;
use alloc::string::String;
use alloc::vec::Vec;
use core::char::REPLACEMENT_CHARACTER;
Expand Down Expand Up @@ -1529,12 +1530,18 @@ macro_rules! decomposing_normalize_to {

macro_rules! normalizer_methods {
() => {
/// Normalize a string slice into a `String`.
pub fn normalize(&self, text: &str) -> String {
/// Normalize a string slice into a `Cow<'a, str>`.
pub fn normalize<'a>(&self, text: &'a str) -> Cow<'a, str> {
let up_to = self.is_normalized_up_to(text);
if up_to == text.len() {
return Cow::Borrowed(text);
}
let mut ret = String::new();
ret.reserve(text.len());
let _ = self.normalize_to(text, &mut ret);
ret
let (head, tail) = text.split_at(up_to);
ret.push_str(head);
let _ = self.normalize_to(tail, &mut ret);
Cow::Owned(ret)
}

/// Return the index a string slice is normalized up to.
Expand All @@ -1553,14 +1560,21 @@ macro_rules! normalizer_methods {
sink.finished()
}

/// Normalize a slice of potentially-invalid UTF-16 into a `Vec`.
/// Normalize a slice of potentially-invalid UTF-16 into a `Cow<'a, [u16]>`.
///
/// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
/// before normalizing.
pub fn normalize_utf16(&self, text: &[u16]) -> Vec<u16> {
pub fn normalize_utf16<'a>(&self, text: &'a [u16]) -> Cow<'a, [u16]> {
let up_to = self.is_normalized_utf16_up_to(text);
if up_to == text.len() {
return Cow::Borrowed(text);
}
let mut ret = Vec::new();
let _ = self.normalize_utf16_to(text, &mut ret);
ret
ret.reserve(text.len());
let (head, tail) = text.split_at(up_to);
ret.extend_from_slice(head);
let _ = self.normalize_utf16_to(tail, &mut ret);
Cow::Owned(ret)
}

/// Return the index a slice of potentially-invalid UTF-16 is normalized up to.
Expand All @@ -1581,15 +1595,25 @@ macro_rules! normalizer_methods {
sink.finished()
}

/// Normalize a slice of potentially-invalid UTF-8 into a `String`.
/// Normalize a slice of potentially-invalid UTF-8 into a `Cow<'a, str>`.
///
/// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
/// according to the WHATWG Encoding Standard.
pub fn normalize_utf8(&self, text: &[u8]) -> String {
pub fn normalize_utf8<'a>(&self, text: &'a [u8]) -> Cow<'a, str> {
let up_to = self.is_normalized_utf8_up_to(text);
if up_to == text.len() {
// SAFETY: The normalization check also checks for
// UTF-8 well-formedness.
return Cow::Borrowed(unsafe { core::str::from_utf8_unchecked(text) });
}
let mut ret = String::new();
ret.reserve(text.len());
let _ = self.normalize_utf8_to(text, &mut ret);
ret
let (head, tail) = text.split_at(up_to);
// SAFETY: The normalization check also checks for
// UTF-8 well-formedness.
ret.push_str(unsafe { core::str::from_utf8_unchecked(head) });
let _ = self.normalize_utf8_to(tail, &mut ret);
Cow::Owned(ret)
}

/// Return the index a slice of potentially-invalid UTF-8 is normalized up to
Expand Down
4 changes: 2 additions & 2 deletions components/normalizer/tests/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1670,11 +1670,11 @@ fn test_utf16_basic() {
let normalizer = ComposingNormalizerBorrowed::new_nfc();

assert_eq!(
normalizer.normalize_utf16(&[0x0061]).as_slice(),
normalizer.normalize_utf16(&[0x0061]).as_ref(),
[0x0061].as_slice()
);
assert_eq!(
normalizer.normalize_utf16(&[0x0300, 0x0323]).as_slice(),
normalizer.normalize_utf16(&[0x0300, 0x0323]).as_ref(),
[0x0323, 0x0300].as_slice()
);
}
Expand Down

0 comments on commit d1b33d8

Please sign in to comment.