Skip to content

Commit

Permalink
Mention trie-value-format.md in various places
Browse files Browse the repository at this point in the history
  • Loading branch information
hsivonen committed Nov 14, 2024
1 parent 380df55 commit e5faa23
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 4 deletions.
26 changes: 22 additions & 4 deletions components/collator/src/elements.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,48 +32,61 @@ use zerovec::{zeroslice, ZeroSlice};
use crate::provider::CollationDataV1;

/// Marker that the decomposition does not round trip via NFC.
///
/// See components/normalizer/trie-value-format.md
const NON_ROUND_TRIP_MARKER: u32 = 1 << 30;

/// Marker that the first character of the decomposition
/// can combine backwards.
///
/// See components/normalizer/trie-value-format.md
const BACKWARD_COMBINING_MARKER: u32 = 1 << 31;

/// Mask for the bits have to be zero for this to be a BMP
/// singleton decomposition, or value baked into the surrogate
/// range.
///
/// See components/normalizer/trie-value-format.md
const HIGH_ZEROS_MASK: u32 = 0x3FFF0000;

/// Mask for the bits have to be zero for this to be a complex
/// decomposition.
///
/// See components/normalizer/trie-value-format.md
const LOW_ZEROS_MASK: u32 = 0xFFE0;

/// Marker value for U+FDFA in NFKD. (Unified with Hangul syllable marker,
/// but they differ by `NON_ROUND_TRIP_MARKER`.)
///
/// See components/normalizer/trie-value-format.md
const FDFA_MARKER: u16 = 1;

/// Checks if a trie value carries a (non-zero) canonical
/// combining class.
///
/// See components/normalizer/trie-value-format.md
fn trie_value_has_ccc(trie_value: u32) -> bool {
(trie_value & 0x3FFFFE00) == 0xD800
}

/// Checks if the trie signifies a special non-starter decomposition.
///
/// See components/normalizer/trie-value-format.md
fn trie_value_indicates_special_non_starter_decomposition(trie_value: u32) -> bool {
(trie_value & 0x3FFFFF00) == 0xD900
}

/// Checks if a trie value signifies a character whose decomposition
/// starts with a non-starter.
///
/// See components/normalizer/trie-value-format.md
fn decomposition_starts_with_non_starter(trie_value: u32) -> bool {
trie_value_has_ccc(trie_value)
}

/// Extracts a canonical combining class (possibly zero) from a trie value.
///
/// # Panics
///
/// The trie value must not be one that signifies a special non-starter
/// decomposition. (Debug-only)
/// See components/normalizer/trie-value-format.md
fn ccc_from_trie_value(trie_value: u32) -> CanonicalCombiningClass {
if trie_value_has_ccc(trie_value) {
CanonicalCombiningClass(trie_value as u8)
Expand Down Expand Up @@ -795,6 +808,8 @@ where
/// The `CollationElement32` mapping for the Combining Diacritical Marks block.
diacritics: &'data ZeroSlice<u16>,
/// NFD main trie.
///
/// See components/normalizer/trie-value-format.md
trie: &'data CodePointTrie<'data, u32>,
/// NFD complex decompositions on the BMP
scalars16: &'data ZeroSlice<u16>,
Expand Down Expand Up @@ -1026,6 +1041,8 @@ where
// Hangul syllables in lookahead, because Hangul isn't allowed to
// participate in contractions, and the trie default is that a character
// is its own decomposition.

// See components/normalizer/trie-value-format.md
let decomposition = c.trie_val;
if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) <= 1 {
// The character is its own decomposition (or Hangul syllable)
Expand Down Expand Up @@ -1261,6 +1278,7 @@ where
// optimize based on that bet.
let hangul_offset = u32::from(c).wrapping_sub(HANGUL_S_BASE); // SIndex in the spec
if hangul_offset >= HANGUL_S_COUNT {
// See components/normalizer/trie-value-format.md
let decomposition = c_c_tv.trie_val;
if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 {
// The character is its own decomposition
Expand Down
28 changes: 28 additions & 0 deletions components/normalizer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,42 +134,60 @@ enum IgnorableBehavior {
}

/// Marker for UTS 46 ignorables.
///
/// See trie-value-format.md
const IGNORABLE_MARKER: u32 = 0xFFFFFFFF;

/// Marker that the decomposition does not round trip via NFC.
///
/// See trie-value-format.md
const NON_ROUND_TRIP_MARKER: u32 = 1 << 30;

/// Marker that the first character of the decomposition
/// can combine backwards.
///
/// See trie-value-format.md
const BACKWARD_COMBINING_MARKER: u32 = 1 << 31;

/// Mask for the bits have to be zero for this to be a BMP
/// singleton decomposition, or value baked into the surrogate
/// range.
///
/// See trie-value-format.md
const HIGH_ZEROS_MASK: u32 = 0x3FFF0000;

/// Mask for the bits have to be zero for this to be a complex
/// decomposition.
///
/// See trie-value-format.md
const LOW_ZEROS_MASK: u32 = 0xFFE0;

/// Checks if a trie value carries a (non-zero) canonical
/// combining class.
///
/// See trie-value-format.md
fn trie_value_has_ccc(trie_value: u32) -> bool {
(trie_value & 0x3FFFFE00) == 0xD800
}

/// Checks if the trie signifies a special non-starter decomposition.
///
/// See trie-value-format.md
fn trie_value_indicates_special_non_starter_decomposition(trie_value: u32) -> bool {
(trie_value & 0x3FFFFF00) == 0xD900
}

/// Checks if a trie value signifies a character whose decomposition
/// starts with a non-starter.
///
/// See trie-value-format.md
fn decomposition_starts_with_non_starter(trie_value: u32) -> bool {
trie_value_has_ccc(trie_value)
}

/// Extracts a canonical combining class (possibly zero) from a trie value.
///
/// See trie-value-format.md
fn ccc_from_trie_value(trie_value: u32) -> CanonicalCombiningClass {
if trie_value_has_ccc(trie_value) {
CanonicalCombiningClass(trie_value as u8)
Expand All @@ -187,6 +205,8 @@ static FDFA_NFKD: [u16; 17] = [

/// Marker value for U+FDFA in NFKD. (Unified with Hangul syllable marker,
/// but they differ by `NON_ROUND_TRIP_MARKER`.)
///
/// See trie-value-format.md
const FDFA_MARKER: u16 = 1;

// These constants originate from page 143 of Unicode 14.0
Expand Down Expand Up @@ -318,13 +338,15 @@ fn compose_non_hangul(mut iter: Char16TrieIterator, starter: char, second: char)
}
}

/// See trie-value-format.md
#[inline(always)]
fn starter_and_decomposes_to_self_impl(trie_val: u32) -> bool {
// The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set,
// and this function needs to ignore that.
(trie_val & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0
}

/// See trie-value-format.md
#[inline(always)]
fn potential_passthrough_and_cannot_combine_backwards_impl(trie_val: u32) -> bool {
(trie_val & (NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_MARKER)) == 0
Expand All @@ -337,6 +359,7 @@ fn potential_passthrough_and_cannot_combine_backwards_impl(trie_val: u32) -> boo
#[derive(Debug, PartialEq, Eq)]
struct CharacterAndTrieValue {
character: char,
/// See trie-value-format.md
trie_val: u32,
}

Expand All @@ -354,6 +377,7 @@ impl CharacterAndTrieValue {
starter_and_decomposes_to_self_impl(self.trie_val)
}

/// See trie-value-format.md
#[inline(always)]
pub fn starter_and_decomposes_to_self_except_replacement(&self) -> bool {
// This intentionally leaves `NON_ROUND_TRIP_MARKER` in the value
Expand All @@ -363,14 +387,17 @@ impl CharacterAndTrieValue {
(self.trie_val & !BACKWARD_COMBINING_MARKER) == 0
}

/// See trie-value-format.md
#[inline(always)]
pub fn can_combine_backwards(&self) -> bool {
(self.trie_val & BACKWARD_COMBINING_MARKER) != 0
}
/// See trie-value-format.md
#[inline(always)]
pub fn potential_passthrough(&self) -> bool {
(self.trie_val & NON_ROUND_TRIP_MARKER) == 0
}
/// See trie-value-format.md
#[inline(always)]
pub fn potential_passthrough_and_cannot_combine_backwards(&self) -> bool {
potential_passthrough_and_cannot_combine_backwards_impl(self.trie_val)
Expand Down Expand Up @@ -696,6 +723,7 @@ where
fn decomposing_next(&mut self, c_and_trie_val: CharacterAndTrieValue) -> char {
let (starter, combining_start) = {
let c = c_and_trie_val.character;
/// See trie-value-format.md
let decomposition = c_and_trie_val.trie_val;
// The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set,
// and that flag needs to be ignored here.
Expand Down

0 comments on commit e5faa23

Please sign in to comment.