diff --git a/components/experimental/src/transliterate/provider.rs b/components/experimental/src/transliterate/provider.rs index 1387bf709f4..319ab99897a 100644 --- a/components/experimental/src/transliterate/provider.rs +++ b/components/experimental/src/transliterate/provider.rs @@ -31,7 +31,6 @@ use zerovec::*; /// The data struct representing [UTS #35 transform rules](https://unicode.org/reports/tr35/tr35-general.html#Transforms). #[icu_provider::data_struct(TransliteratorRulesV1Marker = "transliterator/rules@1")] #[derive(Debug, Clone, PartialEq, Eq)] -#[cfg_attr(feature = "serde", derive(serde::Deserialize))] #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] #[cfg_attr(feature = "datagen", databake(path = icu_experimental::transliterate::provider))] pub struct RuleBasedTransliterator<'a> { @@ -40,19 +39,57 @@ pub struct RuleBasedTransliterator<'a> { /// see, e.g., [Devanagari-Latin](https://github.com/unicode-org/cldr/blob/main/common/transforms/Devanagari-Latin.xml) pub visibility: bool, /// The [`VarTable`] containing any special matchers (variables, UnicodeSets, ...) used by this transliterator. - #[cfg_attr(feature = "serde", serde(borrow))] pub variable_table: VarTable<'a>, /// The filter for this transliterator. If there is none, the set of all code points is used. - #[cfg_attr(feature = "serde", serde(borrow))] pub filter: CodePointInversionList<'a>, /// The list of transform rule groups this transliterator uses. - #[cfg_attr(feature = "serde", serde(borrow))] pub id_group_list: VarZeroVec<'a, VarZeroSlice>, /// The list of conversion rule groups this transliterator uses. - #[cfg_attr(feature = "serde", serde(borrow))] pub rule_group_list: VarZeroVec<'a, VarZeroSlice>, } +#[cfg(feature = "serde")] +impl<'de> serde::Deserialize<'de> for RuleBasedTransliterator<'de> { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + use serde::de::Error; + #[derive(serde::Deserialize)] + pub struct Raw<'a> { + pub visibility: bool, + #[serde(borrow)] + pub variable_table: VarTable<'a>, + #[serde(borrow)] + pub filter: CodePointInversionList<'a>, + #[serde(borrow)] + pub id_group_list: VarZeroVec<'a, VarZeroSlice>, + #[serde(borrow)] + pub rule_group_list: VarZeroVec<'a, VarZeroSlice>, + } + + let Raw { + visibility, + variable_table, + filter, + id_group_list, + rule_group_list, + } = Raw::deserialize(deserializer)?; + if id_group_list.len() != rule_group_list.len() { + return Err(D::Error::custom( + "invalid data: id_group_list and rule_group_list have different lengths", + )); + } + Ok(Self { + visibility, + variable_table, + filter, + id_group_list, + rule_group_list, + }) + } +} + impl RuleBasedTransliterator<'_> { /// Returns an iterator of dependencies on other transliterators. /// diff --git a/components/experimental/src/transliterate/transliterator/mod.rs b/components/experimental/src/transliterate/transliterator/mod.rs index 57a5ff6ce83..b211ffce4a3 100644 --- a/components/experimental/src/transliterate/transliterator/mod.rs +++ b/components/experimental/src/transliterate/transliterator/mod.rs @@ -322,39 +322,32 @@ impl Transliterator { + ?Sized, F: Fn(&Locale) -> Option, DataError>>, { - let payload = Transliterator::load_rbt( + let mut env = LiteMap::new(); + + let transliterator = Transliterator::load_rbt( #[allow(clippy::unwrap_used)] // infallible DataMarkerAttributes::try_from_str(&locale.to_string().to_ascii_lowercase()).unwrap(), - transliterator_provider, - )?; - let rbt = payload.get(); - if !rbt.visibility { - // transliterator is internal - return Err(DataError::custom("internal only transliterator")); - } - let mut env = LiteMap::new(); - // Avoid recursive load - env.insert(locale.to_string(), InternalTransliterator::Null); - Transliterator::load_dependencies_recursive( - rbt, - &mut env, lookup, transliterator_provider, normalizer_provider, + false, + &mut env, )?; + Ok(Transliterator { - transliterator: payload, + transliterator, env, }) } - fn load_dependencies_recursive( - rbt: &RuleBasedTransliterator<'_>, - env: &mut LiteMap, + fn load_rbt( + marker_attributes: &DataMarkerAttributes, lookup: Option<&F>, transliterator_provider: &PT, normalizer_provider: &PN, - ) -> Result<(), DataError> + allow_internal: bool, + env: &mut LiteMap, + ) -> Result, DataError> where PT: DataProvider + ?Sized, PN: DataProvider @@ -365,11 +358,19 @@ impl Transliterator { + ?Sized, F: Fn(&Locale) -> Option, DataError>>, { - for dep in rbt.deps() { + let req = DataRequest { + id: DataIdentifierBorrowed::for_marker_attributes(marker_attributes), + ..Default::default() + }; + let transliterator = transliterator_provider.load(req)?.payload; + if !allow_internal && !transliterator.get().visibility { + return Err(DataError::custom("internal only transliterator")); + } + // Avoid recursive load + env.insert(marker_attributes.to_string(), InternalTransliterator::Null); + for dep in transliterator.get().deps() { if !env.contains_key(&*dep) { - // 1. Insert a placeholder to avoid infinite recursion. - env.insert(dep.to_string(), InternalTransliterator::Null); - // 2. Load the transliterator, by checking + // Load the transliterator, by checking let internal_t = // a) hardcoded specials dep.strip_prefix("x-").and_then(|special| Transliterator::load_special(special, normalizer_provider)) @@ -377,28 +378,20 @@ impl Transliterator { .or_else(|| Some(lookup?(&dep.parse().ok()?)?.map(InternalTransliterator::Dyn))) // c) the data .unwrap_or_else(|| { - let rbt = Transliterator::load_rbt( + Transliterator::load_rbt( #[allow(clippy::unwrap_used)] // infallible DataMarkerAttributes::try_from_str(&dep.to_ascii_lowercase()).unwrap(), + lookup, transliterator_provider, - )?; - Ok(InternalTransliterator::RuleBased(rbt)) + normalizer_provider, + true, + env, + ).map(InternalTransliterator::RuleBased) })?; - if let InternalTransliterator::RuleBased(rbt) = &internal_t { - // 3. Recursively load the dependencies of the dependency. - Self::load_dependencies_recursive( - rbt.get(), - env, - lookup, - transliterator_provider, - normalizer_provider, - )?; - } - // 4. Replace the placeholder with the loaded transliterator. env.insert(dep.to_string(), internal_t); } } - Ok(()) + Ok(transliterator) } fn load_special

( @@ -452,27 +445,6 @@ impl Transliterator { } } - fn load_rbt

( - marker_attributes: &DataMarkerAttributes, - provider: &P, - ) -> Result, DataError> - where - P: DataProvider + ?Sized, - { - let req = DataRequest { - id: DataIdentifierBorrowed::for_marker_attributes(marker_attributes), - ..Default::default() - }; - let payload = provider.load(req)?.payload; - let rbt = payload.get(); - if rbt.id_group_list.len() != rbt.rule_group_list.len() { - return Err(DataError::custom( - "invalid data: id_group_list and rule_group_list have different lengths", - )); - } - Ok(payload) - } - // Before stabilization, consider the input type we want to accept here. We might want to // use a data structure internally that works nicely with a &str, but if we don't, a String // is good to accept because the user might already have one.