Skip to content

Commit

Permalink
Move transliterator data struct invariant (#5705)
Browse files Browse the repository at this point in the history
  • Loading branch information
robertbastian authored Oct 18, 2024
1 parent 5ad31bf commit c717cb0
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 64 deletions.
47 changes: 42 additions & 5 deletions components/experimental/src/transliterate/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ use zerovec::*;
/// The data struct representing [UTS #35 transform rules](https://unicode.org/reports/tr35/tr35-general.html#Transforms).
#[icu_provider::data_struct(TransliteratorRulesV1Marker = "transliterator/rules@1")]
#[derive(Debug, Clone, PartialEq, Eq)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_experimental::transliterate::provider))]
pub struct RuleBasedTransliterator<'a> {
Expand All @@ -40,19 +39,57 @@ pub struct RuleBasedTransliterator<'a> {
/// see, e.g., [Devanagari-Latin](https://github.com/unicode-org/cldr/blob/main/common/transforms/Devanagari-Latin.xml)
pub visibility: bool,
/// The [`VarTable`] containing any special matchers (variables, UnicodeSets, ...) used by this transliterator.
#[cfg_attr(feature = "serde", serde(borrow))]
pub variable_table: VarTable<'a>,
/// The filter for this transliterator. If there is none, the set of all code points is used.
#[cfg_attr(feature = "serde", serde(borrow))]
pub filter: CodePointInversionList<'a>,
/// The list of transform rule groups this transliterator uses.
#[cfg_attr(feature = "serde", serde(borrow))]
pub id_group_list: VarZeroVec<'a, VarZeroSlice<SimpleIdULE>>,
/// The list of conversion rule groups this transliterator uses.
#[cfg_attr(feature = "serde", serde(borrow))]
pub rule_group_list: VarZeroVec<'a, VarZeroSlice<RuleULE>>,
}

#[cfg(feature = "serde")]
impl<'de> serde::Deserialize<'de> for RuleBasedTransliterator<'de> {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
use serde::de::Error;
#[derive(serde::Deserialize)]
pub struct Raw<'a> {
pub visibility: bool,
#[serde(borrow)]
pub variable_table: VarTable<'a>,
#[serde(borrow)]
pub filter: CodePointInversionList<'a>,
#[serde(borrow)]
pub id_group_list: VarZeroVec<'a, VarZeroSlice<SimpleIdULE>>,
#[serde(borrow)]
pub rule_group_list: VarZeroVec<'a, VarZeroSlice<RuleULE>>,
}

let Raw {
visibility,
variable_table,
filter,
id_group_list,
rule_group_list,
} = Raw::deserialize(deserializer)?;
if id_group_list.len() != rule_group_list.len() {
return Err(D::Error::custom(
"invalid data: id_group_list and rule_group_list have different lengths",
));
}
Ok(Self {
visibility,
variable_table,
filter,
id_group_list,
rule_group_list,
})
}
}

impl RuleBasedTransliterator<'_> {
/// Returns an iterator of dependencies on other transliterators.
///
Expand Down
90 changes: 31 additions & 59 deletions components/experimental/src/transliterate/transliterator/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -322,39 +322,32 @@ impl Transliterator {
+ ?Sized,
F: Fn(&Locale) -> Option<Result<Box<dyn CustomTransliterator>, DataError>>,
{
let payload = Transliterator::load_rbt(
let mut env = LiteMap::new();

let transliterator = Transliterator::load_rbt(
#[allow(clippy::unwrap_used)] // infallible
DataMarkerAttributes::try_from_str(&locale.to_string().to_ascii_lowercase()).unwrap(),
transliterator_provider,
)?;
let rbt = payload.get();
if !rbt.visibility {
// transliterator is internal
return Err(DataError::custom("internal only transliterator"));
}
let mut env = LiteMap::new();
// Avoid recursive load
env.insert(locale.to_string(), InternalTransliterator::Null);
Transliterator::load_dependencies_recursive(
rbt,
&mut env,
lookup,
transliterator_provider,
normalizer_provider,
false,
&mut env,
)?;

Ok(Transliterator {
transliterator: payload,
transliterator,
env,
})
}

fn load_dependencies_recursive<PT, PN, F>(
rbt: &RuleBasedTransliterator<'_>,
env: &mut LiteMap<String, InternalTransliterator>,
fn load_rbt<PT, PN, F>(
marker_attributes: &DataMarkerAttributes,
lookup: Option<&F>,
transliterator_provider: &PT,
normalizer_provider: &PN,
) -> Result<(), DataError>
allow_internal: bool,
env: &mut LiteMap<String, InternalTransliterator>,
) -> Result<DataPayload<TransliteratorRulesV1Marker>, DataError>
where
PT: DataProvider<TransliteratorRulesV1Marker> + ?Sized,
PN: DataProvider<CanonicalDecompositionDataV1Marker>
Expand All @@ -365,40 +358,40 @@ impl Transliterator {
+ ?Sized,
F: Fn(&Locale) -> Option<Result<Box<dyn CustomTransliterator>, DataError>>,
{
for dep in rbt.deps() {
let req = DataRequest {
id: DataIdentifierBorrowed::for_marker_attributes(marker_attributes),
..Default::default()
};
let transliterator = transliterator_provider.load(req)?.payload;
if !allow_internal && !transliterator.get().visibility {
return Err(DataError::custom("internal only transliterator"));
}
// Avoid recursive load
env.insert(marker_attributes.to_string(), InternalTransliterator::Null);
for dep in transliterator.get().deps() {
if !env.contains_key(&*dep) {
// 1. Insert a placeholder to avoid infinite recursion.
env.insert(dep.to_string(), InternalTransliterator::Null);
// 2. Load the transliterator, by checking
// Load the transliterator, by checking
let internal_t =
// a) hardcoded specials
dep.strip_prefix("x-").and_then(|special| Transliterator::load_special(special, normalizer_provider))
// b) the user-provided override
.or_else(|| Some(lookup?(&dep.parse().ok()?)?.map(InternalTransliterator::Dyn)))
// c) the data
.unwrap_or_else(|| {
let rbt = Transliterator::load_rbt(
Transliterator::load_rbt(
#[allow(clippy::unwrap_used)] // infallible
DataMarkerAttributes::try_from_str(&dep.to_ascii_lowercase()).unwrap(),
lookup,
transliterator_provider,
)?;
Ok(InternalTransliterator::RuleBased(rbt))
normalizer_provider,
true,
env,
).map(InternalTransliterator::RuleBased)
})?;
if let InternalTransliterator::RuleBased(rbt) = &internal_t {
// 3. Recursively load the dependencies of the dependency.
Self::load_dependencies_recursive(
rbt.get(),
env,
lookup,
transliterator_provider,
normalizer_provider,
)?;
}
// 4. Replace the placeholder with the loaded transliterator.
env.insert(dep.to_string(), internal_t);
}
}
Ok(())
Ok(transliterator)
}

fn load_special<P>(
Expand Down Expand Up @@ -452,27 +445,6 @@ impl Transliterator {
}
}

fn load_rbt<P>(
marker_attributes: &DataMarkerAttributes,
provider: &P,
) -> Result<DataPayload<TransliteratorRulesV1Marker>, DataError>
where
P: DataProvider<TransliteratorRulesV1Marker> + ?Sized,
{
let req = DataRequest {
id: DataIdentifierBorrowed::for_marker_attributes(marker_attributes),
..Default::default()
};
let payload = provider.load(req)?.payload;
let rbt = payload.get();
if rbt.id_group_list.len() != rbt.rule_group_list.len() {
return Err(DataError::custom(
"invalid data: id_group_list and rule_group_list have different lengths",
));
}
Ok(payload)
}

// Before stabilization, consider the input type we want to accept here. We might want to
// use a data structure internally that works nicely with a &str, but if we don't, a String
// is good to accept because the user might already have one.
Expand Down

0 comments on commit c717cb0

Please sign in to comment.