From c18bb78e57e038656a23405079d924501f1a8508 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 24 Apr 2024 10:27:05 -0700 Subject: [PATCH 01/28] Start adding without_ancestors (#4629) --- provider/datagen/src/driver.rs | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index 9c36ced0c86..939fc2479a6 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -122,6 +122,26 @@ impl LocaleFamily { } } + /// The family containing all descendants of the selected locale. + /// + /// This family is primarily useful if the root locale is not desired. + /// + /// For example, the family `::without_ancestors("en-001")` contains: + /// + /// - Self: "en-001" + /// - Descendants: "en-GB", "en-ZA", ... + /// + /// but it does _not_ contain the ancestors "en" and "und". + /// + /// Stylized on the CLI as: "%en-US" + pub fn without_ancestors(langid: LanguageIdentifier) -> Self { + Self { + langid: Some(langid), + include_ancestors: false, + include_descendants: true, + } + } + /// The family containing only the selected locale. /// /// For example, the family `::single("en-001")` contains only "en-001". @@ -159,11 +179,14 @@ impl Writeable for LocaleFamily { sink.write_char('^')?; langid.write_to(sink) } + (Some(langid), false, true) => { + sink.write_char('%')?; + langid.write_to(sink) + } (Some(langid), false, false) => { sink.write_char('@')?; langid.write_to(sink) } - (Some(_), false, true) => unreachable!(), (None, _, _) => sink.write_str("full"), } } @@ -176,8 +199,8 @@ impl Writeable for LocaleFamily { ) { (Some(langid), true, true) => langid.writeable_length_hint(), (Some(langid), true, false) => langid.writeable_length_hint() + 1, + (Some(langid), false, true) => langid.writeable_length_hint() + 1, (Some(langid), false, false) => langid.writeable_length_hint() + 1, - (Some(_), false, true) => unreachable!(), (None, _, _) => writeable::LengthHint::exact(4), } } @@ -221,6 +244,11 @@ impl FromStr for LocaleFamily { include_ancestors: true, include_descendants: false, }), + b'%' => Ok(Self { + langid: Some(LanguageIdentifier::try_from_bytes(remainder)?), + include_ancestors: false, + include_descendants: true, + }), b'@' => Ok(Self { langid: Some(LanguageIdentifier::try_from_bytes(remainder)?), include_ancestors: false, @@ -238,7 +266,7 @@ impl FromStr for LocaleFamily { #[test] fn test_locale_family_parsing() { - let valid_families = ["und", "de-CH", "^es", "@pt-BR", "full"]; + let valid_families = ["und", "de-CH", "^es", "@pt-BR", "%en-001", "full"]; let invalid_families = ["invalid", "@invalid", "-foo", "@full", "full-001"]; for family_str in valid_families { let family = family_str.parse::().unwrap(); From 5c5b9908a21a94bcea63ba3ff836e9511a603fc5 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 24 Apr 2024 17:39:03 -0700 Subject: [PATCH 02/28] Mostly working --- provider/datagen/src/driver.rs | 253 ++++++++++--------------- provider/datagen/tests/test-options.rs | 45 +++++ 2 files changed, 148 insertions(+), 150 deletions(-) diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index 939fc2479a6..dd751dc640b 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -5,6 +5,7 @@ use crate::rayon_prelude::*; use crate::FallbackMode; use displaydoc::Display; +use either::Either; use icu_locid::extensions::unicode::key; use icu_locid::LanguageIdentifier; use icu_locid::ParserError; @@ -95,7 +96,7 @@ impl LocaleFamily { /// - Descendants: "en-GB", "en-ZA", ... /// /// Stylized on the CLI as: "en-US" - pub fn with_descendants(langid: LanguageIdentifier) -> Self { + pub const fn with_descendants(langid: LanguageIdentifier) -> Self { Self { langid: Some(langid), include_ancestors: true, @@ -114,7 +115,7 @@ impl LocaleFamily { /// - Ancestors: "und", "en" /// /// Stylized on the CLI as: "^en-US" - pub fn without_descendants(langid: LanguageIdentifier) -> Self { + pub const fn without_descendants(langid: LanguageIdentifier) -> Self { Self { langid: Some(langid), include_ancestors: true, @@ -134,7 +135,7 @@ impl LocaleFamily { /// but it does _not_ contain the ancestors "en" and "und". /// /// Stylized on the CLI as: "%en-US" - pub fn without_ancestors(langid: LanguageIdentifier) -> Self { + pub const fn without_ancestors(langid: LanguageIdentifier) -> Self { Self { langid: Some(langid), include_ancestors: false, @@ -147,7 +148,7 @@ impl LocaleFamily { /// For example, the family `::single("en-001")` contains only "en-001". /// /// Stylized on the CLI as: "@en-US" - pub fn single(langid: LanguageIdentifier) -> Self { + pub const fn single(langid: LanguageIdentifier) -> Self { Self { langid: Some(langid), include_ancestors: false, @@ -158,7 +159,7 @@ impl LocaleFamily { /// The family containing all locales. /// /// Stylized on the CLI as: "full" - pub fn full() -> Self { + pub const fn full() -> Self { Self { langid: None, include_ancestors: false, @@ -305,6 +306,20 @@ enum LocalesWithOrWithoutFallback { }, } +impl LocalesWithOrWithoutFallback { + fn langid_families(&self) -> impl Iterator)> + '_ { + match self { + Self::WithFallback { locales, .. } => { + Either::Left(locales.iter().filter_map(|family| family.langid.as_ref().map(|langid| { + (langid, Some(family)) + } + ))) + } + Self::WithoutFallback { locales } => Either::Right(locales.iter().map(|langid| (langid, None))), + } + } +} + /// Configuration for a data export operation. /// /// Note that this only configures *which data* is exported. The input provider, usually @@ -856,65 +871,6 @@ impl DatagenDriver { } } -struct ExplicitImplicitLocaleSets { - explicit: HashSet, - implicit: HashSet, -} - -/// Resolves the set of explicit langids and the supported locales into two sets of locales: -/// -/// - `explicit` contains the explicit langids but with aux keys and extension keywords included. -/// For example, if `ar-SA` is requested (explicit langid), and `ar` and `ar-u-nu-latn` are supported, -/// then `ar-SA` and `ar-SA-u-nu-latn` will be returned as `explicit`. -/// - `implcit` contains all supported locales reachable by fallback from an `explicit` locale. -/// These locales can be included without increasing data payload size. -fn make_explicit_implicit_sets( - key: DataKey, - explicit_langids: &mut dyn Iterator, - supported_map: &HashMap>, - fallbacker: &Lazy< - Result, - impl FnOnce() -> Result, - >, -) -> Result { - let mut implicit = HashSet::new(); - let mut explicit: HashSet = Default::default(); - for (explicit_langid, include_ancestors) in explicit_langids { - explicit.insert(explicit_langid.into()); - if let Some(locales) = supported_map.get(explicit_langid) { - explicit.extend(locales.iter().cloned()); // adds ar-EG-u-nu-latn - } - if explicit_langid == &LanguageIdentifier::UND { - continue; - } - let fallbacker = fallbacker.as_ref().map_err(|e| *e)?; - let fallbacker_with_config = fallbacker.for_config(key.fallback_config()); - let mut iter = fallbacker_with_config.fallback_for(explicit_langid.into()); - loop { - if include_ancestors { - implicit.insert(iter.get().clone()); - } - if iter.get().is_und() { - break; - } - // Inherit aux keys and extension keywords from parent locales - let iter_langid = iter.get().get_langid(); - if let Some(locales) = supported_map.get(&iter_langid) { - if include_ancestors { - implicit.extend(locales.iter().cloned()); // adds ar-u-nu-latn - } - for locale in locales { - let mut morphed_locale = locale.clone(); - morphed_locale.set_langid(explicit_langid.clone()); - explicit.insert(morphed_locale); // adds ar-SA-u-nu-latn - } - } - iter.step(); - } - } - Ok(ExplicitImplicitLocaleSets { explicit, implicit }) -} - /// Selects the maximal set of locales to export based on a [`DataKey`] and this datagen /// provider's options bag. The locales may be later optionally deduplicated for fallback. fn select_locales_for_key( @@ -930,41 +886,47 @@ fn select_locales_for_key( ) -> Result, DataError> { // A map from langid to data locales. Keys that have aux keys or extension keywords // may have multiple data locales per langid. - let mut supported_map: HashMap> = Default::default(); + #[derive(Default)] + struct LocalesMapValue<'a> { + family: Option<&'a LocaleFamily>, + is_selected: bool, + data_locales: HashSet + } + let mut locales_map: HashMap = Default::default(); for locale in provider .supported_locales_for_key(key) .map_err(|e| e.with_key(key))? { use std::collections::hash_map::Entry; - match supported_map.entry(locale.get_langid()) { - Entry::Occupied(mut entry) => entry.get_mut().insert(locale), - Entry::Vacant(entry) => entry.insert(Default::default()).insert(locale), + match locales_map.entry(locale.get_langid()) { + Entry::Occupied(mut entry) => entry.get_mut().data_locales.insert(locale), + Entry::Vacant(entry) => entry.insert(Default::default()).data_locales.insert(locale), }; } if key.path().get().starts_with("segmenter/dictionary/") { - supported_map.retain(|_, locales| { - locales.retain(|locale| { + locales_map.retain(|_, value| { + value.data_locales.retain(|locale| { let model = crate::dictionary_data_locale_to_model_name(locale); segmenter_models.iter().any(|m| Some(m.as_ref()) == model) }); - !locales.is_empty() + !value.data_locales.is_empty() }); // Don't perform additional locale filtering - return Ok(supported_map.into_values().flatten().collect()); + return Ok(locales_map.into_values().map(|value| value.data_locales).flatten().collect()); } else if key.path().get().starts_with("segmenter/lstm/") { - supported_map.retain(|_, locales| { - locales.retain(|locale| { + locales_map.retain(|_, value| { + value.data_locales.retain(|locale| { let model = crate::lstm_data_locale_to_model_name(locale); segmenter_models.iter().any(|m| Some(m.as_ref()) == model) }); - !locales.is_empty() + !value.data_locales.is_empty() }); // Don't perform additional locale filtering - return Ok(supported_map.into_values().flatten().collect()); + return Ok(locales_map.into_values().map(|value| value.data_locales).flatten().collect()); } else if key.path().get().starts_with("collator/") { - supported_map.retain(|_, locales| { - locales.retain(|locale| { + locales_map.retain(|_, value| { + value.data_locales.retain(|locale| { let Some(collation) = locale .get_unicode_ext(&key!("co")) .and_then(|co| co.as_single_subtag().copied()) @@ -978,88 +940,79 @@ fn select_locales_for_key( !["big5han", "gb2312"].contains(&collation.as_str()) } }); - !locales.is_empty() + !value.data_locales.is_empty() }); } - let locale_families = match locales_fallback { - // `FallbackMode::Preresolved` exports all supported locales whose langid matches - // one of the explicit locales. This ensures extensions are included. In addition, any - // explicit locales are added to the list, even if they themselves don't contain data; - // fallback should be performed upon exporting. - LocalesWithOrWithoutFallback::WithoutFallback { locales, .. } => { - let mut it = locales.iter().map(|langid| (langid, false)); - let ExplicitImplicitLocaleSets { explicit, .. } = - make_explicit_implicit_sets(key, &mut it, &supported_map, fallbacker)?; - return Ok(explicit); + // Add the explicit langids to the map + for (langid, maybe_family) in locales_fallback.langid_families() { + let value = locales_map + .entry(langid.clone()) + .or_default(); + value.is_selected = true; + if *langid != LanguageIdentifier::UND { + value.family = maybe_family; } - // All other modes resolve to fallback-aware inclusion. - LocalesWithOrWithoutFallback::WithFallback { locales, .. } => locales, - }; - - let mut it = locale_families.iter().filter_map(|x| { - x.langid - .as_ref() - .map(|langid| (langid, x.include_ancestors)) - }); - let ExplicitImplicitLocaleSets { explicit, implicit } = - make_explicit_implicit_sets(key, &mut it, &supported_map, fallbacker)?; - - let supported_and_explicit = supported_map - .into_values() - .flatten() - .chain(explicit.iter().cloned()); - - // Need to check this ahead of time because we can avoid loading the fallbacker. - if locale_families.contains(&LocaleFamily::full()) { - return Ok(supported_and_explicit.collect()); } - let fallbacker = fallbacker.as_ref().map_err(|e| *e)?; - let fallbacker_with_config = fallbacker.for_config(key.fallback_config()); - - // TODO(#4629): Allow the exclusion of und - let include_und = true; + let locale_family_everything = LocaleFamily::with_descendants(LanguageIdentifier::UND); + if let LocalesWithOrWithoutFallback::WithFallback { locales, .. } = locales_fallback { + let value = locales_map.entry(LanguageIdentifier::UND).or_default(); + if locales.is_empty() { + // If no locales are selected but fallback is enabled, select the root locale + value.is_selected = true; + } + if locales.contains(&LocaleFamily::full()) { + // Include all locales by including all descendants of the root locale + value.family = Some(&locale_family_everything); + value.is_selected = true; + } + } - let result = supported_and_explicit - .filter(|locale_orig| { - let mut locale = locale_orig.clone(); - locale.remove_aux(); - if implicit.contains(&locale) { - return true; - } - if explicit.contains(&locale) { - return true; - } - if locale.is_langid_und() && include_und { - return true; - } - if locale.language().is_empty() - && matches!( - key.fallback_config().priority, - icu_provider::FallbackPriority::Region - ) - { - return true; - } - // Special case: skeletons *require* the -u-ca keyword, so don't export locales that don't have it - // This would get caught later on, but it makes datagen faster and quieter to catch it here - if key.path().get() == "datetime/skeletons@1" && !locale.has_unicode_ext() { - return false; - } - let mut iter = fallbacker_with_config.fallback_for(locale); - while !iter.get().is_und() { - if explicit.contains(iter.get()) { - return true; + // Fill in missing extensions and aux keys from parent locales, + // and calculate which langids are ancestors and descendants. + for current_langid in locales_map.keys().cloned().collect::>() { + if current_langid == LanguageIdentifier::UND { + continue; + } + let current_value = locales_map.get(¤t_langid).unwrap(); + let include_ancestors = current_value.family.map(|family| family.include_ancestors).unwrap_or(false); + let fallbacker = fallbacker.as_ref().map_err(|e| *e)?; + let fallbacker_with_config = fallbacker.for_config(key.fallback_config()); + let mut iter = fallbacker_with_config.fallback_for((¤t_langid).into()); + loop { + // Inherit aux keys and extension keywords from parent locales + let parent_langid: LanguageIdentifier = iter.get().get_langid(); + if let Some(parent_value) = locales_map.get_mut(&parent_langid) { + if include_ancestors && !parent_value.is_selected { + log::trace!("Including {parent_langid}: ancestor of {current_langid}"); + parent_value.is_selected = true; + } + let include_descendants = parent_value.family.map(|family| family.include_descendants).unwrap_or(false); + let parent_locales = parent_value.data_locales.clone(); + let current_value = locales_map.get_mut(¤t_langid).unwrap(); + if include_descendants && !current_value.is_selected { + log::trace!("Including {current_langid}: descendant of {parent_langid}"); + current_value.is_selected = true; + } + for mut morphed_locale in parent_locales { + morphed_locale.set_langid(current_langid.clone()); + current_value.data_locales.insert(morphed_locale); } - iter.step(); } - log::trace!("Filtered out: {key}/{locale_orig}"); // this will print aux keys too but it avoids a clone - false - }) - .collect(); + if iter.get().is_und() { + break; + } + iter.step(); + } + } - Ok(result) + let selected_locales = locales_map + .into_iter() + .filter(|(_, value)| value.is_selected) + .flat_map(|(_, value)| value.data_locales) + .collect(); + return Ok(selected_locales); } fn deduplicate_payloads( diff --git a/provider/datagen/tests/test-options.rs b/provider/datagen/tests/test-options.rs index fd711dc648b..e6abe2e771b 100644 --- a/provider/datagen/tests/test-options.rs +++ b/provider/datagen/tests/test-options.rs @@ -653,6 +653,51 @@ fn explicit_preresolved() { assert_eq!(exported.keys().collect::>(), locales); } +#[test] +fn explicit_hybrid_without_descendants() { + const SELECTED_LOCALES: [LocaleFamily; 7] = [ + LocaleFamily::without_descendants(langid!("arc")), // Aramaic, not in supported list + LocaleFamily::without_descendants(langid!("ar-EG")), + LocaleFamily::without_descendants(langid!("ar-SA")), + LocaleFamily::without_descendants(langid!("en-GB")), + LocaleFamily::without_descendants(langid!("es")), + LocaleFamily::without_descendants(langid!("sr-ME")), + LocaleFamily::without_descendants(langid!("ru-Cyrl-RU")), + ]; + let exported = export_to_map_1_5( + DatagenDriver::new() + .with_keys([HelloWorldV1Marker::KEY]) + .with_locales_and_fallback(SELECTED_LOCALES, Default::default()), + &TestingProvider::with_decimal_symbol_like_data(), + ); + + // Explicit locales are "arc", "ar-EG", "ar-SA", "en-GB", "es", "sr-ME", "ru-Cyrl-RU" + let locales = [ + "ar", // ancestor of ar-EG + "ar-EG", // explicit locale + "ar-EG-u-nu-latn", // explicit with extensions + "ar-SA", // explicit locale, inheriting from ar + "ar-SA-u-nu-latn", // extensions should be included (#4533) + "ar-u-nu-latn", // extensions should be included (#4533) + "arc", // Aramaic, inheriting from und + "en", // ancestor of en-GB + "en-001", // ancestor of en-GB + "en-GB", // explicit locale not in supported locales + // "en-ZA", // not reachable + "es", // explicit and supported + // "es-AR", // excluded: descendant of es + "ru", // ancestor of ru-Cyrl-RU + "ru-Cyrl-RU", // explicit locale, even though it is not normalized + // "sr", // not reachable from sr-ME + "sr-Latn", // ancestor of sr-ME + "sr-ME", // explicit locale not in supported locales + "und", // ancestor of everything + ]; + + // Should return the exact explicit locales set. + assert_eq!(exported.keys().collect::>(), locales); +} + #[test] fn explicit_runtime_und() { let exported = export_to_map( From 8cb704cca66e11c23e8eea6f27667f118fe21245 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 24 Apr 2024 17:42:33 -0700 Subject: [PATCH 03/28] Add test for without_ancestors --- provider/datagen/tests/test-options.rs | 45 ++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/provider/datagen/tests/test-options.rs b/provider/datagen/tests/test-options.rs index e6abe2e771b..78fa43932aa 100644 --- a/provider/datagen/tests/test-options.rs +++ b/provider/datagen/tests/test-options.rs @@ -698,6 +698,51 @@ fn explicit_hybrid_without_descendants() { assert_eq!(exported.keys().collect::>(), locales); } +#[test] +fn explicit_hybrid_without_ancestors() { + const SELECTED_LOCALES: [LocaleFamily; 7] = [ + LocaleFamily::without_ancestors(langid!("arc")), // Aramaic, not in supported list + LocaleFamily::without_ancestors(langid!("ar-EG")), + LocaleFamily::without_ancestors(langid!("ar-SA")), + LocaleFamily::without_ancestors(langid!("en-GB")), + LocaleFamily::without_ancestors(langid!("es")), + LocaleFamily::without_ancestors(langid!("sr-ME")), + LocaleFamily::without_ancestors(langid!("ru-Cyrl-RU")), + ]; + let exported = export_to_map_1_5( + DatagenDriver::new() + .with_keys([HelloWorldV1Marker::KEY]) + .with_locales_and_fallback(SELECTED_LOCALES, Default::default()), + &TestingProvider::with_decimal_symbol_like_data(), + ); + + // Explicit locales are "arc", "ar-EG", "ar-SA", "en-GB", "es", "sr-ME", "ru-Cyrl-RU" + let locales = [ + // "ar", // excluded: ancestor of ar-EG + "ar-EG", // explicit locale + "ar-EG-u-nu-latn", // explicit with extensions + "ar-SA", // explicit locale, inheriting from ar + "ar-SA-u-nu-latn", // extensions should be included (#4533) + // "ar-u-nu-latn", // excluded: ancestor of ar-EG + "arc", // Aramaic, inheriting from und + // "en", // excluded: ancestor of en-GB + // "en-001", // excluded: ancestor of en-GB + "en-GB", // explicit locale not in supported locales + // "en-ZA", // not reachable + "es", // explicit and supported + "es-AR", // descendant of es + // "ru", // excluded: ancestor of ru-Cyrl-RU + "ru-Cyrl-RU", // explicit locale, even though it is not normalized + // "sr", // not reachable from sr-ME + // "sr-Latn", // excluded: ancestor of sr-ME + "sr-ME", // explicit locale not in supported locales + // "und", // excluded: ancestor of everything + ]; + + // Should return the exact explicit locales set. + assert_eq!(exported.keys().collect::>(), locales); +} + #[test] fn explicit_runtime_und() { let exported = export_to_map( From 1c99e75db64fc257b5b72a78ff9adc611084e1ee Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 24 Apr 2024 17:42:37 -0700 Subject: [PATCH 04/28] fmt --- provider/datagen/src/driver.rs | 43 +++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index dd751dc640b..4e136b3cb6f 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -307,15 +307,18 @@ enum LocalesWithOrWithoutFallback { } impl LocalesWithOrWithoutFallback { - fn langid_families(&self) -> impl Iterator)> + '_ { + fn langid_families( + &self, + ) -> impl Iterator)> + '_ { match self { Self::WithFallback { locales, .. } => { - Either::Left(locales.iter().filter_map(|family| family.langid.as_ref().map(|langid| { - (langid, Some(family)) - } - ))) + Either::Left(locales.iter().filter_map(|family| { + family.langid.as_ref().map(|langid| (langid, Some(family))) + })) + } + Self::WithoutFallback { locales } => { + Either::Right(locales.iter().map(|langid| (langid, None))) } - Self::WithoutFallback { locales } => Either::Right(locales.iter().map(|langid| (langid, None))), } } } @@ -890,7 +893,7 @@ fn select_locales_for_key( struct LocalesMapValue<'a> { family: Option<&'a LocaleFamily>, is_selected: bool, - data_locales: HashSet + data_locales: HashSet, } let mut locales_map: HashMap = Default::default(); for locale in provider @@ -913,7 +916,11 @@ fn select_locales_for_key( !value.data_locales.is_empty() }); // Don't perform additional locale filtering - return Ok(locales_map.into_values().map(|value| value.data_locales).flatten().collect()); + return Ok(locales_map + .into_values() + .map(|value| value.data_locales) + .flatten() + .collect()); } else if key.path().get().starts_with("segmenter/lstm/") { locales_map.retain(|_, value| { value.data_locales.retain(|locale| { @@ -923,7 +930,11 @@ fn select_locales_for_key( !value.data_locales.is_empty() }); // Don't perform additional locale filtering - return Ok(locales_map.into_values().map(|value| value.data_locales).flatten().collect()); + return Ok(locales_map + .into_values() + .map(|value| value.data_locales) + .flatten() + .collect()); } else if key.path().get().starts_with("collator/") { locales_map.retain(|_, value| { value.data_locales.retain(|locale| { @@ -946,9 +957,7 @@ fn select_locales_for_key( // Add the explicit langids to the map for (langid, maybe_family) in locales_fallback.langid_families() { - let value = locales_map - .entry(langid.clone()) - .or_default(); + let value = locales_map.entry(langid.clone()).or_default(); value.is_selected = true; if *langid != LanguageIdentifier::UND { value.family = maybe_family; @@ -976,7 +985,10 @@ fn select_locales_for_key( continue; } let current_value = locales_map.get(¤t_langid).unwrap(); - let include_ancestors = current_value.family.map(|family| family.include_ancestors).unwrap_or(false); + let include_ancestors = current_value + .family + .map(|family| family.include_ancestors) + .unwrap_or(false); let fallbacker = fallbacker.as_ref().map_err(|e| *e)?; let fallbacker_with_config = fallbacker.for_config(key.fallback_config()); let mut iter = fallbacker_with_config.fallback_for((¤t_langid).into()); @@ -988,7 +1000,10 @@ fn select_locales_for_key( log::trace!("Including {parent_langid}: ancestor of {current_langid}"); parent_value.is_selected = true; } - let include_descendants = parent_value.family.map(|family| family.include_descendants).unwrap_or(false); + let include_descendants = parent_value + .family + .map(|family| family.include_descendants) + .unwrap_or(false); let parent_locales = parent_value.data_locales.clone(); let current_value = locales_map.get_mut(¤t_langid).unwrap(); if include_descendants && !current_value.is_selected { From c0d9149b063043f2096b7b8c855fc36df4ba6254 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 24 Apr 2024 17:49:46 -0700 Subject: [PATCH 05/28] clippy --- provider/datagen/src/driver.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index 4e136b3cb6f..4991fcb487b 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -918,8 +918,7 @@ fn select_locales_for_key( // Don't perform additional locale filtering return Ok(locales_map .into_values() - .map(|value| value.data_locales) - .flatten() + .flat_map(|value| value.data_locales) .collect()); } else if key.path().get().starts_with("segmenter/lstm/") { locales_map.retain(|_, value| { @@ -932,8 +931,7 @@ fn select_locales_for_key( // Don't perform additional locale filtering return Ok(locales_map .into_values() - .map(|value| value.data_locales) - .flatten() + .flat_map(|value| value.data_locales) .collect()); } else if key.path().get().starts_with("collator/") { locales_map.retain(|_, value| { @@ -1027,7 +1025,7 @@ fn select_locales_for_key( .filter(|(_, value)| value.is_selected) .flat_map(|(_, value)| value.data_locales) .collect(); - return Ok(selected_locales); + Ok(selected_locales) } fn deduplicate_payloads( From 9e760711464f839d2c0fa9544b6caf08901cb007 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 24 Apr 2024 18:03:38 -0700 Subject: [PATCH 06/28] Add collation special case for backwards compatibility --- provider/datagen/src/driver.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index 4991fcb487b..4acb6f6b8b4 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -1009,6 +1009,14 @@ fn select_locales_for_key( current_value.is_selected = true; } for mut morphed_locale in parent_locales { + // Special case: don't pull -u-co up from the root. + // TODO(#1964): Consider changing this behavior. + if parent_langid == LanguageIdentifier::UND + && morphed_locale + .contains_unicode_ext(&icu_locid::extensions::unicode::key!("co")) + { + continue; + } morphed_locale.set_langid(current_langid.clone()); current_value.data_locales.insert(morphed_locale); } From 78c923bc6519fed4a801c1c91ff163c2bc60e377 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 24 Apr 2024 18:11:59 -0700 Subject: [PATCH 07/28] Special case for --locales full; fixes --- provider/blob/src/export/mod.rs | 2 +- provider/blob/tests/test_versions.rs | 2 +- provider/datagen/Cargo.toml | 3 +-- provider/datagen/src/driver.rs | 8 ++++++++ 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/provider/blob/src/export/mod.rs b/provider/blob/src/export/mod.rs index ad4886617bc..4292b0c50b7 100644 --- a/provider/blob/src/export/mod.rs +++ b/provider/blob/src/export/mod.rs @@ -23,7 +23,7 @@ //! DatagenDriver::new() //! .with_keys([icu_provider::hello_world::HelloWorldV1Marker::KEY]) //! .with_all_locales() -//! .export(&DatagenProvider::new_latest_tested(), exporter) +//! .export(&icu_provider::hello_world::HelloWorldProvider, exporter) //! .unwrap(); //! //! // communicate the blob to the client application (network, disk, etc.) diff --git a/provider/blob/tests/test_versions.rs b/provider/blob/tests/test_versions.rs index f254638b74b..427c7e00c41 100644 --- a/provider/blob/tests/test_versions.rs +++ b/provider/blob/tests/test_versions.rs @@ -16,7 +16,7 @@ fn run_driver(exporter: BlobExporter) -> Result<(), DataError> { DatagenDriver::new() .with_keys([icu_provider::hello_world::HelloWorldV1Marker::KEY]) .with_locales_and_fallback([LocaleFamily::full()], Default::default()) - .export(&DatagenProvider::new_custom(), exporter) + .export(&icu_provider::hello_world::HelloWorldProvider, exporter) } fn check_hello_world(blob_provider: impl DataProvider) { diff --git a/provider/datagen/Cargo.toml b/provider/datagen/Cargo.toml index 7a6e257873d..db5aca3d9e0 100644 --- a/provider/datagen/Cargo.toml +++ b/provider/datagen/Cargo.toml @@ -35,6 +35,7 @@ all-features = true # DatagenDriver displaydoc = { workspace = true } +either = { workspace = true } icu_locid = { workspace = true, features = ["std"] } icu_provider = { workspace = true, features = ["std", "logging", "datagen", "experimental"]} log = { workspace = true } @@ -79,7 +80,6 @@ zerotrie = { workspace = true, features = ["alloc"], optional = true } zerovec = { workspace = true, features = ["serde", "yoke"], optional = true } ## External dependencies -either = { workspace = true, optional = true } elsa = { workspace = true, optional = true } itertools = { workspace = true, features = ["use_alloc"], optional = true } ndarray = { workspace = true, optional = true } @@ -132,7 +132,6 @@ provider = [ "dep:tinystr", "dep:zerotrie", "dep:zerovec", - "dep:either", "dep:elsa", "dep:itertools", "dep:ndarray", diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index 4acb6f6b8b4..e710e09544b 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -973,6 +973,14 @@ fn select_locales_for_key( // Include all locales by including all descendants of the root locale value.family = Some(&locale_family_everything); value.is_selected = true; + if locales.len() == 1 { + // Special case: return now so we don't need the fallbacker + let selected_locales = locales_map + .into_iter() + .flat_map(|(_, value)| value.data_locales) + .collect(); + return Ok(selected_locales); + } } } From a36b095a6df984aecc94afd9dcda17d87e050243 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 24 Apr 2024 18:20:52 -0700 Subject: [PATCH 08/28] Generalize special case (also impacts transliterator) --- provider/datagen/src/driver.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index e710e09544b..072bafb0fc2 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -1017,12 +1017,8 @@ fn select_locales_for_key( current_value.is_selected = true; } for mut morphed_locale in parent_locales { - // Special case: don't pull -u-co up from the root. - // TODO(#1964): Consider changing this behavior. - if parent_langid == LanguageIdentifier::UND - && morphed_locale - .contains_unicode_ext(&icu_locid::extensions::unicode::key!("co")) - { + // Special case: don't pull extensions or aux keys up from the root. + if morphed_locale.is_langid_und() && !morphed_locale.is_empty() { continue; } morphed_locale.set_langid(current_langid.clone()); From 4b2362e6a0bf301ee8b10e33d449da99b967bbbd Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 24 Apr 2024 18:26:23 -0700 Subject: [PATCH 09/28] Return to the old way of handling --locales full: needs to work with region fallback --- provider/datagen/src/driver.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index 072bafb0fc2..52379ebaa3b 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -962,17 +962,14 @@ fn select_locales_for_key( } } - let locale_family_everything = LocaleFamily::with_descendants(LanguageIdentifier::UND); + let mut include_full = false; if let LocalesWithOrWithoutFallback::WithFallback { locales, .. } = locales_fallback { - let value = locales_map.entry(LanguageIdentifier::UND).or_default(); if locales.is_empty() { // If no locales are selected but fallback is enabled, select the root locale + let value = locales_map.entry(LanguageIdentifier::UND).or_default(); value.is_selected = true; } if locales.contains(&LocaleFamily::full()) { - // Include all locales by including all descendants of the root locale - value.family = Some(&locale_family_everything); - value.is_selected = true; if locales.len() == 1 { // Special case: return now so we don't need the fallbacker let selected_locales = locales_map @@ -981,6 +978,7 @@ fn select_locales_for_key( .collect(); return Ok(selected_locales); } + include_full = true; } } @@ -990,7 +988,11 @@ fn select_locales_for_key( if current_langid == LanguageIdentifier::UND { continue; } - let current_value = locales_map.get(¤t_langid).unwrap(); + let current_value = locales_map.get_mut(¤t_langid).unwrap(); + if include_full && !current_value.is_selected { + log::trace!("Including {current_langid}: the full locale family is present"); + current_value.is_selected = true; + } let include_ancestors = current_value .family .map(|family| family.include_ancestors) From fc27d6cf0f844e51d623a4ee3b73a9032671e1fc Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 24 Apr 2024 18:48:01 -0700 Subject: [PATCH 10/28] Always include variants of und for region fallback --- provider/datagen/src/driver.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index 52379ebaa3b..8300cc047f5 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -985,14 +985,19 @@ fn select_locales_for_key( // Fill in missing extensions and aux keys from parent locales, // and calculate which langids are ancestors and descendants. for current_langid in locales_map.keys().cloned().collect::>() { - if current_langid == LanguageIdentifier::UND { - continue; - } let current_value = locales_map.get_mut(¤t_langid).unwrap(); if include_full && !current_value.is_selected { log::trace!("Including {current_langid}: the full locale family is present"); current_value.is_selected = true; } + if current_langid.language.is_empty() { + if current_langid != LanguageIdentifier::UND { + log::trace!("Including {current_langid}: variants of und are always included"); + current_value.is_selected = true; + } + // Nothing left to be done here + continue; + } let include_ancestors = current_value .family .map(|family| family.include_ancestors) From ad6f135c0cb3ed7192c65304f3b6ef9e5752567b Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 24 Apr 2024 18:56:37 -0700 Subject: [PATCH 11/28] Inline langid_families --- provider/datagen/src/driver.rs | 85 +++++++++++++++------------------- 1 file changed, 38 insertions(+), 47 deletions(-) diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index 8300cc047f5..5ccefec90c7 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -5,7 +5,6 @@ use crate::rayon_prelude::*; use crate::FallbackMode; use displaydoc::Display; -use either::Either; use icu_locid::extensions::unicode::key; use icu_locid::LanguageIdentifier; use icu_locid::ParserError; @@ -298,31 +297,14 @@ pub struct FallbackOptions { #[derive(Debug, Clone)] enum LocalesWithOrWithoutFallback { WithFallback { - locales: HashSet, + families: HashSet, options: FallbackOptions, }, WithoutFallback { - locales: HashSet, + langids: HashSet, }, } -impl LocalesWithOrWithoutFallback { - fn langid_families( - &self, - ) -> impl Iterator)> + '_ { - match self { - Self::WithFallback { locales, .. } => { - Either::Left(locales.iter().filter_map(|family| { - family.langid.as_ref().map(|langid| (langid, Some(family))) - })) - } - Self::WithoutFallback { locales } => { - Either::Right(locales.iter().map(|langid| (langid, None))) - } - } - } -} - /// Configuration for a data export operation. /// /// Note that this only configures *which data* is exported. The input provider, usually @@ -427,7 +409,7 @@ impl DatagenDriver { ) -> Self { Self { locales_fallback: Some(LocalesWithOrWithoutFallback::WithoutFallback { - locales: locales.into_iter().collect(), + langids: locales.into_iter().collect(), }), ..self } @@ -447,7 +429,7 @@ impl DatagenDriver { ) -> Self { Self { locales_fallback: Some(LocalesWithOrWithoutFallback::WithFallback { - locales: locales.into_iter().collect(), + families: locales.into_iter().collect(), options, }), ..self @@ -582,7 +564,7 @@ impl DatagenDriver { // 1.4 API (_, Some(legacy_locales), FallbackMode::PreferredForExporter) => { LocalesWithOrWithoutFallback::WithFallback { - locales: map_legacy_locales_to_locales_with_expansion(legacy_locales), + families: map_legacy_locales_to_locales_with_expansion(legacy_locales), options: FallbackOptions { runtime_fallback_location: None, deduplication_strategy: None, @@ -591,7 +573,7 @@ impl DatagenDriver { } (_, Some(legacy_locales), FallbackMode::Runtime) => { LocalesWithOrWithoutFallback::WithFallback { - locales: map_legacy_locales_to_locales_with_expansion(legacy_locales), + families: map_legacy_locales_to_locales_with_expansion(legacy_locales), options: FallbackOptions { runtime_fallback_location: Some(RuntimeFallbackLocation::Internal), deduplication_strategy: Some(DeduplicationStrategy::Maximal), @@ -600,7 +582,7 @@ impl DatagenDriver { } (_, Some(legacy_locales), FallbackMode::RuntimeManual) => { LocalesWithOrWithoutFallback::WithFallback { - locales: map_legacy_locales_to_locales_with_expansion(legacy_locales), + families: map_legacy_locales_to_locales_with_expansion(legacy_locales), options: FallbackOptions { runtime_fallback_location: Some(RuntimeFallbackLocation::External), deduplication_strategy: Some(DeduplicationStrategy::Maximal), @@ -609,7 +591,7 @@ impl DatagenDriver { } (_, Some(Some(locales)), FallbackMode::Preresolved) => { LocalesWithOrWithoutFallback::WithoutFallback { - locales: locales.into_iter().collect(), + langids: locales.into_iter().collect(), } } (_, Some(None), FallbackMode::Preresolved) => { @@ -619,7 +601,7 @@ impl DatagenDriver { } (_, Some(legacy_locales), FallbackMode::Hybrid) => { LocalesWithOrWithoutFallback::WithFallback { - locales: map_legacy_locales_to_locales_with_expansion(legacy_locales), + families: map_legacy_locales_to_locales_with_expansion(legacy_locales), options: FallbackOptions { runtime_fallback_location: Some(RuntimeFallbackLocation::External), deduplication_strategy: Some(DeduplicationStrategy::None), @@ -639,7 +621,7 @@ impl DatagenDriver { } let (uses_internal_fallback, deduplication_strategy) = match &locales_fallback { - LocalesWithOrWithoutFallback::WithoutFallback { locales } => { + LocalesWithOrWithoutFallback::WithoutFallback { langids: locales } => { let mut sorted_locales = locales.iter().map(ToString::to_string).collect::>(); sorted_locales.sort(); @@ -649,7 +631,7 @@ impl DatagenDriver { ); (false, DeduplicationStrategy::None) } - LocalesWithOrWithoutFallback::WithFallback { options, locales } => { + LocalesWithOrWithoutFallback::WithFallback { options, families: locales } => { let uses_internal_fallback = match options.runtime_fallback_location { None => sink.supports_built_in_fallback(), Some(RuntimeFallbackLocation::Internal) => true, @@ -954,23 +936,27 @@ fn select_locales_for_key( } // Add the explicit langids to the map - for (langid, maybe_family) in locales_fallback.langid_families() { - let value = locales_map.entry(langid.clone()).or_default(); - value.is_selected = true; - if *langid != LanguageIdentifier::UND { - value.family = maybe_family; - } - } - let mut include_full = false; - if let LocalesWithOrWithoutFallback::WithFallback { locales, .. } = locales_fallback { - if locales.is_empty() { - // If no locales are selected but fallback is enabled, select the root locale - let value = locales_map.entry(LanguageIdentifier::UND).or_default(); - value.is_selected = true; - } - if locales.contains(&LocaleFamily::full()) { - if locales.len() == 1 { + match locales_fallback { + LocalesWithOrWithoutFallback::WithFallback { families, .. } => { + if families.is_empty() { + // If no locales are selected but fallback is enabled, select the root locale + let value = locales_map.entry(LanguageIdentifier::UND).or_default(); + value.is_selected = true; + } + for family in families { + let Some(langid) = family.langid.as_ref() else { + debug_assert_eq!(*family, LocaleFamily::full()); + include_full = true; + continue; + }; + let value = locales_map.entry(langid.clone()).or_default(); + value.is_selected = true; + if *langid != LanguageIdentifier::UND { + value.family = Some(family); + } + } + if include_full && families.len() == 1 { // Special case: return now so we don't need the fallbacker let selected_locales = locales_map .into_iter() @@ -978,7 +964,12 @@ fn select_locales_for_key( .collect(); return Ok(selected_locales); } - include_full = true; + } + LocalesWithOrWithoutFallback::WithoutFallback { langids: locales } => { + for langid in locales { + let value = locales_map.entry(langid.clone()).or_default(); + value.is_selected = true; + } } } @@ -1215,7 +1206,7 @@ fn test_collation_filtering() { &crate::provider::DatagenProvider::new_testing(), icu_collator::provider::CollationDataV1Marker::KEY, &LocalesWithOrWithoutFallback::WithoutFallback { - locales: [cas.language.clone()].into_iter().collect(), + langids: [cas.language.clone()].into_iter().collect(), }, &HashSet::from_iter(cas.include_collations.iter().copied().map(String::from)), &[], From ac0b97d4590f66541fe574e43388471d92409766 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 24 Apr 2024 19:06:32 -0700 Subject: [PATCH 12/28] de-AT-u-co-phonebk is now included to prevent fallback to de-AT --- provider/baked/collator/data/macros/collator_meta_v1.rs.data | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/provider/baked/collator/data/macros/collator_meta_v1.rs.data b/provider/baked/collator/data/macros/collator_meta_v1.rs.data index 821f1c5774d..20abe13104e 100644 --- a/provider/baked/collator/data/macros/collator_meta_v1.rs.data +++ b/provider/baked/collator/data/macros/collator_meta_v1.rs.data @@ -20,8 +20,8 @@ macro_rules! __impl_collator_meta_v1 { static AR: ::Yokeable = icu::collator::provider::CollationMetadataV1 { bits: 41u32 }; static LT: ::Yokeable = icu::collator::provider::CollationMetadataV1 { bits: 73u32 }; static AF: ::Yokeable = icu::collator::provider::CollationMetadataV1 { bits: 9u32 }; - static VALUES: [&::Yokeable; 87usize] = [&AF, &AM, &AR, &AR, &AR, &AR, &AM, &AR, &AF, &AR, &AF, &AM, &AF, &AF, &DA, &AF, &AF, &AM, &AF, &AF, &AF, &AF, &AR, &AF, &AF, &AF, &AF, &FR_CA, &AF, &AF, &AR, &AF, &AM, &AR, &AR, &AF, &AF, &AR, &AF, &AF, &AR, &AM, &AR, &AR, &AR, &AR, &AR, &AR, &AR, &AM, <, &AF, &AR, &AR, &AM, &AR, &DA, &AR, &AM, &AF, &AR, &AR, &AF, &AR, &AF, &AM, &AR, &AF, &AF, &AF, &AR, &AF, &AR, &AR, &TH, &AF, &AF, &AF, &AR, &AR, &UND, &AR, &AF, &VI, &AF, &AF, &AR]; - static KEYS: [&str; 87usize] = ["af", "am", "ar", "as", "az", "be", "bg", "bn", "br", "bs", "ceb", "chr", "cs", "cy", "da", "de-u-co-phonebk", "dsb", "el", "en-US-posix", "eo", "es", "et", "fa", "ff-Adlm", "fi", "fil", "fo", "fr-CA", "fy", "gl", "gu", "ha", "he", "hi", "hr", "hsb", "hu", "hy", "ig", "is", "ja", "ka", "kk", "km", "kn", "ko", "kok", "ku", "ky", "lo", "lt", "lv", "mk", "ml", "mn", "mr", "mt", "my", "ne", "no", "or", "pa", "pl", "ps", "ro", "ru", "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "th", "tk", "to", "tr", "ug", "uk", "und", "ur", "uz", "vi", "wo", "yo", "zh"]; + static VALUES: [&::Yokeable; 88usize] = [&AF, &AM, &AR, &AR, &AR, &AR, &AM, &AR, &AF, &AR, &AF, &AM, &AF, &AF, &DA, &AF, &AF, &AF, &AM, &AF, &AF, &AF, &AF, &AR, &AF, &AF, &AF, &AF, &FR_CA, &AF, &AF, &AR, &AF, &AM, &AR, &AR, &AF, &AF, &AR, &AF, &AF, &AR, &AM, &AR, &AR, &AR, &AR, &AR, &AR, &AR, &AM, <, &AF, &AR, &AR, &AM, &AR, &DA, &AR, &AM, &AF, &AR, &AR, &AF, &AR, &AF, &AM, &AR, &AF, &AF, &AF, &AR, &AF, &AR, &AR, &TH, &AF, &AF, &AF, &AR, &AR, &UND, &AR, &AF, &VI, &AF, &AF, &AR]; + static KEYS: [&str; 88usize] = ["af", "am", "ar", "as", "az", "be", "bg", "bn", "br", "bs", "ceb", "chr", "cs", "cy", "da", "de-AT-u-co-phonebk", "de-u-co-phonebk", "dsb", "el", "en-US-posix", "eo", "es", "et", "fa", "ff-Adlm", "fi", "fil", "fo", "fr-CA", "fy", "gl", "gu", "ha", "he", "hi", "hr", "hsb", "hu", "hy", "ig", "is", "ja", "ka", "kk", "km", "kn", "ko", "kok", "ku", "ky", "lo", "lt", "lv", "mk", "ml", "mn", "mr", "mt", "my", "ne", "no", "or", "pa", "pl", "ps", "ro", "ru", "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "th", "tk", "to", "tr", "ug", "uk", "und", "ur", "uz", "vi", "wo", "yo", "zh"]; let mut metadata = icu_provider::DataResponseMetadata::default(); let payload = if let Ok(payload) = KEYS.binary_search_by(|k| req.locale.strict_cmp(k.as_bytes()).reverse()).map(|i| *unsafe { VALUES.get_unchecked(i) }) { payload From 5eb0f7807cb8a52c075733f83585c8d8d68a9f1d Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 24 Apr 2024 19:07:42 -0700 Subject: [PATCH 13/28] fmt/rename --- provider/datagen/src/driver.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index 5ccefec90c7..b36429b1bb5 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -631,7 +631,7 @@ impl DatagenDriver { ); (false, DeduplicationStrategy::None) } - LocalesWithOrWithoutFallback::WithFallback { options, families: locales } => { + LocalesWithOrWithoutFallback::WithFallback { options, families } => { let uses_internal_fallback = match options.runtime_fallback_location { None => sink.supports_built_in_fallback(), Some(RuntimeFallbackLocation::Internal) => true, @@ -649,7 +649,7 @@ impl DatagenDriver { Some(x) => x, }; let mut sorted_locales = - locales.iter().map(ToString::to_string).collect::>(); + families.iter().map(ToString::to_string).collect::>(); sorted_locales.sort(); log::info!( "Datagen configured with {}, {}, and these locales: {:?}", From 0600a812b80b140c19ff8832136b655fdd56b333 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 24 Apr 2024 19:10:03 -0700 Subject: [PATCH 14/28] Undo addition of either dep --- provider/datagen/Cargo.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/provider/datagen/Cargo.toml b/provider/datagen/Cargo.toml index db5aca3d9e0..7a6e257873d 100644 --- a/provider/datagen/Cargo.toml +++ b/provider/datagen/Cargo.toml @@ -35,7 +35,6 @@ all-features = true # DatagenDriver displaydoc = { workspace = true } -either = { workspace = true } icu_locid = { workspace = true, features = ["std"] } icu_provider = { workspace = true, features = ["std", "logging", "datagen", "experimental"]} log = { workspace = true } @@ -80,6 +79,7 @@ zerotrie = { workspace = true, features = ["alloc"], optional = true } zerovec = { workspace = true, features = ["serde", "yoke"], optional = true } ## External dependencies +either = { workspace = true, optional = true } elsa = { workspace = true, optional = true } itertools = { workspace = true, features = ["use_alloc"], optional = true } ndarray = { workspace = true, optional = true } @@ -132,6 +132,7 @@ provider = [ "dep:tinystr", "dep:zerotrie", "dep:zerovec", + "dep:either", "dep:elsa", "dep:itertools", "dep:ndarray", From d98027023b2730bb5671f48b9282bc2023aa2727 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Mon, 29 Apr 2024 16:37:23 -0700 Subject: [PATCH 15/28] Define precedence ordering for LocaleFamily; refactor to support it --- provider/datagen/src/driver.rs | 238 +++++++++++++++++++------ provider/datagen/tests/test-options.rs | 45 +++++ 2 files changed, 232 insertions(+), 51 deletions(-) diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index b36429b1bb5..266cf5bb383 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -16,6 +16,7 @@ use once_cell::sync::Lazy; use std::collections::HashMap; use std::collections::HashSet; use std::fmt; +use std::hash::Hash; use std::str::FromStr; use std::time::Duration; use std::time::Instant; @@ -75,12 +76,52 @@ pub enum DeduplicationStrategy { None, } +/// Inner fields of a [`LocaleFamily`]. +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub(crate) struct LocaleFamilyAnnotations { + include_ancestors: bool, + include_descendants: bool, +} + +impl LocaleFamilyAnnotations { + #[inline] + pub(crate) const fn with_descendants() -> Self { + Self { + include_ancestors: true, + include_descendants: true, + } + } + + #[inline] + pub(crate) const fn without_descendants() -> Self { + Self { + include_ancestors: true, + include_descendants: false, + } + } + + #[inline] + pub(crate) const fn without_ancestors() -> Self { + Self { + include_ancestors: false, + include_descendants: true, + } + } + + #[inline] + pub(crate) const fn single() -> Self { + Self { + include_ancestors: false, + include_descendants: false, + } + } +} + /// A family of locales to export. #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct LocaleFamily { langid: Option, - include_ancestors: bool, - include_descendants: bool, + annotations: LocaleFamilyAnnotations, } impl LocaleFamily { @@ -98,8 +139,7 @@ impl LocaleFamily { pub const fn with_descendants(langid: LanguageIdentifier) -> Self { Self { langid: Some(langid), - include_ancestors: true, - include_descendants: true, + annotations: LocaleFamilyAnnotations::with_descendants(), } } @@ -117,8 +157,7 @@ impl LocaleFamily { pub const fn without_descendants(langid: LanguageIdentifier) -> Self { Self { langid: Some(langid), - include_ancestors: true, - include_descendants: false, + annotations: LocaleFamilyAnnotations::without_descendants(), } } @@ -137,8 +176,7 @@ impl LocaleFamily { pub const fn without_ancestors(langid: LanguageIdentifier) -> Self { Self { langid: Some(langid), - include_ancestors: false, - include_descendants: true, + annotations: LocaleFamilyAnnotations::without_ancestors(), } } @@ -150,8 +188,7 @@ impl LocaleFamily { pub const fn single(langid: LanguageIdentifier) -> Self { Self { langid: Some(langid), - include_ancestors: false, - include_descendants: false, + annotations: LocaleFamilyAnnotations::single(), } } @@ -161,18 +198,62 @@ impl LocaleFamily { pub const fn full() -> Self { Self { langid: None, - include_ancestors: false, - include_descendants: true, + annotations: LocaleFamilyAnnotations { + include_ancestors: false, + include_descendants: true, + }, + } + } + + pub(crate) fn into_inner(self) -> (Option, LocaleFamilyAnnotations) { + (self.langid, self.annotations) + } + + pub(crate) fn as_borrowed(&self) -> LocaleFamilyBorrowed { + LocaleFamilyBorrowed { + langid: self.langid.as_ref(), + annotations: self.annotations, } } } impl Writeable for LocaleFamily { + #[inline] + fn write_to(&self, sink: &mut W) -> core::fmt::Result { + self.as_borrowed().write_to(sink) + } + + #[inline] + fn writeable_length_hint(&self) -> writeable::LengthHint { + self.as_borrowed().writeable_length_hint() + } +} + +writeable::impl_display_with_writeable!(LocaleFamily); + +/// A [`LocaleFamily`] that does not own its [`LanguageIdentifier`]. +pub(crate) struct LocaleFamilyBorrowed<'a> { + langid: Option<&'a LanguageIdentifier>, + annotations: LocaleFamilyAnnotations, +} + +impl<'a> LocaleFamilyBorrowed<'a> { + pub(crate) fn from_inner( + inner: (&'a Option, &LocaleFamilyAnnotations), + ) -> Self { + Self { + langid: inner.0.as_ref(), + annotations: *inner.1, + } + } +} + +impl Writeable for LocaleFamilyBorrowed<'_> { fn write_to(&self, sink: &mut W) -> core::fmt::Result { match ( &self.langid, - self.include_ancestors, - self.include_descendants, + self.annotations.include_ancestors, + self.annotations.include_descendants, ) { (Some(langid), true, true) => langid.write_to(sink), (Some(langid), true, false) => { @@ -194,8 +275,8 @@ impl Writeable for LocaleFamily { fn writeable_length_hint(&self) -> writeable::LengthHint { match ( &self.langid, - self.include_ancestors, - self.include_descendants, + self.annotations.include_ancestors, + self.annotations.include_descendants, ) { (Some(langid), true, true) => langid.writeable_length_hint(), (Some(langid), true, false) => langid.writeable_length_hint() + 1, @@ -206,8 +287,6 @@ impl Writeable for LocaleFamily { } } -writeable::impl_display_with_writeable!(LocaleFamily); - /// An error while parsing a [`LocaleFamily`]. #[derive(Debug, Copy, Clone, PartialEq, Display)] #[non_exhaustive] @@ -241,23 +320,31 @@ impl FromStr for LocaleFamily { match first { b'^' => Ok(Self { langid: Some(LanguageIdentifier::try_from_bytes(remainder)?), - include_ancestors: true, - include_descendants: false, + annotations: LocaleFamilyAnnotations { + include_ancestors: true, + include_descendants: false, + }, }), b'%' => Ok(Self { langid: Some(LanguageIdentifier::try_from_bytes(remainder)?), - include_ancestors: false, - include_descendants: true, + annotations: LocaleFamilyAnnotations { + include_ancestors: false, + include_descendants: true, + }, }), b'@' => Ok(Self { langid: Some(LanguageIdentifier::try_from_bytes(remainder)?), - include_ancestors: false, - include_descendants: false, + annotations: LocaleFamilyAnnotations { + include_ancestors: false, + include_descendants: false, + }, }), b if b.is_ascii_alphanumeric() => Ok(Self { langid: Some(s.parse()?), - include_ancestors: true, - include_descendants: true, + annotations: LocaleFamilyAnnotations { + include_ancestors: true, + include_descendants: true, + }, }), _ => Err(LocaleFamilyParseError::InvalidFamily), } @@ -297,7 +384,7 @@ pub struct FallbackOptions { #[derive(Debug, Clone)] enum LocalesWithOrWithoutFallback { WithFallback { - families: HashSet, + families: HashMap, LocaleFamilyAnnotations>, options: FallbackOptions, }, WithoutFallback { @@ -420,6 +507,9 @@ impl DatagenDriver { /// Use the [`langid!`] macro from the prelude to create an /// explicit list, or [`DatagenProvider::locales_for_coverage_levels`] for CLDR coverage levels. /// + /// If there are multiple [`LocaleFamily`]s for the same [`LanguageIdentifier`], the last entry + /// in the iterator takes precedence. + /// /// [`langid!`]: crate::prelude::langid /// [`DatagenProvider::locales_for_coverage_levels`]: crate::DatagenProvider::locales_for_coverage_levels pub fn with_locales_and_fallback( @@ -429,7 +519,7 @@ impl DatagenDriver { ) -> Self { Self { locales_fallback: Some(LocalesWithOrWithoutFallback::WithFallback { - families: locales.into_iter().collect(), + families: locales.into_iter().map(LocaleFamily::into_inner).collect(), options, }), ..self @@ -554,8 +644,15 @@ impl DatagenDriver { let map_legacy_locales_to_locales_with_expansion = |legacy_locales: Option>| match legacy_locales { - Some(v) => v.into_iter().map(LocaleFamily::with_descendants).collect(), - None => [LocaleFamily::full()].into_iter().collect(), + Some(v) => v + .into_iter() + .map(LocaleFamily::with_descendants) + .map(LocaleFamily::into_inner) + .collect(), + None => [LocaleFamily::full()] + .into_iter() + .map(LocaleFamily::into_inner) + .collect(), }; let locales_fallback = match (locales_fallback, legacy_locales, legacy_fallback_mode) { @@ -622,9 +719,11 @@ impl DatagenDriver { let (uses_internal_fallback, deduplication_strategy) = match &locales_fallback { LocalesWithOrWithoutFallback::WithoutFallback { langids: locales } => { - let mut sorted_locales = - locales.iter().map(ToString::to_string).collect::>(); - sorted_locales.sort(); + let mut sorted_locales = locales + .iter() + .map(|x| x.write_to_string()) + .collect::>(); + sorted_locales.sort_unstable(); log::info!( "Datagen configured without fallback with these locales: {:?}", sorted_locales @@ -648,15 +747,18 @@ impl DatagenDriver { } Some(x) => x, }; - let mut sorted_locales = - families.iter().map(ToString::to_string).collect::>(); - sorted_locales.sort(); + let mut sorted_locales = families + .iter() + .map(LocaleFamilyBorrowed::from_inner) + .map(|family| family.write_to_string().into_owned()) + .collect::>(); + sorted_locales.sort_unstable(); log::info!( "Datagen configured with {}, {}, and these locales: {:?}", if uses_internal_fallback { "internal fallback" } else { - "external fallback " + "external fallback" }, match deduplication_strategy { DeduplicationStrategy::Maximal => "maximal deduplication", @@ -869,24 +971,28 @@ fn select_locales_for_key( impl FnOnce() -> Result, >, ) -> Result, DataError> { - // A map from langid to data locales. Keys that have aux keys or extension keywords - // may have multiple data locales per langid. + /// Values for the map from langid to data locales. Keys that have aux keys or extension + /// keywords may have multiple data locales per langid. #[derive(Default)] - struct LocalesMapValue<'a> { - family: Option<&'a LocaleFamily>, + struct LocalesMapValue { + /// The [`LocaleFamilyAnnotations`] associated with this language identifier if it was + /// explicitly requested. If `None`, the langid is supported but not requested. + family: Option, + /// Whether this language identifier should be included in the export. is_selected: bool, + /// The set of exportable [`DataLocale`]s associated with this language identifier. data_locales: HashSet, } - let mut locales_map: HashMap = Default::default(); + let mut locales_map = HashMap::::new(); for locale in provider .supported_locales_for_key(key) .map_err(|e| e.with_key(key))? { - use std::collections::hash_map::Entry; - match locales_map.entry(locale.get_langid()) { - Entry::Occupied(mut entry) => entry.get_mut().data_locales.insert(locale), - Entry::Vacant(entry) => entry.insert(Default::default()).data_locales.insert(locale), - }; + locales_map + .entry(locale.get_langid()) + .or_default() + .data_locales + .insert(locale); } if key.path().get().starts_with("segmenter/dictionary/") { @@ -944,16 +1050,16 @@ fn select_locales_for_key( let value = locales_map.entry(LanguageIdentifier::UND).or_default(); value.is_selected = true; } - for family in families { - let Some(langid) = family.langid.as_ref() else { - debug_assert_eq!(*family, LocaleFamily::full()); + for (langid, annotations) in families { + let Some(langid) = langid.as_ref() else { + debug_assert_eq!(annotations, &LocaleFamily::full().annotations); include_full = true; continue; }; let value = locales_map.entry(langid.clone()).or_default(); value.is_selected = true; if *langid != LanguageIdentifier::UND { - value.family = Some(family); + value.family = Some(*annotations); } } if include_full && families.len() == 1 { @@ -1225,3 +1331,33 @@ fn test_collation_filtering() { assert_eq!(resolved_locales, expected_locales, "{cas:?}"); } } + +/// Test that the last option with multiple conflicting families wins. +#[test] +fn test_family_precedence() { + let driver = DatagenDriver::new().with_locales_and_fallback( + [ + "en".parse().unwrap(), + "%en".parse().unwrap(), + "@en".parse().unwrap(), + "%zh-TW".parse().unwrap(), + "^zh-TW".parse().unwrap(), + ], + Default::default(), + ); + + let Some(LocalesWithOrWithoutFallback::WithFallback { families, .. }) = driver.locales_fallback + else { + panic!("expected locales with fallback") + }; + + assert_eq!( + families, + [ + "@en".parse::().unwrap().into_inner(), + "^zh-TW".parse::().unwrap().into_inner() + ] + .into_iter() + .collect::>() + ); +} diff --git a/provider/datagen/tests/test-options.rs b/provider/datagen/tests/test-options.rs index 78fa43932aa..ba5c7db895b 100644 --- a/provider/datagen/tests/test-options.rs +++ b/provider/datagen/tests/test-options.rs @@ -743,6 +743,51 @@ fn explicit_hybrid_without_ancestors() { assert_eq!(exported.keys().collect::>(), locales); } +#[test] +fn explicit_hybrid_mixed_families() { + const SELECTED_LOCALES: [LocaleFamily; 8] = [ + LocaleFamily::without_ancestors(langid!("arc")), // Aramaic, not in supported list + LocaleFamily::with_descendants(langid!("ar-EG")), + LocaleFamily::without_ancestors(langid!("ar-EG")), // duplicate entry for ar-EG + LocaleFamily::with_descendants(langid!("en")), + LocaleFamily::single(langid!("en")), // duplicate entry for en + LocaleFamily::without_ancestors(langid!("en-GB")), + LocaleFamily::without_descendants(langid!("es")), + LocaleFamily::with_descendants(langid!("es")), // duplicate entry for es + ]; + let exported = export_to_map_1_5( + DatagenDriver::new() + .with_keys([HelloWorldV1Marker::KEY]) + .with_locales_and_fallback(SELECTED_LOCALES, Default::default()), + &TestingProvider::with_decimal_symbol_like_data(), + ); + + let locales = [ + // "ar", // excluded: ancestor of ar-EG + "ar-EG", // explicit locale + "ar-EG-u-nu-latn", // explicit with extensions + // "ar-SA", // explicit locale, inheriting from ar + // "ar-SA-u-nu-latn", // not reachable + // "ar-u-nu-latn", // not reachable + "arc", // Aramaic, inheriting from und + "en", // included as a singleton + // "en-001", // excluded: ancestor of en-GB + "en-GB", // included without ancestors + // "en-ZA", // not reachable + "es", // explicit and supported + "es-AR", // descendant of es + // "ru", // not requested + // "ru-Cyrl-RU", // not requested + // "sr", // not requested + // "sr-Latn", // not requested + // "sr-ME", // not requested + "und", + ]; + + // Should return the exact explicit locales set. + assert_eq!(exported.keys().collect::>(), locales); +} + #[test] fn explicit_runtime_und() { let exported = export_to_map( From 8d18020140022308f249cdbc68354b59fdd55d13 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Mon, 29 Apr 2024 17:20:58 -0700 Subject: [PATCH 16/28] Switch data structures to use less mutation --- provider/datagen/src/driver.rs | 125 ++++++++++++++++----------------- 1 file changed, 59 insertions(+), 66 deletions(-) diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index 266cf5bb383..7bebde4efd6 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -971,59 +971,48 @@ fn select_locales_for_key( impl FnOnce() -> Result, >, ) -> Result, DataError> { - /// Values for the map from langid to data locales. Keys that have aux keys or extension - /// keywords may have multiple data locales per langid. - #[derive(Default)] - struct LocalesMapValue { - /// The [`LocaleFamilyAnnotations`] associated with this language identifier if it was - /// explicitly requested. If `None`, the langid is supported but not requested. - family: Option, - /// Whether this language identifier should be included in the export. - is_selected: bool, - /// The set of exportable [`DataLocale`]s associated with this language identifier. - data_locales: HashSet, - } - let mut locales_map = HashMap::::new(); + // Map from all supported LanguageIdentifiers to their + // corresponding supported DataLocales. + let mut supported_map = HashMap::>::new(); for locale in provider .supported_locales_for_key(key) .map_err(|e| e.with_key(key))? { - locales_map + supported_map .entry(locale.get_langid()) .or_default() - .data_locales .insert(locale); } if key.path().get().starts_with("segmenter/dictionary/") { - locales_map.retain(|_, value| { - value.data_locales.retain(|locale| { + supported_map.retain(|_, locales| { + locales.retain(|locale| { let model = crate::dictionary_data_locale_to_model_name(locale); segmenter_models.iter().any(|m| Some(m.as_ref()) == model) }); - !value.data_locales.is_empty() + !locales.is_empty() }); // Don't perform additional locale filtering - return Ok(locales_map + return Ok(supported_map .into_values() - .flat_map(|value| value.data_locales) + .flat_map(|locales| locales) .collect()); } else if key.path().get().starts_with("segmenter/lstm/") { - locales_map.retain(|_, value| { - value.data_locales.retain(|locale| { + supported_map.retain(|_, locales| { + locales.retain(|locale| { let model = crate::lstm_data_locale_to_model_name(locale); segmenter_models.iter().any(|m| Some(m.as_ref()) == model) }); - !value.data_locales.is_empty() + !locales.is_empty() }); // Don't perform additional locale filtering - return Ok(locales_map + return Ok(supported_map .into_values() - .flat_map(|value| value.data_locales) + .flat_map(|locales| locales) .collect()); } else if key.path().get().starts_with("collator/") { - locales_map.retain(|_, value| { - value.data_locales.retain(|locale| { + supported_map.retain(|_, locales| { + locales.retain(|locale| { let Some(collation) = locale .get_unicode_ext(&key!("co")) .and_then(|co| co.as_single_subtag().copied()) @@ -1037,18 +1026,22 @@ fn select_locales_for_key( !["big5han", "gb2312"].contains(&collation.as_str()) } }); - !value.data_locales.is_empty() + !locales.is_empty() }); } + // The explicitly requested families, except for the `full` family. + // In without-fallback mode, langids are mapped to `single`. + let mut requested_families = HashMap::::new(); + // Add the explicit langids to the map let mut include_full = false; match locales_fallback { LocalesWithOrWithoutFallback::WithFallback { families, .. } => { if families.is_empty() { // If no locales are selected but fallback is enabled, select the root locale - let value = locales_map.entry(LanguageIdentifier::UND).or_default(); - value.is_selected = true; + requested_families + .insert(LanguageIdentifier::UND, LocaleFamilyAnnotations::single()); } for (langid, annotations) in families { let Some(langid) = langid.as_ref() else { @@ -1056,77 +1049,77 @@ fn select_locales_for_key( include_full = true; continue; }; - let value = locales_map.entry(langid.clone()).or_default(); - value.is_selected = true; if *langid != LanguageIdentifier::UND { - value.family = Some(*annotations); + requested_families.insert(langid.clone(), *annotations); + } else { + requested_families + .insert(LanguageIdentifier::UND, LocaleFamilyAnnotations::single()); } } if include_full && families.len() == 1 { // Special case: return now so we don't need the fallbacker - let selected_locales = locales_map - .into_iter() - .flat_map(|(_, value)| value.data_locales) - .collect(); + let selected_locales = supported_map.into_values().flatten().collect(); return Ok(selected_locales); } } LocalesWithOrWithoutFallback::WithoutFallback { langids: locales } => { for langid in locales { - let value = locales_map.entry(langid.clone()).or_default(); - value.is_selected = true; + requested_families.insert(langid.clone(), LocaleFamilyAnnotations::single()); } } } + // Set of all selected language identifiers after resolving ancestors and descendants. + let mut selected_langids = requested_families.keys().cloned().collect::>(); + + // Map from LanguageIdentifiers to DataLocales, including auxiliary keys and extensions + // inherited from their parent locales. + let mut expansion_map = supported_map.clone(); + // Fill in missing extensions and aux keys from parent locales, // and calculate which langids are ancestors and descendants. - for current_langid in locales_map.keys().cloned().collect::>() { - let current_value = locales_map.get_mut(¤t_langid).unwrap(); - if include_full && !current_value.is_selected { + for current_langid in supported_map.keys().chain(requested_families.keys()) { + if include_full && !selected_langids.contains(current_langid) { log::trace!("Including {current_langid}: the full locale family is present"); - current_value.is_selected = true; + selected_langids.insert(current_langid.clone()); } if current_langid.language.is_empty() { - if current_langid != LanguageIdentifier::UND { + if current_langid != &LanguageIdentifier::UND { log::trace!("Including {current_langid}: variants of und are always included"); - current_value.is_selected = true; + selected_langids.insert(current_langid.clone()); } - // Nothing left to be done here - continue; } - let include_ancestors = current_value - .family + let current_data_locales = expansion_map.entry(current_langid.clone()).or_default(); + let include_ancestors: bool = requested_families + .get(current_langid) .map(|family| family.include_ancestors) .unwrap_or(false); let fallbacker = fallbacker.as_ref().map_err(|e| *e)?; let fallbacker_with_config = fallbacker.for_config(key.fallback_config()); - let mut iter = fallbacker_with_config.fallback_for((¤t_langid).into()); + let mut iter = fallbacker_with_config.fallback_for(current_langid.into()); loop { // Inherit aux keys and extension keywords from parent locales let parent_langid: LanguageIdentifier = iter.get().get_langid(); - if let Some(parent_value) = locales_map.get_mut(&parent_langid) { - if include_ancestors && !parent_value.is_selected { - log::trace!("Including {parent_langid}: ancestor of {current_langid}"); - parent_value.is_selected = true; - } - let include_descendants = parent_value - .family + if let Some(parent_locales) = supported_map.get(&parent_langid) { + let include_descendants = requested_families + .get(&parent_langid) .map(|family| family.include_descendants) .unwrap_or(false); - let parent_locales = parent_value.data_locales.clone(); - let current_value = locales_map.get_mut(¤t_langid).unwrap(); - if include_descendants && !current_value.is_selected { + if include_descendants && !selected_langids.contains(current_langid) { log::trace!("Including {current_langid}: descendant of {parent_langid}"); - current_value.is_selected = true; + selected_langids.insert(current_langid.clone()); + } + if include_ancestors && !selected_langids.contains(&parent_langid) { + log::trace!("Including {parent_langid}: ancestor of {current_langid}"); + selected_langids.insert(parent_langid); } - for mut morphed_locale in parent_locales { + for mut morphed_locale in parent_locales.iter().cloned() { // Special case: don't pull extensions or aux keys up from the root. if morphed_locale.is_langid_und() && !morphed_locale.is_empty() { continue; } morphed_locale.set_langid(current_langid.clone()); - current_value.data_locales.insert(morphed_locale); + current_data_locales.insert(morphed_locale); } } if iter.get().is_und() { @@ -1136,10 +1129,10 @@ fn select_locales_for_key( } } - let selected_locales = locales_map + let selected_locales = expansion_map .into_iter() - .filter(|(_, value)| value.is_selected) - .flat_map(|(_, value)| value.data_locales) + .filter(|(langid, _)| selected_langids.contains(langid)) + .flat_map(|(_, data_locales)| data_locales) .collect(); Ok(selected_locales) } From 04b22d00c72e3deae76e6e1b99df43cdc9aa6613 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Mon, 29 Apr 2024 17:30:47 -0700 Subject: [PATCH 17/28] Even less mutation --- provider/datagen/src/driver.rs | 69 ++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 33 deletions(-) diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index 7bebde4efd6..fd799838f1a 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -1031,42 +1031,45 @@ fn select_locales_for_key( } // The explicitly requested families, except for the `full` family. - // In without-fallback mode, langids are mapped to `single`. - let mut requested_families = HashMap::::new(); - - // Add the explicit langids to the map let mut include_full = false; - match locales_fallback { - LocalesWithOrWithoutFallback::WithFallback { families, .. } => { - if families.is_empty() { + let requested_families: HashMap = + match locales_fallback { + LocalesWithOrWithoutFallback::WithFallback { families, .. } if families.is_empty() => { // If no locales are selected but fallback is enabled, select the root locale - requested_families - .insert(LanguageIdentifier::UND, LocaleFamilyAnnotations::single()); - } - for (langid, annotations) in families { - let Some(langid) = langid.as_ref() else { - debug_assert_eq!(annotations, &LocaleFamily::full().annotations); - include_full = true; - continue; - }; - if *langid != LanguageIdentifier::UND { - requested_families.insert(langid.clone(), *annotations); - } else { - requested_families - .insert(LanguageIdentifier::UND, LocaleFamilyAnnotations::single()); - } - } - if include_full && families.len() == 1 { - // Special case: return now so we don't need the fallbacker - let selected_locales = supported_map.into_values().flatten().collect(); - return Ok(selected_locales); - } - } - LocalesWithOrWithoutFallback::WithoutFallback { langids: locales } => { - for langid in locales { - requested_families.insert(langid.clone(), LocaleFamilyAnnotations::single()); + [(LanguageIdentifier::UND, LocaleFamilyAnnotations::single())] + .into_iter() + .collect() } - } + LocalesWithOrWithoutFallback::WithFallback { families, .. } => families + .iter() + .filter_map(|(langid, annotations)| { + if let Some(langid) = langid.as_ref() { + if *langid == LanguageIdentifier::UND { + // Root locale: do not include descendants (use `full` for that) + Some((LanguageIdentifier::UND, LocaleFamilyAnnotations::single())) + } else { + // All other locales: copy the requested annotations + Some((langid.clone(), *annotations)) + } + } else { + // Full locale family: set the bit instead of adding to the set + debug_assert_eq!(annotations, &LocaleFamily::full().annotations); + include_full = true; + None + } + }) + .collect(), + LocalesWithOrWithoutFallback::WithoutFallback { langids } => langids + .iter() + // Map langids without fallback to the `single` family + .map(|langid| (langid.clone(), LocaleFamilyAnnotations::single())) + .collect(), + }; + + if include_full && requested_families.is_empty() { + // Special case: return now so we don't need the fallbacker + let selected_locales = supported_map.into_values().flatten().collect(); + return Ok(selected_locales); } // Set of all selected language identifiers after resolving ancestors and descendants. From 23810a7343aa819edfe44f7a76486042d1536165 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Mon, 29 Apr 2024 17:40:35 -0700 Subject: [PATCH 18/28] Even less mutation --- provider/datagen/src/driver.rs | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index fd799838f1a..c2f2f2f4ef6 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -1067,21 +1067,20 @@ fn select_locales_for_key( }; if include_full && requested_families.is_empty() { - // Special case: return now so we don't need the fallbacker + // Special case: return now so we don't need the fallbacker (and its requisite CLDR data) let selected_locales = supported_map.into_values().flatten().collect(); return Ok(selected_locales); } - // Set of all selected language identifiers after resolving ancestors and descendants. - let mut selected_langids = requested_families.keys().cloned().collect::>(); - - // Map from LanguageIdentifiers to DataLocales, including auxiliary keys and extensions - // inherited from their parent locales. - let mut expansion_map = supported_map.clone(); + // Need the fallbacker now. + let fallbacker = fallbacker.as_ref().map_err(|e| *e)?; + let fallbacker_with_config = fallbacker.for_config(key.fallback_config()); - // Fill in missing extensions and aux keys from parent locales, - // and calculate which langids are ancestors and descendants. - for current_langid in supported_map.keys().chain(requested_families.keys()) { + // Compute a map from LanguageIdentifiers to DataLocales, including inherited auxiliary keys + // and extensions. Also resolve the ancestors and descendants while building this map. + let mut selected_langids = requested_families.keys().cloned().collect::>(); + let expansion_map: HashMap<&LanguageIdentifier, HashSet> = supported_map.keys().chain(requested_families.keys()).map(|current_langid| { + let mut expansion = HashSet::new(); if include_full && !selected_langids.contains(current_langid) { log::trace!("Including {current_langid}: the full locale family is present"); selected_langids.insert(current_langid.clone()); @@ -1092,13 +1091,10 @@ fn select_locales_for_key( selected_langids.insert(current_langid.clone()); } } - let current_data_locales = expansion_map.entry(current_langid.clone()).or_default(); let include_ancestors: bool = requested_families .get(current_langid) .map(|family| family.include_ancestors) .unwrap_or(false); - let fallbacker = fallbacker.as_ref().map_err(|e| *e)?; - let fallbacker_with_config = fallbacker.for_config(key.fallback_config()); let mut iter = fallbacker_with_config.fallback_for(current_langid.into()); loop { // Inherit aux keys and extension keywords from parent locales @@ -1122,7 +1118,7 @@ fn select_locales_for_key( continue; } morphed_locale.set_langid(current_langid.clone()); - current_data_locales.insert(morphed_locale); + expansion.insert(morphed_locale); } } if iter.get().is_und() { @@ -1130,7 +1126,8 @@ fn select_locales_for_key( } iter.step(); } - } + (current_langid, expansion) + }).collect(); let selected_locales = expansion_map .into_iter() From f955ef10ac1bcdb06e662aaf83d84645a96debc2 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Mon, 29 Apr 2024 17:43:59 -0700 Subject: [PATCH 19/28] docs, clippy --- provider/datagen/src/driver.rs | 100 ++++++++++++++++----------------- 1 file changed, 48 insertions(+), 52 deletions(-) diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index c2f2f2f4ef6..2ce1d1c5306 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -163,7 +163,7 @@ impl LocaleFamily { /// The family containing all descendants of the selected locale. /// - /// This family is primarily useful if the root locale is not desired. + /// This family may be useful if the root locale is not desired. /// /// For example, the family `::without_ancestors("en-001")` contains: /// @@ -993,10 +993,7 @@ fn select_locales_for_key( !locales.is_empty() }); // Don't perform additional locale filtering - return Ok(supported_map - .into_values() - .flat_map(|locales| locales) - .collect()); + return Ok(supported_map.into_values().flatten().collect()); } else if key.path().get().starts_with("segmenter/lstm/") { supported_map.retain(|_, locales| { locales.retain(|locale| { @@ -1006,10 +1003,7 @@ fn select_locales_for_key( !locales.is_empty() }); // Don't perform additional locale filtering - return Ok(supported_map - .into_values() - .flat_map(|locales| locales) - .collect()); + return Ok(supported_map.into_values().flatten().collect()); } else if key.path().get().starts_with("collator/") { supported_map.retain(|_, locales| { locales.retain(|locale| { @@ -1079,55 +1073,57 @@ fn select_locales_for_key( // Compute a map from LanguageIdentifiers to DataLocales, including inherited auxiliary keys // and extensions. Also resolve the ancestors and descendants while building this map. let mut selected_langids = requested_families.keys().cloned().collect::>(); - let expansion_map: HashMap<&LanguageIdentifier, HashSet> = supported_map.keys().chain(requested_families.keys()).map(|current_langid| { - let mut expansion = HashSet::new(); - if include_full && !selected_langids.contains(current_langid) { - log::trace!("Including {current_langid}: the full locale family is present"); - selected_langids.insert(current_langid.clone()); - } - if current_langid.language.is_empty() { - if current_langid != &LanguageIdentifier::UND { + let expansion_map: HashMap<&LanguageIdentifier, HashSet> = supported_map + .keys() + .chain(requested_families.keys()) + .map(|current_langid| { + let mut expansion = HashSet::new(); + if include_full && !selected_langids.contains(current_langid) { + log::trace!("Including {current_langid}: the full locale family is present"); + selected_langids.insert(current_langid.clone()); + } + if current_langid.language.is_empty() && current_langid != &LanguageIdentifier::UND { log::trace!("Including {current_langid}: variants of und are always included"); selected_langids.insert(current_langid.clone()); } - } - let include_ancestors: bool = requested_families - .get(current_langid) - .map(|family| family.include_ancestors) - .unwrap_or(false); - let mut iter = fallbacker_with_config.fallback_for(current_langid.into()); - loop { - // Inherit aux keys and extension keywords from parent locales - let parent_langid: LanguageIdentifier = iter.get().get_langid(); - if let Some(parent_locales) = supported_map.get(&parent_langid) { - let include_descendants = requested_families - .get(&parent_langid) - .map(|family| family.include_descendants) - .unwrap_or(false); - if include_descendants && !selected_langids.contains(current_langid) { - log::trace!("Including {current_langid}: descendant of {parent_langid}"); - selected_langids.insert(current_langid.clone()); - } - if include_ancestors && !selected_langids.contains(&parent_langid) { - log::trace!("Including {parent_langid}: ancestor of {current_langid}"); - selected_langids.insert(parent_langid); - } - for mut morphed_locale in parent_locales.iter().cloned() { - // Special case: don't pull extensions or aux keys up from the root. - if morphed_locale.is_langid_und() && !morphed_locale.is_empty() { - continue; + let include_ancestors: bool = requested_families + .get(current_langid) + .map(|family| family.include_ancestors) + .unwrap_or(false); + let mut iter = fallbacker_with_config.fallback_for(current_langid.into()); + loop { + // Inherit aux keys and extension keywords from parent locales + let parent_langid: LanguageIdentifier = iter.get().get_langid(); + if let Some(parent_locales) = supported_map.get(&parent_langid) { + let include_descendants = requested_families + .get(&parent_langid) + .map(|family| family.include_descendants) + .unwrap_or(false); + if include_descendants && !selected_langids.contains(current_langid) { + log::trace!("Including {current_langid}: descendant of {parent_langid}"); + selected_langids.insert(current_langid.clone()); + } + if include_ancestors && !selected_langids.contains(&parent_langid) { + log::trace!("Including {parent_langid}: ancestor of {current_langid}"); + selected_langids.insert(parent_langid); + } + for mut morphed_locale in parent_locales.iter().cloned() { + // Special case: don't pull extensions or aux keys up from the root. + if morphed_locale.is_langid_und() && !morphed_locale.is_empty() { + continue; + } + morphed_locale.set_langid(current_langid.clone()); + expansion.insert(morphed_locale); } - morphed_locale.set_langid(current_langid.clone()); - expansion.insert(morphed_locale); } + if iter.get().is_und() { + break; + } + iter.step(); } - if iter.get().is_und() { - break; - } - iter.step(); - } - (current_langid, expansion) - }).collect(); + (current_langid, expansion) + }) + .collect(); let selected_locales = expansion_map .into_iter() From 5a4b115780fc65d9e19c41ec3ede3c14a57b494d Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Mon, 29 Apr 2024 17:49:33 -0700 Subject: [PATCH 20/28] Cleanup --- provider/datagen/src/driver.rs | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index 2ce1d1c5306..9c200fe3e06 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -320,31 +320,19 @@ impl FromStr for LocaleFamily { match first { b'^' => Ok(Self { langid: Some(LanguageIdentifier::try_from_bytes(remainder)?), - annotations: LocaleFamilyAnnotations { - include_ancestors: true, - include_descendants: false, - }, + annotations: LocaleFamilyAnnotations::without_descendants(), }), b'%' => Ok(Self { langid: Some(LanguageIdentifier::try_from_bytes(remainder)?), - annotations: LocaleFamilyAnnotations { - include_ancestors: false, - include_descendants: true, - }, + annotations: LocaleFamilyAnnotations::without_ancestors(), }), b'@' => Ok(Self { langid: Some(LanguageIdentifier::try_from_bytes(remainder)?), - annotations: LocaleFamilyAnnotations { - include_ancestors: false, - include_descendants: false, - }, + annotations: LocaleFamilyAnnotations::single(), }), b if b.is_ascii_alphanumeric() => Ok(Self { langid: Some(s.parse()?), - annotations: LocaleFamilyAnnotations { - include_ancestors: true, - include_descendants: true, - }, + annotations: LocaleFamilyAnnotations::with_descendants(), }), _ => Err(LocaleFamilyParseError::InvalidFamily), } From 29603786a3cedc66ece7373b8a89fe49a88c99a7 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Mon, 29 Apr 2024 18:05:44 -0700 Subject: [PATCH 21/28] Fix --- provider/datagen/src/driver.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index 9c200fe3e06..558d9178ad4 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -1065,7 +1065,10 @@ fn select_locales_for_key( .keys() .chain(requested_families.keys()) .map(|current_langid| { - let mut expansion = HashSet::new(); + let mut expansion = supported_map + .get(¤t_langid) + .cloned() + .unwrap_or_default(); if include_full && !selected_langids.contains(current_langid) { log::trace!("Including {current_langid}: the full locale family is present"); selected_langids.insert(current_langid.clone()); @@ -1283,6 +1286,11 @@ fn test_collation_filtering() { language: langid!("ko"), expected: &["ko", "ko-u-co-search", "ko-u-co-searchjl", "ko-u-co-unihan"], }, + TestCase { + include_collations: &[], + language: langid!("und"), + expected: &["und", "und-u-co-emoji", "und-u-co-eor"], + }, ]; for cas in cases { let resolved_locales = select_locales_for_key( From 85380a0e292c7a4cb2a55cdc7cb8d1e75adc6ccd Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Mon, 29 Apr 2024 18:12:59 -0700 Subject: [PATCH 22/28] Clippy --- provider/datagen/src/driver.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index 558d9178ad4..817c00f87e0 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -1066,7 +1066,7 @@ fn select_locales_for_key( .chain(requested_families.keys()) .map(|current_langid| { let mut expansion = supported_map - .get(¤t_langid) + .get(current_langid) .cloned() .unwrap_or_default(); if include_full && !selected_langids.contains(current_langid) { From 48a13557ce3e199355484c71b7526b1e1b2ba262 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Mon, 29 Apr 2024 18:38:56 -0700 Subject: [PATCH 23/28] Fix --- provider/datagen/src/driver.rs | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index 817c00f87e0..5d94c54507d 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -1070,11 +1070,11 @@ fn select_locales_for_key( .cloned() .unwrap_or_default(); if include_full && !selected_langids.contains(current_langid) { - log::trace!("Including {current_langid}: the full locale family is present"); + log::trace!("Including {current_langid}: full locale family: {key}"); selected_langids.insert(current_langid.clone()); } if current_langid.language.is_empty() && current_langid != &LanguageIdentifier::UND { - log::trace!("Including {current_langid}: variants of und are always included"); + log::trace!("Including {current_langid}: und variant: {key}"); selected_langids.insert(current_langid.clone()); } let include_ancestors: bool = requested_families @@ -1085,24 +1085,26 @@ fn select_locales_for_key( loop { // Inherit aux keys and extension keywords from parent locales let parent_langid: LanguageIdentifier = iter.get().get_langid(); - if let Some(parent_locales) = supported_map.get(&parent_langid) { - let include_descendants = requested_families - .get(&parent_langid) - .map(|family| family.include_descendants) - .unwrap_or(false); - if include_descendants && !selected_langids.contains(current_langid) { - log::trace!("Including {current_langid}: descendant of {parent_langid}"); - selected_langids.insert(current_langid.clone()); - } - if include_ancestors && !selected_langids.contains(&parent_langid) { - log::trace!("Including {parent_langid}: ancestor of {current_langid}"); - selected_langids.insert(parent_langid); - } - for mut morphed_locale in parent_locales.iter().cloned() { + let maybe_parent_locales = supported_map.get(&parent_langid); + let include_descendants = requested_families + .get(&parent_langid) + .map(|family| family.include_descendants) + .unwrap_or(false); + if include_descendants && !selected_langids.contains(current_langid) { + log::trace!("Including {current_langid}: descendant of {parent_langid}: {key}"); + selected_langids.insert(current_langid.clone()); + } + if include_ancestors && !selected_langids.contains(&parent_langid) { + log::trace!("Including {parent_langid}: ancestor of {current_langid}: {key}"); + selected_langids.insert(parent_langid); + } + if let Some(parent_locales) = maybe_parent_locales { + for morphed_locale in parent_locales.iter() { // Special case: don't pull extensions or aux keys up from the root. if morphed_locale.is_langid_und() && !morphed_locale.is_empty() { continue; } + let mut morphed_locale = morphed_locale.clone(); morphed_locale.set_langid(current_langid.clone()); expansion.insert(morphed_locale); } From 3f9434114c058e337e01a4744e4ffc758e1e8fb6 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Tue, 30 Apr 2024 09:22:38 -0700 Subject: [PATCH 24/28] Update provider/datagen/src/driver.rs Co-authored-by: Robert Bastian <4706271+robertbastian@users.noreply.github.com> --- provider/datagen/src/driver.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index 5d94c54507d..31fef154343 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -1077,7 +1077,7 @@ fn select_locales_for_key( log::trace!("Including {current_langid}: und variant: {key}"); selected_langids.insert(current_langid.clone()); } - let include_ancestors: bool = requested_families + let include_ancestors = requested_families .get(current_langid) .map(|family| family.include_ancestors) .unwrap_or(false); From c7b792492e30a72c9cfced93c3fd2b6f611166d4 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Tue, 30 Apr 2024 09:23:26 -0700 Subject: [PATCH 25/28] inner -> parts --- provider/datagen/src/driver.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index 31fef154343..a844bf0a9fb 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -205,7 +205,7 @@ impl LocaleFamily { } } - pub(crate) fn into_inner(self) -> (Option, LocaleFamilyAnnotations) { + pub(crate) fn into_parts(self) -> (Option, LocaleFamilyAnnotations) { (self.langid, self.annotations) } @@ -238,7 +238,7 @@ pub(crate) struct LocaleFamilyBorrowed<'a> { } impl<'a> LocaleFamilyBorrowed<'a> { - pub(crate) fn from_inner( + pub(crate) fn from_parts( inner: (&'a Option, &LocaleFamilyAnnotations), ) -> Self { Self { @@ -507,7 +507,7 @@ impl DatagenDriver { ) -> Self { Self { locales_fallback: Some(LocalesWithOrWithoutFallback::WithFallback { - families: locales.into_iter().map(LocaleFamily::into_inner).collect(), + families: locales.into_iter().map(LocaleFamily::into_parts).collect(), options, }), ..self @@ -635,11 +635,11 @@ impl DatagenDriver { Some(v) => v .into_iter() .map(LocaleFamily::with_descendants) - .map(LocaleFamily::into_inner) + .map(LocaleFamily::into_parts) .collect(), None => [LocaleFamily::full()] .into_iter() - .map(LocaleFamily::into_inner) + .map(LocaleFamily::into_parts) .collect(), }; @@ -737,7 +737,7 @@ impl DatagenDriver { }; let mut sorted_locales = families .iter() - .map(LocaleFamilyBorrowed::from_inner) + .map(LocaleFamilyBorrowed::from_parts) .map(|family| family.write_to_string().into_owned()) .collect::>(); sorted_locales.sort_unstable(); @@ -1341,8 +1341,8 @@ fn test_family_precedence() { assert_eq!( families, [ - "@en".parse::().unwrap().into_inner(), - "^zh-TW".parse::().unwrap().into_inner() + "@en".parse::().unwrap().into_parts(), + "^zh-TW".parse::().unwrap().into_parts() ] .into_iter() .collect::>() From 7a6ed81730ff1f5209373f901c439de4f03e7ed1 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Tue, 30 Apr 2024 09:23:55 -0700 Subject: [PATCH 26/28] langids --- provider/datagen/src/driver.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index a844bf0a9fb..63f710dcd74 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -706,8 +706,8 @@ impl DatagenDriver { } let (uses_internal_fallback, deduplication_strategy) = match &locales_fallback { - LocalesWithOrWithoutFallback::WithoutFallback { langids: locales } => { - let mut sorted_locales = locales + LocalesWithOrWithoutFallback::WithoutFallback { langids } => { + let mut sorted_locales = langids .iter() .map(|x| x.write_to_string()) .collect::>(); From bddaa9b4796ce0f055afe3f486c47d9ec9b0e801 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Tue, 30 Apr 2024 09:25:15 -0700 Subject: [PATCH 27/28] comment about false --- provider/datagen/src/driver.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index 63f710dcd74..9994cbed93c 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -1080,6 +1080,7 @@ fn select_locales_for_key( let include_ancestors = requested_families .get(current_langid) .map(|family| family.include_ancestors) + // default to `false` if the langid was not requested .unwrap_or(false); let mut iter = fallbacker_with_config.fallback_for(current_langid.into()); loop { @@ -1089,6 +1090,7 @@ fn select_locales_for_key( let include_descendants = requested_families .get(&parent_langid) .map(|family| family.include_descendants) + // default to `false` if the langid was not requested .unwrap_or(false); if include_descendants && !selected_langids.contains(current_langid) { log::trace!("Including {current_langid}: descendant of {parent_langid}: {key}"); From 2b9638b14b267161685f295f6ba7f402ba7c1af7 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Tue, 30 Apr 2024 09:27:32 -0700 Subject: [PATCH 28/28] Deduplicate candidate langids --- provider/datagen/src/driver.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/provider/datagen/src/driver.rs b/provider/datagen/src/driver.rs index 9994cbed93c..2c7517e4ca7 100644 --- a/provider/datagen/src/driver.rs +++ b/provider/datagen/src/driver.rs @@ -1058,12 +1058,17 @@ fn select_locales_for_key( let fallbacker = fallbacker.as_ref().map_err(|e| *e)?; let fallbacker_with_config = fallbacker.for_config(key.fallback_config()); + // The "candidate" langids that could be exported is the union of requested and supported. + let all_candidate_langids = supported_map + .keys() + .chain(requested_families.keys()) + .collect::>(); + // Compute a map from LanguageIdentifiers to DataLocales, including inherited auxiliary keys // and extensions. Also resolve the ancestors and descendants while building this map. let mut selected_langids = requested_families.keys().cloned().collect::>(); - let expansion_map: HashMap<&LanguageIdentifier, HashSet> = supported_map - .keys() - .chain(requested_families.keys()) + let expansion_map: HashMap<&LanguageIdentifier, HashSet> = all_candidate_langids + .into_iter() .map(|current_langid| { let mut expansion = supported_map .get(current_langid)