Skip to content

Commit

Permalink
Collator preferences (#5573)
Browse files Browse the repository at this point in the history
  • Loading branch information
robertbastian authored Nov 7, 2024
1 parent 2d26b19 commit 6ad31fb
Show file tree
Hide file tree
Showing 10 changed files with 268 additions and 239 deletions.
1 change: 1 addition & 0 deletions components/collator/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ all-features = true
displaydoc = { workspace = true }
icu_collections = { workspace = true }
icu_normalizer = { workspace = true }
icu_locale_core = { workspace = true }
icu_properties = { workspace = true }
icu_provider = { workspace = true, features = ["macros"] }
utf8_iter = { workspace = true }
Expand Down
30 changes: 14 additions & 16 deletions components/collator/README.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

32 changes: 13 additions & 19 deletions components/collator/benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,7 @@
use criterion::{black_box, criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion};

use icu::collator::*;
use icu::locale::Locale;
use icu_provider::DataLocale;

fn to_data_locale(locale_str: &str) -> DataLocale {
locale_str
.parse::<Locale>()
.expect("Failed to parse locale")
.into()
}
use icu::locale::locale;

pub fn collator_with_locale(criterion: &mut Criterion) {
// Load file content in reverse order vector.
Expand Down Expand Up @@ -99,36 +91,36 @@ pub fn collator_with_locale(criterion: &mut Criterion) {
Strength::Identical,
];
let performance_parameters = [
(to_data_locale("en_US"), vec![&content_latin], &all_strength),
(to_data_locale("da_DK"), vec![&content_latin], &all_strength),
(to_data_locale("fr_CA"), vec![&content_latin], &all_strength),
(locale!("en-US"), vec![&content_latin], &all_strength),
(locale!("da-DK"), vec![&content_latin], &all_strength),
(locale!("fr-CA"), vec![&content_latin], &all_strength),
(
to_data_locale("ja_JP"),
locale!("ja-JP"),
vec![&content_latin, &content_jp_h, &content_jp_k, &content_asian],
&all_strength,
),
(
to_data_locale("zh-u-co-pinyin"),
locale!("zh-u-co-pinyin"),
vec![&content_latin, &content_chinese],
&all_strength,
), // zh_CN
(
to_data_locale("zh-u-co-stroke"),
locale!("zh-u-co-stroke"),
vec![&content_latin, &content_chinese],
&all_strength,
), // zh_TW
(
to_data_locale("ru_RU"),
locale!("ru-RU"),
vec![&content_latin, &content_russian],
&all_strength,
),
(
to_data_locale("th"),
locale!("th"),
vec![&content_latin, &content_thai],
&all_strength,
),
(
to_data_locale("ko_KR"),
locale!("ko-KR"),
vec![&content_latin, &content_korean],
&all_strength,
),
Expand Down Expand Up @@ -156,7 +148,9 @@ pub fn collator_with_locale(criterion: &mut Criterion) {
for (index, strength) in benched_strength.iter().enumerate() {
let mut options = CollatorOptions::default();
options.strength = Some(*strength);
let collator = Collator::try_new(&locale_under_bench, options).unwrap();
let collator =
Collator::try_new(CollatorPreferences::from(&locale_under_bench), options)
.unwrap();
// ICU4X collator performance, sort is locale-aware
group.bench_function(
BenchmarkId::new(
Expand Down
58 changes: 40 additions & 18 deletions components/collator/src/comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,22 @@ struct LocaleSpecificDataHolder {
lithuanian_dot_above: bool,
}

icu_locale_core::preferences::define_preferences!(
/// The preferences for collation.
CollatorPreferences,
{
/// The collation type. This corresponds to the `-u-co` BCP-47 tag.
collation_type: icu_locale_core::preferences::extensions::unicode::keywords::CollationType
}
);

impl Copy for CollatorPreferences {}

impl LocaleSpecificDataHolder {
/// The constructor code reused between owned and borrowed cases.
fn try_new_unstable_internal<D>(
provider: &D,
locale: &DataLocale,
prefs: CollatorPreferences,
options: CollatorOptions,
) -> Result<Self, DataError>
where
Expand All @@ -84,24 +95,31 @@ impl LocaleSpecificDataHolder {
+ DataProvider<CollationReorderingV1Marker>
+ ?Sized,
{
let id = DataIdentifierBorrowed::for_marker_attributes_and_locale(
DataMarkerAttributes::from_str_or_panic(
locale.get_single_unicode_ext("co").unwrap_or_default(),
),
locale,
);
let marker_attributes = prefs
.collation_type
.as_ref()
// all collation types are valid marker attributes
.map(|c| DataMarkerAttributes::from_str_or_panic(c.as_str()))
.unwrap_or_default();

let data_locale =
DataLocale::from_preferences_locale::<CollationTailoringV1Marker>(prefs.locale_prefs);
let id = DataIdentifierCow::from_borrowed_and_owned(marker_attributes, data_locale.clone());

let req = DataRequest {
id,
id: id.as_borrowed(),
metadata: {
let mut metadata = DataRequestMetadata::default();
metadata.silent = true;
metadata
},
};

let fallback_id =
DataIdentifierCow::from_borrowed_and_owned(Default::default(), data_locale);

let fallback_req = DataRequest {
id: DataIdentifierBorrowed::for_locale(locale),
id: fallback_id.as_borrowed(),
..Default::default()
};

Expand Down Expand Up @@ -228,14 +246,14 @@ impl Collator {
/// Creates `CollatorBorrowed` for the given locale and options from compiled data.
#[cfg(feature = "compiled_data")]
pub fn try_new(
locale: &DataLocale,
prefs: CollatorPreferences,
options: CollatorOptions,
) -> Result<CollatorBorrowed<'static>, DataError> {
CollatorBorrowed::try_new(locale, options)
CollatorBorrowed::try_new(prefs, options)
}

icu_provider::gen_any_buffer_data_constructors!(
(locale, options: CollatorOptions) -> error: DataError,
(prefs: CollatorPreferences, options: CollatorOptions) -> error: DataError,
functions: [
try_new: skip,
try_new_with_any_provider,
Expand All @@ -248,7 +266,7 @@ impl Collator {
#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::try_new)]
pub fn try_new_unstable<D>(
provider: &D,
locale: &DataLocale,
prefs: CollatorPreferences,
options: CollatorOptions,
) -> Result<Self, DataError>
where
Expand All @@ -270,7 +288,7 @@ impl Collator {
provider.load(Default::default())?.payload,
provider.load(Default::default())?.payload,
|| provider.load(Default::default()).map(|r| r.payload),
locale,
prefs,
options,
)
}
Expand All @@ -286,7 +304,7 @@ impl Collator {
DataPayload<CollationSpecialPrimariesV1Marker>,
DataError,
>,
locale: &DataLocale,
prefs: CollatorPreferences,
options: CollatorOptions,
) -> Result<Self, DataError>
where
Expand All @@ -298,7 +316,7 @@ impl Collator {
+ ?Sized,
{
let locale_dependent =
LocaleSpecificDataHolder::try_new_unstable_internal(provider, locale, options)?;
LocaleSpecificDataHolder::try_new_unstable_internal(provider, prefs, options)?;

// TODO: redesign Korean search collation handling
if jamo.get().ce32s.len() != JAMO_COUNT {
Expand Down Expand Up @@ -355,9 +373,13 @@ pub struct CollatorBorrowed<'a> {
impl CollatorBorrowed<'static> {
/// Creates a collator for the given locale and options from compiled data.
#[cfg(feature = "compiled_data")]
pub fn try_new(locale: &DataLocale, options: CollatorOptions) -> Result<Self, DataError> {
pub fn try_new(
prefs: CollatorPreferences,
options: CollatorOptions,
) -> Result<Self, DataError> {
// These are assigned to locals in order to keep the code after these assignments
// copypaste-compatible with `Collator::try_new_unstable_internal`.

let provider = &crate::provider::Baked;
let decompositions =
icu_normalizer::provider::Baked::SINGLETON_CANONICAL_DECOMPOSITION_DATA_V1_MARKER;
Expand All @@ -367,7 +389,7 @@ impl CollatorBorrowed<'static> {
let jamo = crate::provider::Baked::SINGLETON_COLLATION_JAMO_V1_MARKER;

let locale_dependent =
LocaleSpecificDataHolder::try_new_unstable_internal(provider, locale, options)?;
LocaleSpecificDataHolder::try_new_unstable_internal(provider, prefs, options)?;

// TODO: redesign Korean search collation handling
if jamo.ce32s.len() != JAMO_COUNT {
Expand Down
Loading

0 comments on commit 6ad31fb

Please sign in to comment.