From 3c6b7e49e6ce672ce5d47f6ba67cabfa46981520 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Sat, 21 Oct 2023 18:57:47 -0700 Subject: [PATCH] ZeroTrie: Add some tests for locale with aux key --- Cargo.lock | 2 + experimental/zerotrie/Cargo.toml | 6 + experimental/zerotrie/tests/data/data.rs | 6 + .../zerotrie/tests/locale_aux_test.rs | 143 ++++++++++++++++++ 4 files changed, 157 insertions(+) create mode 100644 experimental/zerotrie/tests/locale_aux_test.rs diff --git a/Cargo.lock b/Cargo.lock index 6eabb28687f..14bea638b69 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4430,12 +4430,14 @@ dependencies = [ "databake", "displaydoc", "icu_benchmark_macros", + "icu_locid", "litemap", "postcard", "rand", "rand_pcg", "serde", "serde_json", + "writeable", "yoke", "zerofrom", "zerovec", diff --git a/experimental/zerotrie/Cargo.toml b/experimental/zerotrie/Cargo.toml index d7ee49cb84e..44e97448736 100644 --- a/experimental/zerotrie/Cargo.toml +++ b/experimental/zerotrie/Cargo.toml @@ -42,6 +42,8 @@ rand_pcg = "0.3" serde = { version = "1.0", default-features = false } serde_json = "1.0" zerovec = { path = "../../utils/zerovec", features = ["serde", "hashmap"] } +icu_locid = { path = "../../components/locid" } +writeable = { path = "../../utils/writeable" } [target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies] criterion = "0.4" @@ -71,3 +73,7 @@ required-features = ["alloc", "litemap"] [[test]] name = "builder_test" required-features = ["alloc", "litemap"] + +[[test]] +name = "locale_aux_test" +required-features = ["alloc", "litemap"] diff --git a/experimental/zerotrie/tests/data/data.rs b/experimental/zerotrie/tests/data/data.rs index 9de102251b0..af631fe2c1d 100644 --- a/experimental/zerotrie/tests/data/data.rs +++ b/experimental/zerotrie/tests/data/data.rs @@ -2202,3 +2202,9 @@ pub mod short_subtags_10pct { "zia", ]; } + +#[allow(dead_code)] +pub mod locales_with_aux { + pub static NUM_UNIQUE_BLOBS: usize = 411; + pub static STRINGS: &[&str] = &["af-x-3", "af-x-3s", "af-x-4", "af-x-4s", "af-x-5", "af-x-5s", "am-x-3", "am-x-3s", "am-x-4", "am-x-4s", "am-x-5", "am-x-5s", "ar-DZ-x-3", "ar-DZ-x-3s", "ar-DZ-x-4", "ar-DZ-x-4s", "ar-DZ-x-5", "ar-DZ-x-5s", "ar-IQ-x-3", "ar-IQ-x-3s", "ar-IQ-x-4", "ar-IQ-x-4s", "ar-IQ-x-5", "ar-IQ-x-5s", "ar-JO-x-3", "ar-JO-x-3s", "ar-JO-x-4", "ar-JO-x-4s", "ar-JO-x-5", "ar-JO-x-5s", "ar-LB-x-3", "ar-LB-x-3s", "ar-LB-x-4", "ar-LB-x-4s", "ar-LB-x-5", "ar-LB-x-5s", "ar-MA-x-3", "ar-MA-x-3s", "ar-MA-x-4", "ar-MA-x-4s", "ar-MA-x-5", "ar-MA-x-5s", "ar-MR-x-3", "ar-MR-x-3s", "ar-MR-x-4", "ar-MR-x-4s", "ar-MR-x-5", "ar-MR-x-5s", "ar-PS-x-3", "ar-PS-x-3s", "ar-PS-x-4", "ar-PS-x-4s", "ar-PS-x-5", "ar-PS-x-5s", "ar-SY-x-3", "ar-SY-x-3s", "ar-SY-x-4", "ar-SY-x-4s", "ar-SY-x-5", "ar-SY-x-5s", "ar-TN-x-3", "ar-TN-x-3s", "ar-TN-x-4", "ar-TN-x-4s", "ar-TN-x-5", "ar-TN-x-5s", "ar-x-3", "ar-x-3s", "ar-x-4", "ar-x-4s", "ar-x-5", "ar-x-5s", "as-x-3", "as-x-3s", "as-x-4", "as-x-4s", "as-x-5", "as-x-5s", "ast-x-3", "ast-x-3s", "ast-x-4", "ast-x-4s", "ast-x-5", "ast-x-5s", "az-x-3", "az-x-3s", "az-x-5", "az-x-5s", "be-x-3", "be-x-3s", "be-x-4", "be-x-4s", "be-x-5", "be-x-5s", "bg-x-3", "bg-x-3s", "bg-x-4", "bg-x-4s", "bg-x-5", "bg-x-5s", "bgc-x-3", "bgc-x-3s", "bgc-x-5", "bgc-x-5s", "bho-x-3", "bho-x-3s", "bho-x-5", "bho-x-5s", "bn-IN-x-3", "bn-IN-x-3s", "bn-IN-x-4", "bn-IN-x-4s", "bn-x-3", "bn-x-3s", "bn-x-4", "bn-x-4s", "bn-x-5", "bn-x-5s", "br-x-3", "br-x-3s", "br-x-4", "br-x-4s", "br-x-5", "br-x-5s", "brx-x-3", "brx-x-3s", "brx-x-4", "brx-x-4s", "brx-x-5", "brx-x-5s", "bs-Cyrl-x-3", "bs-Cyrl-x-3s", "bs-Cyrl-x-4", "bs-Cyrl-x-4s", "bs-Cyrl-x-5", "bs-Cyrl-x-5s", "bs-x-3", "bs-x-3s", "bs-x-4", "bs-x-4s", "bs-x-5", "bs-x-5s", "ca-x-3", "ca-x-3s", "ca-x-4", "ca-x-4s", "ca-x-5", "ca-x-5s", "ceb-x-3", "ceb-x-3s", "ceb-x-4", "ceb-x-4s", "ceb-x-5", "ceb-x-5s", "chr-x-3", "chr-x-3s", "chr-x-4", "chr-x-4s", "chr-x-5", "chr-x-5s", "cs-x-3", "cs-x-3s", "cs-x-5", "cs-x-5s", "cv-x-3", "cv-x-3s", "cv-x-4", "cv-x-4s", "cv-x-5", "cv-x-5s", "cy-x-3", "cy-x-3s", "cy-x-4", "cy-x-4s", "cy-x-5", "cy-x-5s", "da-x-3", "da-x-3s", "da-x-4", "da-x-4s", "da-x-5", "da-x-5s", "de-AT-x-3", "de-AT-x-3s", "de-AT-x-5", "de-AT-x-5s", "de-IT-x-3", "de-IT-x-3s", "de-IT-x-5", "de-IT-x-5s", "de-x-3", "de-x-3s", "de-x-4", "de-x-4s", "de-x-5", "de-x-5s", "doi-x-3", "doi-x-3s", "doi-x-4", "doi-x-4s", "doi-x-5", "doi-x-5s", "dsb-x-3", "dsb-x-3s", "dsb-x-4", "dsb-x-4s", "dsb-x-5", "dsb-x-5s", "el-polyton-x-3", "el-polyton-x-5", "el-polyton-x-5s", "el-x-3", "el-x-3s", "el-x-4", "el-x-4s", "el-x-5", "el-x-5s", "en-001-x-3", "en-001-x-3s", "en-150-x-3", "en-150-x-3s", "en-AG-x-3", "en-AG-x-3s", "en-AI-x-3", "en-AI-x-3s", "en-AT-x-3", "en-AT-x-3s", "en-AU-x-3", "en-AU-x-3s", "en-BB-x-3", "en-BB-x-3s", "en-BE-x-3", "en-BE-x-3s", "en-BM-x-3", "en-BM-x-3s", "en-BS-x-3", "en-BS-x-3s", "en-BW-x-3", "en-BW-x-3s", "en-BZ-x-3", "en-BZ-x-3s", "en-CC-x-3", "en-CC-x-3s", "en-CH-x-3", "en-CH-x-3s", "en-CK-x-3", "en-CK-x-3s", "en-CM-x-3", "en-CM-x-3s", "en-CX-x-3", "en-CX-x-3s", "en-CY-x-3", "en-CY-x-3s", "en-DE-x-3", "en-DE-x-3s", "en-DG-x-3", "en-DG-x-3s", "en-DK-x-3", "en-DK-x-3s", "en-DM-x-3", "en-DM-x-3s", "en-ER-x-3", "en-ER-x-3s", "en-FI-x-3", "en-FI-x-3s", "en-FJ-x-3", "en-FJ-x-3s", "en-FK-x-3", "en-FK-x-3s", "en-FM-x-3", "en-FM-x-3s", "en-GB-x-3", "en-GB-x-3s", "en-GD-x-3", "en-GD-x-3s", "en-GG-x-3", "en-GG-x-3s", "en-GH-x-3", "en-GH-x-3s", "en-GI-x-3", "en-GI-x-3s", "en-GM-x-3", "en-GM-x-3s", "en-GY-x-3", "en-GY-x-3s", "en-HK-x-3", "en-HK-x-3s", "en-IE-x-3", "en-IE-x-3s", "en-IL-x-3", "en-IL-x-3s", "en-IM-x-3", "en-IM-x-3s", "en-IN-x-3", "en-IN-x-3s", "en-IO-x-3", "en-IO-x-3s", "en-JE-x-3", "en-JE-x-3s", "en-JM-x-3", "en-JM-x-3s", "en-KE-x-3", "en-KE-x-3s", "en-KI-x-3", "en-KI-x-3s", "en-KN-x-3", "en-KN-x-3s", "en-KY-x-3", "en-KY-x-3s", "en-LC-x-3", "en-LC-x-3s", "en-LR-x-3", "en-LR-x-3s", "en-LS-x-3", "en-LS-x-3s", "en-MG-x-3", "en-MG-x-3s", "en-MO-x-3", "en-MO-x-3s", "en-MS-x-3", "en-MS-x-3s", "en-MT-x-3", "en-MT-x-3s", "en-MU-x-3", "en-MU-x-3s", "en-MV-x-3", "en-MV-x-3s", "en-MW-x-3", "en-MW-x-3s", "en-MY-x-3", "en-MY-x-3s", "en-NA-x-3", "en-NA-x-3s", "en-NF-x-3", "en-NF-x-3s", "en-NG-x-3", "en-NG-x-3s", "en-NL-x-3", "en-NL-x-3s", "en-NR-x-3", "en-NR-x-3s", "en-NU-x-3", "en-NU-x-3s", "en-NZ-x-3", "en-NZ-x-3s", "en-PG-x-3", "en-PG-x-3s", "en-PK-x-3", "en-PK-x-3s", "en-PN-x-3", "en-PN-x-3s", "en-PW-x-3", "en-PW-x-3s", "en-RW-x-3", "en-RW-x-3s", "en-SB-x-3", "en-SB-x-3s", "en-SC-x-3", "en-SC-x-3s", "en-SD-x-3", "en-SD-x-3s", "en-SE-x-3", "en-SE-x-3s", "en-SG-x-3", "en-SG-x-3s", "en-SH-x-3", "en-SH-x-3s", "en-SI-x-3", "en-SI-x-3s", "en-SL-x-3", "en-SL-x-3s", "en-SS-x-3", "en-SS-x-3s", "en-SX-x-3", "en-SX-x-3s", "en-SZ-x-3", "en-SZ-x-3s", "en-TC-x-3", "en-TC-x-3s", "en-TK-x-3", "en-TK-x-3s", "en-TO-x-3", "en-TO-x-3s", "en-TT-x-3", "en-TT-x-3s", "en-TV-x-3", "en-TV-x-3s", "en-TZ-x-3", "en-TZ-x-3s", "en-UG-x-3", "en-UG-x-3s", "en-VC-x-3", "en-VC-x-3s", "en-VG-x-3", "en-VG-x-3s", "en-VU-x-3", "en-VU-x-3s", "en-WS-x-3", "en-WS-x-3s", "en-ZA-x-3", "en-ZA-x-3s", "en-ZM-x-3", "en-ZM-x-3s", "en-ZW-x-3", "en-ZW-x-3s", "en-x-3", "en-x-3s", "en-x-4", "en-x-4s", "en-x-5", "en-x-5s", "es-CL-x-3s", "es-CO-x-3s", "es-PE-x-3", "es-PE-x-3s", "es-PE-x-5", "es-PE-x-5s", "es-PY-x-3", "es-PY-x-3s", "es-UY-x-3", "es-UY-x-3s", "es-UY-x-5", "es-UY-x-5s", "es-VE-x-3", "es-VE-x-3s", "es-x-3", "es-x-3s", "es-x-4", "es-x-4s", "es-x-5", "es-x-5s", "et-x-3", "et-x-3s", "et-x-4", "et-x-4s", "et-x-5", "et-x-5s", "eu-x-3", "eu-x-3s", "eu-x-4", "eu-x-4s", "eu-x-5", "eu-x-5s", "fa-AF-x-3", "fa-AF-x-3s", "fa-AF-x-4", "fa-AF-x-4s", "fa-AF-x-5", "fa-AF-x-5s", "fa-x-3", "fa-x-3s", "fa-x-4", "fa-x-4s", "fa-x-5", "fa-x-5s", "ff-Adlm-x-3", "ff-Adlm-x-3s", "ff-Adlm-x-4", "ff-Adlm-x-4s", "ff-Adlm-x-5", "ff-Adlm-x-5s", "fi-x-3", "fi-x-3s", "fi-x-4", "fi-x-4s", "fi-x-5", "fi-x-5s", "fil-x-3", "fil-x-3s", "fil-x-4", "fil-x-4s", "fil-x-5", "fil-x-5s", "fo-x-3", "fo-x-3s", "fo-x-4", "fo-x-4s", "fo-x-5", "fo-x-5s", "fr-CA-x-3", "fr-CA-x-3s", "fr-MA-x-3", "fr-MA-x-3s", "fr-x-3", "fr-x-3s", "fr-x-4", "fr-x-4s", "fr-x-5", "fr-x-5s", "ga-x-3", "ga-x-3s", "ga-x-4", "ga-x-4s", "ga-x-5", "ga-x-5s", "gd-x-3", "gd-x-3s", "gd-x-4", "gd-x-4s", "gd-x-5", "gd-x-5s", "gl-x-3", "gl-x-3s", "gl-x-4", "gl-x-4s", "gl-x-5", "gl-x-5s", "gu-x-3", "gu-x-3s", "gu-x-4", "gu-x-4s", "gu-x-5", "gu-x-5s", "ha-x-3", "ha-x-3s", "ha-x-4", "ha-x-4s", "ha-x-5", "ha-x-5s", "he-x-3", "he-x-3s", "he-x-5", "he-x-5s", "hi-Latn-x-3", "hi-Latn-x-3s", "hi-Latn-x-4", "hi-Latn-x-4s", "hi-Latn-x-5", "hi-Latn-x-5s", "hi-x-3", "hi-x-3s", "hi-x-4", "hi-x-4s", "hi-x-5", "hi-x-5s", "hr-x-3", "hr-x-3s", "hr-x-4", "hr-x-4s", "hr-x-5", "hr-x-5s", "hsb-x-3", "hsb-x-3s", "hsb-x-4", "hsb-x-4s", "hsb-x-5", "hsb-x-5s", "hu-x-3", "hu-x-3s", "hu-x-4", "hu-x-4s", "hu-x-5", "hu-x-5s", "hy-x-3", "hy-x-3s", "hy-x-4", "hy-x-4s", "hy-x-5", "hy-x-5s", "ia-x-3", "ia-x-3s", "ia-x-4", "ia-x-5", "ia-x-5s", "id-x-3", "id-x-3s", "id-x-4", "id-x-4s", "id-x-5", "id-x-5s", "ig-x-3", "ig-x-3s", "ig-x-4", "ig-x-4s", "ig-x-5", "ig-x-5s", "is-x-3", "is-x-3s", "is-x-4", "is-x-4s", "is-x-5", "is-x-5s", "it-x-3", "it-x-3s", "it-x-4", "it-x-4s", "it-x-5", "it-x-5s", "ja-x-3", "ja-x-3s", "ja-x-5", "ja-x-5s", "jv-x-3", "jv-x-3s", "jv-x-4", "jv-x-4s", "jv-x-5", "jv-x-5s", "ka-x-3", "ka-x-3s", "ka-x-4", "ka-x-4s", "ka-x-5", "ka-x-5s", "kea-x-3", "kea-x-3s", "kea-x-4", "kea-x-4s", "kea-x-5", "kea-x-5s", "kgp-x-3", "kgp-x-3s", "kgp-x-4", "kgp-x-4s", "kgp-x-5", "kgp-x-5s", "kk-x-3", "kk-x-3s", "kk-x-4", "kk-x-4s", "kk-x-5", "kk-x-5s", "km-x-3", "km-x-3s", "km-x-4", "km-x-4s", "km-x-5", "km-x-5s", "kn-x-3", "kn-x-3s", "kn-x-4", "kn-x-4s", "kn-x-5", "kn-x-5s", "ko-x-3", "ko-x-3s", "ko-x-4", "ko-x-4s", "ko-x-5", "ko-x-5s", "kok-x-3", "kok-x-3s", "kok-x-5", "kok-x-5s", "ks-Deva-x-3", "ks-Deva-x-3s", "ks-Deva-x-4", "ks-Deva-x-4s", "ks-Deva-x-5", "ks-Deva-x-5s", "ks-x-3", "ks-x-3s", "ks-x-4", "ks-x-4s", "ks-x-5", "ks-x-5s", "ky-x-3", "ky-x-3s", "ky-x-4", "ky-x-4s", "ky-x-5", "ky-x-5s", "lo-x-3", "lo-x-3s", "lo-x-5", "lo-x-5s", "lt-x-3", "lt-x-3s", "lt-x-4", "lt-x-4s", "lt-x-5", "lt-x-5s", "lv-x-3", "lv-x-3s", "lv-x-4", "lv-x-4s", "lv-x-5", "lv-x-5s", "mai-x-3", "mai-x-3s", "mai-x-4", "mai-x-4s", "mai-x-5", "mai-x-5s", "mi-x-3", "mi-x-3s", "mi-x-4", "mi-x-4s", "mi-x-5", "mi-x-5s", "mk-x-3", "mk-x-3s", "mk-x-4", "mk-x-4s", "mk-x-5", "mk-x-5s", "ml-x-3", "ml-x-3s", "ml-x-4", "ml-x-4s", "ml-x-5", "ml-x-5s", "mn-x-3", "mn-x-3s", "mn-x-4", "mn-x-4s", "mn-x-5", "mn-x-5s", "mni-x-3", "mni-x-3s", "mni-x-4", "mni-x-4s", "mni-x-5", "mni-x-5s", "mr-x-3", "mr-x-3s", "mr-x-4", "mr-x-4s", "mr-x-5", "mr-x-5s", "ms-x-3", "ms-x-3s", "ms-x-4", "ms-x-4s", "ms-x-5", "ms-x-5s", "my-x-3", "my-x-3s", "my-x-4", "my-x-4s", "my-x-5", "my-x-5s", "nb-x-3", "nb-x-3s", "nb-x-4", "nb-x-4s", "nb-x-5", "nb-x-5s", "ne-x-3", "ne-x-3s", "ne-x-4", "ne-x-4s", "ne-x-5", "ne-x-5s", "nl-x-3", "nl-x-3s", "nl-x-4", "nl-x-4s", "nl-x-5", "nl-x-5s", "nn-x-3", "nn-x-3s", "nn-x-4", "nn-x-4s", "nn-x-5", "nn-x-5s", "no-x-3", "no-x-3s", "no-x-4", "no-x-4s", "no-x-5", "no-x-5s", "or-x-3", "or-x-3s", "or-x-4", "or-x-4s", "or-x-5", "or-x-5s", "pa-x-3", "pa-x-3s", "pa-x-4", "pa-x-4s", "pa-x-5", "pa-x-5s", "pcm-x-3", "pcm-x-3s", "pcm-x-4", "pcm-x-4s", "pcm-x-5", "pcm-x-5s", "pl-x-3", "pl-x-3s", "pl-x-4", "pl-x-4s", "pl-x-5", "pl-x-5s", "ps-x-3", "ps-x-3s", "ps-x-4", "ps-x-5", "ps-x-5s", "pt-x-3", "pt-x-3s", "pt-x-4", "pt-x-4s", "pt-x-5", "pt-x-5s", "qu-x-3", "qu-x-3s", "qu-x-5", "qu-x-5s", "raj-x-3", "raj-x-3s", "raj-x-5", "raj-x-5s", "rm-x-3", "rm-x-3s", "rm-x-4", "rm-x-4s", "rm-x-5", "rm-x-5s", "ro-x-3", "ro-x-3s", "ro-x-4", "ro-x-4s", "ro-x-5", "ro-x-5s", "ru-x-3", "ru-x-3s", "ru-x-4", "ru-x-4s", "ru-x-5", "ru-x-5s", "sa-x-3", "sa-x-3s", "sa-x-4", "sa-x-5", "sa-x-5s", "sat-x-3", "sat-x-3s", "sat-x-4", "sat-x-4s", "sat-x-5", "sat-x-5s", "sc-x-3", "sc-x-3s", "sc-x-4", "sc-x-4s", "sc-x-5", "sc-x-5s", "sd-Deva-x-3", "sd-Deva-x-3s", "sd-Deva-x-4", "sd-Deva-x-4s", "sd-Deva-x-5", "sd-Deva-x-5s", "sd-x-3", "sd-x-3s", "sd-x-4", "sd-x-4s", "sd-x-5", "sd-x-5s", "si-x-3", "si-x-3s", "si-x-4", "si-x-4s", "si-x-5", "si-x-5s", "sk-x-3", "sk-x-3s", "sk-x-4", "sk-x-4s", "sk-x-5", "sk-x-5s", "sl-x-3", "sl-x-3s", "sl-x-4", "sl-x-4s", "sl-x-5", "sl-x-5s", "so-x-3", "so-x-3s", "so-x-4", "so-x-4s", "so-x-5", "so-x-5s", "sq-x-3", "sq-x-3s", "sq-x-4", "sq-x-4s", "sq-x-5", "sq-x-5s", "sr-Latn-XK-x-3", "sr-Latn-XK-x-3s", "sr-Latn-x-3", "sr-Latn-x-3s", "sr-Latn-x-4", "sr-Latn-x-4s", "sr-Latn-x-5", "sr-Latn-x-5s", "sr-ME-x-3", "sr-ME-x-3s", "sr-XK-x-3", "sr-XK-x-3s", "sr-x-3", "sr-x-3s", "sr-x-4", "sr-x-4s", "sr-x-5", "sr-x-5s", "su-x-3", "su-x-3s", "su-x-4", "su-x-4s", "su-x-5", "su-x-5s", "sv-x-3", "sv-x-3s", "sv-x-4", "sv-x-4s", "sv-x-5", "sv-x-5s", "sw-x-3", "sw-x-3s", "sw-x-4", "sw-x-4s", "sw-x-5", "sw-x-5s", "ta-x-3", "ta-x-3s", "ta-x-4", "ta-x-4s", "ta-x-5", "ta-x-5s", "te-x-3", "te-x-3s", "te-x-4", "te-x-4s", "te-x-5", "te-x-5s", "tg-x-3", "tg-x-3s", "tg-x-4", "tg-x-4s", "tg-x-5", "tg-x-5s", "th-x-3", "th-x-3s", "th-x-4", "th-x-4s", "th-x-5", "th-x-5s", "ti-x-3", "ti-x-3s", "ti-x-4", "ti-x-4s", "ti-x-5", "ti-x-5s", "tk-x-3", "tk-x-3s", "tk-x-4", "tk-x-4s", "tk-x-5", "tk-x-5s", "to-x-3", "to-x-3s", "to-x-4", "to-x-4s", "to-x-5", "to-x-5s", "tr-x-3", "tr-x-3s", "tr-x-4", "tr-x-4s", "tr-x-5", "tr-x-5s", "tt-x-3", "tt-x-3s", "tt-x-5", "tt-x-5s", "uk-x-3", "uk-x-3s", "uk-x-4", "uk-x-4s", "uk-x-5", "uk-x-5s", "und-x-3", "und-x-3s", "und-x-4", "und-x-4s", "und-x-5", "und-x-5s", "ur-x-3", "ur-x-3s", "ur-x-4", "ur-x-4s", "ur-x-5", "ur-x-5s", "uz-Cyrl-x-3", "uz-Cyrl-x-3s", "uz-Cyrl-x-4", "uz-Cyrl-x-4s", "uz-Cyrl-x-5", "uz-Cyrl-x-5s", "uz-x-3", "uz-x-3s", "uz-x-4", "uz-x-4s", "uz-x-5", "uz-x-5s", "vi-x-3", "vi-x-3s", "vi-x-5", "vi-x-5s", "wo-x-3", "wo-x-3s", "wo-x-5", "wo-x-5s", "xh-x-3", "xh-x-3s", "xh-x-5", "xh-x-5s", "yo-BJ-x-3", "yo-BJ-x-3s", "yo-BJ-x-4", "yo-BJ-x-4s", "yo-BJ-x-5", "yo-BJ-x-5s", "yo-x-3", "yo-x-3s", "yo-x-4", "yo-x-4s", "yo-x-5", "yo-x-5s", "yrl-x-3", "yrl-x-3s", "yrl-x-4", "yrl-x-4s", "yrl-x-5", "yrl-x-5s", "yue-Hans-x-3", "yue-Hans-x-3s", "yue-Hans-x-5", "yue-Hans-x-5s", "yue-x-3", "yue-x-3s", "yue-x-5", "yue-x-5s", "zh-Hant-x-3", "zh-Hant-x-3s", "zh-Hant-x-5", "zh-Hant-x-5s", "zh-x-3", "zh-x-3s", "zh-x-5", "zh-x-5s", "zu-x-3", "zu-x-3s", "zu-x-4", "zu-x-4s", "zu-x-5", "zu-x-5s"]; +} diff --git a/experimental/zerotrie/tests/locale_aux_test.rs b/experimental/zerotrie/tests/locale_aux_test.rs new file mode 100644 index 00000000000..9b3d54ea116 --- /dev/null +++ b/experimental/zerotrie/tests/locale_aux_test.rs @@ -0,0 +1,143 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use icu_locid::extensions::private::Private; +use icu_locid::Locale; +use litemap::LiteMap; +use std::collections::BTreeSet; +use writeable::Writeable; +use zerotrie::ZeroTriePerfectHash; +use zerotrie::ZeroTrieSimpleAscii; + +mod testdata { + include!("data/data.rs"); +} + +use testdata::locales_with_aux::{NUM_UNIQUE_BLOBS, STRINGS}; +use testdata::strings_to_litemap; + +#[test] +fn test_combined() { + let litemap = strings_to_litemap(STRINGS); + + let trie = ZeroTrieSimpleAscii::try_from(&litemap).unwrap(); + + // Lookup table size: + assert_eq!(trie.byte_len(), 5104); + + // Size including pointer array: + assert_eq!( + trie.byte_len() + NUM_UNIQUE_BLOBS * core::mem::size_of::(), + 8392 + ); + + let trie = ZeroTriePerfectHash::try_from(&litemap).unwrap(); + + // Lookup table size: + assert_eq!(trie.byte_len(), 5157); + + // Size including pointer array: + assert_eq!( + trie.byte_len() + NUM_UNIQUE_BLOBS * core::mem::size_of::(), + 8445 + ); + + let total_str_len = litemap.iter_keys().map(|k| k.len()).sum::(); + assert_eq!(total_str_len, 8115); + + // Lookup table size: + assert_eq!( + total_str_len + STRINGS.len() * core::mem::size_of::(), + 16531 + ); + + // Size including pointer array: (2x for the lookup array and value array) + assert_eq!( + total_str_len + 2 * STRINGS.len() * core::mem::size_of::(), + 24947 + ); + + // Size including u16 pointer array: + assert_eq!( + total_str_len + + STRINGS.len() * core::mem::size_of::() + + STRINGS.len() * core::mem::size_of::() + + NUM_UNIQUE_BLOBS * core::mem::size_of::(), + 21923 + ); +} + +#[test] +fn test_aux_split() { + let locales: Vec = STRINGS.iter().map(|s| s.parse().unwrap()).collect(); + + let aux_keys: BTreeSet<&Private> = locales.iter().map(|l| &l.extensions.private).collect(); + assert_eq!(aux_keys.len(), 6); + + let mut cumulative_index = 0; + let mut total_simpleascii_len = 0; + let mut total_perfecthash_len = 0; + let mut unique_locales = BTreeSet::new(); + for private in aux_keys.iter() { + let current_locales: Vec = locales + .iter() + .filter(|l| l.extensions.private == **private) + .map(|l| { + let mut l = l.clone(); + l.extensions.private = Private::default(); + l + }) + .collect(); + let litemap: LiteMap, usize> = current_locales + .iter() + .map(|l| { + (l.write_to_string().into_owned().into_bytes(), { + cumulative_index += 1; + cumulative_index - 1 + }) + }) + .collect(); + + let trie = ZeroTrieSimpleAscii::try_from(&litemap).unwrap(); + total_simpleascii_len += trie.byte_len(); + + let trie = ZeroTriePerfectHash::try_from(&litemap).unwrap(); + total_perfecthash_len += trie.byte_len(); + + for k in litemap.iter_keys() { + unique_locales.insert(k.clone()); + } + } + assert_eq!(cumulative_index, locales.len()); + + assert_eq!(total_simpleascii_len, 5098); + assert_eq!(total_perfecthash_len, 5302); + + let total_unique_locale_str_len = unique_locales.iter().map(|v| v.len()).sum::(); + assert_eq!(total_unique_locale_str_len, 945); + + // Size including pointer array: + assert_eq!( + total_simpleascii_len + NUM_UNIQUE_BLOBS * core::mem::size_of::(), + 8386 + ); + assert_eq!( + total_perfecthash_len + NUM_UNIQUE_BLOBS * core::mem::size_of::(), + 8590 + ); + // 2x for the lookup arrays and value arrays + assert_eq!( + total_unique_locale_str_len + 2 * STRINGS.len() * core::mem::size_of::(), + 17777 + ); + + // Size including u16 pointer array: + assert_eq!( + total_unique_locale_str_len + + STRINGS.len() * core::mem::size_of::() + + STRINGS.len() * core::mem::size_of::() + + NUM_UNIQUE_BLOBS * core::mem::size_of::(), + 14753 + ); +}