[lipi] Support ISO : and improve runtime

Runtime improvement is approximately 2x: Benchmark: `make profile-time-osx` Before: 22.52s After : 11.21s The main improvements are: - using `unicode_normalization` instead of our hand-rolled function. - fixing a bug in the `len_longest_key` calculation that made the value ~3x larger than it should be. The algorithm itself is still slow, but this change quickly puts out the fire. In addition, this commit adds `Ord`/`PartialOrd` to a variety of types and improves various comments.
ambuda-org · Nov 20, 2024 · c426cdf · c426cdf
1 parent 6022952
commit c426cdf
Show file tree

Hide file tree

Showing 14 changed files with 202 additions and 43 deletions.
diff --git a/vidyut-chandas/src/akshara.rs b/vidyut-chandas/src/akshara.rs
@@ -1,7 +1,7 @@
 use crate::sounds;
 
 /// The weight of an akshara.
-#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
+#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
 pub enum Weight {
     /// A *guru* or heavy syllable.
     G,
@@ -18,7 +18,7 @@ pub enum Weight {
 /// - It must not start with an anusvara or visarga.
 ///
 /// Together, these three rurles mean that an input string has exactly one division into aksharas.
-#[derive(Debug, Clone, Eq, PartialEq)]
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
 pub struct Akshara {
     pub(crate) text: String,
     pub(crate) weight: Weight,

diff --git a/vidyut-chandas/src/padya.rs b/vidyut-chandas/src/padya.rs
@@ -13,7 +13,7 @@ pub enum PatternWeight {
 }
 
 /// Describes how a vrtta matches some input.
-#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, PartialOrd)]
+#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
 pub enum MatchType {
     /// No match.
     None,
@@ -26,7 +26,7 @@ pub enum MatchType {
 }
 
 /// A traditional shorthand for vrtta weights.
-#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
+#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
 pub enum Gana {
     /// *ya* (L G G)
     Ya,
@@ -257,7 +257,7 @@ impl TryFrom<&str> for Vrtta {
     }
 }
 
-#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
 pub(crate) enum JatiKind {
     /// A default jati.
     Basic,

diff --git a/vidyut-lipi/Cargo.toml b/vidyut-lipi/Cargo.toml
@@ -18,10 +18,10 @@ clap = { version = "4.0.12", features = ["derive"] }
 wasm-bindgen = "0.2"
 serde-wasm-bindgen = "0.4"
 console_error_panic_hook = "0.1.7"
+unicode-normalization = "0.1.22"
 
 [lib]
 crate-type = ["cdylib", "rlib"]
 
 [dev-dependencies]
 codes-iso-15924 = { version = "0.1.3", default-features = false }
-unicode-normalization = "0.1.22"
diff --git a/vidyut-lipi/examples/sample.rs b/vidyut-lipi/examples/sample.rs
@@ -10,6 +10,6 @@ fn main() {
     }
 
     let mut lipika = Lipika::new();
-    let output = lipika.transliterate(input, Scheme::Slp1, Scheme::Tibetan);
-    println!("{output}");
+    let output = lipika.transliterate(input, Scheme::Slp1, Scheme::Devanagari);
+    _ = lipika.transliterate(output, Scheme::Devanagari, Scheme::Slp1);
 }
diff --git a/vidyut-lipi/scripts/create_schemes.py b/vidyut-lipi/scripts/create_schemes.py
@@ -14,6 +14,7 @@
 
 CRATE_DIR = Path(__file__).parent.parent
 
+# Scripts to use from `common_maps.git`
 ALLOWED = {
     "AHOM",
     "ASSAMESE",
@@ -71,6 +72,7 @@
 }
 
 
+# Human-readable names for Unicode combos
 KEY_NAMES = {
     "\u0905": "A",
     "\u0906": "AA",
@@ -216,6 +218,7 @@ def __init__(self, d):
         for k, v in d.items():
             setattr(self, k, v)
 
+# Mapping from names to Unicode sequences
 C = AttributeDict({v: k for k, v in KEY_NAMES.items()})
 
 
@@ -238,15 +241,16 @@ def __init__(self, d):
 }
 
 
+# Tweaks to the defaults in common_map
 OVERRIDES = {
     "BARAHA":
     # Existing accent marks seem to be mostly wrong -- delete so that we
     # can redefine them elsewhere.
     {
         "\u1ce1": None,
-        "\ua8e1": None,
-        "\ua8e2": None,
-        "\ua8e3": None,
+        C.COMBINING_DIGIT_1: None,
+        C.COMBINING_DIGIT_2: None,
+        C.COMBINING_DIGIT_3: None,
     },
     "GRANTHA": {
         # vowel sign AU
@@ -447,6 +451,7 @@ def __init__(self, d):
 }
 
 
+# Additional characters not present in common_map (or deleted in OVERRIDES)
 EXTENSIONS = {
     "ASSAMESE": [
         (C.CANDRABINDU_VIRAMA, "\u09fc"),
@@ -541,6 +546,16 @@ def __init__(self, d):
         (C.QA, "q"),
         (C.JIHVAMULIYA, "ẖ"),
         (C.UPADHMANIYA, "ḫ"),
+        (C.KA + C.VIRAMA + C.HA, "k:h"),
+        (C.GA + C.VIRAMA + C.HA, "g:h"),
+        (C.CA + C.VIRAMA + C.HA, "c:h"),
+        (C.JA + C.VIRAMA + C.HA, "j:h"),
+        (C.TTA + C.VIRAMA + C.HA, "ṭ:h"),
+        (C.DDA + C.VIRAMA + C.HA, "ḍ:h"),
+        (C.TA + C.VIRAMA + C.HA, "t:h"),
+        (C.DA + C.VIRAMA + C.HA, "d:h"),
+        (C.PA + C.VIRAMA + C.HA, "p:h"),
+        (C.BA + C.VIRAMA + C.HA, "b:h"),
     ],
     "KANNADA": [
         (C.JIHVAMULIYA, "\u0cf1"),
@@ -640,6 +655,7 @@ def _sanitize(s: str) -> str:
 
 
 def to_unique(xs: list) -> list:
+    """Remove duplicates from `xs`."""
     seen = set()
     ret = []
     for x in xs:

diff --git a/vidyut-lipi/src/autogen_schemes.rs b/vidyut-lipi/src/autogen_schemes.rs
@@ -5096,6 +5096,16 @@ pub const ISO_15919: &[(&str, &str)] = &[
     (TAMIL_AYTHAM, "ḳ"),
     (QA, "q"),
     (UPADHMANIYA, "ḫ"),
+    ("क्ह", "k:h"),
+    ("ग्ह", "g:h"),
+    ("च्ह", "c:h"),
+    ("ज्ह", "j:h"),
+    ("ट्ह", "ṭ:h"),
+    ("ड्ह", "ḍ:h"),
+    ("त्ह", "t:h"),
+    ("द्ह", "d:h"),
+    ("प्ह", "p:h"),
+    ("ब्ह", "b:h"),
     (E, "e"),
     (O, "o"),
     (SIGN_E, "e"),

diff --git a/vidyut-lipi/src/mapping.rs b/vidyut-lipi/src/mapping.rs
@@ -379,7 +379,9 @@ impl Mapping {
         for (k, v) in &b_map.numeral_to_int {
             int_to_numeral.insert(*v, k.to_string());
         }
-        let len_longest_key = all.keys().map(|a| a.len()).max().unwrap_or(0);
+        // Take length in *chars*, not in *bytes*.
+        // (Using chars over bytes offers a ~3x speedup in the core transliterate loop.)
+        let len_longest_key = all.keys().map(|a| a.chars().count()).max().unwrap_or(0);
         let numeral_to_int = a_map.numeral_to_int.clone();
 
         Self {
@@ -409,6 +411,7 @@ impl Mapping {
         self.all.get(key)
     }
 
+    /// Dumps this mapping's data to stdout.
     #[allow(unused)]
     pub(crate) fn dump(&self) {
         let mut items: Vec<_> = self.all.iter().collect();

diff --git a/vidyut-lipi/src/scheme.rs b/vidyut-lipi/src/scheme.rs
@@ -613,6 +613,7 @@ impl Scheme {
         }
     }
 
+    #[cfg(target_arch = "wasm32")]
     pub(crate) fn unicode_composition_exclusions(&self) -> &[&str] {
         use crate::unicode_norm as u;
         use Scheme::*;

diff --git a/vidyut-lipi/src/transliterate.rs b/vidyut-lipi/src/transliterate.rs
@@ -70,6 +70,29 @@ fn transliterate_inner(input: &str, mapping: &Mapping) -> String {
             }
         }
 
+        // Special case: from ISO-15959 separator logic for a:i, a:u
+        //
+        // (consonants are handled in the mapping. We can't do the same for a:i and a:u because the
+        // implicit 'a' vowel causes problems.)
+        // TODO: is there a better place to put this?
+        if mapping.from == Scheme::Iso15919
+            && (input[i..].starts_with("a:i") || input[i..].starts_with("a:u"))
+        {
+            if is_to_abugida && had_virama {
+                // 'a' means we should pop virama.
+                output.pop();
+                had_virama = false;
+            } else {
+                // Otherwise, add independent 'a' vowel.
+                if let Some(x) = mapping.get("a") {
+                    output.push_str(x.text());
+                }
+            }
+            // Increment past "a:"
+            i += 2;
+            continue;
+        }
+
         // 1. Find the largest prefix of `input[i..]` that is defined in `mapping`.
         //
         // We must check for the *largest* match to distinguish between `b` and `bh`, `R` and `RR`,
@@ -141,6 +164,20 @@ fn transliterate_inner(input: &str, mapping: &Mapping) -> String {
                     }
                 }
             }
+
+            // Special case: to ISO-15959 separator logic for a:i, a:u
+            //
+            // (consonants are handled in the mapping. We can't do the same for a:i and a:u because
+            // the implicit 'a' vowel causes problems.)
+            // TODO: is there a better place to put this?
+            if mapping.to == Scheme::Iso15919
+                && (output.ends_with("ai") || output.ends_with("au"))
+                && matches!(token.text(), "i" | "u")
+            {
+                output.pop();
+                output.push(':');
+                output.push_str(token.text());
+            }
         } else {
             // ITRANS: `\` skips the next character.
             if is_from_itrans {

diff --git a/vidyut-lipi/src/unicode_norm.rs b/vidyut-lipi/src/unicode_norm.rs
@@ -27,6 +27,7 @@
 
 use crate::scheme::Scheme;
 use rustc_hash::FxHashMap;
+use unicode_normalization::UnicodeNormalization;
 
 type Table = &'static [(&'static str, &'static str)];
 
@@ -117,6 +118,7 @@ pub const DEVANAGARI_NFD: Table = &[
 ];
 
 /// Characters that should not be created during NFD --> NFC.
+#[cfg(target_arch = "wasm32")]
 pub const DEVANAGARI_COMPOSITION_EXCLUSIONS: &[&str] = &[
     "\u{0958}", // ka
     "\u{0959}", // kha
@@ -139,6 +141,7 @@ pub const BENGALI_NFD: Table = &[
 ];
 
 /// Characters that should not be created during NFD --> NFC.
+#[cfg(target_arch = "wasm32")]
 pub const BENGALI_COMPOSITION_EXCLUSIONS: &[&str] = &["\u{09dc}", "\u{09dd}", "\u{09df}"];
 
 /// Spec: <https://unicode.org/charts/PDF/U1000.pdf>
@@ -178,6 +181,7 @@ pub const GURMUKHI_NFD: Table = &[
 ];
 
 /// Spec: <https://unicode.org/charts/PDF/U0A00.pdf>
+#[cfg(target_arch = "wasm32")]
 pub const GURMUKHI_COMPOSITION_EXCLUSIONS: &[&str] = &[
     "\u{0a33}", "\u{0a36}", "\u{0a59}", "\u{0a5a}", "\u{0a5b}", "\u{0a5e}",
 ];
@@ -207,6 +211,7 @@ pub const ORIYA_NFD: Table = &[
     ("\u{0b5d}", "\u{0b22}\u{0b3c}"), // letter rha
 ];
 
+#[cfg(target_arch = "wasm32")]
 pub const ORIYA_COMPOSITION_EXCLUSIONS: &[&str] = &["\u{0b5c}", "\u{0b5d}"];
 
 /// Spec: <https://unicode.org/charts/PDF/U11580.pdf>
@@ -253,9 +258,17 @@ pub const TIRHUTA_NFD: Table = &[
 ///
 /// Only characters that appear in one of our `Scheme`s will be converted. All other characters
 /// will be left as-is.
+#[cfg(not(target_arch = "wasm32"))]
+pub(crate) fn to_nfc(s: &str) -> String {
+    s.nfc().collect()
+}
+
+/// WASM-only version of `to_nfc`.
 ///
-/// TODO: consider using `unicode_normalization` in non-WASM with conditional compilation. Leaning
-/// against due to having to reason about two different systems.
+/// The `unicode_normalization` implementation of this logic is substantially faster (which
+/// motivates using it in non-WASM builds) but also much larger (which motivates avoiding it in
+/// WASM builds).
+#[cfg(target_arch = "wasm32")]
 pub(crate) fn to_nfc(s: &str) -> String {
     let mut map = FxHashMap::default();
     let mut len_longest_key = 0;
@@ -305,6 +318,7 @@ pub(crate) fn to_nfc(s: &str) -> String {
 ///
 /// Our version of `to_nfd` supports only those characters that are part of a `Scheme`. All other
 /// characters are left unchanged.
+#[allow(unused)]
 pub(crate) fn to_nfd(s: &str) -> String {
     let mut map: FxHashMap<String, String> = FxHashMap::default();
 

diff --git a/vidyut-lipi/tests/basic.rs b/vidyut-lipi/tests/basic.rs
@@ -982,6 +982,56 @@ fn iso_15919_bug_no_greedy_match_on_nfd() {
     );
 }
 
+#[test]
+fn iso_15919_colon_separator() {
+    // Consonants
+    assert_two_way_pairwise(&[
+        (
+            Iso15919,
+            "k:ha g:ha c:ha j:ha ṭ:ha ḍ:ha t:ha d:ha p:ha b:ha",
+        ),
+        (Slp1, "kha gha cha jha wha qha tha dha pha bha"),
+        (Devanagari, "क्ह ग्ह च्ह ज्ह ट्ह ड्ह त्ह द्ह प्ह ब्ह"),
+        (Kannada, "ಕ್ಹ ಗ್ಹ ಚ್ಹ ಜ್ಹ ಟ್ಹ ಡ್ಹ ತ್ಹ ದ್ಹ ಪ್ಹ ಬ್ಹ"),
+    ]);
+
+    // Consonants with marks
+    assert_two_way_pairwise(&[
+        (
+            Iso15919,
+            "k:hā g:hā c:hā j:hā ṭ:hā ḍ:hā t:hā d:hā p:hā b:hā",
+        ),
+        (Slp1, "khA ghA chA jhA whA qhA thA dhA phA bhA"),
+        (Devanagari, "क्हा ग्हा च्हा ज्हा ट्हा ड्हा त्हा द्हा प्हा ब्हा"),
+        (Kannada, "ಕ್ಹಾ ಗ್ಹಾ ಚ್ಹಾ ಜ್ಹಾ ಟ್ಹಾ ಡ್ಹಾ ತ್ಹಾ ದ್ಹಾ ಪ್ಹಾ ಬ್ಹಾ"),
+    ]);
+
+    // Consonants with viramas
+    assert_two_way_pairwise(&[
+        (Iso15919, "k:h g:h c:h j:h ṭ:h ḍ:h t:h d:h p:h b:h"),
+        (Slp1, "kh gh ch jh wh qh th dh ph bh"),
+        (Devanagari, "क्ह् ग्ह् च्ह् ज्ह् ट्ह् ड्ह् त्ह् द्ह् प्ह् ब्ह्"),
+        (Kannada, "ಕ್ಹ್ ಗ್ಹ್ ಚ್ಹ್ ಜ್ಹ್ ಟ್ಹ್ ಡ್ಹ್ ತ್ಹ್ ದ್ಹ್ ಪ್ಹ್ ಬ್ಹ್"),
+    ]);
+
+    // Vowels
+    assert_two_way_pairwise(&[
+        (Iso15919, "a:i a:u ka:i ka:u"),
+        (Slp1, "ai au kai kau"),
+        (Devanagari, "अइ अउ कइ कउ"),
+        (Kannada, "ಅಇ ಅಉ ಕಇ ಕಉ"),
+    ]);
+
+    // Regular colons -- ignore
+    // TODO: what's the best policy for handling these?
+    assert_two_way_pairwise(&[
+        (Iso15919, "a: ka: k: a:ā k:ta"),
+        (Slp1, "a: ka: k: a:A k:ta"),
+        (Devanagari, "अ: क: क्: अ:आ क्:त"),
+        (Kannada, "ಅ: ಕ: ಕ್: ಅ:ಆ ಕ್:ತ"),
+    ]);
+}
+
 #[test]
 fn iso_15919_tamil_aytam() {
     assert_transliterate("ஃ", Tamil, Iso15919, "ḳ");