Merge pull request #24 from popcornell/txtnorm

Made whisper text norm idempotent
microsoft · Feb 19, 2024 · 144fed3 · 144fed3
2 parents bda171d + ea38dbe
commit 144fed3
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 5 deletions.
diff --git a/utils/text_norm_whisper_like/__init__.py b/utils/text_norm_whisper_like/__init__.py
@@ -15,4 +15,4 @@ def get_txt_norm(txt_norm):
     elif txt_norm == "chime8":
         return EnglishTextNormalizer()
     else:
-        raise NotImplementedError
+        raise NotImplementedError
diff --git a/utils/text_norm_whisper_like/english.json b/utils/text_norm_whisper_like/english.json
@@ -86,7 +86,7 @@
     "archaeologically": "archeologically",
     "archaeologist": "archeologist",
     "archaeologists": "archeologists",
-    "archaeology": "archeology</span>",
+    "archaeology": "archeology",
     "ardour": "ardor",
     "armour": "armor",
     "armoured": "armored",

diff --git a/utils/text_norm_whisper_like/english.py b/utils/text_norm_whisper_like/english.py
@@ -455,8 +455,8 @@ class EnglishSpellingNormalizer:
     [1] https://www.tysto.com/uk-us-spelling-list.html
     """
 
-    def __init__(self):
-        mapping_path = os.path.join(os.path.dirname(__file__), "english.json")
+    def __init__(self, mapping_name="english.json"):
+        mapping_path = os.path.join(os.path.dirname(__file__), mapping_name)
         self.mapping = json.load(open(mapping_path))
 
     def __call__(self, s: str):
@@ -471,6 +471,7 @@ def __init__(self, standardize_numbers=False):
                 "hmm"
             ),
             r"\b(a+h+)\b|\b(ha+)\b": "ah",
+            r"[!?.]+(?=$|\s)": "",  # Okay.. --> okay
             r"\b(o+h+)\b|\b(h+o+)\b": "oh",
             r"\b(u+h+)\b|\b(h+u+)\b|\b(h+u+h+)\b": "uh",
             # common contractions
@@ -534,6 +535,7 @@ def __init__(self, standardize_numbers=False):
         else:
             self.standardize_numbers = None
         self.standardize_spellings = EnglishSpellingNormalizer()
+        self.pre_standardize_spellings = EnglishSpellingNormalizer("pre_english.json")
 
     def __call__(self, s: str):
         s = s.lower()
@@ -542,6 +544,7 @@ def __call__(self, s: str):
         # remove words between brackets
         s = re.sub(r"\(([^)]+?)\)", "", s)
         # remove words between parenthesis
+        s = self.pre_standardize_spellings(s)
         s = re.sub(r"\s+'", "'", s)
         # when there's a space before an apostrophe
 
@@ -557,8 +560,8 @@ def __call__(self, s: str):
 
         if self.standardize_numbers is not None:
             s = self.standardize_numbers(s)
-        s = self.standardize_spellings(s)
 
+        s = self.standardize_spellings(s)
         # now remove prefix/suffix symbols
         # that are not preceded/followed by numbers
         s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s)

diff --git a/utils/text_norm_whisper_like/pre_english.json b/utils/text_norm_whisper_like/pre_english.json
@@ -0,0 +1,5 @@
+{
+  "shan't": "shall not",
+  "han't": "has not",
+  "ain't": "ain not"
+}