Skip to content

Commit

Permalink
Merge pull request #24 from popcornell/txtnorm
Browse files Browse the repository at this point in the history
Made whisper text norm idempotent
  • Loading branch information
nidleo authored Feb 19, 2024
2 parents bda171d + ea38dbe commit 144fed3
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 5 deletions.
2 changes: 1 addition & 1 deletion utils/text_norm_whisper_like/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ def get_txt_norm(txt_norm):
elif txt_norm == "chime8":
return EnglishTextNormalizer()
else:
raise NotImplementedError
raise NotImplementedError
2 changes: 1 addition & 1 deletion utils/text_norm_whisper_like/english.json
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@
"archaeologically": "archeologically",
"archaeologist": "archeologist",
"archaeologists": "archeologists",
"archaeology": "archeology</span>",
"archaeology": "archeology",
"ardour": "ardor",
"armour": "armor",
"armoured": "armored",
Expand Down
9 changes: 6 additions & 3 deletions utils/text_norm_whisper_like/english.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,8 +455,8 @@ class EnglishSpellingNormalizer:
[1] https://www.tysto.com/uk-us-spelling-list.html
"""

def __init__(self):
mapping_path = os.path.join(os.path.dirname(__file__), "english.json")
def __init__(self, mapping_name="english.json"):
mapping_path = os.path.join(os.path.dirname(__file__), mapping_name)
self.mapping = json.load(open(mapping_path))

def __call__(self, s: str):
Expand All @@ -471,6 +471,7 @@ def __init__(self, standardize_numbers=False):
"hmm"
),
r"\b(a+h+)\b|\b(ha+)\b": "ah",
r"[!?.]+(?=$|\s)": "", # Okay.. --> okay
r"\b(o+h+)\b|\b(h+o+)\b": "oh",
r"\b(u+h+)\b|\b(h+u+)\b|\b(h+u+h+)\b": "uh",
# common contractions
Expand Down Expand Up @@ -534,6 +535,7 @@ def __init__(self, standardize_numbers=False):
else:
self.standardize_numbers = None
self.standardize_spellings = EnglishSpellingNormalizer()
self.pre_standardize_spellings = EnglishSpellingNormalizer("pre_english.json")

def __call__(self, s: str):
s = s.lower()
Expand All @@ -542,6 +544,7 @@ def __call__(self, s: str):
# remove words between brackets
s = re.sub(r"\(([^)]+?)\)", "", s)
# remove words between parenthesis
s = self.pre_standardize_spellings(s)
s = re.sub(r"\s+'", "'", s)
# when there's a space before an apostrophe

Expand All @@ -557,8 +560,8 @@ def __call__(self, s: str):

if self.standardize_numbers is not None:
s = self.standardize_numbers(s)
s = self.standardize_spellings(s)

s = self.standardize_spellings(s)
# now remove prefix/suffix symbols
# that are not preceded/followed by numbers
s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s)
Expand Down
5 changes: 5 additions & 0 deletions utils/text_norm_whisper_like/pre_english.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"shan't": "shall not",
"han't": "has not",
"ain't": "ain not"
}

0 comments on commit 144fed3

Please sign in to comment.