Skip to content

Commit

Permalink
Merge pull request #27 from microsoft/alvinn/textnorm_remove_fillers
Browse files Browse the repository at this point in the history
Normalization update: remove fillers and more
  • Loading branch information
nidleo authored Mar 4, 2024
2 parents f3c33b4 + 8fbea20 commit b2d2483
Show file tree
Hide file tree
Showing 9 changed files with 205 additions and 6 deletions.
41 changes: 41 additions & 0 deletions configs/train_css/local/conformer_v0.51_mc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Note there's newer and better data. Do not download v1.2
train_dir: ./v1.2/100hrs/train
val_dir: ./v1.2/100hrs/val
out_dir: ./

train_set_cfg:
sample_frac: 1.0
max_urls: null # null means no limit
val_set_cfg:
sample_frac: 1.0
max_urls: null # null means no limit

clip_gt_to_mixture: True

log_params_mlflow: True
log_metrics_mlflow: True

scheduler_step_every: [1, iterations]
scheduler_name: step_lr
scheduler_linear_warmup_decay_cfg:
warmup: 10000
decay: 260000
scheduler_step_lr_cfg:
step_size: 1
gamma: 1.0 # no decay

stop_after: [260000, iterations]
eval_every: [1000, iterations]
save_every: [1000, iterations]

global_batch_size: 256
learning_rate: 1e-5
weight_decay: 1e-2 # according to the paper set to 1e-2

# Large model per CSS with Conformer definition
conformer_css_cfg:
nnet_conf:
conformer_conf:
attention_dim: 512 # default 256
attention_heads: 8 # default 4
num_blocks: 18 # default 16
46 changes: 46 additions & 0 deletions configs/train_css/local/conformer_v0.51_sc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Note there's newer and better data. Do not download v1.2
train_dir: ./v1.2/100hrs/train
val_dir: ./v1.2/100hrs/val
out_dir: ./

single_channel: True

train_set_cfg:
sample_frac: 1.0
max_urls: null # null means no limit
val_set_cfg:
sample_frac: 1.0
max_urls: null # null means no limit

clip_gt_to_mixture: True

log_params_mlflow: True
log_metrics_mlflow: True

scheduler_step_every: [1, iterations]
scheduler_name: step_lr
scheduler_linear_warmup_decay_cfg:
warmup: 10000
decay: 260000
scheduler_step_lr_cfg:
step_size: 1
gamma: 1.0 # no decay

stop_after: [260000, iterations]
eval_every: [1000, iterations]
save_every: [1000, iterations]

global_batch_size: 256
learning_rate: 1e-5
weight_decay: 1e-2 # according to the paper set to 1e-2

# Large model per CSS with Conformer definition
conformer_css_cfg:
extractor_conf:
ipd_index: '' # For MC '1,0;2,0;3,0;4,0;5,0;6,0'. For SC ''.
nnet_conf:
conformer_conf:
attention_dim: 512 # default 256
attention_heads: 8 # default 4
num_blocks: 18 # default 16
in_features: 257 # For MC 1799. For SC 257.
3 changes: 2 additions & 1 deletion configs/train_css/local/conformer_v0.5_mc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ val_set_cfg:
sample_frac: 1.0
max_urls: null # null means no limit

# This model was actually trained with clip_gt_to_mixture: False, but we recommend to set it to True.
# This model was trained with clip_gt_to_mixture=False, but we recommend to set it to True.
clip_gt_to_mixture: False

log_params_mlflow: True
log_metrics_mlflow: True
Expand Down
3 changes: 2 additions & 1 deletion configs/train_css/local/conformer_v0.5_sc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ val_set_cfg:
sample_frac: 1.0
max_urls: null # null means no limit

# This model was actually trained with clip_gt_to_mixture: False, but we recommend to set it to True.
# This model was trained with clip_gt_to_mixture=False, but we recommend to set it to True.
clip_gt_to_mixture: False

log_params_mlflow: True
log_metrics_mlflow: True
Expand Down
2 changes: 2 additions & 0 deletions configs/train_css/local/debug_mc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ val_set_cfg:
sample_frac: 1.0
max_urls: 2 # null means no limit

clip_gt_to_mixture: True

log_params_mlflow: False
log_metrics_mlflow: False

Expand Down
2 changes: 2 additions & 0 deletions configs/train_css/local/debug_sc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ val_set_cfg:
sample_frac: 1.0
max_urls: 2 # null means no limit

clip_gt_to_mixture: True

log_params_mlflow: False
log_metrics_mlflow: False

Expand Down
3 changes: 2 additions & 1 deletion css/training/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ class TrainCfg:
learning_rate: float = 1e-3
global_batch_size: int = 32 # global means across all GPUs, local means per GPU
clip_grad_norm: float = 0.01
clip_gt_to_mixture: bool = True # clips the ground truth to the mixture to avoid trying to drive the mask above 1
# clips the ground truth to the mixture to avoid trying to drive the mask above 1. "True" is recommended.
clip_gt_to_mixture: bool = False
weight_decay: float = 1e-4
is_debug: bool = False # no data workers, no DataParallel, etc.
log_params_mlflow: bool = True
Expand Down
6 changes: 4 additions & 2 deletions utils/text_norm_whisper_like/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@

from .basic import BasicTextNormalizer as BasicTextNormalizer
from .english import EnglishTextNormalizer as EnglishTextNormalizer
from whisper.normalizers import EnglishTextNormalizer as OriginalEnglishTextNormalizer


def get_txt_norm(txt_norm):
assert txt_norm in ["chime8", None]
if txt_norm is None:
return None
elif txt_norm == "chime8":
return EnglishTextNormalizer()
elif txt_norm == "whisper":
return OriginalEnglishTextNormalizer()
else:
raise NotImplementedError
raise NotImplementedError()
105 changes: 104 additions & 1 deletion utils/text_norm_whisper_like/english.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,82 @@ def __call__(self, s: str):
return s


class EnglishReverseNumberNormalizer(EnglishNumberNormalizer):
"""
This is an approximate inverse of EnglishNumberNormalizer that converts arabic numerals
into spelled-out numbers.
Motivation: Whisper's original EnglishNumberNormalizer produces numberals that match Whisper's rich
token set, which many ASRs cannot output.
This class takes an alternative normalization approach, converting Whisper's numberals back to
spelled-out numbers. This ensures compatibility with the token sets of other ASR systems while
avoiding penalizing Whisper for outputting numerals.
Examples of cases handled:
- "365" -> "three hundred sixty five"
- "$20" -> "twenty dollars"
- "50%" -> "fifty percent"
- "12th" -> "twelfth", "12s" -> "twelves"
- "90th" -> "ninetieth", "90s" -> "nineties"
- The special cases of "70 000" -> "seventy thousand" but not larger numbers.
Caveats: this class takes care of the majority of cases, but it is not perfect.
- Only numerals within the 0-1000 range are handled.
- Minus/plus signs are not handled.
- There is inherent ambiguity e.g. "100" -> "one hundred" or "a hundred".
"""

def __init__(self):
super().__init__()
# Reverse dictionaries
self.int_to_ones = {v: k for k, v in self.ones.items()}
self.int_to_tens = {v: k for k, v in self.tens.items()}

# 11th -> eleventh etc.
self.str_to_ones_suffixed = {str(n)+s: k for k, (n,s) in self.ones_suffixed.items()}
# 20s -> twenties etc.
self.str_to_tens_suffixed = {str(n)+s: k for k, (n,s) in self.tens_suffixed.items()}

def __call__(self, s: str):
# "$x[.y]" -> "x[.y] dollars"
s = re.sub(r'\$(\d+(\.\d+)?)', r'\1 dollars', s)
# "x[.y]"% -> "x[.y] percent"
s = re.sub(r'(\d+(\.\d+)?)%', r'\1 percent', s)
# note this doesn't handle cases such as -x or +x.

def number_to_words(w: str):
if w.isdigit():
num = int(w)
if w == '000':
return 'thousand' # will work in case of "70 000" -> "seventy thousand"
if num == 0:
return "zero"
elif num == 100:
return "hundred"
elif 0 < num < 1000:
hundreds, remainder = divmod(num, 100)
tens, ones = divmod(remainder, 10)
h = [f"{self.int_to_ones[hundreds]} hundred"] if hundreds > 0 else []
if 0 < remainder <= 19:
t = [self.int_to_ones[remainder]]
o = []
else:
t = [self.int_to_tens[tens*10]] if tens > 0 else []
o = [self.int_to_ones[ones]] if ones > 0 else []
return " ".join(h + t + o)
elif num == 1000:
return "thousand"
else:
return w # case not handled
else:
# suffixed numbers
w = self.str_to_ones_suffixed.get(w, w)
w = self.str_to_tens_suffixed.get(w, w)
return w

return " ".join(number_to_words(w) for w in s.split())


class EnglishSpellingNormalizer:
"""
Applies British-American spelling mappings as listed in [1].
Expand All @@ -464,7 +540,7 @@ def __call__(self, s: str):


class EnglishTextNormalizer:
def __init__(self, standardize_numbers=False):
def __init__(self, standardize_numbers=False, standardize_numbers_rev=True, remove_fillers=True):
self.replacers = {
# common non verbal sounds are mapped to the similar ones
r"\b(hm+)\b|\b(mhm)\b|\b(mm+)\b|\b(m+h)\b|\b(hm+)\b|\b(um+)\b|\b(uhm+)\b": ( # noqa e501
Expand Down Expand Up @@ -492,6 +568,9 @@ def __init__(self, standardize_numbers=False):
r"\bcoulda\b": "could have",
r"\bshoulda\b": "should have",
r"\bma'am\b": "madam",
r"\bokay\b": "ok",
r"\bsetup\b": "set up",
r"\beveryday\b": "every day",
# contractions in titles/prefixes
r"\bmr\b": "mister ",
r"\bmrs\b": "missus ",
Expand Down Expand Up @@ -532,11 +611,23 @@ def __init__(self, standardize_numbers=False):
}
if standardize_numbers:
self.standardize_numbers = EnglishNumberNormalizer()
assert not standardize_numbers_rev
else:
self.standardize_numbers = None

if standardize_numbers_rev:
self.standardize_numbers_rev = EnglishReverseNumberNormalizer()
else:
self.standardize_numbers_rev = None

self.standardize_spellings = EnglishSpellingNormalizer()
self.pre_standardize_spellings = EnglishSpellingNormalizer("pre_english.json")

if remove_fillers:
self.fillers = ['hmm', 'uh', 'ah', 'eh'] # assumes replacers have been applied
else:
self.fillers = None

def __call__(self, s: str):
s = s.lower()

Expand All @@ -561,13 +652,25 @@ def __call__(self, s: str):
if self.standardize_numbers is not None:
s = self.standardize_numbers(s)

if self.standardize_numbers_rev is not None:
s = self.standardize_numbers_rev(s)

s = self.standardize_spellings(s)
# now remove prefix/suffix symbols
# that are not preceded/followed by numbers
s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s)
s = re.sub(r"([^0-9])%", r"\1 ", s)

# remove filler words
# motivation: these words are very common, yet hold little information in the majority of cases.
# some ASR systems may ignore them by convention and will be penalized unfairly.
if self.fillers:
s = re.sub(r'\b(' + '|'.join(self.fillers) + r')\b', "", s)

s = re.sub(r"\s+", " ", s)
# replace any successive whitespaces with a space

s = re.sub(r"^\s+|\s+$", "", s)
# remove leading and trailing whitespaces

return s

0 comments on commit b2d2483

Please sign in to comment.