Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into ted-apostropf-merge
Browse files Browse the repository at this point in the history
  • Loading branch information
Marvin84 committed Jun 21, 2024
2 parents cf5884a + aa2b3fb commit 6acec99
Show file tree
Hide file tree
Showing 528 changed files with 54,982 additions and 17,037 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def bpe_ls960_1023_base():
}

from ...pytorch_networks.ctc.decoder.flashlight_ctc_v1 import DecoderConfig
from ...pytorch_networks.ctc.decoder.greedy_bpe_ctc_v3 import DecoderConfig as GreedyDecoderConfig

def tune_and_evaluate_helper(
training_name: str,
Expand Down Expand Up @@ -121,6 +122,22 @@ def tune_and_evaluate_helper(
**default_returnn,
)

def greedy_search_helper(training_name: str, asr_model: ASRModel, decoder_config: GreedyDecoderConfig):
# remove prior if exists
asr_model = copy.deepcopy(asr_model)
asr_model.prior_file = None

search_name = training_name + "/search_greedy"
search_jobs, wers = search(
search_name,
forward_config={},
asr_model=asr_model,
decoder_module="ctc.decoder.greedy_bpe_ctc_v3",
decoder_args={"config": asdict(decoder_config)},
test_dataset_tuples=dev_dataset_tuples,
**default_returnn,
)

default_decoder_config_bpe5000 = DecoderConfig(
lexicon=get_text_lexicon(prefix=prefix_name, librispeech_key="train-other-960", bpe_size=5000),
returnn_vocab=label_datastream_bpe5000.vocab,
Expand Down Expand Up @@ -200,6 +217,7 @@ def tune_and_evaluate_helper(
"max_seq_length": {"audio_features": 35 * 16000},
"accum_grad_multiple_step": 1,
"torch_amp_options": {"dtype": "bfloat16"},
"gradient_clip": 1.0,
}

network_module = "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6"
Expand All @@ -224,3 +242,8 @@ def tune_and_evaluate_helper(
lm_scales=[1.6, 1.8, 2.0],
prior_scales=[0.2, 0.3, 0.4],
)

greedy_decoder_config = GreedyDecoderConfig(
returnn_vocab=label_datastream_bpe5000.vocab,
)
greedy_search_helper(training_name=training_name, asr_model=asr_model, decoder_config=greedy_decoder_config)
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ def tune_and_evaluate_helper(
"max_seq_length": {"audio_features": 35 * 16000},
"accum_grad_multiple_step": 1,
"torch_amp_options": {"dtype": "bfloat16"},
"gradient_clip": 1.0,
}

network_module = "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""
Greedy CTC decoder without any extras
v3: add config objects
"""
from dataclasses import dataclass
import time
import torch


@dataclass
class DecoderConfig:
returnn_vocab: str


@dataclass
class ExtraConfig:
# used for RTF logging
print_rtf: bool = True
sample_rate: int = 16000

# Hypothesis logging
print_hypothesis: bool = True


def forward_init_hook(run_ctx, **kwargs):
# we are storing durations, but call it output.hdf to match
# the default output of the ReturnnForwardJob
config = DecoderConfig(**kwargs["config"])
extra_config_dict = kwargs.get("extra_config", {})
extra_config = ExtraConfig(**extra_config_dict)

run_ctx.recognition_file = open("search_out.py", "wt")
run_ctx.recognition_file.write("{\n")

from returnn.datasets.util.vocabulary import Vocabulary

vocab = Vocabulary.create_vocab(vocab_file=config.returnn_vocab, unknown_label=None)
run_ctx.labels = vocab.labels

run_ctx.print_rtf = extra_config.print_rtf
if run_ctx.print_rtf:
run_ctx.running_audio_len_s = 0
run_ctx.total_time = 0

run_ctx.print_hypothesis = extra_config.print_hypothesis


def forward_finish_hook(run_ctx, **kwargs):
run_ctx.recognition_file.write("}\n")
run_ctx.recognition_file.close()

print("Total-time: %.2f, Batch-RTF: %.3f" % (run_ctx.total_time, run_ctx.total_time / run_ctx.running_audio_len_s))


def forward_step(*, model, data, run_ctx, **kwargs):
raw_audio = data["raw_audio"] # [B, T', F]
raw_audio_len = data["raw_audio:size1"] # [B]

audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000

if run_ctx.print_rtf:
run_ctx.running_audio_len_s += audio_len_batch
am_start = time.time()

logprobs, audio_features_len = model(
raw_audio=raw_audio,
raw_audio_len=raw_audio_len,
)
batch_indices = []
for lp, l in zip(logprobs, audio_features_len):
batch_indices.append(torch.unique_consecutive(torch.argmax(lp[:l], dim=-1), dim=0).detach().cpu().numpy())

if run_ctx.print_rtf:
am_time = time.time() - am_start
run_ctx.total_time += am_time
print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time, am_time / audio_len_batch))

tags = data["seq_tag"]

for indices, tag in zip(batch_indices, tags):
sequence = [run_ctx.labels[idx] for idx in indices if idx < len(run_ctx.labels)]
sequence = [s for s in sequence if (not s.startswith("<") and not s.startswith("["))]
text = " ".join(sequence).replace("@@ ", "")
if run_ctx.print_hypothesis:
print(text)
run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(text)))
2 changes: 1 addition & 1 deletion users/berger/args/experiments/ctc.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def get_ctc_recog_step_args(num_classes: int, reduction_factor: int = 4, **kwarg
"mem_rqmt": 16,
},
"rtf": 20,
"mem": 4,
"mem": 8,
}

return recursive_update(default_args, kwargs)
Expand Down
2 changes: 1 addition & 1 deletion users/berger/args/experiments/transducer.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def get_transducer_recog_step_args(
"mem_rqmt": 16,
},
"rtf": 50,
"mem": 4,
"mem": 8,
}

return recursive_update(default_args, kwargs)
Expand Down
25 changes: 25 additions & 0 deletions users/berger/args/jobs/rasr_init_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def get_feature_extraction_args_16kHz(
gt_args: Optional[Dict] = None,
) -> Dict:
mfcc_filter_width = features.filter_width_from_channels(channels=20, f_max=8000) # = 16000 / 2
filterbank_filter_width = features.filter_width_from_channels(channels=80, f_max=8000) # = 16000 / 2

if mfcc_cepstrum_options is None:
mfcc_cepstrum_options = {
Expand Down Expand Up @@ -142,6 +143,30 @@ def get_feature_extraction_args_16kHz(
"normalization_options": {},
}
},
"filterbank": {
"filterbank_options": {
"warping_function": "mel",
"filter_width": filterbank_filter_width,
"normalize": False,
"normalization_options": {},
"without_samples": False,
"samples_options": {
"audio_format": "wav",
# "scale_input": 2**-15,
"dc_detection": dc_detection,
},
"fft_options": {
"preemphasis": 0.97,
"window_type": "hanning",
"window_shift": 0.01,
"window_length": 0.025,
},
"apply_log": True,
"add_epsilon": True,
"add_features_output": True,
# "warp_differential_unit": False,
},
},
"energy": {
"energy_options": {
"without_samples": False,
Expand Down
1 change: 1 addition & 0 deletions users/berger/args/returnn/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def get_base_config(backend: Backend) -> Dict[str, Any]:
elif backend == Backend.PYTORCH:
result["backend"] = "torch"
result["use_lovely_tensors"] = True
# result["torch_amp"] = {"dtype": "bfloat16"}
else:
raise NotImplementedError
return result
Expand Down
55 changes: 55 additions & 0 deletions users/berger/args/returnn/learning_rates.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ class LearningRateSchedules(Enum):
NewbobAbs = auto()
OCLR = auto()
OCLR_STEP = auto()
OCLR_STEP_TORCH = auto()
CONST_DECAY = auto()
CONST_DECAY_STEP = auto()

Expand Down Expand Up @@ -38,6 +39,8 @@ def get_learning_rate_config(
config.update(get_oclr_config(**kwargs))
elif schedule == LearningRateSchedules.OCLR_STEP:
extra_python.append(get_oclr_function(**kwargs))
elif schedule == LearningRateSchedules.OCLR_STEP_TORCH:
extra_python.append(get_oclr_function_torch(**kwargs))
elif schedule == LearningRateSchedules.CONST_DECAY:
config.update(get_const_decay_config(**kwargs))
elif schedule == LearningRateSchedules.CONST_DECAY_STEP:
Expand Down Expand Up @@ -184,6 +187,58 @@ def get_oclr_function(
)


def get_oclr_function_torch(
num_epochs: int,
n_steps_per_epoch: int,
peak_lr: float = 1e-03,
inc_epochs: Optional[int] = None,
dec_epochs: Optional[int] = None,
initial_lr: Optional[float] = None,
decayed_lr: Optional[float] = None,
final_lr: Optional[float] = None,
**kwargs,
) -> str:
initial_lr = initial_lr or peak_lr / 10
decayed_lr = decayed_lr or initial_lr
final_lr = final_lr or initial_lr / 5
inc_epochs = inc_epochs or (num_epochs * 9) // 20
dec_epochs = dec_epochs or inc_epochs

return dedent(
f"""def dynamic_learning_rate(*, global_train_step: int, **_):
# Increase linearly from initial_lr to peak_lr over the first inc_epoch epochs
# Decrease linearly from peak_lr to decayed_lr over the next dec_epoch epochs
# Decrease linearly from decayed_lr to final_lr over the remaining epochs
initial_lr = {initial_lr}
peak_lr = {peak_lr}
decayed_lr = {decayed_lr}
final_lr = {final_lr}
inc_epochs = {inc_epochs}
dec_epochs = {dec_epochs}
total_epochs = {num_epochs}
n_steps_per_epoch = {n_steps_per_epoch}
# -- derived -- #
steps_increase = inc_epochs * n_steps_per_epoch
steps_decay = dec_epochs * n_steps_per_epoch
steps_final = (total_epochs - inc_epochs - dec_epochs) * n_steps_per_epoch
step_size_increase = (peak_lr - initial_lr) / steps_increase
step_size_decay = (peak_lr - decayed_lr) / steps_decay
step_size_final = (decayed_lr - final_lr) / steps_final
if global_train_step <= steps_increase:
return initial_lr + step_size_increase * global_train_step
if global_train_step <= steps_increase + steps_decay:
return peak_lr - step_size_decay * (global_train_step - steps_increase)
return max(
decayed_lr - step_size_final * (global_train_step - steps_increase - steps_decay),
final_lr
)"""
)


def get_const_decay_config(
num_epochs: int,
const_lr: float = 1e-03,
Expand Down
2 changes: 1 addition & 1 deletion users/berger/args/returnn/regularization.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def get_chunking_config(

if isinstance(chunking_factors, list):
chunking_factors = {key: 1 for key in chunking_factors}
assert isinstance(chunking_factors, Dict)
assert isinstance(chunking_factors, dict)
return {
"chunking": (
{key: base_chunk_size // factor for key, factor in chunking_factors.items()},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from .config_02c_transducer_rasr_features_wei_lex import py as py_02c
from .config_02e_transducer_rasr_features_tinaconf import py as py_02e
from .config_02e_transducer_rasr_features_tinaconf_rtf import py as py_02e_rtf
from .config_02f_transducer_rasr_features_am_scales import py as py_02f
from .config_03a_transducer_fullsum_raw_samples import py as py_03a
from .config_03b_transducer_fullsum_rasr_features import py as py_03b
from .config_03c_transducer_fullsum_rasr_features_wei_lex import py as py_03c
Expand All @@ -37,6 +38,7 @@ def main() -> SummaryReport:
sub_reports.append(copy.deepcopy(py_02c()[0]))
sub_reports.append(copy.deepcopy(py_02e()))
sub_reports.append(copy.deepcopy(py_02e_rtf()))
sub_reports.append(copy.deepcopy(py_02f()))
sub_reports.append(copy.deepcopy(py_03a()))
sub_reports.append(copy.deepcopy(py_03b()))
sub_reports.append(copy.deepcopy(py_03c()))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,8 @@ def run_exp() -> Tuple[SummaryReport, Checkpoint, Dict[str, AlignmentData]]:
)

recog_args = exp_args.get_ctc_recog_step_args(num_classes)
align_args = exp_args.get_ctc_align_step_args(num_classes)
recog_args["epochs"] = [320, 400, 480, 500, "best"]
align_args = exp_args.get_ctc_align_step_args(num_classes, epoch=500)
recog_args["epochs"] = [320, 500, "best"]
recog_args["prior_scales"] = [0.3]
recog_args["lm_scales"] = [0.9]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,8 @@ def run_exp() -> Tuple[SummaryReport, Checkpoint, Dict[str, AlignmentData]]:
)

recog_args = exp_args.get_ctc_recog_step_args(num_classes)
align_args = exp_args.get_ctc_align_step_args(num_classes)
recog_args["epochs"] = [320, 400, 480, 500, "best"]
align_args = exp_args.get_ctc_align_step_args(num_classes, epoch=500)
recog_args["epochs"] = [320, 500, "best"]
recog_args["prior_scales"] = [0.3]
recog_args["lm_scales"] = [0.9]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ def run_exp() -> Tuple[SummaryReport, Checkpoint, Dict[str, AlignmentData]]:
recog_args["feature_type"] = FeatureType.GAMMATONE_16K
recog_args["prior_scales"] = [0.3]
recog_args["lm_scales"] = [0.9]
recog_args["search_stats"] = True
align_args["feature_type"] = FeatureType.GAMMATONE_16K

# ********** System **********
Expand All @@ -146,6 +147,7 @@ def run_exp() -> Tuple[SummaryReport, Checkpoint, Dict[str, AlignmentData]]:
SummaryKey.PRIOR,
SummaryKey.LM,
SummaryKey.WER,
SummaryKey.RTF,
SummaryKey.SUB,
SummaryKey.INS,
SummaryKey.DEL,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,7 @@ def generate_returnn_config(
**kwargs,
) -> ReturnnConfig:
if train:
(
network_dict,
extra_python,
) = transducer_model.make_context_1_conformer_transducer(
(network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer(
num_outputs=num_classes,
gt_args={
"sample_rate": 16000,
Expand Down Expand Up @@ -87,10 +84,7 @@ def generate_returnn_config(
},
)
else:
(
network_dict,
extra_python,
) = transducer_model.make_context_1_conformer_transducer_recog(
(network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_recog(
num_outputs=num_classes,
gt_args={
"sample_rate": 16000,
Expand Down
Loading

0 comments on commit 6acec99

Please sign in to comment.