Merge remote-tracking branch 'origin/main' into ted-apostropf-merge

rwth-i6 · Jun 21, 2024 · 6acec99 · 6acec99
2 parents cf5884a + aa2b3fb
commit 6acec99
Show file tree

Hide file tree

Showing 528 changed files with 54,982 additions and 17,037 deletions.
diff --git a/example_setups/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_bpe/baseline.py b/example_setups/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_bpe/baseline.py
@@ -61,6 +61,7 @@ def bpe_ls960_1023_base():
     }
 
     from ...pytorch_networks.ctc.decoder.flashlight_ctc_v1 import DecoderConfig
+    from ...pytorch_networks.ctc.decoder.greedy_bpe_ctc_v3 import DecoderConfig as GreedyDecoderConfig
 
     def tune_and_evaluate_helper(
         training_name: str,
@@ -121,6 +122,22 @@ def tune_and_evaluate_helper(
                 **default_returnn,
             )
 
+    def greedy_search_helper(training_name: str, asr_model: ASRModel, decoder_config: GreedyDecoderConfig):
+        # remove prior if exists
+        asr_model = copy.deepcopy(asr_model)
+        asr_model.prior_file = None
+
+        search_name = training_name + "/search_greedy"
+        search_jobs, wers = search(
+            search_name,
+            forward_config={},
+            asr_model=asr_model,
+            decoder_module="ctc.decoder.greedy_bpe_ctc_v3",
+            decoder_args={"config": asdict(decoder_config)},
+            test_dataset_tuples=dev_dataset_tuples,
+            **default_returnn,
+        )
+
     default_decoder_config_bpe5000 = DecoderConfig(
         lexicon=get_text_lexicon(prefix=prefix_name, librispeech_key="train-other-960", bpe_size=5000),
         returnn_vocab=label_datastream_bpe5000.vocab,
@@ -200,6 +217,7 @@ def tune_and_evaluate_helper(
         "max_seq_length": {"audio_features": 35 * 16000},
         "accum_grad_multiple_step": 1,
         "torch_amp_options": {"dtype": "bfloat16"},
+        "gradient_clip": 1.0,
     }
 
     network_module = "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6"
@@ -224,3 +242,8 @@ def tune_and_evaluate_helper(
         lm_scales=[1.6, 1.8, 2.0],
         prior_scales=[0.2, 0.3, 0.4],
     )
+
+    greedy_decoder_config = GreedyDecoderConfig(
+        returnn_vocab=label_datastream_bpe5000.vocab,
+    )
+    greedy_search_helper(training_name=training_name, asr_model=asr_model, decoder_config=greedy_decoder_config)
diff --git a/example_setups/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_phon/baseline.py b/example_setups/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_phon/baseline.py
@@ -195,6 +195,7 @@ def tune_and_evaluate_helper(
         "max_seq_length": {"audio_features": 35 * 16000},
         "accum_grad_multiple_step": 1,
         "torch_amp_options": {"dtype": "bfloat16"},
+        "gradient_clip": 1.0,
     }
 
     network_module = "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6"

diff --git a/...ps/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v3.py b/...ps/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v3.py
@@ -0,0 +1,87 @@
+"""
+Greedy CTC decoder without any extras
+
+v3: add config objects
+"""
+from dataclasses import dataclass
+import time
+import torch
+
+
+@dataclass
+class DecoderConfig:
+    returnn_vocab: str
+
+
+@dataclass
+class ExtraConfig:
+    # used for RTF logging
+    print_rtf: bool = True
+    sample_rate: int = 16000
+
+    # Hypothesis logging
+    print_hypothesis: bool = True
+
+
+def forward_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    config = DecoderConfig(**kwargs["config"])
+    extra_config_dict = kwargs.get("extra_config", {})
+    extra_config = ExtraConfig(**extra_config_dict)
+
+    run_ctx.recognition_file = open("search_out.py", "wt")
+    run_ctx.recognition_file.write("{\n")
+
+    from returnn.datasets.util.vocabulary import Vocabulary
+
+    vocab = Vocabulary.create_vocab(vocab_file=config.returnn_vocab, unknown_label=None)
+    run_ctx.labels = vocab.labels
+
+    run_ctx.print_rtf = extra_config.print_rtf
+    if run_ctx.print_rtf:
+        run_ctx.running_audio_len_s = 0
+        run_ctx.total_time = 0
+
+    run_ctx.print_hypothesis = extra_config.print_hypothesis
+
+
+def forward_finish_hook(run_ctx, **kwargs):
+    run_ctx.recognition_file.write("}\n")
+    run_ctx.recognition_file.close()
+
+    print("Total-time: %.2f, Batch-RTF: %.3f" % (run_ctx.total_time, run_ctx.total_time / run_ctx.running_audio_len_s))
+
+
+def forward_step(*, model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000
+
+    if run_ctx.print_rtf:
+        run_ctx.running_audio_len_s += audio_len_batch
+        am_start = time.time()
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    batch_indices = []
+    for lp, l in zip(logprobs, audio_features_len):
+        batch_indices.append(torch.unique_consecutive(torch.argmax(lp[:l], dim=-1), dim=0).detach().cpu().numpy())
+
+    if run_ctx.print_rtf:
+        am_time = time.time() - am_start
+        run_ctx.total_time += am_time
+        print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time, am_time / audio_len_batch))
+
+    tags = data["seq_tag"]
+
+    for indices, tag in zip(batch_indices, tags):
+        sequence = [run_ctx.labels[idx] for idx in indices if idx < len(run_ctx.labels)]
+        sequence = [s for s in sequence if (not s.startswith("<") and not s.startswith("["))]
+        text = " ".join(sequence).replace("@@ ", "")
+        if run_ctx.print_hypothesis:
+            print(text)
+        run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(text)))
diff --git a/users/berger/args/experiments/ctc.py b/users/berger/args/experiments/ctc.py
@@ -67,7 +67,7 @@ def get_ctc_recog_step_args(num_classes: int, reduction_factor: int = 4, **kwarg
             "mem_rqmt": 16,
         },
         "rtf": 20,
-        "mem": 4,
+        "mem": 8,
     }
 
     return recursive_update(default_args, kwargs)

diff --git a/users/berger/args/experiments/transducer.py b/users/berger/args/experiments/transducer.py
@@ -68,7 +68,7 @@ def get_transducer_recog_step_args(
             "mem_rqmt": 16,
         },
         "rtf": 50,
-        "mem": 4,
+        "mem": 8,
     }
 
     return recursive_update(default_args, kwargs)

diff --git a/users/berger/args/jobs/rasr_init_args.py b/users/berger/args/jobs/rasr_init_args.py
@@ -91,6 +91,7 @@ def get_feature_extraction_args_16kHz(
     gt_args: Optional[Dict] = None,
 ) -> Dict:
     mfcc_filter_width = features.filter_width_from_channels(channels=20, f_max=8000)  # = 16000 / 2
+    filterbank_filter_width = features.filter_width_from_channels(channels=80, f_max=8000)  # = 16000 / 2
 
     if mfcc_cepstrum_options is None:
         mfcc_cepstrum_options = {
@@ -142,6 +143,30 @@ def get_feature_extraction_args_16kHz(
                 "normalization_options": {},
             }
         },
+        "filterbank": {
+            "filterbank_options": {
+                "warping_function": "mel",
+                "filter_width": filterbank_filter_width,
+                "normalize": False,
+                "normalization_options": {},
+                "without_samples": False,
+                "samples_options": {
+                    "audio_format": "wav",
+                    # "scale_input": 2**-15,
+                    "dc_detection": dc_detection,
+                },
+                "fft_options": {
+                    "preemphasis": 0.97,
+                    "window_type": "hanning",
+                    "window_shift": 0.01,
+                    "window_length": 0.025,
+                },
+                "apply_log": True,
+                "add_epsilon": True,
+                "add_features_output": True,
+                # "warp_differential_unit": False,
+            },
+        },
         "energy": {
             "energy_options": {
                 "without_samples": False,

diff --git a/users/berger/args/returnn/config.py b/users/berger/args/returnn/config.py
@@ -21,6 +21,7 @@ def get_base_config(backend: Backend) -> Dict[str, Any]:
     elif backend == Backend.PYTORCH:
         result["backend"] = "torch"
         result["use_lovely_tensors"] = True
+        # result["torch_amp"] = {"dtype": "bfloat16"}
     else:
         raise NotImplementedError
     return result

diff --git a/users/berger/args/returnn/learning_rates.py b/users/berger/args/returnn/learning_rates.py
@@ -10,6 +10,7 @@ class LearningRateSchedules(Enum):
     NewbobAbs = auto()
     OCLR = auto()
     OCLR_STEP = auto()
+    OCLR_STEP_TORCH = auto()
     CONST_DECAY = auto()
     CONST_DECAY_STEP = auto()
 
@@ -38,6 +39,8 @@ def get_learning_rate_config(
         config.update(get_oclr_config(**kwargs))
     elif schedule == LearningRateSchedules.OCLR_STEP:
         extra_python.append(get_oclr_function(**kwargs))
+    elif schedule == LearningRateSchedules.OCLR_STEP_TORCH:
+        extra_python.append(get_oclr_function_torch(**kwargs))
     elif schedule == LearningRateSchedules.CONST_DECAY:
         config.update(get_const_decay_config(**kwargs))
     elif schedule == LearningRateSchedules.CONST_DECAY_STEP:
@@ -184,6 +187,58 @@ def get_oclr_function(
     )
 
 
+def get_oclr_function_torch(
+    num_epochs: int,
+    n_steps_per_epoch: int,
+    peak_lr: float = 1e-03,
+    inc_epochs: Optional[int] = None,
+    dec_epochs: Optional[int] = None,
+    initial_lr: Optional[float] = None,
+    decayed_lr: Optional[float] = None,
+    final_lr: Optional[float] = None,
+    **kwargs,
+) -> str:
+    initial_lr = initial_lr or peak_lr / 10
+    decayed_lr = decayed_lr or initial_lr
+    final_lr = final_lr or initial_lr / 5
+    inc_epochs = inc_epochs or (num_epochs * 9) // 20
+    dec_epochs = dec_epochs or inc_epochs
+
+    return dedent(
+        f"""def dynamic_learning_rate(*, global_train_step: int, **_):
+            # Increase linearly from initial_lr to peak_lr over the first inc_epoch epochs
+            # Decrease linearly from peak_lr to decayed_lr over the next dec_epoch epochs
+            # Decrease linearly from decayed_lr to final_lr over the remaining epochs
+            initial_lr = {initial_lr}
+            peak_lr = {peak_lr}
+            decayed_lr = {decayed_lr}
+            final_lr = {final_lr}
+            inc_epochs = {inc_epochs}
+            dec_epochs = {dec_epochs}
+            total_epochs = {num_epochs}
+            n_steps_per_epoch = {n_steps_per_epoch}
+
+            # -- derived -- #
+            steps_increase = inc_epochs * n_steps_per_epoch
+            steps_decay = dec_epochs * n_steps_per_epoch
+            steps_final = (total_epochs - inc_epochs - dec_epochs) * n_steps_per_epoch
+
+            step_size_increase = (peak_lr - initial_lr) / steps_increase
+            step_size_decay = (peak_lr - decayed_lr) / steps_decay
+            step_size_final = (decayed_lr - final_lr) / steps_final
+
+            if global_train_step <= steps_increase:
+                return initial_lr + step_size_increase * global_train_step
+            if global_train_step <= steps_increase + steps_decay:
+                return peak_lr - step_size_decay * (global_train_step - steps_increase)
+            
+            return max(
+                decayed_lr - step_size_final * (global_train_step - steps_increase - steps_decay),
+                final_lr
+            )"""
+    )
+
+
 def get_const_decay_config(
     num_epochs: int,
     const_lr: float = 1e-03,

diff --git a/users/berger/args/returnn/regularization.py b/users/berger/args/returnn/regularization.py
@@ -13,7 +13,7 @@ def get_chunking_config(
 
     if isinstance(chunking_factors, list):
         chunking_factors = {key: 1 for key in chunking_factors}
-    assert isinstance(chunking_factors, Dict)
+    assert isinstance(chunking_factors, dict)
     return {
         "chunking": (
             {key: base_chunk_size // factor for key, factor in chunking_factors.items()},

diff --git a/users/berger/configs/librispeech/20230210_baselines/__init__.py b/users/berger/configs/librispeech/20230210_baselines/__init__.py
@@ -15,6 +15,7 @@
 from .config_02c_transducer_rasr_features_wei_lex import py as py_02c
 from .config_02e_transducer_rasr_features_tinaconf import py as py_02e
 from .config_02e_transducer_rasr_features_tinaconf_rtf import py as py_02e_rtf
+from .config_02f_transducer_rasr_features_am_scales import py as py_02f
 from .config_03a_transducer_fullsum_raw_samples import py as py_03a
 from .config_03b_transducer_fullsum_rasr_features import py as py_03b
 from .config_03c_transducer_fullsum_rasr_features_wei_lex import py as py_03c
@@ -37,6 +38,7 @@ def main() -> SummaryReport:
     sub_reports.append(copy.deepcopy(py_02c()[0]))
     sub_reports.append(copy.deepcopy(py_02e()))
     sub_reports.append(copy.deepcopy(py_02e_rtf()))
+    sub_reports.append(copy.deepcopy(py_02f()))
     sub_reports.append(copy.deepcopy(py_03a()))
     sub_reports.append(copy.deepcopy(py_03b()))
     sub_reports.append(copy.deepcopy(py_03c()))

diff --git a/users/berger/configs/librispeech/20230210_baselines/config_01a_ctc_blstm_raw_samples.py b/users/berger/configs/librispeech/20230210_baselines/config_01a_ctc_blstm_raw_samples.py
@@ -138,8 +138,8 @@ def run_exp() -> Tuple[SummaryReport, Checkpoint, Dict[str, AlignmentData]]:
     )
 
     recog_args = exp_args.get_ctc_recog_step_args(num_classes)
-    align_args = exp_args.get_ctc_align_step_args(num_classes)
-    recog_args["epochs"] = [320, 400, 480, 500, "best"]
+    align_args = exp_args.get_ctc_align_step_args(num_classes, epoch=500)
+    recog_args["epochs"] = [320, 500, "best"]
     recog_args["prior_scales"] = [0.3]
     recog_args["lm_scales"] = [0.9]
 

diff --git a/users/berger/configs/librispeech/20230210_baselines/config_01c_ctc_conformer_raw_samples.py b/users/berger/configs/librispeech/20230210_baselines/config_01c_ctc_conformer_raw_samples.py
@@ -142,8 +142,8 @@ def run_exp() -> Tuple[SummaryReport, Checkpoint, Dict[str, AlignmentData]]:
     )
 
     recog_args = exp_args.get_ctc_recog_step_args(num_classes)
-    align_args = exp_args.get_ctc_align_step_args(num_classes)
-    recog_args["epochs"] = [320, 400, 480, 500, "best"]
+    align_args = exp_args.get_ctc_align_step_args(num_classes, epoch=500)
+    recog_args["epochs"] = [320, 500, "best"]
     recog_args["prior_scales"] = [0.3]
     recog_args["lm_scales"] = [0.9]
 

diff --git a/...s/berger/configs/librispeech/20230210_baselines/config_01d_ctc_conformer_rasr_features.py b/...s/berger/configs/librispeech/20230210_baselines/config_01d_ctc_conformer_rasr_features.py
@@ -133,6 +133,7 @@ def run_exp() -> Tuple[SummaryReport, Checkpoint, Dict[str, AlignmentData]]:
     recog_args["feature_type"] = FeatureType.GAMMATONE_16K
     recog_args["prior_scales"] = [0.3]
     recog_args["lm_scales"] = [0.9]
+    recog_args["search_stats"] = True
     align_args["feature_type"] = FeatureType.GAMMATONE_16K
 
     # ********** System **********
@@ -146,6 +147,7 @@ def run_exp() -> Tuple[SummaryReport, Checkpoint, Dict[str, AlignmentData]]:
             SummaryKey.PRIOR,
             SummaryKey.LM,
             SummaryKey.WER,
+            SummaryKey.RTF,
             SummaryKey.SUB,
             SummaryKey.INS,
             SummaryKey.DEL,

diff --git a/users/berger/configs/librispeech/20230210_baselines/config_02a_transducer_raw_samples.py b/users/berger/configs/librispeech/20230210_baselines/config_02a_transducer_raw_samples.py
@@ -47,10 +47,7 @@ def generate_returnn_config(
     **kwargs,
 ) -> ReturnnConfig:
     if train:
-        (
-            network_dict,
-            extra_python,
-        ) = transducer_model.make_context_1_conformer_transducer(
+        (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer(
             num_outputs=num_classes,
             gt_args={
                 "sample_rate": 16000,
@@ -87,10 +84,7 @@ def generate_returnn_config(
             },
         )
     else:
-        (
-            network_dict,
-            extra_python,
-        ) = transducer_model.make_context_1_conformer_transducer_recog(
+        (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_recog(
             num_outputs=num_classes,
             gt_args={
                 "sample_rate": 16000,