From 3b45a1634819b31163e22e8ce92aa67f1e7ae8aa Mon Sep 17 00:00:00 2001 From: Judyxujj Date: Wed, 15 May 2024 17:26:48 +0800 Subject: [PATCH 001/227] Add swb PyTorch ctc setup (#219) * add initial setup * rm binary files * rm binary files --------- Co-authored-by: Jingjing Xu --- .../ctc/lbs_960/configs/baseline_config.py | 2 +- users/jxu/experiments/ctc/swb/__init__.py | 0 .../experiments/ctc/swb/configs/__init__.py | 0 .../ctc/swb/configs/baseline/__init__.py | 0 .../baseline/config_01_ctc_torch_conformer.py | 168 +++++++++++ .../baseline/config_01b_ctc_conformer.py | 263 ++++++++++++++++++ .../jxu/experiments/ctc/swb/data/__init__.py | 0 .../jxu/experiments/ctc/swb/data/ctc_data.py | 161 +++++++++++ users/jxu/experiments/ctc/swb/data/data.py | 160 +++++++++++ users/jxu/experiments/ctc/swb/data/lm_data.py | 13 + ...er_ctc_d_model_386_num_layers_12_logmel.py | 262 +++++++++++++++++ .../train_steps/baseline_ctc.py | 55 ++++ .../ctc/swb/pytorch_networks/util.py | 14 + .../._i6_conformer_downsample_3.py | Bin 4096 -> 0 bytes 14 files changed, 1097 insertions(+), 1 deletion(-) create mode 100644 users/jxu/experiments/ctc/swb/__init__.py create mode 100644 users/jxu/experiments/ctc/swb/configs/__init__.py create mode 100644 users/jxu/experiments/ctc/swb/configs/baseline/__init__.py create mode 100644 users/jxu/experiments/ctc/swb/configs/baseline/config_01_ctc_torch_conformer.py create mode 100644 users/jxu/experiments/ctc/swb/configs/baseline/config_01b_ctc_conformer.py create mode 100644 users/jxu/experiments/ctc/swb/data/__init__.py create mode 100644 users/jxu/experiments/ctc/swb/data/ctc_data.py create mode 100644 users/jxu/experiments/ctc/swb/data/data.py create mode 100644 users/jxu/experiments/ctc/swb/data/lm_data.py create mode 100644 users/jxu/experiments/ctc/swb/pytorch_networks/baseline/conformer_ctc_d_model_386_num_layers_12_logmel.py create mode 100644 users/jxu/experiments/ctc/swb/pytorch_networks/train_steps/baseline_ctc.py create mode 100644 users/jxu/experiments/ctc/swb/pytorch_networks/util.py delete mode 100644 users/jxu/experiments/hybrid/switchboard/pytorch_networks/._i6_conformer_downsample_3.py diff --git a/users/jxu/experiments/ctc/lbs_960/configs/baseline_config.py b/users/jxu/experiments/ctc/lbs_960/configs/baseline_config.py index 9b684a96b..8d9f991a5 100644 --- a/users/jxu/experiments/ctc/lbs_960/configs/baseline_config.py +++ b/users/jxu/experiments/ctc/lbs_960/configs/baseline_config.py @@ -33,7 +33,7 @@ # tools.rasr_binary_path = tk.Path("/u/berger/repositories/rasr_versions/onnx/arch/linux-x86_64-standard") # tools.returnn_root = tk.Path("/u/berger/repositories/MiniReturnn") SCTK_BINARY_PATH = compile_sctk() # use last published version -SCTK_BINARY_PATH.hash_overwrite = "LBS_DEFAULT_SCTK_BINARY_PATH" +SCTK_BINARY_PATH.hash_overwrite = "SWB_DEFAULT_SCTK_BINARY_PATH" # ********** Return Config generators ********** diff --git a/users/jxu/experiments/ctc/swb/__init__.py b/users/jxu/experiments/ctc/swb/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/jxu/experiments/ctc/swb/configs/__init__.py b/users/jxu/experiments/ctc/swb/configs/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/jxu/experiments/ctc/swb/configs/baseline/__init__.py b/users/jxu/experiments/ctc/swb/configs/baseline/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/jxu/experiments/ctc/swb/configs/baseline/config_01_ctc_torch_conformer.py b/users/jxu/experiments/ctc/swb/configs/baseline/config_01_ctc_torch_conformer.py new file mode 100644 index 000000000..fdbe2c492 --- /dev/null +++ b/users/jxu/experiments/ctc/swb/configs/baseline/config_01_ctc_torch_conformer.py @@ -0,0 +1,168 @@ +import copy +from typing import Dict, Tuple + +from i6_core.returnn.config import ReturnnConfig +import i6_core.rasr as rasr +from i6_core.returnn import Checkpoint +from i6_core.recognition import Hub5ScoreJob +from i6_experiments.common.tools.sctk import compile_sctk + +from i6_experiments.users.berger.args.experiments import ctc as exp_args +from i6_experiments.users.berger.args.returnn.config import get_returnn_config, Backend +from i6_experiments.users.berger.util import default_tools_v2 +from i6_experiments.users.berger.args.returnn.learning_rates import LearningRateSchedules, Optimizers +from i6_experiments.users.berger.systems.dataclasses import AlignmentData, FeatureType, ReturnnConfigs, ConfigVariant + +from i6_experiments.users.berger.recipe.summary.report import SummaryReport +import i6_experiments.users.jxu.experiments.ctc.swb.pytorch_networks.baseline.conformer_ctc_d_model_386_num_layers_12_logmel as conformer_ctc +from i6_experiments.users.jxu.experiments.ctc.swb.data.ctc_data import get_switchboard_data +from i6_experiments.users.berger.systems.returnn_seq2seq_system import ( + ReturnnSeq2SeqSystem, +) +from sisyphus import gs, tk + + +# ********** Settings ********** +rasr.flow.FlowNetwork.default_flags = {"cache_mode": "task_dependent"} + +num_classes = 88 +num_subepochs = 600 + +tools = copy.deepcopy(default_tools_v2) +tools.rasr_binary_path = tk.Path( + "/u/berger/repositories/rasr_versions/gen_seq2seq_dev/arch/linux-x86_64-standard", + hash_overwrite="/u/berger/repositories/rasr_versions/gen_seq2seq_dev/arch/linux-x86_64-standard" +) +tools.returnn_root = tk.Path("/u/jxu/setups/tedlium2/2023-07-11--ctc-tedlium/tools/20240509_returnn/returnn", + hash_overwrite="/u/berger/repositories/returnn") +SCTK_BINARY_PATH = compile_sctk() # use last published version +SCTK_BINARY_PATH.hash_overwrite = "LBS_DEFAULT_SCTK_BINARY_PATH" + +# ********** Return Config generators ********** + + +def generate_returnn_config(variant: ConfigVariant, train_data_config: dict, dev_data_config: dict, lr: dict, + batch_size: int, network_args:dict) -> ReturnnConfig: + network_args["num_outputs"] = num_classes + model_config = conformer_ctc.get_default_config_v1(**network_args) + + extra_config = { + "train": train_data_config, + "dev": dev_data_config, + } + if variant == ConfigVariant.RECOG: + extra_config["model_outputs"] = {"classes": {"dim": num_classes}} + + return get_returnn_config( + num_epochs=num_subepochs, + num_inputs=1, + num_outputs=num_classes, + target="targets", + extra_python=[conformer_ctc.get_serializer(model_config, variant=variant)], + extern_data_config=True, + backend=Backend.PYTORCH, + grad_noise=0.0, + grad_clip=0.0, + optimizer=Optimizers.AdamW, + schedule=LearningRateSchedules.OCLR, + max_seqs=60, + initial_lr=lr["initial_lr"], + peak_lr=lr["peak_lr"], + final_lr=lr["final_lr"], + batch_size=batch_size, + use_chunking=False, + extra_config=extra_config, + ) + + +def run_exp() -> Tuple[SummaryReport, Checkpoint, Dict[str, AlignmentData]]: + assert tools.returnn_root is not None + assert tools.returnn_python_exe is not None + assert tools.rasr_binary_path is not None + + data = get_switchboard_data( + returnn_root=tools.returnn_root, + feature_type=FeatureType.SAMPLES, + augmented_lexicon=True, + test_keys=["hub5e01"], + ) + + # ********** Step args ********** + + train_args = exp_args.get_ctc_train_step_args( + num_epochs=300, + # gpu_mem_rqmt=11, + ) + + recog_args = exp_args.get_ctc_recog_step_args(num_classes) + align_args = exp_args.get_ctc_align_step_args(num_classes) + recog_args["epochs"] = [160, 300, "best"] + recog_args["feature_type"] = FeatureType.SAMPLES + recog_args["prior_scales"] = [0.3, 0.5] + recog_args["lm_scales"] = [0.5, 0.7, 0.9] + align_args["feature_type"] = FeatureType.SAMPLES + + recog_am_args = copy.deepcopy(exp_args.ctc_recog_am_args) + recog_am_args.update( + { + "tying_type": "global-and-nonword", + "nonword_phones": ["[NOISE]", "[VOCALIZEDNOISE]", "[LAUGHTER]"], + } + ) + # ********** System ********** + + system = ReturnnSeq2SeqSystem(tools) + + system.init_corpora( + dev_keys=data.dev_keys, + test_keys=data.test_keys, + align_keys=data.align_keys, + corpus_data=data.data_inputs, + am_args=recog_am_args, + ) + system.setup_scoring( + scorer_type=Hub5ScoreJob, + # stm_kwargs={"non_speech_tokens": ["[NOISE]", "[LAUGHTER]", "[VOCALIZED-NOISE]"]}, + stm_kwargs={"stm_paths": {key: tk.Path("/u/corpora/speech/hub5e_00/xml/hub5e_00.stm") for key in data.dev_keys}}, + score_kwargs={ + "glm": tk.Path("/u/corpora/speech/hub5e_00/xml/glm"), + # "glm": tk.Path("/u/corpora/speech/hub-5-00/raw/transcriptions/reference/en20000405_hub5.glm"), + }, + ) + + # ********** Returnn Configs ********** + + network_args = {} + lr ={"initial_lr": 7e-6, "peak_lr": 7e-4, "final_lr": 1e-7} + + for ordering in [ + # "laplace:.1000", + "laplace:.384", + # "laplace:.100", + # "laplace:.50", + # "laplace:.25", + # "laplace:.10", + # "random", + ]: + mod_train_data_config = copy.deepcopy(data.train_data_config) + mod_train_data_config["seq_ordering"] = ordering + + train_config = generate_returnn_config( + ConfigVariant.TRAIN, train_data_config=data.train_data_config, dev_data_config=data.cv_data_config, lr=lr, batch_size=18000*160, network_args=network_args + ) + recog_config = generate_returnn_config( + ConfigVariant.RECOG, train_data_config=data.train_data_config, dev_data_config=data.cv_data_config, lr=lr, batch_size=18000*160, network_args=network_args + ) + + returnn_configs = ReturnnConfigs( + train_config=train_config, + recog_configs={"recog": recog_config}, + ) + + system.add_experiment_configs(f"Conformer_CTC_order-{ordering}", returnn_configs) + + system.run_train_step(**train_args) + system.run_dev_recog_step(**recog_args) + # system.run_test_recog_step(**recog_args) + assert system.summary_report + return system.summary_report diff --git a/users/jxu/experiments/ctc/swb/configs/baseline/config_01b_ctc_conformer.py b/users/jxu/experiments/ctc/swb/configs/baseline/config_01b_ctc_conformer.py new file mode 100644 index 000000000..ff2d7734c --- /dev/null +++ b/users/jxu/experiments/ctc/swb/configs/baseline/config_01b_ctc_conformer.py @@ -0,0 +1,263 @@ +import copy +import os +from typing import Dict, Tuple + +import i6_core.rasr as rasr +from i6_core.recognition import Hub5ScoreJob +from i6_core.returnn import Checkpoint +from i6_core.returnn.config import ReturnnConfig +from i6_experiments.users.berger.args.experiments import ctc as exp_args +from i6_experiments.users.berger.args.returnn.config import get_returnn_config +from i6_experiments.users.berger.args.returnn.learning_rates import ( + LearningRateSchedules, +) +import i6_experiments.users.berger.network.models.fullsum_ctc as ctc_model +from i6_experiments.users.berger.recipe.summary.report import SummaryReport +from i6_experiments.users.berger.systems.returnn_seq2seq_system import ( + ReturnnSeq2SeqSystem, +) +from i6_experiments.users.berger.systems.dataclasses import AlignmentData, FeatureType, ReturnnConfigs +from i6_experiments.users.berger.util import default_tools_v2 as tools +from i6_private.users.vieting.helpers.returnn import serialize_dim_tags +from i6_experiments.users.berger.corpus.switchboard.ctc_data import ( + get_switchboard_data, +) +from sisyphus import gs, tk + +# ********** Settings ********** + +rasr.flow.FlowNetwork.default_flags = {"cache_mode": "task_dependent"} + + +num_classes = 88 + + +# ********** Return Config generators ********** + + +def generate_returnn_config( + train: bool, + *, + loss_corpus: tk.Path, + loss_lexicon: tk.Path, + am_args: dict, + train_data_config: dict, + dev_data_config: dict, + num_epochs: int = 300, +) -> ReturnnConfig: + if train: + network_dict, extra_python = ctc_model.make_conformer_fullsum_ctc_model( + num_outputs=num_classes, + specaug_args={ + "max_time_num": 1, + "max_time": 15, + "max_feature_num": 5, + "max_feature": 4, + }, + conformer_args={ + "num_blocks": 12, + "size": 512, + "dropout": 0.1, + "l2": 1e-04, + }, + output_args={ + "rasr_binary_path": tools.rasr_binary_path, + "loss_corpus_path": loss_corpus, + "loss_lexicon_path": loss_lexicon, + "am_args": am_args, + }, + ) + else: + network_dict, extra_python = ctc_model.make_conformer_ctc_recog_model( + num_outputs=num_classes, + conformer_args={ + "num_blocks": 12, + "size": 512, + }, + ) + + returnn_config = get_returnn_config( + network=network_dict, + target=None, + num_epochs=num_epochs, + num_inputs=40, + python_prolog=[ + "import sys", + "sys.setrecursionlimit(10 ** 6)", + ], + extra_python=extra_python, + extern_data_config=True, + grad_noise=0.0, + grad_clip=0.0, + schedule=LearningRateSchedules.OCLR, + initial_lr=1e-05, + peak_lr=4e-04, + final_lr=1e-05, + cycle_epoch=160 if num_epochs == 400 else 135, + batch_size=10000, + use_chunking=False, + extra_config={ + "train": train_data_config, + "dev": dev_data_config, + }, + ) + returnn_config = serialize_dim_tags(returnn_config) + + return returnn_config + + +def run_exp() -> Tuple[SummaryReport, Checkpoint, Dict[str, AlignmentData]]: + assert tools.returnn_root is not None + assert tools.returnn_python_exe is not None + assert tools.rasr_binary_path is not None + + data = get_switchboard_data( + returnn_root=tools.returnn_root, + returnn_python_exe=tools.returnn_python_exe, + rasr_binary_path=tools.rasr_binary_path, + feature_type=FeatureType.GAMMATONE_8K, + augmented_lexicon=True, + test_keys=["hub5e01"], + ) + + # ********** Step args ********** + + train_args = exp_args.get_ctc_train_step_args( + num_epochs=300, + # gpu_mem_rqmt=11, + ) + + recog_args = exp_args.get_ctc_recog_step_args(num_classes) + align_args = exp_args.get_ctc_align_step_args(num_classes) + recog_args["epochs"] = [160, 300, "best"] + recog_args["feature_type"] = FeatureType.GAMMATONE_8K + recog_args["prior_scales"] = [0.3, 0.5] + recog_args["lm_scales"] = [0.5, 0.7, 0.9] + align_args["feature_type"] = FeatureType.GAMMATONE_8K + + recog_am_args = copy.deepcopy(exp_args.ctc_recog_am_args) + recog_am_args.update( + { + "tying_type": "global-and-nonword", + "nonword_phones": ["[NOISE]", "[VOCALIZEDNOISE]", "[LAUGHTER]"], + } + ) + loss_am_args = copy.deepcopy(exp_args.ctc_loss_am_args) + loss_am_args.update( + { + "state_tying": "lookup", + "state_tying_file": tk.Path("/work/asr4/berger/dependencies/switchboard/state_tying/eow-state-tying"), + "tying_type": "global-and-nonword", + "nonword_phones": ["[NOISE]", "[VOCALIZEDNOISE]", "[LAUGHTER]"], + "phon_history_length": 0, + "phon_future_length": 0, + } + ) + + # ********** System ********** + + system = ReturnnSeq2SeqSystem(tools) + + system.init_corpora( + dev_keys=data.dev_keys, + test_keys=data.test_keys, + align_keys=data.align_keys, + corpus_data=data.data_inputs, + am_args=recog_am_args, + ) + system.setup_scoring( + scorer_type=Hub5ScoreJob, + # stm_kwargs={"non_speech_tokens": ["[NOISE]", "[LAUGHTER]", "[VOCALIZED-NOISE]"]}, + stm_kwargs={"stm_paths": {key: tk.Path("/u/corpora/speech/hub5e_00/xml/hub5e_00.stm") for key in data.dev_keys}}, + score_kwargs={ + "glm": tk.Path("/u/corpora/speech/hub5e_00/xml/glm"), + # "glm": tk.Path("/u/corpora/speech/hub-5-00/raw/transcriptions/reference/en20000405_hub5.glm"), + }, + ) + + # ********** Returnn Configs ********** + + config_generator_kwargs = { + "loss_corpus": data.loss_corpus, + "loss_lexicon": data.loss_lexicon, + "am_args": loss_am_args, + "dev_data_config": data.cv_data_config, + } + + for ordering in [ + # "laplace:.1000", + "laplace:.384", + # "laplace:.100", + # "laplace:.50", + # "laplace:.25", + # "laplace:.10", + # "random", + ]: + mod_train_data_config = copy.deepcopy(data.train_data_config) + mod_train_data_config["seq_ordering"] = ordering + + train_config = generate_returnn_config( + train=True, train_data_config=mod_train_data_config, **config_generator_kwargs + ) + recog_config = generate_returnn_config( + train=False, train_data_config=mod_train_data_config, **config_generator_kwargs + ) + + returnn_configs = ReturnnConfigs( + train_config=train_config, + recog_configs={"recog": recog_config}, + ) + + system.add_experiment_configs(f"Conformer_CTC_order-{ordering}", returnn_configs) + + system.run_train_step(**train_args) + system.run_dev_recog_step(**recog_args) + # system.run_test_recog_step(**recog_args) + alignments = next( + iter(system.run_align_step(exp_names=["Conformer_CTC_order-laplace:.384"], **align_args).values()) + ) + + model = system.get_train_job("Conformer_CTC_order-laplace:.384").out_checkpoints[300] + assert isinstance(model, Checkpoint) + + system.cleanup_experiments() + + mod_train_data_config = copy.deepcopy(data.train_data_config) + mod_train_data_config["seq_ordering"] = "laplace:.384" + train_config = generate_returnn_config( + train=True, + train_data_config=mod_train_data_config, + num_epochs=400, + **config_generator_kwargs, + ) + recog_config = generate_returnn_config( + train=False, train_data_config=mod_train_data_config, **config_generator_kwargs + ) + + returnn_configs = ReturnnConfigs( + train_config=train_config, + recog_configs={"recog": recog_config}, + ) + + system.add_experiment_configs(f"Conformer_CTC_order-laplace:.384_400ep", returnn_configs) + train_args = exp_args.get_ctc_train_step_args( + num_epochs=400, + # gpu_mem_rqmt=11, + ) + system.run_train_step(**train_args) + recog_args["epochs"] = [160, 320, 400, "best"] + system.run_dev_recog_step(**recog_args) + + assert system.summary_report + return system.summary_report, model, alignments + + +def py() -> Tuple[SummaryReport, Checkpoint, Dict[str, AlignmentData]]: + filename_handle = os.path.splitext(os.path.basename(__file__))[0][len("config_") :] + gs.ALIAS_AND_OUTPUT_SUBDIR = f"{filename_handle}/" + + summary_report, model, alignments = run_exp() + + tk.register_report(f"{gs.ALIAS_AND_OUTPUT_SUBDIR}/summary.report", summary_report) + + return summary_report, model, alignments diff --git a/users/jxu/experiments/ctc/swb/data/__init__.py b/users/jxu/experiments/ctc/swb/data/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/jxu/experiments/ctc/swb/data/ctc_data.py b/users/jxu/experiments/ctc/swb/data/ctc_data.py new file mode 100644 index 000000000..238abd0f7 --- /dev/null +++ b/users/jxu/experiments/ctc/swb/data/ctc_data.py @@ -0,0 +1,161 @@ +from typing import List, Optional +import copy + +from i6_core import corpus +from i6_core.returnn.hdf import BlissToPcmHDFJob +from i6_core.lexicon.modification import AddEowPhonemesToLexiconJob +from i6_experiments.users.berger.systems.dataclasses import FeatureType +from i6_experiments.users.jxu.experiments.ctc.swb.data import data +from i6_experiments.users.berger.args.returnn.dataset import MetaDatasetBuilder, hdf_config_dict_for_files +from i6_experiments.users.berger.recipe.returnn.hdf import BlissCorpusToTargetHdfJob +from i6_experiments.users.berger.corpus.general.experiment_data import PytorchCTCSetupData + +from sisyphus import tk + + +def get_switchboard_data( + returnn_root: tk.Path, + train_key: str = "train", + cv_keys: Optional[List[str]] = None, + dev_keys: Optional[List[str]] = None, + test_keys: Optional[List[str]] = None, + feature_type: FeatureType = FeatureType.SAMPLES, + dc_detection: bool = False, + add_unknown: bool = False, + augmented_lexicon: bool = False, + **kwargs, +) -> PytorchCTCSetupData: + if cv_keys is None: + cv_keys = ["hub5e00"] + if dev_keys is None: + dev_keys = ["hub5e00"] + if test_keys is None: + test_keys = ["hub5e01", "rt03s"] + + # ********** Data inputs ********** + + train_data_inputs, cv_data_inputs, dev_data_inputs, test_data_inputs = data.get_data_inputs( + train_key=train_key, + cv_keys=cv_keys, + dev_keys=dev_keys, + test_keys=test_keys, + ctc_lexicon=True, + add_all_allophones=True, + augmented_lexicon=augmented_lexicon, + **kwargs, + ) + + # ********** Train data ********** + train_corpus_object = train_data_inputs[train_key].corpus_object + eow_lexicon = AddEowPhonemesToLexiconJob(train_data_inputs[train_key].lexicon.filename, + nonword_phones=["[NOISE]", "[VOCALIZEDNOISE]", "[LAUGHTER]"]).out_lexicon + assert train_corpus_object is not None + + if not add_unknown and not augmented_lexicon: + train_corpus_object.corpus_file = corpus.FilterCorpusRemoveUnknownWordSegmentsJob( + train_corpus_object.corpus_file, + eow_lexicon, + all_unknown=False, + ).out_corpus + + train_dataset_builder = MetaDatasetBuilder() + + if feature_type is not FeatureType.SAMPLES: + raise NotImplementedError("Currently only support feature types sample") + + bliss_to_pcm_hdf_job = BlissToPcmHDFJob( + train_corpus_object.corpus_file, + rounding=BlissToPcmHDFJob.RoundingScheme.rasr_compatible, + returnn_root=returnn_root, + ) + bliss_to_pcm_hdf_job.rqmt = {"cpu": 2, "mem": 8, "time": 8} + train_feature_hdf = bliss_to_pcm_hdf_job.out_hdf + + train_dataset_builder.add_dataset( + name="features", + key_mapping={"data": "data"}, + dataset_config=hdf_config_dict_for_files( + [train_feature_hdf], + { + "partition_epoch": 6, + "seq_ordering": "laplace:.1000", + }), + control=True, + ) + + train_targets_hdf = BlissCorpusToTargetHdfJob( + train_corpus_object.corpus_file, + bliss_lexicon=eow_lexicon, + returnn_root=returnn_root, + ).out_hdf + train_dataset_builder.add_dataset( + name="targets", + dataset_config=hdf_config_dict_for_files([train_targets_hdf]), + key_mapping={"data": "targets"}, + control=False, + ) + train_data_config = train_dataset_builder.get_dict() + + # ********** CV data ********** + cv_dataset_builder = MetaDatasetBuilder() + cv_feature_hdf = [] + + cv_data_inputs = copy.deepcopy(cv_data_inputs) + for key in dev_keys: + if not add_unknown: + for corpus_object in [cv_data_inputs[key].corpus_object for key in dev_keys]: + assert corpus_object.corpus_file is not None + corpus_object.corpus_file = corpus.FilterCorpusRemoveUnknownWordSegmentsJob( + corpus_object.corpus_file, + eow_lexicon, + all_unknown=False, + ).out_corpus + + bliss_to_pcm_hdf_job = BlissToPcmHDFJob( + cv_data_inputs[key].corpus_object.corpus_file, + rounding=BlissToPcmHDFJob.RoundingScheme.rasr_compatible, + returnn_root=returnn_root, + ) + cv_feature_hdf.append(bliss_to_pcm_hdf_job.out_hdf) + + cv_dataset_builder.add_dataset( + name="features", + key_mapping={"data": "data"}, + dataset_config=hdf_config_dict_for_files( + cv_feature_hdf, + { + "partition_epoch": 1, + "seq_ordering": "laplace:.1000", + }), + control=True, + ) + + cv_targets_hdf = [] + for key in dev_keys: + cv_targets_hdf.append(BlissCorpusToTargetHdfJob( + cv_data_inputs[key].corpus_object.corpus_file, + bliss_lexicon=eow_lexicon, + returnn_root=returnn_root, + ).out_hdf) + + cv_dataset_builder.add_dataset( + name="targets", + dataset_config=hdf_config_dict_for_files(cv_targets_hdf), + key_mapping={"data": "targets"}, + control=False, + ) + cv_data_config = cv_dataset_builder.get_dict() + + return PytorchCTCSetupData( + train_key=train_key, + dev_keys=list(dev_data_inputs.keys()), + test_keys=list(test_data_inputs.keys()), + align_keys=["train", "hub5e00_zoltan_4gram", "hub5e01_zoltan_4gram"], + train_data_config=train_data_config, + cv_data_config=cv_data_config, + data_inputs={ + **train_data_inputs, + **dev_data_inputs, + **test_data_inputs, + }, + ) diff --git a/users/jxu/experiments/ctc/swb/data/data.py b/users/jxu/experiments/ctc/swb/data/data.py new file mode 100644 index 000000000..d02af37ca --- /dev/null +++ b/users/jxu/experiments/ctc/swb/data/data.py @@ -0,0 +1,160 @@ +from sisyphus import tk +import copy +from typing import Dict, List, Tuple, Optional +from i6_core.corpus import FilterCorpusBySegmentsJob, FilterSegmentsByListJob, SegmentCorpusJob +import i6_experiments.common.datasets.switchboard as swb_dataset +from i6_experiments.common.datasets.switchboard.constants import concurrent +from i6_experiments.common.setups.rasr import util as rasr_util + +from i6_experiments.users.berger.recipe.corpus.transform import TransformTranscriptionsJob, TranscriptionTransform +from i6_experiments.users.berger.corpus.general.helpers import filter_unk_in_corpus_object +from i6_experiments.users.berger.helpers.rasr import convert_legacy_corpus_object_dict_to_scorable +from .lm_data import get_lm +from i6_experiments.users.berger import helpers +from i6_experiments.users.berger.recipe.lexicon.modification import ( + EnsureSilenceFirstJob, + DeleteEmptyOrthJob, + MakeBlankLexiconJob, +) + + +def get_data_inputs( + train_key: str = "train", + cv_keys: Optional[List[str]] = None, + dev_keys: Optional[List[str]] = None, + test_keys: Optional[List[str]] = None, + lm_names: Optional[List[str]] = None, + ctc_lexicon: bool = False, + augmented_lexicon: bool = False, + add_all_allophones: bool = False, +) -> Tuple[Dict[str, helpers.RasrDataInput], ...]: + if cv_keys is None: + cv_keys = ["hub5e00"] + if dev_keys is None: + dev_keys = ["hub5e00"] + if test_keys is None: + test_keys = ["hub5e01"] + if lm_names is None: + lm_names = ["zoltan_4gram"] + + corpus_object_dict = { + "train": swb_dataset.get_train_corpus_object_i6_legacy(), + "hub5e00": swb_dataset.get_hub5e00_corpus_object(), + "hub5e01": swb_dataset.get_hub5e01_corpus_object(), + "rt03s": swb_dataset.get_rt03s_corpus_object(), + } + + corpus_object_dict = convert_legacy_corpus_object_dict_to_scorable(corpus_object_dict) + corpus_object_dict["hub5e00"].stm = tk.Path("/u/corpora/speech/hub5e_00/xml/hub5e_00.stm") + corpus_object_dict["hub5e00"].glm = tk.Path("/u/corpora/speech/hub5e_00/xml/glm") + # corpus_object_dict["hub5e00"].stm = swb_dataset.get_hub5e00().stm + # corpus_object_dict["hub5e00"].glm = swb_dataset.get_hub5e00().glm + corpus_object_dict["hub5e01"].stm = swb_dataset.get_hub5e01().stm + corpus_object_dict["hub5e01"].glm = tk.Path("/u/corpora/speech/hub5e_00/xml/glm") + # corpus_object_dict["hub5e01"].glm = swb_dataset.get_hub5e01().glm + + lms = {lm_name: get_lm(lm_name) for lm_name in lm_names} + + if augmented_lexicon: + bliss_lexicon = tk.Path("/work/asr4/berger/dependencies/switchboard/lexicon/wei_train_ctc.lexicon.orig.xml") + else: + bliss_lexicon = swb_dataset.get_bliss_lexicon() + bliss_lexicon = EnsureSilenceFirstJob(bliss_lexicon).out_lexicon + + if ctc_lexicon: + bliss_lexicon = DeleteEmptyOrthJob(bliss_lexicon).out_lexicon + bliss_lexicon = MakeBlankLexiconJob(bliss_lexicon).out_lexicon + + lexicon_config = helpers.LexiconConfig( + filename=bliss_lexicon, + normalize_pronunciation=False, + add_all_allophones=add_all_allophones, + add_allophones_from_lexicon=not add_all_allophones, + ) + + train_data_inputs = {} + cv_data_inputs = {} + dev_data_inputs = {} + test_data_inputs = {} + + train_corpus_object = copy.deepcopy(corpus_object_dict[train_key]) + # if filter_unk_from_corpus: + # filter_unk_in_corpus_object(train_corpus_object, bliss_lexicon) + + segment_files = SegmentCorpusJob(train_corpus_object.corpus_file, 1).out_single_segment_files + filtered_segment_files = FilterSegmentsByListJob( + segment_files, + filter_list=[ + "switchboard-1/sw04118A/sw4118A-ms98-a-0045", + "switchboard-1/sw02663A/sw2663A-ms98-a-0022", + "switchboard-1/sw02986A/sw2986A-ms98-a-0013", + ], + ).out_single_segment_files + train_corpus_object.corpus_file = FilterCorpusBySegmentsJob( + train_corpus_object.corpus_file, segment_file=filtered_segment_files[1] + ).out_corpus + + train_data_inputs[train_key] = helpers.RasrDataInput( + corpus_object=train_corpus_object, + concurrent=concurrent[train_key], + lexicon=lexicon_config, + ) + + for cv_key in cv_keys: + cv_corpus_object = copy.deepcopy(corpus_object_dict[cv_key]) + filter_unk_in_corpus_object(cv_corpus_object, bliss_lexicon) + + segment_files = SegmentCorpusJob(cv_corpus_object.corpus_file, 1).out_single_segment_files + filtered_segment_files = FilterSegmentsByListJob( + segment_files, + filter_list=[ + "hub5e_00/en_6189a/36", + "hub5e_00/en_4852b/77", + "hub5e_00/en_6189b/66", + ], + ).out_single_segment_files + cv_corpus_object.corpus_file = FilterCorpusBySegmentsJob( + cv_corpus_object.corpus_file, segment_file=filtered_segment_files[1] + ).out_corpus + + cv_corpus_object.corpus_file = TransformTranscriptionsJob( + cv_corpus_object.corpus_file, TranscriptionTransform.ALL_LOWERCASE + ).out_corpus_file + cv_data_inputs[cv_key] = helpers.RasrDataInput( + corpus_object=cv_corpus_object, + concurrent=concurrent[cv_key], + lexicon=lexicon_config, + ) + + for dev_key in dev_keys: + for lm_name, lm in lms.items(): + dev_data_inputs[f"{dev_key}_{lm_name}"] = helpers.RasrDataInput( + corpus_object=corpus_object_dict[dev_key], + concurrent=concurrent[dev_key], + lexicon=lexicon_config, + lm=lm, + ) + + for test_key in test_keys: + for lm_name, lm in lms.items(): + test_data_inputs[f"{test_key}_{lm_name}"] = helpers.RasrDataInput( + corpus_object=corpus_object_dict[test_key], + concurrent=concurrent[test_key], + lexicon=lexicon_config, + lm=lm, + ) + + return train_data_inputs, cv_data_inputs, dev_data_inputs, test_data_inputs + + +def get_final_gmm_output(): + output_args = rasr_util.OutputArgs("final") + + output_args.define_corpus_type("train", "train") + output_args.define_corpus_type("hub5e00", "dev") + for tc in ("hub5e01", "rt03s"): + output_args.define_corpus_type(tc, "test") + + output_args.add_feature_to_extract("gt") + + return output_args diff --git a/users/jxu/experiments/ctc/swb/data/lm_data.py b/users/jxu/experiments/ctc/swb/data/lm_data.py new file mode 100644 index 000000000..cd566bd73 --- /dev/null +++ b/users/jxu/experiments/ctc/swb/data/lm_data.py @@ -0,0 +1,13 @@ +from sisyphus import tk + +from i6_experiments.users.berger.helpers import rasr_lm_config + + +def get_lm(name: str) -> rasr_lm_config.LMData: + if name == "zoltan_4gram": + return rasr_lm_config.ArpaLMData(10, tk.Path("/work/asr4/berger/dependencies/switchboard/lm/zoltan_4gram.gz")) + elif name == "fisher_4gram": + return rasr_lm_config.ArpaLMData( + 10, tk.Path("/home/tuske/work/ASR/switchboard/corpus/lm/data/mylm/swb.fsh.4gr.voc30k.LM.gz") + ) + raise ValueError diff --git a/users/jxu/experiments/ctc/swb/pytorch_networks/baseline/conformer_ctc_d_model_386_num_layers_12_logmel.py b/users/jxu/experiments/ctc/swb/pytorch_networks/baseline/conformer_ctc_d_model_386_num_layers_12_logmel.py new file mode 100644 index 000000000..4f5f2f321 --- /dev/null +++ b/users/jxu/experiments/ctc/swb/pytorch_networks/baseline/conformer_ctc_d_model_386_num_layers_12_logmel.py @@ -0,0 +1,262 @@ +from dataclasses import dataclass +from enum import Enum, auto +from i6_core.returnn.config import CodeWrapper +from i6_experiments.users.berger.systems.dataclasses import ConfigVariant + +import torch +from i6_experiments.common.setups.returnn_pytorch.serialization import Collection +from i6_experiments.common.setups.serialization import Import, PartialImport +from i6_experiments.users.berger.pytorch.serializers.basic import ( + get_basic_pt_network_serializer, +) +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1, VGG4LayerActFrontendV1Config + +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config +from i6_models.parts.frontend.generic_frontend import ( + GenericFrontendV1, + GenericFrontendV1Config, + FrontendLayerType, +) +import i6_models.parts.conformer as conformer_parts_i6 +import i6_models.assemblies.conformer as conformer_i6 +import i6_experiments.users.berger.pytorch.custom_parts as custom_parts +from i6_models.config import ModelConfiguration, ModuleFactoryV1 +from i6_models.parts.conformer.norm import LayerNormNC +from ..util import lengths_to_padding_mask +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.assemblies.conformer import ( + ConformerEncoderV1Config, +) + +@dataclass +class SpecaugmentByLengthConfigV1(ModelConfiguration): + time_min_num_masks: int + time_max_mask_per_n_frames: int + time_mask_max_size: int + freq_min_num_masks: int + freq_max_num_masks: int + freq_mask_max_size: int + + +@dataclass +class ConformerCTCConfig(ModelConfiguration): + feature_extraction: ModuleFactoryV1 + specaugment_cfg: SpecaugmentByLengthConfigV1 + conformer: ModuleFactoryV1 + dim: int + target_size: int + dropout: float + + +class ConformerCTCModel(torch.nn.Module): + def __init__(self, cfg: ConformerCTCConfig, **_): + super().__init__() + self.feature_extraction = cfg.feature_extraction() + self.specaugment_cfg = cfg.specaugment_cfg + self.conformer = cfg.conformer() + self.dropout = torch.nn.Dropout(cfg.dropout) + self.final_linear = torch.nn.Linear(cfg.dim, cfg.target_size) + + def forward(self, audio_features: torch.Tensor, audio_features_len: torch.Tensor): + with torch.no_grad(): + audio_features = audio_features.squeeze(-1) + x, input_len = self.feature_extraction(audio_features, audio_features_len) + sequence_mask = lengths_to_padding_mask(input_len) + x = specaugment_v1_by_length(x, + time_min_num_masks=self.specaugment_cfg.time_min_num_masks, + time_max_mask_per_n_frames=self.specaugment_cfg.time_max_mask_per_n_frames, + time_mask_max_size=self.specaugment_cfg.time_mask_max_size, + freq_min_num_masks=self.specaugment_cfg.freq_min_num_masks, + freq_max_num_masks=self.specaugment_cfg.freq_max_num_masks, + freq_mask_max_size=self.specaugment_cfg.freq_mask_max_size) # [B, T, F] + + x, sequence_mask = self.conformer(x, sequence_mask) # [B, T, F] + x = self.dropout(x) + logits = self.final_linear(x) # [B, T, F] + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(sequence_mask, dim=1).type(torch.int32) + + +def get_train_serializer( + model_config: ConformerCTCConfig, + **_, +) -> Collection: + return get_basic_pt_network_serializer( + module_import_path=f"{__name__}.{ConformerCTCModel.__name__}", + model_config=model_config, + additional_serializer_objects=[ + Import("i6_experiments.users.jxu.experiments.ctc.swb.pytorch_networks.train_steps.baseline_ctc.train_step"), + ], + ) + + +def get_prior_serializer( + model_config: ConformerCTCConfig, + **_, +) -> Collection: + pytorch_package = __package__.rpartition(".")[0] + return get_basic_pt_network_serializer( + module_import_path=f"{__name__}.{ConformerCTCModel.__name__}", + model_config=model_config, + additional_serializer_objects=[ + Import(f"{pytorch_package}.forward.basic.forward_step"), + Import(f"{pytorch_package}.forward.prior_callback.ComputePriorCallback", import_as="forward_callback"), + ], + ) + + +class RecogType(Enum): + RASR = auto() + FLASHLIGHT = auto() + + +def get_rasr_recog_serializer( + model_config: ConformerCTCConfig, + **_, +) -> Collection: + pytorch_package = __package__.rpartition(".")[0] + return get_basic_pt_network_serializer( + module_import_path=f"{__name__}.{ConformerCTCModel.__name__}", + model_config=model_config, + additional_serializer_objects=[Import(f"{pytorch_package}.forward.basic.forward_step")], + ) + + +def get_recog_serializer( + model_config: ConformerCTCConfig, +) -> Collection: + pytorch_package = __package__.rpartition(".")[0] + return get_basic_pt_network_serializer( + module_import_path=f"{__name__}.{ConformerCTCModel.__name__}", + model_config=model_config, + additional_serializer_objects=[ + Import(f"{__name__}.export"), + ], + ) + + +def get_serializer(model_config: ConformerCTCConfig, variant: ConfigVariant, **kwargs) -> Collection: + if variant == ConfigVariant.TRAIN: + return get_train_serializer(model_config, **kwargs) + if variant == ConfigVariant.PRIOR: + return get_prior_serializer(model_config, **kwargs) + if variant == ConfigVariant.ALIGN: + return get_recog_serializer(model_config, **kwargs) + if variant == ConfigVariant.RECOG: + return get_recog_serializer(model_config, **kwargs) + raise NotImplementedError + + +def get_default_config_v1(num_outputs: int) -> ConformerCTCConfig: + feature_extraction = ModuleFactoryV1( + module_class=LogMelFeatureExtractionV1, + cfg=LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + n_fft=400, + ), + ) + + specaug_cfg = SpecaugmentByLengthConfigV1( + time_min_num_masks=2, + time_max_mask_per_n_frames=25, + time_mask_max_size=20, + freq_min_num_masks=2, + freq_max_num_masks=16, + freq_mask_max_size=5, + ) + + # frontend = ModuleFactoryV1( + # GenericFrontendV1, + # GenericFrontendV1Config( + # in_features=80, + # layer_ordering=[ + # FrontendLayerType.Conv2d, + # FrontendLayerType.Conv2d, + # FrontendLayerType.Pool2d, + # FrontendLayerType.Conv2d, + # FrontendLayerType.Conv2d, + # FrontendLayerType.Pool2d, + # FrontendLayerType.Activation, + # ], + # conv_kernel_sizes=[(3, 3), (3, 3), (3, 3), (3, 3)], + # conv_paddings=None, + # conv_out_dims=[32, 64, 64, 32], + # conv_strides=[(1, 1), (1, 1), (1, 1), (1, 1)], + # pool_kernel_sizes=[(2, 1), (2, 1)], + # pool_strides=None, + # pool_paddings=None, + # activations=[torch.nn.ReLU()], + # out_features=384, + # ), + # ) + + frontend_cfg = VGG4LayerActFrontendV1Config( + in_features=80, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation=torch.nn.ReLU(), + out_features=384, + ) + + frontend = ModuleFactoryV1(VGG4LayerActFrontendV1, frontend_cfg) + + ff_cfg = conformer_parts_i6.ConformerPositionwiseFeedForwardV1Config( + input_dim=384, + hidden_dim=1536, + dropout=0.2, + activation=torch.nn.SiLU(), + ) + + mhsa_cfg = conformer_parts_i6.ConformerMHSAV1Config( + input_dim=384, + num_att_heads=6, + att_weights_dropout=0.1, + dropout=0.1, + ) + + conv_cfg = conformer_parts_i6.ConformerConvolutionV1Config( + channels=384, + kernel_size=31, + dropout=0.2, + activation=torch.nn.SiLU(), + norm=LayerNormNC(384), + ) + + block_cfg = conformer_i6.ConformerBlockV1Config( + ff_cfg=ff_cfg, + mhsa_cfg=mhsa_cfg, + conv_cfg=conv_cfg, + ) + + conformer_cfg = conformer_i6.ConformerEncoderV1Config( + num_layers=12, + frontend=frontend, + block_cfg=block_cfg, + ) + + return ConformerCTCConfig( + feature_extraction=feature_extraction, + specaugment_cfg=specaug_cfg, + conformer=ModuleFactoryV1(conformer_i6.ConformerEncoderV1, cfg=conformer_cfg), + dim=384, + target_size=num_outputs, + dropout=0.1, + ) \ No newline at end of file diff --git a/users/jxu/experiments/ctc/swb/pytorch_networks/train_steps/baseline_ctc.py b/users/jxu/experiments/ctc/swb/pytorch_networks/train_steps/baseline_ctc.py new file mode 100644 index 000000000..bfd3a8b24 --- /dev/null +++ b/users/jxu/experiments/ctc/swb/pytorch_networks/train_steps/baseline_ctc.py @@ -0,0 +1,55 @@ +import torch +from returnn.tensor.tensor_dict import TensorDict + + +def map_tensor_to_minus1_plus1_interval(tensor: torch.Tensor) -> torch.Tensor: + if torch.is_floating_point(tensor): + return tensor + + dtype = tensor.dtype + info = torch.iinfo(dtype) + min_val = info.min + max_val = info.max + + return 2.0 * (tensor.float() - min_val) / (max_val - min_val) - 1.0 + +def train_step(*, model: torch.nn.Module, extern_data: TensorDict, **kwargs): + audio_features = extern_data["data"].raw_tensor + audio_features = audio_features.squeeze(-1) + audio_features = map_tensor_to_minus1_plus1_interval(audio_features) + assert extern_data["data"].dims[1].dyn_size_ext is not None + + audio_features_len = extern_data["data"].dims[1].dyn_size_ext.raw_tensor + assert audio_features_len is not None + + assert extern_data["targets"].raw_tensor is not None + targets = extern_data["targets"].raw_tensor.long() + + targets_len_rf = extern_data["targets"].dims[1].dyn_size_ext + assert targets_len_rf is not None + targets_len = targets_len_rf.raw_tensor + assert targets_len is not None + + log_probs, sequence_lengths = model( + audio_features=audio_features, + audio_features_len=audio_features_len.to("cuda"), + ) + + log_probs = torch.transpose(log_probs, 0, 1) # [T, B, F] + + loss = torch.nn.functional.ctc_loss( + log_probs=log_probs, + targets=targets, + input_lengths=sequence_lengths, + target_lengths=targets_len, + blank=0, + reduction="sum", + zero_infinity=True, + ) + + from returnn.tensor import batch_dim + import returnn.frontend as rf + + rf.get_run_ctx().mark_as_loss( + name="CTC", loss=loss, custom_inv_norm_factor=rf.reduce_sum(targets_len_rf, axis=batch_dim) + ) \ No newline at end of file diff --git a/users/jxu/experiments/ctc/swb/pytorch_networks/util.py b/users/jxu/experiments/ctc/swb/pytorch_networks/util.py new file mode 100644 index 000000000..082865328 --- /dev/null +++ b/users/jxu/experiments/ctc/swb/pytorch_networks/util.py @@ -0,0 +1,14 @@ +import torch + +def lengths_to_padding_mask(lengths: torch.Tensor) -> torch.Tensor: + """ + Convert lengths to an equivalent boolean mask + + :param lengths: [B] + :return: B x T, where 1 means within sequence and 0 means outside sequence + """ + max_length = torch.max(lengths) + index_range = torch.arange(max_length, device=lengths.device, dtype=lengths.dtype) + sequence_mask = torch.less(index_range[None, :], lengths[:, None]) + + return sequence_mask \ No newline at end of file diff --git a/users/jxu/experiments/hybrid/switchboard/pytorch_networks/._i6_conformer_downsample_3.py b/users/jxu/experiments/hybrid/switchboard/pytorch_networks/._i6_conformer_downsample_3.py deleted file mode 100644 index 9347483ceb674f5fed371bbe88004c2d3b429271..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4096 zcmZQz6=P>$Vqox1Ojhs@R)|o50+1L3ClDJkFz{^v(m+1nBL)UWIUt(=a103vVqnmB zgy>+H0aVV7riBs6hl-0P=jZAr78K;9>J=2_m!;+<<|U^x02QqGA$-d2Xr2|C)=} Date: Wed, 15 May 2024 14:58:23 +0000 Subject: [PATCH 002/227] add conformer enc with more weight dropout --- .../asr/encoder/conformer_encoder_v2.py | 1391 +++++++++++++++++ 1 file changed, 1391 insertions(+) create mode 100644 users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py diff --git a/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py b/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py new file mode 100644 index 000000000..4633e7665 --- /dev/null +++ b/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py @@ -0,0 +1,1391 @@ +""" +Conformer encoder + +Other implementations: + +https://github.com/facebookresearch/fairseq/blob/main/fairseq/models/speech_to_text/modules/emformer.py +https://pytorch.org/audio/main/_modules/torchaudio/models/emformer.html#Emformer +""" + +from __future__ import annotations +from typing import Optional, Union, List, Tuple +from dataclasses import dataclass +from returnn.tensor import Dim +from returnn.tf.util.data import SpatialDim, FeatureDim +from i6_experiments.users.zeineldeen.modules.network import ReturnnNetwork + +from i6_core.returnn.config import CodeWrapper + + +class ConformerEncoder: + """ + Represents Conformer Encoder Architecture + + * Conformer: Convolution-augmented Transformer for Speech Recognition + * Ref: https://arxiv.org/abs/2005.08100 + """ + + def __init__( + self, + input="data", + input_layer="lstm-6", + input_layer_conv_act="relu", + add_abs_pos_enc_to_input=False, + frontend_conv_l2=0.0, + num_blocks=16, + conv_kernel_size=32, + specaug=True, + pos_enc="rel", + activation="swish", + block_final_norm=True, + ff_dim=512, + ff_bias=True, + ctc_loss_scale=None, + dropout=0.1, + att_dropout=0.1, + enc_key_dim=256, + att_num_heads=4, + target="bpe", + l2=0.0, + lstm_dropout=0.1, + rec_weight_dropout=0.0, + with_ctc=False, + native_ctc=False, + ctc_dropout=0.0, + ctc_l2=0.0, + ctc_opts=None, + ctc_self_align_delay: Optional[int] = None, + ctc_self_align_scale: float = 0.5, + subsample=None, + start_conv_init=None, + conv_module_init=None, + mhsa_init=None, + mhsa_out_init=None, + ff_init=None, + rel_pos_clipping=16, + dropout_in=0.1, + batch_norm_opts=None, + use_ln=False, + pooling_str=None, + self_att_l2=0.0, + sandwich_conv=False, + add_to_prefix_name=None, + output_layer_name="encoder", + create_only_blocks=False, + no_mhsa_module=False, + proj_input=False, + use_sqrd_relu=False, + use_causal_layers=False, + use_causal_conv=None, + conv_alternative_name: Optional[str] = None, + fix_merge_dims=False, + weight_noise=None, + weight_noise_layers=None, + convolution_first=False, + ff_weight_dropout=None, + mhsa_weight_dropout=None, + conv_weight_dropout=None, + memory_variant_opts: Optional[ConformerMemoryVariantOpts] = None, + ): + """ + :param str input: input layer name + :param str|None input_layer: type of input layer which does subsampling + :param int num_blocks: number of Conformer blocks + :param int conv_kernel_size: kernel size for conv layers in Convolution module + :param bool|None specaug: If true, then SpecAug is appliedi wi + :param str|None activation: activation used to sandwich modules + :param bool block_final_norm: if True, apply layer norm at the end of each conformer block + :param bool final_norm: if True, apply layer norm to the output of the encoder + :param int|None ff_dim: dimension of the first linear layer in FF module + :param str|None ff_init: FF layers initialization + :param bool|None ff_bias: If true, then bias is used for the FF layers + :param float embed_dropout: dropout applied to the source embedding + :param float dropout: general dropout + :param float att_dropout: dropout applied to attention weights + :param int enc_key_dim: encoder key dimension, also denoted as d_model, or d_key + :param int att_num_heads: the number of attention heads + :param str target: target labels key name + :param float l2: add L2 regularization for trainable weights parameters + :param float lstm_dropout: dropout applied to the input of the LSTMs in case they are used + :param float rec_weight_dropout: dropout applied to the hidden-to-hidden weight matrices of the LSTM in case used + :param bool with_ctc: if true, CTC loss is used + :param bool native_ctc: if true, use returnn native ctc implementation instead of TF implementation + :param float ctc_dropout: dropout applied on input to ctc + :param float ctc_l2: L2 applied to the weight matrix of CTC softmax + :param dict[str] ctc_opts: options for CTC + """ + + self.input = input + self.input_layer = input_layer + self.input_layer_conv_act = input_layer_conv_act + self.add_abs_pos_enc_to_input = add_abs_pos_enc_to_input + self.frontend_conv_l2 = frontend_conv_l2 + + self.num_blocks = num_blocks + self.conv_kernel_size = conv_kernel_size + + self.pos_enc = pos_enc + self.rel_pos_clipping = rel_pos_clipping + + self.ff_bias = ff_bias + + self.specaug = specaug + + self.activation = activation + + self.block_final_norm = block_final_norm + + self.dropout = dropout + self.att_dropout = att_dropout + self.lstm_dropout = lstm_dropout + + self.dropout_in = dropout_in + + # key and value dimensions are the same + self.enc_key_dim = enc_key_dim + self.enc_value_dim = enc_key_dim + self.att_num_heads = att_num_heads + self.enc_key_per_head_dim = enc_key_dim // att_num_heads + self.enc_val_per_head_dim = enc_key_dim // att_num_heads + + self.ff_dim = ff_dim + if self.ff_dim is None: + self.ff_dim = 2 * self.enc_key_dim + + self.target = target + + self.l2 = l2 + self.self_att_l2 = self_att_l2 + self.rec_weight_dropout = rec_weight_dropout + + if batch_norm_opts is None: + batch_norm_opts = {} + + bn_momentum = batch_norm_opts.pop("momentum", 0.1) + bn_eps = batch_norm_opts.pop("epsilon", 1e-3) + bn_update_sample_only_in_train = batch_norm_opts.pop("update_sample_only_in_training", True) + bn_delay_sample_update = batch_norm_opts.pop("delay_sample_update", True) + self.batch_norm_opts = { + "momentum": bn_momentum, + "epsilon": bn_eps, + "update_sample_only_in_training": bn_update_sample_only_in_train, + "delay_sample_update": bn_delay_sample_update, + } + self.batch_norm_opts.update(**batch_norm_opts) + + self.with_ctc = with_ctc + self.native_ctc = native_ctc + self.ctc_dropout = ctc_dropout + self.ctc_loss_scale = ctc_loss_scale + self.ctc_l2 = ctc_l2 + self.ctc_opts = ctc_opts + if not self.ctc_opts: + self.ctc_opts = {} + self.ctc_self_align_delay = ctc_self_align_delay + self.ctc_self_align_scale = ctc_self_align_scale + + self.start_conv_init = start_conv_init + self.conv_module_init = conv_module_init + self.mhsa_init = mhsa_init + self.mhsa_out_init = mhsa_out_init + self.ff_init = ff_init + + self.sandwich_conv = sandwich_conv + + # add maxpooling layers + self.subsample = subsample + self.subsample_list = [1] * num_blocks + if subsample: + for idx, s in enumerate(map(int, subsample.split("_")[:num_blocks])): + self.subsample_list[idx] = s + + self.network = ReturnnNetwork() + + self.use_ln = use_ln + + self.pooling_str = pooling_str + + self.add_to_prefix_name = add_to_prefix_name + self.output_layer_name = output_layer_name + + self.create_only_blocks = create_only_blocks + + self.no_mhsa_module = no_mhsa_module + self.proj_input = proj_input + + self.use_sqrd_relu = use_sqrd_relu + + self.use_causal_layers = use_causal_layers + self.use_causal_conv = use_causal_conv if use_causal_conv is not None else self.use_causal_layers + + self.conv_alternative_name = conv_alternative_name + self.fix_merge_dims = fix_merge_dims + + self.weight_noise = weight_noise + self.weight_noise_layers = weight_noise_layers + if self.weight_noise_layers is None: + self.weight_noise_layers = [] + for layer in self.weight_noise_layers: + assert layer in ["mhsa", "conv", "frontend_conv"] + + self.convolution_first = convolution_first + + self.ff_weight_drop = ff_weight_dropout + self.conv_weight_drop = conv_weight_dropout + self.mhsa_weight_drop = mhsa_weight_dropout + + self.memory_variant_opts = memory_variant_opts + if self.memory_variant_opts: + self.concat_window_dim = SpatialDim("concat-window") # W*N + self.enc_att_num_heads_dim = SpatialDim("enc-att-num-heads", att_num_heads) + self.enc_per_head_dim = FeatureDim("enc-dim-per-head", self.enc_key_per_head_dim) + if self.memory_variant_opts.conv_cache_size: + self.conv_cache_concat_dim = SpatialDim("conv-cache-concat") + if self.memory_variant_opts.use_emformer_mem: + self.emformer_mem_bank_dim = SpatialDim("emformer-mem-bank") # M, the same as C but different tag + self.emformer_ext_query_dim = SpatialDim("emformer-ext-query") # W+1 + self.concat_window_with_mem_dim = SpatialDim("concat-window-with-mem") # W*N+M + + def _create_ff_module(self, prefix_name, i, source, layer_index): + """ + Add Feed Forward Module: + LN -> FFN -> Swish -> Dropout -> FFN -> Dropout + + :param str prefix_name: some prefix name + :param int i: FF module index + :param str source: name of source layer + :param int layer_index: index of layer + :return: last layer name of this module + :rtype: str + """ + prefix_name = prefix_name + "_ffmod_{}".format(i) + + ln = self.network.add_layer_norm_layer("{}_ln".format(prefix_name), source) + + ff1 = self.network.add_linear_layer( + "{}_ff1".format(prefix_name), + ln, + n_out=self.ff_dim, + l2=self.l2, + forward_weights_init=self.ff_init, + with_bias=self.ff_bias, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + ) + + if self.use_sqrd_relu: + swish_act = self.network.add_activation_layer("{}_relu".format(prefix_name), ff1, activation="relu") + swish_act = self.network.add_eval_layer( + "{}_square_relu".format(prefix_name), swish_act, eval="source(0) ** 2" + ) + else: + swish_act = self.network.add_activation_layer( + "{}_swish".format(prefix_name), ff1, activation=self.activation + ) + + drop1 = self.network.add_dropout_layer("{}_drop1".format(prefix_name), swish_act, dropout=self.dropout) + + ff2 = self.network.add_linear_layer( + "{}_ff2".format(prefix_name), + drop1, + n_out=self.enc_key_dim, + l2=self.l2, + forward_weights_init=self.ff_init, + with_bias=self.ff_bias, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + ) + + drop2 = self.network.add_dropout_layer("{}_drop2".format(prefix_name), ff2, dropout=self.dropout) + + half_step_ff = self.network.add_eval_layer("{}_half_step".format(prefix_name), drop2, eval="0.5 * source(0)") + + res_inputs = [half_step_ff, source] + + ff_module_res = self.network.add_combine_layer( + "{}_res".format(prefix_name), kind="add", source=res_inputs, n_out=self.enc_key_dim + ) + + return ff_module_res + + def _get_mem_chunks(self, prefix_name: str, input_layer: str, mem_size: int) -> List[Tuple[str, Union[str, Dim]]]: + """ + :param name: layer prefix name + :param input_layer: name of input layer to shift of shape [B*C, W, D] + :return: arg for the ConcatLayer, i.e. list of tuple (layer_name, axis) + """ + input_layer_splitted = self.network.add_generic_layer( + f"{prefix_name}_split_chunk", + cls="split_batch_time", + source=input_layer, + base=self.memory_variant_opts.split_batch_time_base, + ) # [B, C, W, D], C = chunked_time_dim + mem_chunks = [] + for mem_idx in range(mem_size): + chunk_shifted = self.network.add_generic_layer( + f"{prefix_name}_chunk_shifted" + (f"_{mem_idx}" if mem_idx > 0 else ""), + cls="shift_axis", + source=input_layer_splitted, + axis=self.memory_variant_opts.chunked_time_dim, # C + amount=mem_idx + 1, + pad=True, + adjust_size_info=False, # no change in dim tag, C stays the same + ) # [B, C, W, D] + # Merge batch and chunk dim again. + chunk_shifted = self.network.add_generic_layer( + f"{prefix_name}_chunk_shifted_" + (f"_{mem_idx}" if mem_idx > 0 else ""), + cls="merge_dims", + source=chunk_shifted, + axes=("B", self.memory_variant_opts.chunked_time_dim), + ) # [B*C, W, D] + # Make sure the time_dim_axis (T) is set to the correct dim (W). + chunk_shifted = self.network.add_generic_layer( + f"{prefix_name}_chunk_shifted__" + (f"_{mem_idx}" if mem_idx > 0 else ""), + cls="reinterpret_data", + source=chunk_shifted, + set_axes={"T": "spatial"}, + ) # [B*C, W, D] but not time_dim_axis is set to W + + if self.memory_variant_opts.mem_slice_start is not None: + assert self.memory_variant_opts.mem_slice_size is not None + chunk_shifted = self.network.add_generic_layer( + f"{prefix_name}_chunk_shifted__" + (f"_{mem_idx}" if mem_idx > 0 else "") + "_sliced", + cls="slice", + source=chunk_shifted, + axis="T", + slice_start=self.memory_variant_opts.mem_slice_start, + slice_end=self.memory_variant_opts.mem_slice_start + self.memory_variant_opts.mem_slice_size, + ) + + mem_chunks.append((chunk_shifted, "T")) + + # reverse to concat left-most first + mem_chunks.reverse() + return mem_chunks + + def _self_att_v2( + self, prefix_name: str, *, input_layer: str, concat_prev_chunks_inputs: str, layer_index: int + ) -> str: + """ + Self-Attention implementation via RETURNN layers instead of using RETURNN SelfAttentionLayer + + :param prefix_name: layer prefix name + :param input_layer: name of input layer + :param concat_prev_chunks_inputs: + """ + + if self.memory_variant_opts.use_cached_prev_kv: + assert concat_prev_chunks_inputs is None, "Should use cached keys and values instead." + + K = self.network.add_generic_layer( + f"{prefix_name}_ln_K", + cls="linear", + source=input_layer if self.memory_variant_opts.use_cached_prev_kv else concat_prev_chunks_inputs, + n_out=self.enc_key_dim, + forward_weights_init=self.mhsa_init, + with_bias=False, + L2=self.self_att_l2, + param_dropout=self.mhsa_weight_drop, + param_dropout_min_ndim=2, + ) # [B*C, W*N, D] or [B*C, W, D] (use_cached_prev_kv) + + V = self.network.add_generic_layer( + f"{prefix_name}_ln_V", + cls="linear", + source=input_layer if self.memory_variant_opts.use_cached_prev_kv else concat_prev_chunks_inputs, + n_out=self.enc_value_dim, + forward_weights_init=self.mhsa_init, + with_bias=False, + L2=self.self_att_l2, + param_dropout=self.mhsa_weight_drop, + param_dropout_min_ndim=2, + ) # [B*C, W*N, D] or [B*C, W, D] (use_cached_prev_kv) + + if self.memory_variant_opts.use_emformer_mem and layer_index > 1: + # Take memory from the previous layer. + # f"{prefix_name}_emformer_mem" has shape [B*C, D] + # I.e. one vector per batch and chunk. + # We effectively convert it to [B,M,D] with M=C but being a separate tag, + # and then expand it to [B*C,M,D]. + # This expansion is maybe not optimal, however, the attention still is only per chunk, + # which would be difficult otherwise. + # C is approx 15-20. + # Then we can concat it to K and V. + # Note on prefix_name: The outer _create_mhsa_module adds the additional "_self_att" prefix. + mem_bank = self._block_prefix_name(layer_index - 1) + "_self_att_emformer_mem" # [B*C, D] + + # Same projection which is usually applied to get back to the residual stream. + mem_bank = self.network.add_generic_layer( + f"{prefix_name}_emformer_mem_proj", + cls="linear", + source=mem_bank, + n_out=self.enc_key_dim, + with_bias=False, + reuse_params=self._block_prefix_name(layer_index - 1) + "_self_att_linear", + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + ) # [B*C, D] + mem_bank = self.network.add_dropout_layer( + f"{prefix_name}_emformer_mem_proj_drop", mem_bank, dropout=self.dropout + ) + + if self.memory_variant_opts.apply_tanh_on_emformer_mem: + mem_bank = self.network.add_generic_layer( + f"{prefix_name}_emformer_mem_tanh", cls="activation", source=mem_bank, activation="tanh" + ) + else: + mem_bank = self.network.add_generic_layer( + f"{prefix_name}_emformer_mem_clipped", + cls="eval", + source=mem_bank, + eval="tf.clip_by_value(source(0), -10, 10)", + ) + + mem_bank = self.network.add_generic_layer( + f"{prefix_name}_emformer_mem_split_batch_time", + cls="split_batch_time", + source=mem_bank, + base=self.memory_variant_opts.split_batch_time_base, + ) # [B, C, D] + mem_bank = self.network.add_generic_layer( + f"{prefix_name}_emformer_mem_set_new_dim", + cls="reinterpret_data", + source=mem_bank, + set_dim_tags=[(self.memory_variant_opts.chunked_time_dim, self.emformer_mem_bank_dim)], + ) # [B, M, D] + + mem_bank_K = self.network.add_generic_layer( + f"{prefix_name}_emformer_mem_K", + cls="linear", + source=mem_bank, + n_out=self.enc_key_dim, + with_bias=False, + reuse_params=K, + ) # [B, M, D] + mem_bank_V = self.network.add_generic_layer( + f"{prefix_name}_emformer_mem_V", + cls="linear", + source=mem_bank, + n_out=self.enc_value_dim, + with_bias=False, + reuse_params=V, + ) # [B, M, D] + + mem_bank_K = self.network.add_generic_layer( + f"{prefix_name}_emformer_mem_K_expand_chunks", + cls="expand_dims", + source=mem_bank_K, + axis="T", + dim=self.memory_variant_opts.chunked_time_dim, + ) # [B, M, C, D] + mem_bank_K = self.network.add_generic_layer( + f"{prefix_name}_emformer_mem_K_expanded_merged", + cls="merge_dims", + source=mem_bank_K, + axes=("B", self.memory_variant_opts.chunked_time_dim), + ) # [B*C, M, D] + + mem_bank_V = self.network.add_generic_layer( + f"{prefix_name}_emformer_mem_V_expand_chunks", + cls="expand_dims", + source=mem_bank_V, + axis="T", + dim=self.memory_variant_opts.chunked_time_dim, + ) # [B, M, C, D] + mem_bank_V = self.network.add_generic_layer( + f"{prefix_name}_emformer_mem_V_expanded_merged", + cls="merge_dims", + source=mem_bank_V, + axes=("B", self.memory_variant_opts.chunked_time_dim), + ) # [B*C, M, D] + else: + mem_bank_K, mem_bank_V = None, None + + kv_dim = self.concat_window_with_mem_dim if mem_bank_K else self.concat_window_dim # W*N [+M] + + if self.memory_variant_opts.use_cached_prev_kv or mem_bank_K: + # concat previous cached keys and values + concat_keys = self._get_mem_chunks(f"{prefix_name}_ln_K_", K, self.memory_variant_opts.mem_size) + concat_keys.append((K, "T")) + if mem_bank_K: + concat_keys.append((mem_bank_K, self.emformer_mem_bank_dim)) + + K = self.network.add_generic_layer( + f"{prefix_name}_ln_K_concat", + cls="concat", + source=concat_keys, + out_dim=kv_dim, + ) # [B*C, W*N [+M], D] + + K_H = self.network.add_generic_layer( + f"{prefix_name}_ln_K_H", + cls="split_dims", + source=K, + axis="f", + dims=(self.enc_att_num_heads_dim, self.enc_per_head_dim), + ) # [B*C, W*N, H, D/H] + + if self.memory_variant_opts.use_cached_prev_kv or mem_bank_V: + concat_values = self._get_mem_chunks(f"{prefix_name}_ln_V_", V, self.memory_variant_opts.mem_size) + concat_values.append((V, "T")) + if mem_bank_V: + concat_values.append((mem_bank_V, self.emformer_mem_bank_dim)) + + V = self.network.add_generic_layer( + f"{prefix_name}_ln_V_concat", + cls="concat", + source=concat_values, + out_dim=kv_dim, + ) # [B*C, W*N, D] + + V_H = self.network.add_generic_layer( + f"{prefix_name}_ln_V_H", + cls="split_dims", + source=V, + axis="f", + dims=(self.enc_att_num_heads_dim, self.enc_per_head_dim), + ) # [B*C, W*N, H, D/H] + Q = self.network.add_generic_layer( + f"{prefix_name}_ln_Q", + cls="linear", + source=input_layer, + n_out=self.enc_key_dim, + forward_weights_init=self.mhsa_init, + with_bias=False, + L2=self.self_att_l2, + param_dropout=self.mhsa_weight_drop, + param_dropout_min_ndim=2, + ) # second half of shape [B*C, W, D] + query_dim = self.memory_variant_opts.chunk_size_dim # W + + if self.memory_variant_opts.use_emformer_mem: + assert ( + self.memory_variant_opts.mem_slice_start is not None + and self.memory_variant_opts.mem_slice_size is not None + ) + # Q = Wq * [C,R] of shape [B*C, W, D] + + # s_mean = mean(Q[:C]) = mean(Wq * C) <=> Wq * mean(C) + Q_summary_slice = self.network.add_generic_layer( + f"{prefix_name}_ln_Q_summary_slice", + cls="slice", + source=Q, + axis="T", # W + slice_start=self.memory_variant_opts.mem_slice_start, # skip L context if it exists + slice_end=self.memory_variant_opts.mem_slice_size, # this should be always equal to C + ) + Q_summary_mean = self.network.add_generic_layer( + f"{prefix_name}_ln_Q_summary_mean", + cls="reduce", + source=Q_summary_slice, + mode="mean", + axis="T", + keep_dims=True, + ) # [B*C, 1, D] + + Q = self.network.add_generic_layer( + f"{prefix_name}_ln_Q_concat_summary_mean", + cls="concat", + source=[(Q, "T"), (Q_summary_mean, "T")], + out_dim=self.emformer_ext_query_dim, + ) # [B*C, W+1, D] + query_dim = self.emformer_ext_query_dim # W+1 + + Q_H_ = self.network.add_generic_layer( + f"{prefix_name}_ln_Q_H_", + cls="split_dims", + source=Q, + axis="f", + dims=(self.enc_att_num_heads_dim, self.enc_per_head_dim), + ) # [B*C, W [+1], H, D/H] + + # this scaling is actually a bug in self_attention layer. so to be comparable, we do same here. + dim_per_head_const = self.network.add_generic_layer( + f"{prefix_name}_dim_per_head_const", + cls="constant", + value=self.enc_key_per_head_dim, + source=None, + dtype="float32", + ) # [1] + Q_energy_factor = self.network.add_generic_layer( + f"{prefix_name}_Q_energy_factor", cls="eval", source=dim_per_head_const, eval="source(0) ** -0.5" + ) # [1] + Q_H = self.network.add_generic_layer( + f"{prefix_name}_ln_Q_H", + cls="combine", + kind="mul", + source=[Q_H_, Q_energy_factor], + ) # [B*C, W [+1], H, D/H] + + energy = self.network.add_generic_layer( + f"{prefix_name}_ln_energy", + cls="dot", + source=[K_H, Q_H], + reduce=self.enc_per_head_dim, # D/H + var1="auto", + var2="auto", + ) # [B*C, H, W*N [+M], W [+1]] + + if self.memory_variant_opts.use_cached_prev_kv: + # input does not matter for rel pos enc, so we need to make sure the shape is correct + rel_pos_inputs = K # [B*C, W*N [+M], D] + else: + # just to not break hashes... + rel_pos_inputs = concat_prev_chunks_inputs # [B*C, W*N, D] + + ln_rel_pos_enc = self.network.add_generic_layer( + f"{prefix_name}_ln_rel_pos_enc", + cls="relative_positional_encoding", + source=rel_pos_inputs, + out_dim=self.enc_per_head_dim, # D/H + forward_weights_init=self.ff_init, + clipping=self.rel_pos_clipping, + query_spatial_dim=query_dim, # W+1 or W + key_value_spatial_dim=kv_dim, # W*N [+M] + query_offset=self.memory_variant_opts.chunk_size * self.memory_variant_opts.mem_size, + ) # [queries (W [+1]), kvs (W*N [+M]), D/H] + + if self.memory_variant_opts.use_emformer_mem: # -> have summary, i.e. [W+1] + mask_query = "emformer_mask_query_dim" + if mask_query not in self.network: + range_in_query_dim = self.network.add_generic_layer( + "emformer_range_query_dim", cls="range_in_axis", source=Q, axis=self.emformer_ext_query_dim + ) # [W+1] + mask_query = self.network.add_eval_layer( + mask_query, + range_in_query_dim, + eval=f"tf.less(source(0), {self.memory_variant_opts.chunk_size})", + out_type={"dtype": "bool"}, + ) # [W+1] + ln_rel_pos_enc = self.network.add_eval_layer( + f"{prefix_name}_ln_rel_pos_enc_masked_query", + [ln_rel_pos_enc, mask_query], + eval="tf.where(source(1), source(0), 0.)", + ) + + if mem_bank_K: + mask_kv = "emformer_mask_kv_dim" + if mask_kv not in self.network: + range_in_kv_dim = self.network.add_generic_layer( + "emformer_range_kv_dim", cls="range_in_axis", source=K, axis=kv_dim + ) # [W*N + M] + kv_dim_len = self.network.add_generic_layer("kv_dim_len", cls="length", source=K, axis=kv_dim) + mem_len = self.network.add_generic_layer( + "mem_len", cls="length", source=mem_bank_K, axis=self.emformer_mem_bank_dim + ) + mask_kv = self.network.add_eval_layer( + mask_kv, + [range_in_kv_dim, kv_dim_len, mem_len], + eval=f"tf.less(source(0), source(1) - source(2))", + out_type={"dtype": "bool"}, + ) + ln_rel_pos_enc = self.network.add_eval_layer( + f"{prefix_name}_ln_rel_pos_enc_masked_kv", + [ln_rel_pos_enc, mask_kv], + eval="tf.where(source(1), source(0), 0.)", + ) + + energy_rel_pos = self.network.add_generic_layer( + f"{prefix_name}_ln_energy_rel_pos", + cls="dot", + source=[Q_H, ln_rel_pos_enc], + reduce=self.enc_per_head_dim, # D/H + var1="auto", + var2="auto", + ) # [B*C, H, W[+1], W*N [+M]] + + energy = self.network.add_generic_layer( + f"{prefix_name}_ln_energy_", + cls="combine", + source=[energy, energy_rel_pos], + kind="add", + allow_broadcast_all_sources=False, + ) # [B*C, H, W*N [+M], W [+1]] + + if mem_bank_K: + energy = self.network.add_eval_layer( + f"{prefix_name}_ln_energy_emformer_mem_masked", + energy, + eval=_energy_mask_emformer_mem, + eval_locals=dict( + chunked_time_dim=self.memory_variant_opts.chunked_time_dim, # C + chunk_size_dim=self.memory_variant_opts.chunk_size_dim, # W + att_num_heads_dim=self.enc_att_num_heads_dim, # H + query_dim=query_dim, # W+1 + kv_dim=kv_dim, # W*N+M + mem_bank_dim=self.emformer_mem_bank_dim, # M + neg_inf=-1e8, + ), + ) + + weights = self.network.add_generic_layer( + f"{prefix_name}_ln_weights", + cls="softmax_over_spatial", + source=energy, + ) # [B*C, H, W, W*N] + weights_drop = self.network.add_generic_layer( + f"{prefix_name}_ln_weights_drop", + cls="dropout", + source=weights, + dropout=self.att_dropout, + dropout_noise_shape={"*": None}, + ) # [B*C, H, W, W*N] + att0 = self.network.add_generic_layer( + f"{prefix_name}_ln_att0", + cls="dot", + source=[weights_drop, V_H], + reduce=kv_dim, # W*N + var1="auto", + var2="auto", + ) # [B*C, H, W [+1], D/H] + mhsa_ = self.network.add_generic_layer( + f"{prefix_name}_ln_att_", + cls="merge_dims", + source=att0, + axes=(self.enc_att_num_heads_dim, self.enc_per_head_dim), + ) # [B*C, W [+1], D] + mhsa = self.network.add_generic_layer( + f"{prefix_name}_ln_att", + cls="reinterpret_data", + source=mhsa_, + # TODO: is this safe? find a better way + set_axes={ + "T": f"dim:" + + str(self.memory_variant_opts.chunk_size + (1 if self.memory_variant_opts.use_emformer_mem else 0)) + }, + ) # [B*C, W [+1], D] + + if self.memory_variant_opts.use_emformer_mem: + # used later by shift layer to collect a memory bank + self.network.add_generic_layer( + f"{prefix_name}_emformer_mem", + cls="gather", + source=mhsa, + axis="T", + position=self.memory_variant_opts.chunk_size, + ) # [B*C, D] + mhsa = self.network.add_generic_layer( + f"{prefix_name}_ln_att_slice", + cls="slice", + source=mhsa, + axis="T", + slice_start=0, + slice_end=self.memory_variant_opts.chunk_size, + ) # [B*C, W, D] + + return mhsa + + def _create_mhsa_module(self, prefix_name, source, layer_index): + """ + Add Multi-Headed Selft-Attention Module: + LN + MHSA + Dropout + + :param str prefix: some prefix name + :param str source: name of source layer + :param int layer_index: index of layer + :return: last layer name of this module + :rtype: str + """ + prefix_name = "{}_self_att".format(prefix_name) + ln = self.network.add_layer_norm_layer("{}_ln".format(prefix_name), source) + ln_rel_pos_enc = None + + if self.pos_enc == "rel": + ln_rel_pos_enc = self.network.add_relative_pos_encoding_layer( + "{}_ln_rel_pos_enc".format(prefix_name), + ln, + n_out=self.enc_key_per_head_dim, + forward_weights_init=self.ff_init, + clipping=self.rel_pos_clipping, + ) + + if self.memory_variant_opts is not None: + # ln: [B*C, W, D] + + if self.memory_variant_opts.use_cached_prev_kv is False: + # shifted inputs + current chunk + ln_concat_chunks = self._get_mem_chunks( + prefix_name=f"{prefix_name}_ln", input_layer=ln, mem_size=self.memory_variant_opts.mem_size + ) + ln_concat_chunks += [(ln, "T")] + ln_ = self.network.add_generic_layer( + f"{prefix_name}_ln_concat", + cls="concat", + source=ln_concat_chunks, + out_dim=self.concat_window_dim, + ) # [B*C, W*N, D] + else: + ln_ = None + + if self.memory_variant_opts.self_att_version == 0: + assert self.memory_variant_opts.use_cached_prev_kv is False, "Not implemented." + # this implementation is not efficient. + ln_rel_pos_enc = self.network.add_relative_pos_encoding_layer( + f"{prefix_name}_ln_rel_pos_enc", + ln_, + n_out=self.enc_key_per_head_dim, + forward_weights_init=self.ff_init, + clipping=self.rel_pos_clipping, + ) # [B*C, W*N, D/H] + # same param name as before + mhsa_ = self.network.add_self_att_layer( + name=prefix_name, + source=ln_, + n_out=self.enc_value_dim, + num_heads=self.att_num_heads, + total_key_dim=self.enc_key_dim, + att_dropout=self.att_dropout, + forward_weights_init=self.mhsa_init, + key_shift=ln_rel_pos_enc if ln_rel_pos_enc is not None else None, + l2=self.self_att_l2, + attention_left_only=self.use_causal_layers, + param_variational_noise=self.weight_noise if "mhsa" in self.weight_noise_layers else None, + param_dropout=self.mhsa_weight_drop, + param_dropout_min_ndim=2, + ) # [B*C, W*N, D] + mhsa_splits = self.network.add_generic_layer( + f"{prefix_name}_splits", + cls="split", + source=mhsa_, + axis="T", + num_splits=2, + ) # two tensors of shape [B*C, W, D] + mhsa = self.network.add_generic_layer( + f"{prefix_name}_split_2", + cls="copy", + source=mhsa_splits + "/1", # select second half + ) # [B*C, W, D] + else: + # For efficient computation: + # We cannot use SelfAttentionLayer on the concatenated tensor, + # because it would waste compute time for the frames of the last chunk. + # So reimplement the SelfAttentionLayer here explicitly. + # key, value: via concatenated, [B*C, W*N, D] + # + # in case use_cached_prev_kv is enabled: + # - ln_ contains the cached keys and values only + # - project only current chunk + mhsa = self._self_att_v2( + prefix_name, input_layer=ln, concat_prev_chunks_inputs=ln_, layer_index=layer_index + ) + else: + mhsa = self.network.add_self_att_layer( + name=prefix_name, + source=ln, + n_out=self.enc_value_dim, + num_heads=self.att_num_heads, + total_key_dim=self.enc_key_dim, + att_dropout=self.att_dropout, + forward_weights_init=self.mhsa_init, + key_shift=ln_rel_pos_enc if ln_rel_pos_enc is not None else None, + l2=self.self_att_l2, + attention_left_only=self.use_causal_layers, + param_variational_noise=self.weight_noise if "mhsa" in self.weight_noise_layers else None, + param_dropout=self.mhsa_weight_drop, + param_dropout_min_ndim=2, + ) + + mhsa_linear = self.network.add_linear_layer( + "{}_linear".format(prefix_name), + mhsa, + n_out=self.enc_key_dim, + l2=self.l2, + forward_weights_init=self.mhsa_out_init, + with_bias=False, + param_dropout=self.mhsa_weight_drop, + param_dropout_min_ndim=2, + ) + + drop = self.network.add_dropout_layer("{}_dropout".format(prefix_name), mhsa_linear, dropout=self.dropout) + + res_inputs = [drop, source] + + mhsa_res = self.network.add_combine_layer( + "{}_res".format(prefix_name), kind="add", source=res_inputs, n_out=self.enc_value_dim + ) + return mhsa_res + + def _create_convolution_module(self, prefix_name, source, layer_index, half_step=False): + """ + Add Convolution Module: + LN + point-wise-conv + GLU + depth-wise-conv + BN + Swish + point-wise-conv + Dropout + + :param str prefix_name: some prefix name + :param str source: name of source layer + :param int layer_index: index of layer + :return: last layer name of this module + :rtype: str + """ + prefix_name = "{}_conv_mod".format(prefix_name) + + ln = self.network.add_layer_norm_layer("{}_ln".format(prefix_name), source) + + pointwise_conv1 = self.network.add_linear_layer( + "{}_pointwise_conv1".format(prefix_name), + ln, + n_out=2 * self.enc_key_dim, + activation=None, + l2=self.l2, + with_bias=self.ff_bias, + forward_weights_init=self.conv_module_init, + param_dropout=self.conv_weight_drop, + param_dropout_min_ndim=2, + ) + + glu_act = self.network.add_gating_layer("{}_glu".format(prefix_name), pointwise_conv1) + + if self.memory_variant_opts is not None and self.memory_variant_opts.conv_cache_size: + mem_chunks = self._get_mem_chunks( + prefix_name=f"{prefix_name}_glu_act", + input_layer=glu_act, + mem_size=self.memory_variant_opts.conv_cache_size, + ) + glu_act_ = self.network.add_generic_layer( + f"{prefix_name}_glu_act_concat", + cls="concat", + source=[*mem_chunks, (glu_act, "T")], + out_dim=self.conv_cache_concat_dim, + ) # [B*C, W*N, D] + else: + glu_act_ = glu_act + + if self.use_causal_conv: + # pad to the left to make it causal + depthwise_conv_input_padded = self.network.add_pad_layer( + "{}_depthwise_conv_input_padded".format(prefix_name), + glu_act_, + axes="T", + padding=(self.conv_kernel_size - 1, 0), + ) + + depthwise_conv = self.network.add_conv_layer( + prefix_name + "_" + (self.conv_alternative_name or "depthwise_conv2"), + depthwise_conv_input_padded, + n_out=self.enc_key_dim, + filter_size=(self.conv_kernel_size,), + groups=self.enc_key_dim, + l2=self.l2, + forward_weights_init=self.conv_module_init, + padding="valid", + param_dropout=self.conv_weight_drop, + param_dropout_min_ndim=2, + ) + + # Fix time dim, match with the original input + depthwise_conv = self.network.add_reinterpret_data_layer( + "{}_depthwise_conv2_".format(prefix_name), + depthwise_conv, + size_base=glu_act, + ) + else: + depthwise_conv = self.network.add_conv_layer( + prefix_name + "_" + (self.conv_alternative_name or "depthwise_conv2"), + glu_act_, + n_out=self.enc_key_dim, + filter_size=(self.conv_kernel_size,), + groups=self.enc_key_dim, + l2=self.l2, + forward_weights_init=self.conv_module_init, + param_dropout=self.conv_weight_drop, + param_dropout_min_ndim=2, + ) + + if self.memory_variant_opts is not None and self.memory_variant_opts.conv_cache_size: + # we apply convolution over the concatenated chunks but we only need the output of the current + # chunk, thus, we need to slice from [B*C, W*N, D] to [B*C, W, D] + assert self.memory_variant_opts.mem_slice_size, "mem_slice_size must be set." + depthwise_conv = self.network.add_generic_layer( + f"{prefix_name}_depthwise_conv_slice", + cls="slice", + source=depthwise_conv, + axis="T", + slice_start=self.memory_variant_opts.mem_slice_size * self.memory_variant_opts.conv_cache_size, + ) + + if self.use_ln: + bn = self.network.add_layer_norm_layer("{}_layer_norm".format(prefix_name), depthwise_conv) + else: + bn = self.network.add_batch_norm_layer( + "{}_bn".format(prefix_name), depthwise_conv, opts=self.batch_norm_opts + ) + + swish_act = self.network.add_activation_layer("{}_swish".format(prefix_name), bn, activation="swish") + + pointwise_conv2 = self.network.add_linear_layer( + "{}_pointwise_conv2".format(prefix_name), + swish_act, + n_out=self.enc_key_dim, + activation=None, + l2=self.l2, + with_bias=self.ff_bias, + forward_weights_init=self.conv_module_init, + param_dropout=self.conv_weight_drop, + param_dropout_min_ndim=2, + ) + + drop = self.network.add_dropout_layer("{}_drop".format(prefix_name), pointwise_conv2, dropout=self.dropout) + + if half_step: + drop = self.network.add_eval_layer("{}_half_step".format(prefix_name), drop, eval="0.5 * source(0)") + + res_inputs = [drop, source] + + res = self.network.add_combine_layer( + "{}_res".format(prefix_name), kind="add", source=res_inputs, n_out=self.enc_key_dim + ) + return res + + def _block_prefix_name(self, layer_index: int) -> str: + assert layer_index >= 1 + if self.add_to_prefix_name: + prefix_name = "conformer_block_%s_%02i" % (self.add_to_prefix_name, layer_index) + else: + prefix_name = "conformer_block_%02i" % layer_index + return prefix_name + + def _create_conformer_block(self, i, source): + """ + Add the whole Conformer block: + x1 = x0 + 1/2 * FFN(x0) (FFN module 1) + x2 = x1 + MHSA(x1) (MHSA) + x3 = x2 + Conv(x2) (Conv module) + x4 = LayerNorm(x3 + 1/2 * FFN(x3)) (FFN module 2) + + :param int i: layer index + :param str source: name of source layer + :return: last layer name of this module + :rtype: str + """ + prefix_name = self._block_prefix_name(i) + ff_module1 = self._create_ff_module(prefix_name, 1, source, i) + + if self.convolution_first: + conv_module_ = self._create_convolution_module(prefix_name, ff_module1, i) + mhsa_module = self._create_mhsa_module(prefix_name, conv_module_, i) + ff_module2_input = mhsa_module + else: + if self.no_mhsa_module: + mhsa = ff_module1 # use FF1 module output as input to conv module + else: + mhsa_input = ff_module1 + if self.sandwich_conv: + conv_module1 = self._create_convolution_module( + prefix_name + "_sandwich", ff_module1, i, half_step=True + ) + mhsa_input = conv_module1 + mhsa = self._create_mhsa_module(prefix_name, mhsa_input, i) + + conv_module = self._create_convolution_module(prefix_name, mhsa, i, half_step=self.sandwich_conv) + ff_module2_input = conv_module + + ff_module2 = self._create_ff_module(prefix_name, 2, ff_module2_input, i) + res = ff_module2 + if self.block_final_norm: + res = self.network.add_layer_norm_layer("{}_ln".format(prefix_name), res) + if self.subsample: + assert 0 <= i - 1 < len(self.subsample) + subsample_factor = self.subsample_list[i - 1] + if subsample_factor > 1: + res = self.network.add_pool_layer(res + "_pool{}".format(i), res, pool_size=(subsample_factor,)) + res = self.network.add_copy_layer(prefix_name, res) + return res + + def _create_all_network_parts(self): + """ + ConvSubsampling/LSTM -> Linear -> Dropout -> [Conformer Blocks] x N + """ + data = self.input + if self.specaug: + data = self.network.add_eval_layer( + "source", + data, + eval="self.network.get_config().typed_value('transform')(source(0, as_data=True), network=self.network)", + ) + + subsampled_input = None + if self.input_layer is None: + subsampled_input = data + elif self.input_layer.startswith("stack"): + stack_size = int(self.input_layer.split("-")[1]) + stack_window = self.network.add_window_layer( + "stack_window", data, window_size=stack_size, stride=stack_size + ) # [B,C,W,F] + subsampled_input = self.network.add_merge_dims_layer( + "stack_window_merge_dim", + stack_window, + axes=["static:0", "f"], + keep_order=True, + ) # [B,C,W*F] + elif "lstm" in self.input_layer: + sample_factor = int(self.input_layer.split("-")[1]) + pool_sizes = None + if sample_factor == 2: + pool_sizes = [2, 1] + elif sample_factor == 4: + pool_sizes = [2, 2] + elif sample_factor == 6: + pool_sizes = [3, 2] + # add 2 LSTM layers with max pooling to subsample and encode positional information + subsampled_input = self.network.add_lstm_layers( + data, + num_layers=2, + lstm_dim=self.enc_key_dim, + dropout=self.lstm_dropout, + bidirectional=True, + rec_weight_dropout=self.rec_weight_dropout, + l2=self.l2, + pool_sizes=pool_sizes, + ) + elif self.input_layer == "conv-4": + # conv-layer-1: 3x3x32 followed by max pool layer on feature axis (1, 2) + # conv-layer-2: 3x3x64 with striding (2, 1) on time axis + # conv-layer-3: 3x3x64 with striding (2, 1) on time axis + + # TODO: make this more generic + + conv_input = self.network.add_conv_block( + "conv_out", + data, + hwpc_sizes=[((3, 3), (1, 2), 32)], + l2=self.frontend_conv_l2, + activation=self.input_layer_conv_act, + init=self.start_conv_init, + merge_out=False, + param_variational_noise=self.weight_noise if "frontend_conv" in self.weight_noise_layers else None, + ) + + subsampled_input = self.network.add_conv_block( + "conv_merged", + conv_input, + hwpc_sizes=[((3, 3), (2, 1), 64), ((3, 3), (2, 1), 64)], + l2=self.frontend_conv_l2, + activation=self.input_layer_conv_act, + init=self.start_conv_init, + use_striding=True, + split_input=False, + prefix_name="subsample_", + merge_out_fixed=self.fix_merge_dims, + param_variational_noise=self.weight_noise if "frontend_conv" in self.weight_noise_layers else None, + ) + elif self.input_layer == "conv-6": + conv_input = self.network.add_conv_block( + "conv_out", + data, + hwpc_sizes=[((3, 3), (1, 2), 32)], + l2=self.frontend_conv_l2, + activation=self.input_layer_conv_act, + init=self.start_conv_init, + merge_out=False, + param_variational_noise=self.weight_noise if "frontend_conv" in self.weight_noise_layers else None, + ) + + subsampled_input = self.network.add_conv_block( + "conv_merged", + conv_input, + hwpc_sizes=[((3, 3), (3, 1), 64), ((3, 3), (2, 1), 64)], + l2=self.frontend_conv_l2, + activation=self.input_layer_conv_act, + init=self.start_conv_init, + use_striding=True, + split_input=False, + prefix_name="subsample_", + merge_out_fixed=self.fix_merge_dims, + param_variational_noise=self.weight_noise if "frontend_conv" in self.weight_noise_layers else None, + ) + + assert subsampled_input is not None + + source_linear = self.network.add_linear_layer( + "source_linear", + subsampled_input, + n_out=self.enc_key_dim, + l2=self.l2, + forward_weights_init=self.ff_init, + with_bias=False, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + ) + + if self.add_abs_pos_enc_to_input: + source_linear = self.network.add_pos_encoding_layer("input_abs_pos_enc", source_linear, add_to_input=True) + + if self.dropout_in: + source_linear = self.network.add_dropout_layer("source_dropout", source_linear, dropout=self.dropout_in) + + conformer_block_src = source_linear + for i in range(1, self.num_blocks + 1): + conformer_block_src = self._create_conformer_block(i, conformer_block_src) + + encoder = self.network.add_copy_layer(self.output_layer_name, conformer_block_src) + + if self.with_ctc: + default_ctc_loss_opts = {"beam_width": 1} + if self.native_ctc: + default_ctc_loss_opts["use_native"] = True + else: + self.ctc_opts.update({"ignore_longer_outputs_than_inputs": True}) # always enable + if self.ctc_opts: + default_ctc_loss_opts["ctc_opts"] = self.ctc_opts + self.network.add_softmax_layer( + "ctc", + encoder, + l2=self.ctc_l2, + target=self.target, + loss="ctc", + dropout=self.ctc_dropout, + loss_opts=default_ctc_loss_opts, + ) + if self.ctc_loss_scale or self.ctc_self_align_delay: + self.network["ctc"]["loss_scale"] = (self.ctc_loss_scale or 1.0) * ( + (1.0 - self.ctc_self_align_scale) if self.ctc_self_align_delay else 1.0 + ) + + if self.ctc_self_align_delay: + # http://arxiv.org/abs/2105.05005 + assert self.ctc_self_align_delay > 0 # not implemented otherwise, but also not sure if meaningful + self.network["ctc_log_prob"] = {"class": "activation", "from": "ctc", "activation": "safe_log"} + # Cut off first N frames. + self.network[f"ctc_log_prob_slice{self.ctc_self_align_delay}"] = { + "class": "slice", + "from": "ctc_log_prob", + "axis": "T", + "slice_start": self.ctc_self_align_delay, + } + # Forced alignment using that. + self.network[f"ctc_forced_alignment_slice{self.ctc_self_align_delay}"] = { + "class": "forced_align", + "align_target": f"data:{self.target}", + "topology": "ctc", + "from": f"ctc_log_prob_slice{self.ctc_self_align_delay}", + "input_type": "log_prob", + } + # Add blanks at the end. + self.network["_blank_idx"] = { + "class": "length", + "from": f"data:{self.target}", + "axis": "sparse_dim", + "sparse": True, + } + self.network[f"ctc_forced_alignment_shift{self.ctc_self_align_delay}"] = { + "class": "postfix_in_time", + "from": f"ctc_forced_alignment_slice{self.ctc_self_align_delay}", + "postfix": "_blank_idx", + "repeat": self.ctc_self_align_delay, + } + # Now CE loss to those targets. + self.network[f"ctc_ce_shift{self.ctc_self_align_delay}"] = { + "class": "copy", + "from": "ctc", + "loss": "ce", + "loss_scale": (self.ctc_loss_scale or 1.0) * self.ctc_self_align_scale, + "target": f"layer:ctc_forced_alignment_shift{self.ctc_self_align_delay}", + } + + return encoder + + def _create_conformer_blocks(self, input): + if self.proj_input: + conformer_block_src = self.network.add_linear_layer( + "encoder_proj", input, n_out=self.enc_key_dim, activation=None, with_bias=False + ) + else: + conformer_block_src = input + for i in range(1, self.num_blocks + 1): + conformer_block_src = self._create_conformer_block(i, conformer_block_src) + encoder = self.network.add_copy_layer(self.output_layer_name, conformer_block_src) + return encoder + + def create_network(self): + # create only conformer blocks without front-end, etc + if self.create_only_blocks: + return self._create_conformer_blocks(input=self.input) + return self._create_all_network_parts() + + +@dataclass +class ConformerMemoryVariantOpts: + split_batch_time_base: str + chunked_time_dim: Dim # C, number of chunks + chunk_size_dim: Dim # W, including extended left/right, excluding Emformer summary or so + chunk_size: int + self_att_version: int # TODO: just for testing + mem_size: int + mem_slice_start: int + mem_slice_size: int + conv_cache_size: int + use_cached_prev_kv: bool + use_emformer_mem: bool # https://arxiv.org/abs/2010.10759 + apply_tanh_on_emformer_mem: bool + + +def _energy_mask_emformer_mem( + *, + self, + source, + chunked_time_dim: Dim, # C + chunk_size_dim: Dim, # W + att_num_heads_dim: Dim, # H + query_dim: Dim, # W+1 + kv_dim: Dim, # W*N+M, M=C + mem_bank_dim: Dim, # M, M=C + neg_inf: float, + **_kwargs, +): + import numpy + import tensorflow as tf + from returnn.tensor import Tensor, Dim, batch_dim + from returnn.tf.util.data import BatchInfo + + chunk_size_dim # unused # noqa + + energy_data: Tensor = source(0, as_data=True) # [B*C, H, W*N+M, W+1], M=C, dims not necessarily that order + + assert len(energy_data.batch.virtual_dims) == 2 + batch_virtual_dim0, batch_virtual_dim1 = energy_data.batch.virtual_dims + assert isinstance(batch_virtual_dim0, BatchInfo.GlobalBatchDim) + assert isinstance(batch_virtual_dim1, BatchInfo.FixedDim) + assert batch_virtual_dim1.dim_tag == chunked_time_dim + + energy_shape = [] + energy_dims = [] + for d in energy_data.dims: + if d.is_batch_dim(): + energy_dims += [batch_dim, chunked_time_dim] + energy_shape += [batch_virtual_dim0.size, chunked_time_dim.get_dim_value()] + continue + energy_dims.append(d) + energy_shape.append(d.get_dim_value()) + energy: tf.Tensor = tf.reshape(energy_data.raw_tensor, energy_shape) # [B, C, H, W*N [+M], W+1] + assert set(energy_dims) == {batch_dim, chunked_time_dim, att_num_heads_dim, query_dim, kv_dim} + assert len(energy_dims) == len(energy_shape) == energy.shape.rank == 5 + + def _bc_shape(d_: Dim): + ls = [(a_, d__) for a_, d__ in enumerate(energy_dims) if d__ == d_] + if not ls: + raise Exception(f"dim {d_} not found in {energy_dims}") + if len(ls) > 1: + raise Exception(f"dim {d_} found multiple times in {energy_dims}: {ls}") + a = ls[0][0] + return [1] * a + [d_.get_dim_value()] + [1] * (len(energy_dims) - a - 1) + + c_range: tf.Tensor = tf.range(chunked_time_dim.get_dim_value()) # [C] + c_range: tf.Tensor = tf.reshape(c_range, _bc_shape(chunked_time_dim)) # [..C..] + q_range: tf.Tensor = tf.range(query_dim.get_dim_value()) # [W+1] + q_range: tf.Tensor = tf.reshape(q_range, _bc_shape(query_dim)) # [..W+1..] + q_s_idx = query_dim.get_dim_value() - 1 # W + kv_range: tf.Tensor = tf.range(kv_dim.get_dim_value()) # [W*N+M] + kv_range: tf.Tensor = tf.reshape(kv_range, _bc_shape(kv_dim)) # [..W*N+M..] + kv_m_start_idx = kv_dim.get_dim_value() - mem_bank_dim.get_dim_value() # W*N + # In chunk c, we only allow to attend to the previous chunk memories m < c. + mask0 = tf.less(kv_range, kv_m_start_idx + c_range) # [..C.., ..W*N+M..] + # In summary, we do not attend to any memories. + mask1 = tf.less(q_range, q_s_idx) | ( + tf.equal(q_range, q_s_idx) & tf.less(kv_range, kv_m_start_idx) + ) # [..W+1.., ..W*N+M..] + mask = mask0 & mask1 # [..C.., ..W+1.., ..W*N+M..] + energy = tf.where(mask, energy, neg_inf) + + energy = tf.reshape(energy, [d.get_dim_value() for d in energy_data.dims]) # [B*C, H, W*N+M, W+1] + if numpy.isinf(neg_inf): + self.allow_inf_in_output = True + return energy From fe549d67e71f4f292cfa36186668ed343027d64a Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Wed, 15 May 2024 14:59:06 +0000 Subject: [PATCH 003/227] fix --- users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py b/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py index 4633e7665..3c31072f9 100644 --- a/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py +++ b/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py @@ -17,7 +17,7 @@ from i6_core.returnn.config import CodeWrapper -class ConformerEncoder: +class ConformerEncoderV2: """ Represents Conformer Encoder Architecture From f1c35d15b4881bc4f87703e1eae6ea71e9dcff4a Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Wed, 15 May 2024 15:12:21 +0000 Subject: [PATCH 004/227] add more weight noise opts --- .../asr/encoder/conformer_encoder_v2.py | 54 +++++++++++++------ users/zeineldeen/modules/network.py | 3 ++ 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py b/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py index 3c31072f9..110b74553 100644 --- a/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py +++ b/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py @@ -1,5 +1,6 @@ """ Conformer encoder +Same as conformer_encoder.py but with more regularizations Other implementations: @@ -14,8 +15,6 @@ from returnn.tf.util.data import SpatialDim, FeatureDim from i6_experiments.users.zeineldeen.modules.network import ReturnnNetwork -from i6_core.returnn.config import CodeWrapper - class ConformerEncoderV2: """ @@ -28,7 +27,7 @@ class ConformerEncoderV2: def __init__( self, input="data", - input_layer="lstm-6", + input_layer="conv-6", input_layer_conv_act="relu", add_abs_pos_enc_to_input=False, frontend_conv_l2=0.0, @@ -79,12 +78,15 @@ def __init__( use_causal_conv=None, conv_alternative_name: Optional[str] = None, fix_merge_dims=False, - weight_noise=None, - weight_noise_layers=None, + ff_weight_noise=None, + mhsa_weight_noise=None, + conv_weight_noise=None, + frontend_conv_weight_noise=None, convolution_first=False, ff_weight_dropout=None, mhsa_weight_dropout=None, conv_weight_dropout=None, + frontend_conv_weight_dropout=None, memory_variant_opts: Optional[ConformerMemoryVariantOpts] = None, ): """ @@ -221,18 +223,17 @@ def __init__( self.conv_alternative_name = conv_alternative_name self.fix_merge_dims = fix_merge_dims - self.weight_noise = weight_noise - self.weight_noise_layers = weight_noise_layers - if self.weight_noise_layers is None: - self.weight_noise_layers = [] - for layer in self.weight_noise_layers: - assert layer in ["mhsa", "conv", "frontend_conv"] + self.ff_weight_noise = ff_weight_noise + self.conv_weight_noise = conv_weight_noise + self.mhsa_weight_noise = mhsa_weight_noise + self.frontend_conv_weight_noise = frontend_conv_weight_noise self.convolution_first = convolution_first self.ff_weight_drop = ff_weight_dropout self.conv_weight_drop = conv_weight_dropout self.mhsa_weight_drop = mhsa_weight_dropout + self.frontend_conv_weight_drop = frontend_conv_weight_dropout self.memory_variant_opts = memory_variant_opts if self.memory_variant_opts: @@ -271,6 +272,7 @@ def _create_ff_module(self, prefix_name, i, source, layer_index): with_bias=self.ff_bias, param_dropout=self.ff_weight_drop, param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, ) if self.use_sqrd_relu: @@ -294,6 +296,7 @@ def _create_ff_module(self, prefix_name, i, source, layer_index): with_bias=self.ff_bias, param_dropout=self.ff_weight_drop, param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, ) drop2 = self.network.add_dropout_layer("{}_drop2".format(prefix_name), ff2, dropout=self.dropout) @@ -387,6 +390,7 @@ def _self_att_v2( L2=self.self_att_l2, param_dropout=self.mhsa_weight_drop, param_dropout_min_ndim=2, + param_variational_noise=self.mhsa_weight_noise, ) # [B*C, W*N, D] or [B*C, W, D] (use_cached_prev_kv) V = self.network.add_generic_layer( @@ -399,6 +403,7 @@ def _self_att_v2( L2=self.self_att_l2, param_dropout=self.mhsa_weight_drop, param_dropout_min_ndim=2, + param_variational_noise=self.mhsa_weight_noise, ) # [B*C, W*N, D] or [B*C, W, D] (use_cached_prev_kv) if self.memory_variant_opts.use_emformer_mem and layer_index > 1: @@ -555,6 +560,7 @@ def _self_att_v2( L2=self.self_att_l2, param_dropout=self.mhsa_weight_drop, param_dropout_min_ndim=2, + param_variational_noise=self.mhsa_weight_noise, ) # second half of shape [B*C, W, D] query_dim = self.memory_variant_opts.chunk_size_dim # W @@ -839,7 +845,7 @@ def _create_mhsa_module(self, prefix_name, source, layer_index): key_shift=ln_rel_pos_enc if ln_rel_pos_enc is not None else None, l2=self.self_att_l2, attention_left_only=self.use_causal_layers, - param_variational_noise=self.weight_noise if "mhsa" in self.weight_noise_layers else None, + param_variational_noise=self.mhsa_weight_noise, param_dropout=self.mhsa_weight_drop, param_dropout_min_ndim=2, ) # [B*C, W*N, D] @@ -880,7 +886,7 @@ def _create_mhsa_module(self, prefix_name, source, layer_index): key_shift=ln_rel_pos_enc if ln_rel_pos_enc is not None else None, l2=self.self_att_l2, attention_left_only=self.use_causal_layers, - param_variational_noise=self.weight_noise if "mhsa" in self.weight_noise_layers else None, + param_variational_noise=self.mhsa_weight_noise, param_dropout=self.mhsa_weight_drop, param_dropout_min_ndim=2, ) @@ -894,6 +900,7 @@ def _create_mhsa_module(self, prefix_name, source, layer_index): with_bias=False, param_dropout=self.mhsa_weight_drop, param_dropout_min_ndim=2, + param_variational_noise=self.mhsa_weight_noise, ) drop = self.network.add_dropout_layer("{}_dropout".format(prefix_name), mhsa_linear, dropout=self.dropout) @@ -930,6 +937,7 @@ def _create_convolution_module(self, prefix_name, source, layer_index, half_step forward_weights_init=self.conv_module_init, param_dropout=self.conv_weight_drop, param_dropout_min_ndim=2, + param_variational_noise=self.conv_weight_noise, ) glu_act = self.network.add_gating_layer("{}_glu".format(prefix_name), pointwise_conv1) @@ -969,6 +977,7 @@ def _create_convolution_module(self, prefix_name, source, layer_index, half_step padding="valid", param_dropout=self.conv_weight_drop, param_dropout_min_ndim=2, + param_variational_noise=self.conv_weight_noise, ) # Fix time dim, match with the original input @@ -988,6 +997,7 @@ def _create_convolution_module(self, prefix_name, source, layer_index, half_step forward_weights_init=self.conv_module_init, param_dropout=self.conv_weight_drop, param_dropout_min_ndim=2, + param_variational_noise=self.conv_weight_noise, ) if self.memory_variant_opts is not None and self.memory_variant_opts.conv_cache_size: @@ -1021,6 +1031,7 @@ def _create_convolution_module(self, prefix_name, source, layer_index, half_step forward_weights_init=self.conv_module_init, param_dropout=self.conv_weight_drop, param_dropout_min_ndim=2, + param_variational_noise=self.conv_weight_noise, ) drop = self.network.add_dropout_layer("{}_drop".format(prefix_name), pointwise_conv2, dropout=self.dropout) @@ -1151,7 +1162,9 @@ def _create_all_network_parts(self): activation=self.input_layer_conv_act, init=self.start_conv_init, merge_out=False, - param_variational_noise=self.weight_noise if "frontend_conv" in self.weight_noise_layers else None, + param_dropout=self.frontend_conv_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.frontend_conv_weight_noise, ) subsampled_input = self.network.add_conv_block( @@ -1165,7 +1178,9 @@ def _create_all_network_parts(self): split_input=False, prefix_name="subsample_", merge_out_fixed=self.fix_merge_dims, - param_variational_noise=self.weight_noise if "frontend_conv" in self.weight_noise_layers else None, + param_dropout=self.frontend_conv_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.frontend_conv_weight_noise, ) elif self.input_layer == "conv-6": conv_input = self.network.add_conv_block( @@ -1176,7 +1191,9 @@ def _create_all_network_parts(self): activation=self.input_layer_conv_act, init=self.start_conv_init, merge_out=False, - param_variational_noise=self.weight_noise if "frontend_conv" in self.weight_noise_layers else None, + param_dropout=self.frontend_conv_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.frontend_conv_weight_noise, ) subsampled_input = self.network.add_conv_block( @@ -1190,7 +1207,9 @@ def _create_all_network_parts(self): split_input=False, prefix_name="subsample_", merge_out_fixed=self.fix_merge_dims, - param_variational_noise=self.weight_noise if "frontend_conv" in self.weight_noise_layers else None, + param_dropout=self.frontend_conv_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.frontend_conv_weight_noise, ) assert subsampled_input is not None @@ -1204,6 +1223,7 @@ def _create_all_network_parts(self): with_bias=False, param_dropout=self.ff_weight_drop, param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, ) if self.add_abs_pos_enc_to_input: diff --git a/users/zeineldeen/modules/network.py b/users/zeineldeen/modules/network.py index 1278f1029..f763da614 100644 --- a/users/zeineldeen/modules/network.py +++ b/users/zeineldeen/modules/network.py @@ -85,6 +85,7 @@ def add_linear_layer( forward_weights_init=None, param_dropout=None, param_dropout_min_ndim=None, + param_variational_noise=None, **kwargs, ): d = {"class": "linear", "activation": activation, "with_bias": with_bias, "from": source} @@ -102,6 +103,8 @@ def add_linear_layer( d["param_dropout"] = param_dropout if param_dropout_min_ndim is not None: d["param_dropout_min_ndim"] = param_dropout_min_ndim + if param_variational_noise: + d["param_variational_noise"] = param_variational_noise d.update(kwargs) self._net[name] = d return name From 4fe365a15f2c78f7f2b3b710cd12053022f529fc Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Wed, 15 May 2024 15:15:53 +0000 Subject: [PATCH 005/227] black formatting --- .../models/asr/decoder/conformer_decoder.py | 827 ++++++++++-------- .../models/asr/decoder/transformer_decoder.py | 624 +++++++------ 2 files changed, 850 insertions(+), 601 deletions(-) diff --git a/users/zeineldeen/models/asr/decoder/conformer_decoder.py b/users/zeineldeen/models/asr/decoder/conformer_decoder.py index 0bca298ec..1ba8cd4bb 100644 --- a/users/zeineldeen/models/asr/decoder/conformer_decoder.py +++ b/users/zeineldeen/models/asr/decoder/conformer_decoder.py @@ -2,351 +2,488 @@ class ConformerDecoder: - """ - Represents Conformer Decoder with causal convolution modules and masked self-attention - """ - - def __init__(self, - base_model, target='bpe', num_layers=6, beam_size=12, ff_init=None, ff_dim=2048, - ff_bias=True, activation='swish', use_sqrd_relu=False, conv_kernel_size=32, conv_module_init=None, - att_num_heads=8, dropout=0.1, att_dropout=0.1, softmax_dropout=0.0, embed_dropout=0.1, l2=0.0, - self_att_l2=0.0, apply_embed_weight=False, label_smoothing=0.1, mhsa_init=None, half_step=True, - mhsa_out_init=None, pos_enc=None, rel_pos_clipping=16, length_normalization=True, - replace_cross_att_w_masked_self_att=False, create_ilm_decoder=False, ilm_type=None, ilm_args=None): - - self.base_model = base_model - self.enc_value_dim = base_model.enc_value_dim - self.enc_key_dim = base_model.enc_key_dim - self.enc_att_num_heads = base_model.att_num_heads - self.enc_key_per_head_dim = base_model.enc_key_per_head_dim - self.enc_val_per_head_dim = base_model.enc_val_per_head_dim - - self.att_num_heads = att_num_heads - - self.target = target - self.num_layers = num_layers - self.beam_size = beam_size - - self.ff_init = ff_init - self.ff_dim = ff_dim - self.ff_bias = ff_bias - - self.conv_kernel_size = conv_kernel_size - self.conv_module_init = conv_module_init - - self.activation = activation - self.use_sqrd_relu = use_sqrd_relu - - self.mhsa_init = mhsa_init - self.mhsa_out_init = mhsa_out_init - - self.pos_enc = pos_enc - self.rel_pos_clipping = rel_pos_clipping - self.half_step = half_step - - self.dropout = dropout - self.softmax_dropout = softmax_dropout - self.att_dropout = att_dropout - self.label_smoothing = label_smoothing - - self.l2 = l2 - self.self_att_l2 = self_att_l2 - - self.embed_dropout = embed_dropout - self.embed_weight = None - - if apply_embed_weight: - self.embed_weight = self.enc_value_dim ** 0.5 - - self.decision_layer_name = None - self.length_normalization = length_normalization - - self.replace_cross_att_w_masked_self_att = replace_cross_att_w_masked_self_att # used to train ILM - - # used for recognition with ILM - self.create_ilm_decoder = create_ilm_decoder - self.ilm_type = ilm_type - self.ilm_args = ilm_args or {} - if self.create_ilm_decoder: - self.replace_cross_att_w_masked_self_att = False # keep original decoder as-is - - self.network = ReturnnNetwork() - self.subnet_unit = ReturnnNetwork() - self.output_prob = None - - def _create_masked_mhsa(self, prefix, source, **kwargs): - prefix_name = '{}_self_att'.format(prefix) - - # for tuning mini-self-att ILM - att_num_heads = kwargs.get('att_num_heads', self.att_num_heads) - enc_key_dim = kwargs.get('enc_key_dim', self.enc_key_dim) - enc_key_per_head_dim = enc_key_dim // att_num_heads - - ln = self.subnet_unit.add_layer_norm_layer('{}_ln'.format(prefix_name), source) - ln_rel_pos_enc = None - - if self.pos_enc == 'rel': - ln_rel_pos_enc = self.subnet_unit.add_relative_pos_encoding_layer( - '{}_ln_rel_pos_enc'.format(prefix_name), ln, n_out=enc_key_per_head_dim, forward_weights_init=self.ff_init, - clipping=self.rel_pos_clipping) - - mhsa = self.subnet_unit.add_self_att_layer( - '{}'.format(prefix_name), ln, n_out=self.enc_value_dim, num_heads=att_num_heads, attention_left_only=True, - total_key_dim=enc_key_dim, att_dropout=self.att_dropout, forward_weights_init=self.mhsa_init, - key_shift=ln_rel_pos_enc if ln_rel_pos_enc is not None else None, l2=self.self_att_l2) - - mhsa_linear = self.subnet_unit.add_linear_layer( - '{}_linear'.format(prefix_name), mhsa, n_out=enc_key_dim, l2=self.l2, - forward_weights_init=self.mhsa_out_init, - with_bias=False) - - drop = self.subnet_unit.add_dropout_layer('{}_dropout'.format(prefix_name), mhsa_linear, dropout=self.dropout) - - res_inputs = [drop, source] - - mhsa_res = self.subnet_unit.add_combine_layer( - '{}_res'.format(prefix_name), kind='add', source=res_inputs, n_out=self.enc_value_dim) - return mhsa_res - - - def _create_mhsa(self, prefix, source): - ln = self.subnet_unit.add_layer_norm_layer('{}_att_ln'.format(prefix), source) - - att_query0 = self.subnet_unit.add_linear_layer( - '{}_att_query0'.format(prefix), ln, with_bias=False, n_out=self.enc_value_dim, - forward_weights_init=self.mhsa_init, l2=self.l2) - - # (B, H, D/H) - att_query = self.subnet_unit.add_split_dim_layer( - '{}_att_query'.format(prefix), att_query0, axis='F', dims=(self.enc_att_num_heads, self.enc_key_per_head_dim)) - - # --------------- Add to the encoder network --------------- # - att_key0 = self.base_model.network.add_linear_layer( - '{}_att_key0'.format(prefix), 'encoder', with_bias=False, n_out=self.enc_key_dim, - forward_weights_init=self.mhsa_init, l2=self.l2) - - # (B, enc-T, H, D/H) - att_key = self.base_model.network.add_split_dim_layer( - '{}_att_key'.format(prefix), att_key0, axis='F', dims=(self.enc_att_num_heads, self.enc_key_per_head_dim)) - - att_value0 = self.base_model.network.add_linear_layer( - '{}_att_value0'.format(prefix), 'encoder', with_bias=False, n_out=self.enc_value_dim, - forward_weights_init=self.mhsa_init, l2=self.l2) - - # (B, enc-T, H, D'/H) - att_value = self.base_model.network.add_split_dim_layer( - '{}_att_value'.format(prefix), att_value0, axis='F', dims=(self.enc_att_num_heads, self.enc_val_per_head_dim)) - # ----------------------------------------------------------- # - - # (B, H, enc-T, 1) - att_energy = self.subnet_unit.add_dot_layer( - '{}_att_energy'.format(prefix), source=['base:' + att_key, att_query], red1=-1, red2=-1, var1='T', var2='T?') - - att_weights = self.subnet_unit.add_softmax_over_spatial_layer( - '{}_att_weights'.format(prefix), att_energy, energy_factor=self.enc_key_per_head_dim ** -0.5) - - att_weights_drop = self.subnet_unit.add_dropout_layer( - '{}_att_weights_drop'.format(prefix), att_weights, dropout=self.att_dropout, dropout_noise_shape={"*": None}) - - # (B, H, V) - att0 = self.subnet_unit.add_generic_att_layer( - '{}_att0'.format(prefix), weights=att_weights_drop, base='base:' + att_value) - - att = self.subnet_unit.add_merge_dims_layer('{}_att'.format(prefix), att0, axes='static') # (B, H*V) except_batch - - # output projection - att_linear = self.subnet_unit.add_linear_layer( - '{}_att_linear'.format(prefix), att, with_bias=False, n_out=self.enc_value_dim, - forward_weights_init=self.mhsa_out_init, l2=self.l2) - - att_drop = self.subnet_unit.add_dropout_layer('{}_att_drop'.format(prefix), att_linear, dropout=self.dropout) - - out = self.subnet_unit.add_combine_layer( - '{}_att_out'.format(prefix), [att_drop, source], kind='add', n_out=self.enc_value_dim) - return out - - def _create_convolution_module(self, prefix_name, source): """ - Add Convolution Module: - LN + point-wise-conv + GLU + depth-wise-conv + Swish + point-wise-conv + Dropout - Note that BN is disabled here because it uses full sequence. - - :param str prefix_name: some prefix name - :param str source: name of source layer - :return: last layer name of this module - :rtype: str + Represents Conformer Decoder with causal convolution modules and masked self-attention """ - prefix_name = '{}_conv_mod'.format(prefix_name) - - ln = self.subnet_unit.add_layer_norm_layer('{}_ln'.format(prefix_name), source) - - pointwise_conv1 = self.subnet_unit.add_linear_layer( - '{}_pointwise_conv1'.format(prefix_name), ln, n_out=2 * self.enc_key_dim, activation=None, l2=self.l2, - with_bias=self.ff_bias, forward_weights_init=self.conv_module_init) - - glu_act = self.subnet_unit.add_gating_layer('{}_glu'.format(prefix_name), pointwise_conv1) - - # Pad to make causal conv - # TODO: This currently does not work inside a recurrent subnetwork. Need to be fixed. - depthwise_conv_input_padded = self.subnet_unit.add_pad_layer( - '{}_depthwise_conv_input_padded'.format(prefix_name), - glu_act, axes='T', padding=(self.conv_kernel_size - 1, 0) - ) - - depthwise_conv = self.subnet_unit.add_conv_layer( - '{}_depthwise_conv2'.format(prefix_name), depthwise_conv_input_padded, n_out=self.enc_key_dim, - filter_size=(self.conv_kernel_size,), groups=self.enc_key_dim, l2=self.l2, - forward_weights_init=self.conv_module_init, padding='valid') - - swish_act = self.subnet_unit.add_activation_layer( - '{}_swish'.format(prefix_name), depthwise_conv, activation='swish') - - pointwise_conv2 = self.subnet_unit.add_linear_layer( - '{}_pointwise_conv2'.format(prefix_name), swish_act, n_out=self.enc_key_dim, activation=None, l2=self.l2, - with_bias=self.ff_bias, forward_weights_init=self.conv_module_init) - - drop = self.subnet_unit.add_dropout_layer('{}_drop'.format(prefix_name), pointwise_conv2, dropout=self.dropout) - - res_inputs = [drop, source] - - res = self.subnet_unit.add_combine_layer( - '{}_res'.format(prefix_name), kind='add', source=res_inputs, n_out=self.enc_key_dim) - return res - - def _create_ff_module(self, prefix_name, i, source): - """ - Add Feed Forward Module: - LN -> FFN -> Swish -> Dropout -> FFN -> Dropout - - :param str prefix_name: some prefix name - :param int i: FF module index - :param str source: name of source layer - :return: last layer name of this module - :rtype: str - """ - prefix_name = prefix_name + '_ffmod_{}'.format(i) - - ln = self.subnet_unit.add_layer_norm_layer('{}_ln'.format(prefix_name), source) - - ff1 = self.subnet_unit.add_linear_layer( - '{}_ff1'.format(prefix_name), ln, n_out=self.ff_dim, l2=self.l2, forward_weights_init=self.ff_init, - with_bias=self.ff_bias) - - if self.use_sqrd_relu: - swish_act = self.subnet_unit.add_activation_layer('{}_relu'.format(prefix_name), ff1, activation='relu') - swish_act = self.subnet_unit.add_eval_layer('{}_square_relu'.format(prefix_name), swish_act, eval='source(0) ** 2') - else: - swish_act = self.subnet_unit.add_activation_layer('{}_swish'.format(prefix_name), ff1, activation=self.activation) - - drop1 = self.subnet_unit.add_dropout_layer('{}_drop1'.format(prefix_name), swish_act, dropout=self.dropout) - - ff2 = self.subnet_unit.add_linear_layer( - '{}_ff2'.format(prefix_name), drop1, n_out=self.enc_key_dim, l2=self.l2, forward_weights_init=self.ff_init, - with_bias=self.ff_bias) - - drop2 = self.subnet_unit.add_dropout_layer('{}_drop2'.format(prefix_name), ff2, dropout=self.dropout) - - if self.half_step: - drop2 = self.subnet_unit.add_eval_layer('{}_half_step'.format(prefix_name), drop2, eval='0.5 * source(0)') - - res_inputs = [drop2, source] - - ff_module_res = self.subnet_unit.add_combine_layer( - '{}_res'.format(prefix_name), kind='add', source=res_inputs, n_out=self.enc_key_dim) - - return ff_module_res - - def _create_decoder_block(self, source, i): - """FF + Masked-MHSA + Causal-Conv + Cross-MHA + FF""" - - prefix = 'conformer_decoder_%02i' % i - - ff1 = self._create_ff_module(prefix, 1, source) - masked_mhsa = self._create_masked_mhsa(prefix, ff1) - conv_module = self._create_convolution_module(prefix, masked_mhsa) - - if self.replace_cross_att_w_masked_self_att: - mhsa = self._create_masked_mhsa('ilm_' + prefix, conv_module, **self.ilm_args) - else: - mhsa = self._create_mhsa(prefix, conv_module) - - ff2 = self._create_ff_module(prefix, 2, mhsa) - ff2_norm = self.subnet_unit.add_layer_norm_layer('{}_ln'.format(prefix), ff2) - out = self.subnet_unit.add_copy_layer(prefix, ff2_norm) - return out - - def _create_ilm_decoder_block(self, source, i): - prefix = 'conformer_decoder_%02i' % i - - ff1 = self._create_ff_module('prior_' + prefix, 1, source) - masked_mhsa = self._create_masked_mhsa('prior_' + prefix, ff1) - conv_module = self._create_convolution_module('prior_' + prefix, masked_mhsa) - - if self.ilm_type == 'mini_lstm': - mhsa = self._create_masked_mhsa('mini_ilm_' + prefix, conv_module, **self.ilm_args) - else: - assert self.ilm_type == 'zero' - mhsa = self.subnet_unit.add_eval_layer('zero_att_%02i' % i, conv_module, eval='tf.zeros_like(source(0))') - - ff2 = self._create_ff_module('prior_' + prefix, 2, mhsa) - ff2_norm = self.subnet_unit.add_layer_norm_layer('{}_ln'.format('prior_' + prefix), ff2) - out = self.subnet_unit.add_copy_layer('prior_' + prefix, ff2_norm) - return out - - def _create_decoder(self): - - self.output_prob = self.subnet_unit.add_softmax_layer( - 'output_prob', 'decoder', loss='ce', - loss_opts={'label_smoothing': self.label_smoothing}, target=self.target, dropout=self.softmax_dropout, - forward_weights_init=self.ff_init, l2=self.l2) - - if self.length_normalization: - output = self.subnet_unit.add_choice_layer( - 'output', self.output_prob, target=self.target, beam_size=self.beam_size, initial_output=0) - else: - output = self.subnet_unit.add_choice_layer( - 'output', self.output_prob, target=self.target, beam_size=self.beam_size, initial_output=0, - length_normalization=self.length_normalization) - - self.subnet_unit.add_compare_layer('end', output, value=0) - - target_embed_raw = self.subnet_unit.add_linear_layer( - 'target_embed_raw', 'prev:' + output, with_bias=False, n_out=self.enc_value_dim, - forward_weights_init=self.ff_init, l2=self.l2) - - if self.embed_weight: - target_embed_raw = self.subnet_unit.add_eval_layer( - 'target_embed_weighted', target_embed_raw, eval='source(0) * %f' % self.embed_weight) - - target_embed = self.subnet_unit.add_dropout_layer( - 'target_embed', target_embed_raw, dropout=self.embed_dropout, dropout_noise_shape={"*": None}) - - x = target_embed - for i in range(1, self.num_layers + 1): - x = self._create_decoder_block(x, i) - self.subnet_unit.add_copy_layer('decoder', x) - - if self.create_ilm_decoder: - x = target_embed - for i in range(1, self.num_layers + 1): - x = self._create_ilm_decoder_block( x, i) - self.subnet_unit.add_copy_layer('prior_decoder', x) - - self.subnet_unit.add_softmax_layer( - 'prior_output_prob', 'prior_decoder', loss='ce', - loss_opts={'label_smoothing': self.label_smoothing}, target=self.target, dropout=self.softmax_dropout, - forward_weights_init=self.ff_init, l2=self.l2 - ) - - dec_output = self.network.add_subnet_rec_layer('output', unit=self.subnet_unit.get_net(), target=self.target) - - return dec_output - - def create_network(self): - dec_output = self._create_decoder() - - # recurrent subnetwork - decision_layer_name = self.base_model.network.add_decide_layer('decision', dec_output, target=self.target) - self.decision_layer_name = decision_layer_name - return dec_output + def __init__( + self, + base_model, + target="bpe", + num_layers=6, + beam_size=12, + ff_init=None, + ff_dim=2048, + ff_bias=True, + activation="swish", + use_sqrd_relu=False, + conv_kernel_size=32, + conv_module_init=None, + att_num_heads=8, + dropout=0.1, + att_dropout=0.1, + softmax_dropout=0.0, + embed_dropout=0.1, + l2=0.0, + self_att_l2=0.0, + apply_embed_weight=False, + label_smoothing=0.1, + mhsa_init=None, + half_step=True, + mhsa_out_init=None, + pos_enc=None, + rel_pos_clipping=16, + length_normalization=True, + replace_cross_att_w_masked_self_att=False, + create_ilm_decoder=False, + ilm_type=None, + ilm_args=None, + ): + self.base_model = base_model + self.enc_value_dim = base_model.enc_value_dim + self.enc_key_dim = base_model.enc_key_dim + self.enc_att_num_heads = base_model.att_num_heads + self.enc_key_per_head_dim = base_model.enc_key_per_head_dim + self.enc_val_per_head_dim = base_model.enc_val_per_head_dim + + self.att_num_heads = att_num_heads + + self.target = target + self.num_layers = num_layers + self.beam_size = beam_size + + self.ff_init = ff_init + self.ff_dim = ff_dim + self.ff_bias = ff_bias + + self.conv_kernel_size = conv_kernel_size + self.conv_module_init = conv_module_init + + self.activation = activation + self.use_sqrd_relu = use_sqrd_relu + + self.mhsa_init = mhsa_init + self.mhsa_out_init = mhsa_out_init + + self.pos_enc = pos_enc + self.rel_pos_clipping = rel_pos_clipping + self.half_step = half_step + + self.dropout = dropout + self.softmax_dropout = softmax_dropout + self.att_dropout = att_dropout + self.label_smoothing = label_smoothing + + self.l2 = l2 + self.self_att_l2 = self_att_l2 + + self.embed_dropout = embed_dropout + self.embed_weight = None + + if apply_embed_weight: + self.embed_weight = self.enc_value_dim**0.5 + + self.decision_layer_name = None + self.length_normalization = length_normalization + + self.replace_cross_att_w_masked_self_att = replace_cross_att_w_masked_self_att # used to train ILM + + # used for recognition with ILM + self.create_ilm_decoder = create_ilm_decoder + self.ilm_type = ilm_type + self.ilm_args = ilm_args or {} + if self.create_ilm_decoder: + self.replace_cross_att_w_masked_self_att = False # keep original decoder as-is + + self.network = ReturnnNetwork() + self.subnet_unit = ReturnnNetwork() + self.output_prob = None + + def _create_masked_mhsa(self, prefix, source, **kwargs): + prefix_name = "{}_self_att".format(prefix) + + # for tuning mini-self-att ILM + att_num_heads = kwargs.get("att_num_heads", self.att_num_heads) + enc_key_dim = kwargs.get("enc_key_dim", self.enc_key_dim) + enc_key_per_head_dim = enc_key_dim // att_num_heads + + ln = self.subnet_unit.add_layer_norm_layer("{}_ln".format(prefix_name), source) + ln_rel_pos_enc = None + + if self.pos_enc == "rel": + ln_rel_pos_enc = self.subnet_unit.add_relative_pos_encoding_layer( + "{}_ln_rel_pos_enc".format(prefix_name), + ln, + n_out=enc_key_per_head_dim, + forward_weights_init=self.ff_init, + clipping=self.rel_pos_clipping, + ) + + mhsa = self.subnet_unit.add_self_att_layer( + "{}".format(prefix_name), + ln, + n_out=self.enc_value_dim, + num_heads=att_num_heads, + attention_left_only=True, + total_key_dim=enc_key_dim, + att_dropout=self.att_dropout, + forward_weights_init=self.mhsa_init, + key_shift=ln_rel_pos_enc if ln_rel_pos_enc is not None else None, + l2=self.self_att_l2, + ) + + mhsa_linear = self.subnet_unit.add_linear_layer( + "{}_linear".format(prefix_name), + mhsa, + n_out=enc_key_dim, + l2=self.l2, + forward_weights_init=self.mhsa_out_init, + with_bias=False, + ) + + drop = self.subnet_unit.add_dropout_layer("{}_dropout".format(prefix_name), mhsa_linear, dropout=self.dropout) + + res_inputs = [drop, source] + + mhsa_res = self.subnet_unit.add_combine_layer( + "{}_res".format(prefix_name), kind="add", source=res_inputs, n_out=self.enc_value_dim + ) + return mhsa_res + + def _create_mhsa(self, prefix, source): + ln = self.subnet_unit.add_layer_norm_layer("{}_att_ln".format(prefix), source) + + att_query0 = self.subnet_unit.add_linear_layer( + "{}_att_query0".format(prefix), + ln, + with_bias=False, + n_out=self.enc_value_dim, + forward_weights_init=self.mhsa_init, + l2=self.l2, + ) + + # (B, H, D/H) + att_query = self.subnet_unit.add_split_dim_layer( + "{}_att_query".format(prefix), + att_query0, + axis="F", + dims=(self.enc_att_num_heads, self.enc_key_per_head_dim), + ) + + # --------------- Add to the encoder network --------------- # + att_key0 = self.base_model.network.add_linear_layer( + "{}_att_key0".format(prefix), + "encoder", + with_bias=False, + n_out=self.enc_key_dim, + forward_weights_init=self.mhsa_init, + l2=self.l2, + ) + + # (B, enc-T, H, D/H) + att_key = self.base_model.network.add_split_dim_layer( + "{}_att_key".format(prefix), att_key0, axis="F", dims=(self.enc_att_num_heads, self.enc_key_per_head_dim) + ) + + att_value0 = self.base_model.network.add_linear_layer( + "{}_att_value0".format(prefix), + "encoder", + with_bias=False, + n_out=self.enc_value_dim, + forward_weights_init=self.mhsa_init, + l2=self.l2, + ) + + # (B, enc-T, H, D'/H) + att_value = self.base_model.network.add_split_dim_layer( + "{}_att_value".format(prefix), + att_value0, + axis="F", + dims=(self.enc_att_num_heads, self.enc_val_per_head_dim), + ) + # ----------------------------------------------------------- # + + # (B, H, enc-T, 1) + att_energy = self.subnet_unit.add_dot_layer( + "{}_att_energy".format(prefix), source=["base:" + att_key, att_query], red1=-1, red2=-1, var1="T", var2="T?" + ) + + att_weights = self.subnet_unit.add_softmax_over_spatial_layer( + "{}_att_weights".format(prefix), att_energy, energy_factor=self.enc_key_per_head_dim**-0.5 + ) + + att_weights_drop = self.subnet_unit.add_dropout_layer( + "{}_att_weights_drop".format(prefix), att_weights, dropout=self.att_dropout, dropout_noise_shape={"*": None} + ) + + # (B, H, V) + att0 = self.subnet_unit.add_generic_att_layer( + "{}_att0".format(prefix), weights=att_weights_drop, base="base:" + att_value + ) + + att = self.subnet_unit.add_merge_dims_layer( + "{}_att".format(prefix), att0, axes="static" + ) # (B, H*V) except_batch + + # output projection + att_linear = self.subnet_unit.add_linear_layer( + "{}_att_linear".format(prefix), + att, + with_bias=False, + n_out=self.enc_value_dim, + forward_weights_init=self.mhsa_out_init, + l2=self.l2, + ) + + att_drop = self.subnet_unit.add_dropout_layer("{}_att_drop".format(prefix), att_linear, dropout=self.dropout) + + out = self.subnet_unit.add_combine_layer( + "{}_att_out".format(prefix), [att_drop, source], kind="add", n_out=self.enc_value_dim + ) + return out + + def _create_convolution_module(self, prefix_name, source): + """ + Add Convolution Module: + LN + point-wise-conv + GLU + depth-wise-conv + Swish + point-wise-conv + Dropout + Note that BN is disabled here because it uses full sequence. + + :param str prefix_name: some prefix name + :param str source: name of source layer + :return: last layer name of this module + :rtype: str + """ + prefix_name = "{}_conv_mod".format(prefix_name) + + ln = self.subnet_unit.add_layer_norm_layer("{}_ln".format(prefix_name), source) + + pointwise_conv1 = self.subnet_unit.add_linear_layer( + "{}_pointwise_conv1".format(prefix_name), + ln, + n_out=2 * self.enc_key_dim, + activation=None, + l2=self.l2, + with_bias=self.ff_bias, + forward_weights_init=self.conv_module_init, + ) + + glu_act = self.subnet_unit.add_gating_layer("{}_glu".format(prefix_name), pointwise_conv1) + + # Pad to make causal conv + # TODO: This currently does not work inside a recurrent subnetwork. Need to be fixed. + depthwise_conv_input_padded = self.subnet_unit.add_pad_layer( + "{}_depthwise_conv_input_padded".format(prefix_name), + glu_act, + axes="T", + padding=(self.conv_kernel_size - 1, 0), + ) + + depthwise_conv = self.subnet_unit.add_conv_layer( + "{}_depthwise_conv2".format(prefix_name), + depthwise_conv_input_padded, + n_out=self.enc_key_dim, + filter_size=(self.conv_kernel_size,), + groups=self.enc_key_dim, + l2=self.l2, + forward_weights_init=self.conv_module_init, + padding="valid", + ) + + swish_act = self.subnet_unit.add_activation_layer( + "{}_swish".format(prefix_name), depthwise_conv, activation="swish" + ) + + pointwise_conv2 = self.subnet_unit.add_linear_layer( + "{}_pointwise_conv2".format(prefix_name), + swish_act, + n_out=self.enc_key_dim, + activation=None, + l2=self.l2, + with_bias=self.ff_bias, + forward_weights_init=self.conv_module_init, + ) + + drop = self.subnet_unit.add_dropout_layer("{}_drop".format(prefix_name), pointwise_conv2, dropout=self.dropout) + + res_inputs = [drop, source] + + res = self.subnet_unit.add_combine_layer( + "{}_res".format(prefix_name), kind="add", source=res_inputs, n_out=self.enc_key_dim + ) + return res + + def _create_ff_module(self, prefix_name, i, source): + """ + Add Feed Forward Module: + LN -> FFN -> Swish -> Dropout -> FFN -> Dropout + + :param str prefix_name: some prefix name + :param int i: FF module index + :param str source: name of source layer + :return: last layer name of this module + :rtype: str + """ + prefix_name = prefix_name + "_ffmod_{}".format(i) + + ln = self.subnet_unit.add_layer_norm_layer("{}_ln".format(prefix_name), source) + + ff1 = self.subnet_unit.add_linear_layer( + "{}_ff1".format(prefix_name), + ln, + n_out=self.ff_dim, + l2=self.l2, + forward_weights_init=self.ff_init, + with_bias=self.ff_bias, + ) + + if self.use_sqrd_relu: + swish_act = self.subnet_unit.add_activation_layer("{}_relu".format(prefix_name), ff1, activation="relu") + swish_act = self.subnet_unit.add_eval_layer( + "{}_square_relu".format(prefix_name), swish_act, eval="source(0) ** 2" + ) + else: + swish_act = self.subnet_unit.add_activation_layer( + "{}_swish".format(prefix_name), ff1, activation=self.activation + ) + + drop1 = self.subnet_unit.add_dropout_layer("{}_drop1".format(prefix_name), swish_act, dropout=self.dropout) + + ff2 = self.subnet_unit.add_linear_layer( + "{}_ff2".format(prefix_name), + drop1, + n_out=self.enc_key_dim, + l2=self.l2, + forward_weights_init=self.ff_init, + with_bias=self.ff_bias, + ) + + drop2 = self.subnet_unit.add_dropout_layer("{}_drop2".format(prefix_name), ff2, dropout=self.dropout) + + if self.half_step: + drop2 = self.subnet_unit.add_eval_layer("{}_half_step".format(prefix_name), drop2, eval="0.5 * source(0)") + + res_inputs = [drop2, source] + + ff_module_res = self.subnet_unit.add_combine_layer( + "{}_res".format(prefix_name), kind="add", source=res_inputs, n_out=self.enc_key_dim + ) + + return ff_module_res + + def _create_decoder_block(self, source, i): + """FF + Masked-MHSA + Causal-Conv + Cross-MHA + FF""" + + prefix = "conformer_decoder_%02i" % i + + ff1 = self._create_ff_module(prefix, 1, source) + masked_mhsa = self._create_masked_mhsa(prefix, ff1) + conv_module = self._create_convolution_module(prefix, masked_mhsa) + + if self.replace_cross_att_w_masked_self_att: + mhsa = self._create_masked_mhsa("ilm_" + prefix, conv_module, **self.ilm_args) + else: + mhsa = self._create_mhsa(prefix, conv_module) + + ff2 = self._create_ff_module(prefix, 2, mhsa) + ff2_norm = self.subnet_unit.add_layer_norm_layer("{}_ln".format(prefix), ff2) + out = self.subnet_unit.add_copy_layer(prefix, ff2_norm) + return out + + def _create_ilm_decoder_block(self, source, i): + prefix = "conformer_decoder_%02i" % i + + ff1 = self._create_ff_module("prior_" + prefix, 1, source) + masked_mhsa = self._create_masked_mhsa("prior_" + prefix, ff1) + conv_module = self._create_convolution_module("prior_" + prefix, masked_mhsa) + + if self.ilm_type == "mini_lstm": + mhsa = self._create_masked_mhsa("mini_ilm_" + prefix, conv_module, **self.ilm_args) + else: + assert self.ilm_type == "zero" + mhsa = self.subnet_unit.add_eval_layer("zero_att_%02i" % i, conv_module, eval="tf.zeros_like(source(0))") + + ff2 = self._create_ff_module("prior_" + prefix, 2, mhsa) + ff2_norm = self.subnet_unit.add_layer_norm_layer("{}_ln".format("prior_" + prefix), ff2) + out = self.subnet_unit.add_copy_layer("prior_" + prefix, ff2_norm) + return out + + def _create_decoder(self): + self.output_prob = self.subnet_unit.add_softmax_layer( + "output_prob", + "decoder", + loss="ce", + loss_opts={"label_smoothing": self.label_smoothing}, + target=self.target, + dropout=self.softmax_dropout, + forward_weights_init=self.ff_init, + l2=self.l2, + ) + + if self.length_normalization: + output = self.subnet_unit.add_choice_layer( + "output", self.output_prob, target=self.target, beam_size=self.beam_size, initial_output=0 + ) + else: + output = self.subnet_unit.add_choice_layer( + "output", + self.output_prob, + target=self.target, + beam_size=self.beam_size, + initial_output=0, + length_normalization=self.length_normalization, + ) + + self.subnet_unit.add_compare_layer("end", output, value=0) + + target_embed_raw = self.subnet_unit.add_linear_layer( + "target_embed_raw", + "prev:" + output, + with_bias=False, + n_out=self.enc_value_dim, + forward_weights_init=self.ff_init, + l2=self.l2, + ) + + if self.embed_weight: + target_embed_raw = self.subnet_unit.add_eval_layer( + "target_embed_weighted", target_embed_raw, eval="source(0) * %f" % self.embed_weight + ) + + target_embed = self.subnet_unit.add_dropout_layer( + "target_embed", target_embed_raw, dropout=self.embed_dropout, dropout_noise_shape={"*": None} + ) + + x = target_embed + for i in range(1, self.num_layers + 1): + x = self._create_decoder_block(x, i) + self.subnet_unit.add_copy_layer("decoder", x) + + if self.create_ilm_decoder: + x = target_embed + for i in range(1, self.num_layers + 1): + x = self._create_ilm_decoder_block(x, i) + self.subnet_unit.add_copy_layer("prior_decoder", x) + + self.subnet_unit.add_softmax_layer( + "prior_output_prob", + "prior_decoder", + loss="ce", + loss_opts={"label_smoothing": self.label_smoothing}, + target=self.target, + dropout=self.softmax_dropout, + forward_weights_init=self.ff_init, + l2=self.l2, + ) + + dec_output = self.network.add_subnet_rec_layer("output", unit=self.subnet_unit.get_net(), target=self.target) + + return dec_output + + def create_network(self): + dec_output = self._create_decoder() + + # recurrent subnetwork + decision_layer_name = self.base_model.network.add_decide_layer("decision", dec_output, target=self.target) + self.decision_layer_name = decision_layer_name + + return dec_output diff --git a/users/zeineldeen/models/asr/decoder/transformer_decoder.py b/users/zeineldeen/models/asr/decoder/transformer_decoder.py index 42a74331f..da4dc7850 100644 --- a/users/zeineldeen/models/asr/decoder/transformer_decoder.py +++ b/users/zeineldeen/models/asr/decoder/transformer_decoder.py @@ -2,259 +2,371 @@ class TransformerDecoder: - """ - Represents standard Transformer decoder - - * Attention Is All You Need - * Ref: https://arxiv.org/abs/1706.03762 - """ - - def __init__(self, - base_model, target='bpe', num_layers=6, beam_size=12, ff_init=None, ff_dim=2048, ff_act='relu', att_num_heads=8, - dropout=0.1, att_dropout=0.0, softmax_dropout=0.0, embed_dropout=0.1, l2=0.0, embed_pos_enc=False, - apply_embed_weight=False, label_smoothing=0.1, mhsa_init=None, mhsa_out_init=None, - pos_enc=None, rel_pos_clipping=16, length_normalization=True, - replace_cross_att_w_masked_self_att=False, create_ilm_decoder=False, ilm_type=None, ilm_args=None): - - self.base_model = base_model - self.enc_value_dim = base_model.enc_value_dim - self.enc_key_dim = base_model.enc_key_dim - self.enc_att_num_heads = base_model.att_num_heads - self.enc_key_per_head_dim = base_model.enc_key_per_head_dim - self.enc_val_per_head_dim = base_model.enc_val_per_head_dim - - self.att_num_heads = att_num_heads - - self.target = target - self.num_layers = num_layers - self.beam_size = beam_size - - self.ff_init = ff_init - self.ff_dim = ff_dim - self.ff_act = ff_act - - self.mhsa_init = mhsa_init - self.mhsa_init_out = mhsa_out_init - - self.pos_enc = pos_enc - self.rel_pos_clipping = rel_pos_clipping - - self.dropout = dropout - self.softmax_dropout = softmax_dropout - self.att_dropout = att_dropout - self.label_smoothing = label_smoothing - - self.l2 = l2 - - self.embed_dropout = embed_dropout - self.embed_pos_enc = embed_pos_enc - - self.embed_weight = None - - if apply_embed_weight: - self.embed_weight = self.enc_value_dim ** 0.5 - - self.decision_layer_name = None - - self.length_normalization = length_normalization - - self.replace_cross_att_w_masked_self_att = replace_cross_att_w_masked_self_att # used to train ILM - - # used for recognition with ILM - self.create_ilm_decoder = create_ilm_decoder - self.ilm_type = ilm_type - self.ilm_args = ilm_args or {} - if self.create_ilm_decoder: - self.replace_cross_att_w_masked_self_att = False # keep original decoder as-is - - self.network = ReturnnNetwork() - self.subnet_unit = ReturnnNetwork() - self.output_prob = None - - def _create_masked_mhsa(self, subnet_unit: ReturnnNetwork, prefix, source, **kwargs): - prefix = '{}_self_att'.format(prefix) - - # for tuning mini-self-att ILM - att_num_heads = kwargs.get('att_num_heads', self.att_num_heads) - enc_key_dim = kwargs.get('enc_key_dim', self.enc_key_dim) - enc_key_per_head_dim = enc_key_dim // att_num_heads - - ln = subnet_unit.add_layer_norm_layer('{}_ln'.format(prefix), source) - - ln_rel_pos_enc = None - if self.pos_enc == 'rel': - ln_rel_pos_enc = self.subnet_unit.add_relative_pos_encoding_layer( - '{}_ln_rel_pos_enc'.format(prefix), ln, n_out=enc_key_per_head_dim, forward_weights_init=self.ff_init, - clipping=self.rel_pos_clipping) - - att = subnet_unit.add_self_att_layer( - '{}_att'.format(prefix), ln, num_heads=att_num_heads, total_key_dim=enc_key_dim, - n_out=self.enc_value_dim, attention_left_only=True, att_dropout=self.att_dropout, - forward_weights_init=self.mhsa_init, l2=self.l2, key_shift=ln_rel_pos_enc if ln_rel_pos_enc is not None else None) - - linear = subnet_unit.add_linear_layer( - '{}_linear'.format(prefix), att, activation=None, with_bias=False, n_out=self.enc_value_dim, - forward_weights_init=self.mhsa_init_out, l2=self.l2) - - drop = subnet_unit.add_dropout_layer('{}_drop'.format(prefix), linear, dropout=self.dropout) - - out = subnet_unit.add_combine_layer('{}_out'.format(prefix), [drop, source], kind='add', n_out=self.enc_value_dim) - - return out - - def _create_mhsa(self, subnet_unit: ReturnnNetwork, prefix, source): - ln = subnet_unit.add_layer_norm_layer('{}_att_ln'.format(prefix), source) - - att_query0 = subnet_unit.add_linear_layer( - '{}_att_query0'.format(prefix), ln, with_bias=False, n_out=self.enc_value_dim, - forward_weights_init=self.mhsa_init, l2=self.l2) - - # (B, H, D/H) - att_query = subnet_unit.add_split_dim_layer( - '{}_att_query'.format(prefix), att_query0, axis='F', dims=(self.enc_att_num_heads, self.enc_key_per_head_dim)) - - # --------------- Add to the encoder network --------------- # - att_key0 = self.base_model.network.add_linear_layer( - '{}_att_key0'.format(prefix), 'encoder', with_bias=False, n_out=self.enc_key_dim, - forward_weights_init=self.mhsa_init, l2=self.l2) - - # (B, enc-T, H, D/H) - att_key = self.base_model.network.add_split_dim_layer( - '{}_att_key'.format(prefix), att_key0, axis='F', dims=(self.enc_att_num_heads, self.enc_key_per_head_dim)) - - att_value0 = self.base_model.network.add_linear_layer( - '{}_att_value0'.format(prefix), 'encoder', with_bias=False, n_out=self.enc_value_dim, - forward_weights_init=self.mhsa_init, l2=self.l2) - - # (B, enc-T, H, D'/H) - att_value = self.base_model.network.add_split_dim_layer( - '{}_att_value'.format(prefix), att_value0, axis='F', dims=(self.enc_att_num_heads, self.enc_val_per_head_dim)) - # ----------------------------------------------------------- # - - # (B, H, enc-T, 1) - att_energy = subnet_unit.add_dot_layer( - '{}_att_energy'.format(prefix), source=['base:' + att_key, att_query], red1=-1, red2=-1, var1='T', var2='T?') - - att_weights = subnet_unit.add_softmax_over_spatial_layer( - '{}_att_weights'.format(prefix), att_energy, energy_factor=self.enc_key_per_head_dim ** -0.5) - - att_weights_drop = subnet_unit.add_dropout_layer( - '{}_att_weights_drop'.format(prefix), att_weights, dropout=self.att_dropout, dropout_noise_shape={"*": None}) - - # (B, H, V) - att0 = subnet_unit.add_generic_att_layer( - '{}_att0'.format(prefix), weights=att_weights_drop, base='base:' + att_value) - - att = subnet_unit.add_merge_dims_layer('{}_att'.format(prefix), att0, axes='static') # (B, H*V) except_batch - - # output projection - att_linear = subnet_unit.add_linear_layer( - '{}_att_linear'.format(prefix), att, with_bias=False, n_out=self.enc_value_dim, - forward_weights_init=self.mhsa_init_out, l2=self.l2) - - att_drop = subnet_unit.add_dropout_layer('{}_att_drop'.format(prefix), att_linear, dropout=self.dropout) - - out = subnet_unit.add_combine_layer( - '{}_att_out'.format(prefix), [att_drop, source], kind='add', n_out=self.enc_value_dim) - return out - - def _create_ff_module(self, subnet_unit: ReturnnNetwork, prefix, source): - ff_ln = subnet_unit.add_layer_norm_layer('{}_ff_ln'.format(prefix), source) - - ff1 = subnet_unit.add_linear_layer( - '{}_ff_conv1'.format(prefix), ff_ln, activation=self.ff_act, forward_weights_init=self.ff_init, n_out=self.ff_dim, - with_bias=True, l2=self.l2) - - ff2 = subnet_unit.add_linear_layer( - '{}_ff_conv2'.format(prefix), ff1, activation=None, forward_weights_init=self.ff_init, n_out=self.enc_value_dim, - dropout=self.dropout, with_bias=True, l2=self.l2) - - drop = subnet_unit.add_dropout_layer('{}_ff_drop'.format(prefix), ff2, dropout=self.dropout) - - out = subnet_unit.add_combine_layer( - '{}_ff_out'.format(prefix), [drop, source], kind='add', n_out=self.enc_value_dim) - return out - - def _create_decoder_block(self, subnet_unit: ReturnnNetwork, source, i): - prefix = 'transformer_decoder_%02i' % i - masked_mhsa = self._create_masked_mhsa(subnet_unit, prefix, source) - if self.replace_cross_att_w_masked_self_att: - mhsa = self._create_masked_mhsa(subnet_unit, 'ilm_' + prefix, masked_mhsa, **self.ilm_args) - else: - mhsa = self._create_mhsa(subnet_unit, prefix, masked_mhsa) - ff = self._create_ff_module(subnet_unit, prefix, mhsa) - out = subnet_unit.add_copy_layer(prefix, ff) - return out - - def _create_ilm_decoder_block(self, subnet_unit: ReturnnNetwork, source, i): - prefix = 'transformer_decoder_%02i' % i - masked_mhsa = self._create_masked_mhsa(subnet_unit, 'prior_' + prefix, source) - if self.ilm_type == 'mini_lstm': - mhsa = self._create_masked_mhsa(subnet_unit, 'mini_ilm_' + prefix, masked_mhsa, **self.ilm_args) - else: - assert self.ilm_type == 'zero' - mhsa = subnet_unit.add_eval_layer('zero_att_%02i' % i, masked_mhsa, eval='tf.zeros_like(source(0))') - ff = self._create_ff_module(subnet_unit, 'prior_' + prefix, mhsa) - out = subnet_unit.add_copy_layer('prior_' + prefix, ff) - return out - - def _create_decoder(self, subnet_unit: ReturnnNetwork): - - self.output_prob = subnet_unit.add_softmax_layer( - 'output_prob', 'decoder', loss='ce', - loss_opts={'label_smoothing': self.label_smoothing}, target=self.target, dropout=self.softmax_dropout, - forward_weights_init=self.ff_init, l2=self.l2) - - if self.length_normalization: - output = subnet_unit.add_choice_layer( - 'output', self.output_prob, target=self.target, beam_size=self.beam_size, initial_output=0) - else: - output = subnet_unit.add_choice_layer( - 'output', self.output_prob, target=self.target, beam_size=self.beam_size, initial_output=0, - length_normalization=self.length_normalization) - - subnet_unit.add_compare_layer('end', output, value=0) - - target_embed_raw = subnet_unit.add_linear_layer( - 'target_embed_raw', 'prev:' + output, with_bias=False, n_out=self.enc_value_dim, - forward_weights_init=self.ff_init, l2=self.l2) - - if self.embed_weight: - target_embed_raw = subnet_unit.add_eval_layer( - 'target_embed_weighted', target_embed_raw, eval='source(0) * %f' % self.embed_weight) - - if self.embed_pos_enc: - target_embed_raw = subnet_unit.add_pos_encoding_layer('target_embed_pos_enc', target_embed_raw) - - target_embed = subnet_unit.add_dropout_layer( - 'target_embed', target_embed_raw, dropout=self.embed_dropout, dropout_noise_shape={"*": None}) - - x = target_embed - for i in range(1, self.num_layers + 1): - x = self._create_decoder_block(subnet_unit, x, i) - subnet_unit.add_layer_norm_layer('decoder', x) - - if self.create_ilm_decoder: - x = target_embed - for i in range(1, self.num_layers + 1): - x = self._create_ilm_decoder_block(subnet_unit, x, i) - subnet_unit.add_layer_norm_layer('prior_decoder', x) - - subnet_unit.add_softmax_layer( - 'prior_output_prob', 'prior_decoder', loss='ce', - loss_opts={'label_smoothing': self.label_smoothing}, target=self.target, dropout=self.softmax_dropout, - forward_weights_init=self.ff_init, l2=self.l2 - ) - - dec_output = self.network.add_subnet_rec_layer('output', unit=subnet_unit.get_net(), target=self.target) - - return dec_output - - def create_network(self): - dec_output = self._create_decoder(self.subnet_unit) - - # recurrent subnetwork - decision_layer_name = self.base_model.network.add_decide_layer('decision', dec_output, target=self.target) - self.decision_layer_name = decision_layer_name - - return dec_output + """ + Represents standard Transformer decoder + + * Attention Is All You Need + * Ref: https://arxiv.org/abs/1706.03762 + """ + + def __init__( + self, + base_model, + target="bpe", + num_layers=6, + beam_size=12, + ff_init=None, + ff_dim=2048, + ff_act="relu", + att_num_heads=8, + dropout=0.1, + att_dropout=0.0, + softmax_dropout=0.0, + embed_dropout=0.1, + l2=0.0, + embed_pos_enc=False, + apply_embed_weight=False, + label_smoothing=0.1, + mhsa_init=None, + mhsa_out_init=None, + pos_enc=None, + rel_pos_clipping=16, + length_normalization=True, + replace_cross_att_w_masked_self_att=False, + create_ilm_decoder=False, + ilm_type=None, + ilm_args=None, + ): + self.base_model = base_model + self.enc_value_dim = base_model.enc_value_dim + self.enc_key_dim = base_model.enc_key_dim + self.enc_att_num_heads = base_model.att_num_heads + self.enc_key_per_head_dim = base_model.enc_key_per_head_dim + self.enc_val_per_head_dim = base_model.enc_val_per_head_dim + + self.att_num_heads = att_num_heads + + self.target = target + self.num_layers = num_layers + self.beam_size = beam_size + + self.ff_init = ff_init + self.ff_dim = ff_dim + self.ff_act = ff_act + + self.mhsa_init = mhsa_init + self.mhsa_init_out = mhsa_out_init + + self.pos_enc = pos_enc + self.rel_pos_clipping = rel_pos_clipping + + self.dropout = dropout + self.softmax_dropout = softmax_dropout + self.att_dropout = att_dropout + self.label_smoothing = label_smoothing + + self.l2 = l2 + + self.embed_dropout = embed_dropout + self.embed_pos_enc = embed_pos_enc + + self.embed_weight = None + + if apply_embed_weight: + self.embed_weight = self.enc_value_dim**0.5 + + self.decision_layer_name = None + + self.length_normalization = length_normalization + + self.replace_cross_att_w_masked_self_att = replace_cross_att_w_masked_self_att # used to train ILM + + # used for recognition with ILM + self.create_ilm_decoder = create_ilm_decoder + self.ilm_type = ilm_type + self.ilm_args = ilm_args or {} + if self.create_ilm_decoder: + self.replace_cross_att_w_masked_self_att = False # keep original decoder as-is + + self.network = ReturnnNetwork() + self.subnet_unit = ReturnnNetwork() + self.output_prob = None + + def _create_masked_mhsa(self, subnet_unit: ReturnnNetwork, prefix, source, **kwargs): + prefix = "{}_self_att".format(prefix) + + # for tuning mini-self-att ILM + att_num_heads = kwargs.get("att_num_heads", self.att_num_heads) + enc_key_dim = kwargs.get("enc_key_dim", self.enc_key_dim) + enc_key_per_head_dim = enc_key_dim // att_num_heads + + ln = subnet_unit.add_layer_norm_layer("{}_ln".format(prefix), source) + + ln_rel_pos_enc = None + if self.pos_enc == "rel": + ln_rel_pos_enc = self.subnet_unit.add_relative_pos_encoding_layer( + "{}_ln_rel_pos_enc".format(prefix), + ln, + n_out=enc_key_per_head_dim, + forward_weights_init=self.ff_init, + clipping=self.rel_pos_clipping, + ) + + att = subnet_unit.add_self_att_layer( + "{}_att".format(prefix), + ln, + num_heads=att_num_heads, + total_key_dim=enc_key_dim, + n_out=self.enc_value_dim, + attention_left_only=True, + att_dropout=self.att_dropout, + forward_weights_init=self.mhsa_init, + l2=self.l2, + key_shift=ln_rel_pos_enc if ln_rel_pos_enc is not None else None, + ) + + linear = subnet_unit.add_linear_layer( + "{}_linear".format(prefix), + att, + activation=None, + with_bias=False, + n_out=self.enc_value_dim, + forward_weights_init=self.mhsa_init_out, + l2=self.l2, + ) + + drop = subnet_unit.add_dropout_layer("{}_drop".format(prefix), linear, dropout=self.dropout) + + out = subnet_unit.add_combine_layer( + "{}_out".format(prefix), [drop, source], kind="add", n_out=self.enc_value_dim + ) + + return out + + def _create_mhsa(self, subnet_unit: ReturnnNetwork, prefix, source): + ln = subnet_unit.add_layer_norm_layer("{}_att_ln".format(prefix), source) + + att_query0 = subnet_unit.add_linear_layer( + "{}_att_query0".format(prefix), + ln, + with_bias=False, + n_out=self.enc_value_dim, + forward_weights_init=self.mhsa_init, + l2=self.l2, + ) + + # (B, H, D/H) + att_query = subnet_unit.add_split_dim_layer( + "{}_att_query".format(prefix), + att_query0, + axis="F", + dims=(self.enc_att_num_heads, self.enc_key_per_head_dim), + ) + + # --------------- Add to the encoder network --------------- # + att_key0 = self.base_model.network.add_linear_layer( + "{}_att_key0".format(prefix), + "encoder", + with_bias=False, + n_out=self.enc_key_dim, + forward_weights_init=self.mhsa_init, + l2=self.l2, + ) + + # (B, enc-T, H, D/H) + att_key = self.base_model.network.add_split_dim_layer( + "{}_att_key".format(prefix), att_key0, axis="F", dims=(self.enc_att_num_heads, self.enc_key_per_head_dim) + ) + + att_value0 = self.base_model.network.add_linear_layer( + "{}_att_value0".format(prefix), + "encoder", + with_bias=False, + n_out=self.enc_value_dim, + forward_weights_init=self.mhsa_init, + l2=self.l2, + ) + + # (B, enc-T, H, D'/H) + att_value = self.base_model.network.add_split_dim_layer( + "{}_att_value".format(prefix), + att_value0, + axis="F", + dims=(self.enc_att_num_heads, self.enc_val_per_head_dim), + ) + # ----------------------------------------------------------- # + + # (B, H, enc-T, 1) + att_energy = subnet_unit.add_dot_layer( + "{}_att_energy".format(prefix), source=["base:" + att_key, att_query], red1=-1, red2=-1, var1="T", var2="T?" + ) + + att_weights = subnet_unit.add_softmax_over_spatial_layer( + "{}_att_weights".format(prefix), att_energy, energy_factor=self.enc_key_per_head_dim**-0.5 + ) + + att_weights_drop = subnet_unit.add_dropout_layer( + "{}_att_weights_drop".format(prefix), att_weights, dropout=self.att_dropout, dropout_noise_shape={"*": None} + ) + + # (B, H, V) + att0 = subnet_unit.add_generic_att_layer( + "{}_att0".format(prefix), weights=att_weights_drop, base="base:" + att_value + ) + + att = subnet_unit.add_merge_dims_layer("{}_att".format(prefix), att0, axes="static") # (B, H*V) except_batch + + # output projection + att_linear = subnet_unit.add_linear_layer( + "{}_att_linear".format(prefix), + att, + with_bias=False, + n_out=self.enc_value_dim, + forward_weights_init=self.mhsa_init_out, + l2=self.l2, + ) + + att_drop = subnet_unit.add_dropout_layer("{}_att_drop".format(prefix), att_linear, dropout=self.dropout) + + out = subnet_unit.add_combine_layer( + "{}_att_out".format(prefix), [att_drop, source], kind="add", n_out=self.enc_value_dim + ) + return out + + def _create_ff_module(self, subnet_unit: ReturnnNetwork, prefix, source): + ff_ln = subnet_unit.add_layer_norm_layer("{}_ff_ln".format(prefix), source) + + ff1 = subnet_unit.add_linear_layer( + "{}_ff_conv1".format(prefix), + ff_ln, + activation=self.ff_act, + forward_weights_init=self.ff_init, + n_out=self.ff_dim, + with_bias=True, + l2=self.l2, + ) + + ff2 = subnet_unit.add_linear_layer( + "{}_ff_conv2".format(prefix), + ff1, + activation=None, + forward_weights_init=self.ff_init, + n_out=self.enc_value_dim, + dropout=self.dropout, + with_bias=True, + l2=self.l2, + ) + + drop = subnet_unit.add_dropout_layer("{}_ff_drop".format(prefix), ff2, dropout=self.dropout) + + out = subnet_unit.add_combine_layer( + "{}_ff_out".format(prefix), [drop, source], kind="add", n_out=self.enc_value_dim + ) + return out + + def _create_decoder_block(self, subnet_unit: ReturnnNetwork, source, i): + prefix = "transformer_decoder_%02i" % i + masked_mhsa = self._create_masked_mhsa(subnet_unit, prefix, source) + if self.replace_cross_att_w_masked_self_att: + mhsa = self._create_masked_mhsa(subnet_unit, "ilm_" + prefix, masked_mhsa, **self.ilm_args) + else: + mhsa = self._create_mhsa(subnet_unit, prefix, masked_mhsa) + ff = self._create_ff_module(subnet_unit, prefix, mhsa) + out = subnet_unit.add_copy_layer(prefix, ff) + return out + + def _create_ilm_decoder_block(self, subnet_unit: ReturnnNetwork, source, i): + prefix = "transformer_decoder_%02i" % i + masked_mhsa = self._create_masked_mhsa(subnet_unit, "prior_" + prefix, source) + if self.ilm_type == "mini_lstm": + mhsa = self._create_masked_mhsa(subnet_unit, "mini_ilm_" + prefix, masked_mhsa, **self.ilm_args) + else: + assert self.ilm_type == "zero" + mhsa = subnet_unit.add_eval_layer("zero_att_%02i" % i, masked_mhsa, eval="tf.zeros_like(source(0))") + ff = self._create_ff_module(subnet_unit, "prior_" + prefix, mhsa) + out = subnet_unit.add_copy_layer("prior_" + prefix, ff) + return out + + def _create_decoder(self, subnet_unit: ReturnnNetwork): + self.output_prob = subnet_unit.add_softmax_layer( + "output_prob", + "decoder", + loss="ce", + loss_opts={"label_smoothing": self.label_smoothing}, + target=self.target, + dropout=self.softmax_dropout, + forward_weights_init=self.ff_init, + l2=self.l2, + ) + + if self.length_normalization: + output = subnet_unit.add_choice_layer( + "output", self.output_prob, target=self.target, beam_size=self.beam_size, initial_output=0 + ) + else: + output = subnet_unit.add_choice_layer( + "output", + self.output_prob, + target=self.target, + beam_size=self.beam_size, + initial_output=0, + length_normalization=self.length_normalization, + ) + + subnet_unit.add_compare_layer("end", output, value=0) + + target_embed_raw = subnet_unit.add_linear_layer( + "target_embed_raw", + "prev:" + output, + with_bias=False, + n_out=self.enc_value_dim, + forward_weights_init=self.ff_init, + l2=self.l2, + ) + + if self.embed_weight: + target_embed_raw = subnet_unit.add_eval_layer( + "target_embed_weighted", target_embed_raw, eval="source(0) * %f" % self.embed_weight + ) + + if self.embed_pos_enc: + target_embed_raw = subnet_unit.add_pos_encoding_layer("target_embed_pos_enc", target_embed_raw) + + target_embed = subnet_unit.add_dropout_layer( + "target_embed", target_embed_raw, dropout=self.embed_dropout, dropout_noise_shape={"*": None} + ) + + x = target_embed + for i in range(1, self.num_layers + 1): + x = self._create_decoder_block(subnet_unit, x, i) + subnet_unit.add_layer_norm_layer("decoder", x) + + if self.create_ilm_decoder: + x = target_embed + for i in range(1, self.num_layers + 1): + x = self._create_ilm_decoder_block(subnet_unit, x, i) + subnet_unit.add_layer_norm_layer("prior_decoder", x) + + subnet_unit.add_softmax_layer( + "prior_output_prob", + "prior_decoder", + loss="ce", + loss_opts={"label_smoothing": self.label_smoothing}, + target=self.target, + dropout=self.softmax_dropout, + forward_weights_init=self.ff_init, + l2=self.l2, + ) + + dec_output = self.network.add_subnet_rec_layer("output", unit=subnet_unit.get_net(), target=self.target) + + return dec_output + + def create_network(self): + dec_output = self._create_decoder(self.subnet_unit) + + # recurrent subnetwork + decision_layer_name = self.base_model.network.add_decide_layer("decision", dec_output, target=self.target) + self.decision_layer_name = decision_layer_name + + return dec_output From e7d6f351e3d2e2c35cf29d5ca785de418975f289 Mon Sep 17 00:00:00 2001 From: Judyxujj Date: Wed, 15 May 2024 23:25:24 +0800 Subject: [PATCH 006/227] Jing tedlium independent softmax (#220) * tedlium ctc pytorch * rm empty files --------- Co-authored-by: Jingjing Xu --- users/jxu/corpus/__init__.py | 0 users/jxu/corpus/general/__init__.py | 4 + .../jxu/corpus/general/corpus_preparation.py | 77 ++++++ users/jxu/corpus/general/experiment_data.py | 43 ++++ users/jxu/corpus/general/hdf.py | 216 +++++++++++++++++ users/jxu/corpus/general/helpers.py | 37 +++ users/jxu/corpus/tedlium2/__init__.py | 0 users/jxu/corpus/tedlium2/data.py | 136 +++++++++++ users/jxu/corpus/tedlium2/lm_data.py | 13 + .../baseline/config_01_ctc_torch_conformer.py | 2 +- .../conformer_baseline_with_log_mel.py | 135 +++++++++++ .../ctc/tedlium2/configs/configs_helper.py | 93 ++++++++ .../experiments/ctc/tedlium2/data/ctc_data.py | 155 ++++++++++++ .../baseline/conformer_size_384_log_mel.py | 224 ++++++++++++++++++ .../utils/dump_log_prob_forward_callback.py | 152 ++++++++++++ 15 files changed, 1286 insertions(+), 1 deletion(-) create mode 100644 users/jxu/corpus/__init__.py create mode 100644 users/jxu/corpus/general/__init__.py create mode 100644 users/jxu/corpus/general/corpus_preparation.py create mode 100644 users/jxu/corpus/general/experiment_data.py create mode 100644 users/jxu/corpus/general/hdf.py create mode 100644 users/jxu/corpus/general/helpers.py create mode 100644 users/jxu/corpus/tedlium2/__init__.py create mode 100644 users/jxu/corpus/tedlium2/data.py create mode 100644 users/jxu/corpus/tedlium2/lm_data.py create mode 100644 users/jxu/experiments/ctc/tedlium2/configs/baseline/conformer_baseline_with_log_mel.py create mode 100644 users/jxu/experiments/ctc/tedlium2/configs/configs_helper.py create mode 100644 users/jxu/experiments/ctc/tedlium2/data/ctc_data.py create mode 100644 users/jxu/experiments/ctc/tedlium2/pytorch_networks/baseline/conformer_size_384_log_mel.py create mode 100644 users/jxu/experiments/ctc/tedlium2/utils/dump_log_prob_forward_callback.py diff --git a/users/jxu/corpus/__init__.py b/users/jxu/corpus/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/jxu/corpus/general/__init__.py b/users/jxu/corpus/general/__init__.py new file mode 100644 index 000000000..15ec5f2b1 --- /dev/null +++ b/users/jxu/corpus/general/__init__.py @@ -0,0 +1,4 @@ +from .corpus_preparation import * +from .experiment_data import * +from .hdf import * +from .helpers import * diff --git a/users/jxu/corpus/general/corpus_preparation.py b/users/jxu/corpus/general/corpus_preparation.py new file mode 100644 index 000000000..0153f5ada --- /dev/null +++ b/users/jxu/corpus/general/corpus_preparation.py @@ -0,0 +1,77 @@ +import re +from typing import List, Tuple, Optional + + +def clean_string(s: str, custom_subs: Optional[List[Tuple[str, str]]] = None) -> str: + for pattern, substitution in list( + { + r"[\!\"\%\,\/\:\;\?\{\}\&]": "", + "`": "'", + r"\.(\w)": r"\g<1>", + r"(\s|\A)\'": r"\g<1>", + r"(\s|\A)\(": r"\g<1>", + r"(\s|\A)\)": r"\g<1>", + r"\(\S*\)": "", + r"\[\S*\]": "", + "-HYPHEN": "HYPHEN", + "--DASH": "DASH", + r" *": "", + r" *": "", + r" *<.*> *": "", + r" *< *": "", + r" *> *": "", + r" *\* *": "", + r" *, *": "", + r" *\^ *": "", + r" *\\ *": "", + r" *\| *": "", + r" *~ *": "", + r" *\[.*\] *": "", + r" *\[ *": "", + r" *\] *": "", + r" *\. *": "", + r" *# *": "", + r"\$": "dollars", + r"(.)\1+": r"\1\1", + }.items() + ) + (custom_subs or []): + s = re.sub(pattern, substitution, s) + + s = " ".join(s.split()) + return s + + +def lm_cleaning(s: str): + from returnn.datasets.lm import english_cleaners + + remove_regexes = [ + re.compile(expr) + for expr in [ + r" *", + r" *", + r" *<.*> *", + r" *< *", + r" *> *", + r" *\* *", + r" *, *", + r" *\^ *", + r" *\\ *", + r" *\| *", + r" *~ *", + r" *\[.*\] *", + r" *\[ *", + r" *\] *", + r" *\. *", + r" *# *", + ] + ] + replace_regexes = [ + (re.compile(r"\$"), "dollars"), + (r"(.)\1+", r"\1\1"), + ] + sentence_clean = english_cleaners(s) + for expr in remove_regexes: + sentence_clean = re.sub(expr, "", sentence_clean) + for expr, repl in replace_regexes: + sentence_clean = re.sub(expr, repl, sentence_clean) + return sentence_clean diff --git a/users/jxu/corpus/general/experiment_data.py b/users/jxu/corpus/general/experiment_data.py new file mode 100644 index 000000000..1811e96f5 --- /dev/null +++ b/users/jxu/corpus/general/experiment_data.py @@ -0,0 +1,43 @@ +from dataclasses import dataclass, field +from typing import Dict, List +from i6_experiments.users.berger import helpers +from sisyphus import tk + + +@dataclass +class BasicSetupData: + train_key: str + dev_keys: List[str] + test_keys: List[str] + align_keys: List[str] + train_data_config: Dict + cv_data_config: Dict + data_inputs: Dict[str, helpers.RasrDataInput] + + +@dataclass +class CTCSetupData(BasicSetupData): + loss_corpus: tk.Path + loss_lexicon: tk.Path + + +@dataclass +class PytorchCTCSetupData(BasicSetupData): + pass + + +@dataclass +class HybridSetupData(BasicSetupData): + pass + + +@dataclass +class SMSHybridSetupData(BasicSetupData): + scoring_corpora: Dict[str, tk.Path] + python_prolog: Dict + num_classes: int + + +@dataclass +class ReturnnSearchSetupData(BasicSetupData): + forward_data_config: Dict[str, Dict] diff --git a/users/jxu/corpus/general/hdf.py b/users/jxu/corpus/general/hdf.py new file mode 100644 index 000000000..41df1e7e8 --- /dev/null +++ b/users/jxu/corpus/general/hdf.py @@ -0,0 +1,216 @@ +from typing import Optional, List +from i6_core.corpus import SegmentCorpusJob + +from i6_core.returnn.hdf import BlissToPcmHDFJob +from i6_experiments.users.berger.recipe.returnn.hdf import BlissCorpusToTargetHdfJob +from i6_experiments.users.berger.args.returnn.dataset import MetaDatasetBuilder, hdf_config_dict_for_files +from i6_experiments.users.berger.systems.dataclasses import AlignmentData, FeatureType +from sisyphus import tk +from i6_experiments.users.berger.args.jobs.rasr_init_args import ( + get_feature_extraction_args_16kHz, + get_feature_extraction_args_8kHz, +) +from i6_experiments.users.berger.helpers import build_rasr_feature_hdfs, RasrDataInput, SeparatedCorpusObject +from i6_experiments.users.berger.recipe.corpus.transform import ReplaceUnknownWordsJob + + +def build_feature_hdf_dataset_config( + data_inputs: List[RasrDataInput], + feature_type: FeatureType, + returnn_root: tk.Path, + returnn_python_exe: tk.Path, + rasr_binary_path: tk.Path, + rasr_arch: str = "linux-x86_64-standard", + dc_detection: bool = False, + single_hdf: bool = False, + extra_config: Optional[dict] = None, +) -> dict: + feature_hdfs = [] + + if feature_type in { + FeatureType.GAMMATONE_16K, + FeatureType.GAMMATONE_CACHED_16K, + FeatureType.GAMMATONE_8K, + FeatureType.GAMMATONE_CACHED_8K, + }: + if feature_type == FeatureType.GAMMATONE_16K or feature_type == FeatureType.GAMMATONE_CACHED_16K: + gt_args = get_feature_extraction_args_16kHz(dc_detection=dc_detection)["gt"] + elif feature_type == FeatureType.GAMMATONE_8K or feature_type == FeatureType.GAMMATONE_CACHED_8K: + gt_args = get_feature_extraction_args_8kHz(dc_detection=dc_detection)["gt"] + else: + raise NotImplementedError + + for data_input in data_inputs: + feature_hdfs += build_rasr_feature_hdfs( + data_input.corpus_object, + split=data_input.concurrent, + feature_type="gt", + feature_extraction_args=gt_args, + returnn_python_exe=returnn_python_exe, + returnn_root=returnn_root, + rasr_binary_path=rasr_binary_path, + rasr_arch=rasr_arch, + single_hdf=single_hdf, + ) + + elif feature_type == FeatureType.SAMPLES: + for data_input in data_inputs: + if single_hdf: + segment_files = [None] + else: + segment_files = list( + SegmentCorpusJob( + data_input.corpus_object.corpus_file, data_input.concurrent + ).out_single_segment_files.values() + ) + + for segment_file in segment_files: + feature_hdf_job = BlissToPcmHDFJob( + data_input.corpus_object.corpus_file, + segment_file=segment_file, + rounding=BlissToPcmHDFJob.RoundingScheme.rasr_compatible, + returnn_root=returnn_root, + ) + feature_hdf_job.rqmt["mem"] = 8 + feature_hdf_job.rqmt["time"] = 24 + feature_hdfs.append(feature_hdf_job.out_hdf) + else: + raise NotImplementedError + + return hdf_config_dict_for_files(files=feature_hdfs, extra_config=extra_config) + + +def build_feature_alignment_meta_dataset_config( + data_inputs: List[RasrDataInput], + feature_type: FeatureType, + alignments: List[AlignmentData], + returnn_root: tk.Path, + returnn_python_exe: tk.Path, + rasr_binary_path: tk.Path, + rasr_arch: str = "linux-x86_64-standard", + dc_detection: bool = False, + single_hdf: bool = False, + extra_config: Optional[dict] = None, +) -> dict: + feature_hdf_config = build_feature_hdf_dataset_config( + data_inputs=data_inputs, + feature_type=feature_type, + returnn_root=returnn_root, + returnn_python_exe=returnn_python_exe, + rasr_binary_path=rasr_binary_path, + rasr_arch=rasr_arch, + dc_detection=dc_detection, + single_hdf=single_hdf, + ) + + dataset_builder = MetaDatasetBuilder() + dataset_builder.add_dataset( + name="data", dataset_config=feature_hdf_config, key_mapping={"data": "data"}, control=False + ) + + alignment_hdf_files = [ + alignment.get_hdf(returnn_python_exe=returnn_python_exe, returnn_root=returnn_root) for alignment in alignments + ] + alignment_hdf_config = hdf_config_dict_for_files(files=alignment_hdf_files, extra_config=extra_config) + dataset_builder.add_dataset( + name="classes", dataset_config=alignment_hdf_config, key_mapping={"data": "classes"}, control=True + ) + return dataset_builder.get_dict() + + +def build_multi_speaker_feature_hdf_files( + data_inputs: List[RasrDataInput], + feature_type: FeatureType, + returnn_root: tk.Path, + returnn_python_exe: tk.Path, + rasr_binary_path: tk.Path, + rasr_arch: str = "linux-x86_64-standard", + dc_detection: bool = False, + single_hdf: bool = False, +) -> dict: + feature_hdfs = {} + + if feature_type in { + FeatureType.CONCAT_SEC_GAMMATONE_16K, + FeatureType.CONCAT_MIX_GAMMATONE_16K, + FeatureType.CONCAT_SEC_MIX_GAMMATONE_16K, + }: + gt_args = get_feature_extraction_args_16kHz(dc_detection=dc_detection)["gt"] + + feature_hdfs_prim = [] + feature_hdfs_sec = [] + feature_hdfs_mix = [] + + for data_input in data_inputs: + assert isinstance(data_input.corpus_object, SeparatedCorpusObject) + for hdfs_list, subobject in [ + (feature_hdfs_prim, data_input.corpus_object.get_primary_corpus_object()), + (feature_hdfs_sec, data_input.corpus_object.get_secondary_corpus_object()), + (feature_hdfs_mix, data_input.corpus_object.get_mix_corpus_object()), + ]: + hdfs_list += build_rasr_feature_hdfs( + subobject, + split=data_input.concurrent, + feature_type="gt", + feature_extraction_args=gt_args, + returnn_python_exe=returnn_python_exe, + returnn_root=returnn_root, + rasr_binary_path=rasr_binary_path, + rasr_arch=rasr_arch, + single_hdf=single_hdf, + ) + feature_hdfs["primary"] = feature_hdfs_prim + if feature_type in {FeatureType.CONCAT_SEC_GAMMATONE_16K, FeatureType.CONCAT_SEC_MIX_GAMMATONE_16K}: + feature_hdfs["secondary"] = feature_hdfs_sec + if feature_type in {FeatureType.CONCAT_MIX_GAMMATONE_16K, FeatureType.CONCAT_SEC_MIX_GAMMATONE_16K}: + feature_hdfs["mix"] = feature_hdfs_sec + else: + raise NotImplementedError + + return feature_hdfs + + +def build_feature_label_meta_dataset_config( + data_inputs: List[RasrDataInput], + feature_type: FeatureType, + lexicon: tk.Path, + label_dim: int, + returnn_root: tk.Path, + returnn_python_exe: tk.Path, + rasr_binary_path: tk.Path, + rasr_arch: str = "linux-x86_64-standard", + dc_detection: bool = False, + single_hdf: bool = False, + extra_config: Optional[dict] = None, +) -> dict: + feature_hdf_config = build_feature_hdf_dataset_config( + data_inputs=data_inputs, + feature_type=feature_type, + returnn_root=returnn_root, + returnn_python_exe=returnn_python_exe, + rasr_binary_path=rasr_binary_path, + rasr_arch=rasr_arch, + dc_detection=dc_detection, + single_hdf=single_hdf, + extra_config=extra_config, + ) + + dataset_builder = MetaDatasetBuilder() + dataset_builder.add_dataset( + name="data", dataset_config=feature_hdf_config, key_mapping={"data": "data"}, control=True + ) + + label_hdf_files = [ + BlissCorpusToTargetHdfJob( + ReplaceUnknownWordsJob(data_input.corpus_object.corpus_file, lexicon_file=lexicon).out_corpus_file, + bliss_lexicon=lexicon, + returnn_root=returnn_root, + dim=label_dim, + ).out_hdf + for data_input in data_inputs + ] + label_hdf_config = hdf_config_dict_for_files(files=label_hdf_files) + dataset_builder.add_dataset( + name="classes", dataset_config=label_hdf_config, key_mapping={"data": "classes"}, control=False + ) + return dataset_builder.get_dict() diff --git a/users/jxu/corpus/general/helpers.py b/users/jxu/corpus/general/helpers.py new file mode 100644 index 000000000..f4aabc02c --- /dev/null +++ b/users/jxu/corpus/general/helpers.py @@ -0,0 +1,37 @@ +from typing import Union +from sisyphus import tk +from i6_core.corpus import FilterCorpusRemoveUnknownWordSegmentsJob +from i6_experiments.users.berger.helpers import SeparatedCorpusObject, ScorableCorpusObject +from i6_core.meta.system import CorpusObject as metaCorpusObject +from i6_experiments.common.datasets.util import CorpusObject as i6CorpusObject + + +def filter_unk_in_corpus_object( + corpus_object: Union[metaCorpusObject, i6CorpusObject, ScorableCorpusObject, SeparatedCorpusObject], + lexicon: tk.Path, +) -> None: + if isinstance(corpus_object, (metaCorpusObject, i6CorpusObject, ScorableCorpusObject)): + assert corpus_object.corpus_file is not None + corpus_object.corpus_file = FilterCorpusRemoveUnknownWordSegmentsJob( + bliss_corpus=corpus_object.corpus_file, + bliss_lexicon=lexicon, + all_unknown=False, + ).out_corpus + elif isinstance(corpus_object, SeparatedCorpusObject): + corpus_object.primary_corpus_file = FilterCorpusRemoveUnknownWordSegmentsJob( + bliss_corpus=corpus_object.primary_corpus_file, + bliss_lexicon=lexicon, + all_unknown=False, + ).out_corpus + corpus_object.secondary_corpus_file = FilterCorpusRemoveUnknownWordSegmentsJob( + bliss_corpus=corpus_object.secondary_corpus_file, + bliss_lexicon=lexicon, + all_unknown=False, + ).out_corpus + corpus_object.mix_corpus_file = FilterCorpusRemoveUnknownWordSegmentsJob( + bliss_corpus=corpus_object.mix_corpus_file, + bliss_lexicon=lexicon, + all_unknown=False, + ).out_corpus + else: + raise NotImplementedError diff --git a/users/jxu/corpus/tedlium2/__init__.py b/users/jxu/corpus/tedlium2/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/jxu/corpus/tedlium2/data.py b/users/jxu/corpus/tedlium2/data.py new file mode 100644 index 000000000..445c927bc --- /dev/null +++ b/users/jxu/corpus/tedlium2/data.py @@ -0,0 +1,136 @@ +import copy +from sisyphus import tk +from typing import Dict, List, Optional, Tuple + +from i6_core.bpe.train import ReturnnTrainBpeJob +from i6_core.tools import CloneGitRepositoryJob +from i6_experiments.common.datasets.tedlium2.textual_data import get_text_data_dict +import i6_experiments.common.datasets.tedlium2.lexicon as tdl_lexicon +from i6_experiments.common.baselines.tedlium2.data import get_corpus_data_inputs +from i6_experiments.common.setups.rasr import util as rasr_util +from i6_experiments.users.berger import helpers +from i6_experiments.users.berger.recipe import lexicon +from i6_experiments.users.berger.corpus.general.helpers import filter_unk_in_corpus_object +from .lm_data import get_lm + + +def get_data_inputs( + train_key: str = "train", + cv_keys: Optional[List[str]] = None, + dev_keys: Optional[List[str]] = None, + test_keys: Optional[List[str]] = None, + lm_names: Optional[List[str]] = None, + add_unknown_phoneme_and_mapping: bool = False, + ctc_lexicon: bool = False, + filter_unk_from_corpus: bool = False, + use_augmented_lexicon: bool = True, + add_all_allophones: bool = False, +) -> Tuple[Dict[str, helpers.RasrDataInput], ...]: + if cv_keys is None: + cv_keys = ["dev"] + if dev_keys is None: + dev_keys = ["dev"] + if test_keys is None: + test_keys = ["test"] + if lm_names is None: + lm_names = ["4gram"] + + data_inputs = get_corpus_data_inputs(add_unknown_phoneme_and_mapping=add_unknown_phoneme_and_mapping) + + assert data_inputs["dev"]["dev"].lm + + lms = {key: get_lm(key) for key in lm_names} + + original_bliss_lexicon = tdl_lexicon.get_bliss_lexicon( + add_unknown_phoneme_and_mapping=add_unknown_phoneme_and_mapping + ) + + if use_augmented_lexicon: + bliss_lexicon = tdl_lexicon.get_g2p_augmented_bliss_lexicon( + add_unknown_phoneme_and_mapping=add_unknown_phoneme_and_mapping + ) + else: + bliss_lexicon = original_bliss_lexicon + + bliss_lexicon = lexicon.EnsureSilenceFirstJob(bliss_lexicon).out_lexicon + + if ctc_lexicon: + bliss_lexicon = lexicon.DeleteEmptyOrthJob(bliss_lexicon).out_lexicon + bliss_lexicon = lexicon.MakeBlankLexiconJob(bliss_lexicon).out_lexicon + + lexicon_config = helpers.LexiconConfig( + filename=bliss_lexicon, + normalize_pronunciation=False, + add_all_allophones=add_all_allophones, + add_allophones_from_lexicon=not add_all_allophones, + ) + + train_data_inputs = {} + cv_data_inputs = {} + dev_data_inputs = {} + test_data_inputs = {} + + train_corpus_object = data_inputs[train_key][train_key].corpus_object + if filter_unk_from_corpus: + train_corpus_object = copy.deepcopy(train_corpus_object) + filter_unk_in_corpus_object(train_corpus_object, bliss_lexicon) + + train_data_inputs[train_key] = helpers.RasrDataInput( + corpus_object=helpers.convert_legacy_corpus_object_to_scorable(train_corpus_object), + concurrent=data_inputs[train_key][train_key].concurrent, + lexicon=lexicon_config, + ) + + for cv_key in cv_keys: + cv_corpus_object = data_inputs[cv_key][cv_key].corpus_object + if filter_unk_from_corpus: + cv_corpus_object = copy.deepcopy(cv_corpus_object) + filter_unk_in_corpus_object(cv_corpus_object, bliss_lexicon) + cv_data_inputs[cv_key] = helpers.RasrDataInput( + corpus_object=helpers.convert_legacy_corpus_object_to_scorable(cv_corpus_object), + concurrent=data_inputs[cv_key][cv_key].concurrent, + lexicon=lexicon_config, + ) + + for dev_key in dev_keys: + for lm_name, lm in lms.items(): + dev_data_inputs[f"{dev_key}_{lm_name}"] = helpers.RasrDataInput( + corpus_object=helpers.convert_legacy_corpus_object_to_scorable( + data_inputs[dev_key][dev_key].corpus_object + ), + concurrent=data_inputs[dev_key][dev_key].concurrent, + lexicon=lexicon_config, + lm=lm, + ) + + for test_key in test_keys: + for lm_name, lm in lms.items(): + test_data_inputs[f"{test_key}_{lm_name}"] = helpers.RasrDataInput( + corpus_object=helpers.convert_legacy_corpus_object_to_scorable( + data_inputs[test_key][test_key].corpus_object + ), + concurrent=data_inputs[test_key][test_key].concurrent, + lexicon=lexicon_config, + lm=lm, + ) + + return train_data_inputs, cv_data_inputs, dev_data_inputs, test_data_inputs + + +def get_final_gmm_output(): + output_args = rasr_util.OutputArgs("final") + + for ck in ["train", "dev", "test"]: + output_args.define_corpus_type(ck, ck) + + output_args.add_feature_to_extract("gt") + + return output_args + + +def get_bpe(size: int, subword_nmt_repo: Optional[tk.Path] = None) -> ReturnnTrainBpeJob: + txt_file = get_text_data_dict()["background-data"] + if subword_nmt_repo is None: + subword_nmt_repo = CloneGitRepositoryJob("https://github.com/albertz/subword-nmt.git").out_repository + + return ReturnnTrainBpeJob(txt_file, size, subword_nmt_repo=subword_nmt_repo) diff --git a/users/jxu/corpus/tedlium2/lm_data.py b/users/jxu/corpus/tedlium2/lm_data.py new file mode 100644 index 000000000..6253f9830 --- /dev/null +++ b/users/jxu/corpus/tedlium2/lm_data.py @@ -0,0 +1,13 @@ +from i6_experiments.users.berger.helpers import rasr_lm_config +from i6_experiments.common.baselines.tedlium2.data import get_corpus_data_inputs + + +def get_lm(name: str) -> rasr_lm_config.LMData: + lm_dict = {} + + ted_4gram = get_corpus_data_inputs()["dev"]["dev"].lm + assert ted_4gram is not None + + lm_dict["4gram"] = rasr_lm_config.ArpaLMData(filename=ted_4gram["filename"], scale=ted_4gram.get("scale", 1.0)) + + return lm_dict[name] diff --git a/users/jxu/experiments/ctc/swb/configs/baseline/config_01_ctc_torch_conformer.py b/users/jxu/experiments/ctc/swb/configs/baseline/config_01_ctc_torch_conformer.py index fdbe2c492..d55b45da1 100644 --- a/users/jxu/experiments/ctc/swb/configs/baseline/config_01_ctc_torch_conformer.py +++ b/users/jxu/experiments/ctc/swb/configs/baseline/config_01_ctc_torch_conformer.py @@ -33,7 +33,7 @@ "/u/berger/repositories/rasr_versions/gen_seq2seq_dev/arch/linux-x86_64-standard", hash_overwrite="/u/berger/repositories/rasr_versions/gen_seq2seq_dev/arch/linux-x86_64-standard" ) -tools.returnn_root = tk.Path("/u/jxu/setups/tedlium2/2023-07-11--ctc-tedlium/tools/20240509_returnn/returnn", +tools.returnn_root = tk.Path("/u/jxu/setups/tedlium2/2023-07-11--ctc-tedlium2/tools/20240509_returnn/returnn", hash_overwrite="/u/berger/repositories/returnn") SCTK_BINARY_PATH = compile_sctk() # use last published version SCTK_BINARY_PATH.hash_overwrite = "LBS_DEFAULT_SCTK_BINARY_PATH" diff --git a/users/jxu/experiments/ctc/tedlium2/configs/baseline/conformer_baseline_with_log_mel.py b/users/jxu/experiments/ctc/tedlium2/configs/baseline/conformer_baseline_with_log_mel.py new file mode 100644 index 000000000..20281ceb7 --- /dev/null +++ b/users/jxu/experiments/ctc/tedlium2/configs/baseline/conformer_baseline_with_log_mel.py @@ -0,0 +1,135 @@ +import functools +from typing import Any, Dict, List, Optional, Union +import copy + +import numpy as np + +import i6_core.returnn as returnn +import i6_experiments.users.jxu.experiments.ctc.tedlium2.configs.configs_helper as configs_helper +from i6_experiments.users.berger.systems.dataclasses import ReturnnConfigs +from i6_experiments.common.setups.returnn_pytorch.serialization import Collection +from i6_experiments.users.berger.systems.dataclasses import ConfigVariant + +# ********** Constant values ********** + +num_outputs = 79 +num_subepochs = 250 + + +# ********** Settings ********** + +def get_returnn_config( + network: Optional[Dict] = None, + *, + target: Optional[str] = "classes", + num_inputs: Optional[int] = None, + num_outputs: Optional[int] = None, + python_prolog: Optional[Union[List, Dict]] = None, + extern_data_config: bool = False, + extra_python: Optional[List] = None, + extra_config: Optional[Dict] = None, + hash_full_python_code: bool = False, + **kwargs, +) -> returnn.ReturnnConfig: + python_prolog = python_prolog or ["import numpy as np"] + extra_python = extra_python or [] + config_dict: dict[str, Any] = {"target": target} + if num_inputs is not None: + config_dict["num_inputs"] = num_inputs + if num_outputs is not None: + config_dict["num_outputs"] = {target: num_outputs} + if extern_data_config: + config_dict.update( + configs_helper.get_extern_data_config(num_inputs=1, num_outputs=num_outputs, target=target, + **kwargs) + ) + config_dict.update(configs_helper.get_base_config()) + + if network: + config_dict.update({"network:": network}) + + lrate_config = configs_helper.get_oclr_config(**kwargs) + config_dict.update(lrate_config) + + config_dict.update(configs_helper.get_base_regularization_config(**kwargs)) + + if extra_config: + config_dict.update(extra_config) + + post_config_dict = {} + post_config_dict.update(configs_helper.get_base_post_config(**kwargs)) + + return returnn.ReturnnConfig( + config=config_dict, + post_config=post_config_dict, + hash_full_python_code=hash_full_python_code, + python_prolog=python_prolog, + python_epilog=extra_python, + pprint_kwargs={"sort_dicts": False}, + ) + +def get_serializer(model_config, variant: ConfigVariant, in_dim: int = 1) -> Collection: + from i6_experiments.users.jxu.experiments.ctc.tedlium2.pytorch_networks.baseline.conformer_size_384_log_mel import \ + get_train_serializer, get_recog_serializer, get_prior_serializer + if variant == ConfigVariant.TRAIN: + return get_train_serializer(model_config) + if variant == ConfigVariant.PRIOR: + return get_prior_serializer(model_config) + if variant == ConfigVariant.RECOG: + return get_recog_serializer(model_config) + raise NotImplementedError + +def returnn_config_generator(train_data_config: dict, dev_data_config: dict, peak_lr: float) -> dict: + from i6_experiments.users.jxu.experiments.ctc.tedlium2.pytorch_networks.baseline.conformer_size_384_log_mel import get_default_config_v1 + + extra_config = { + "train": train_data_config, + "dev": dev_data_config, + } + recog_extra_config = copy.deepcopy(extra_config) + recog_extra_config["model_outputs"] = {"classes": {"dim": num_outputs}} + + config_partial = functools.partial( + get_returnn_config, + num_epochs=num_subepochs, + num_inputs=50, + num_outputs=num_outputs, + target="targets", + extern_data_config=True, + grad_noise=0.0, + grad_clip=0.0, + cycle_epoch=110, + lr_1= peak_lr / 100, + lr_2 = peak_lr / 10, + peak_lr=peak_lr, + final_lr=1e-08, + batch_size=18000 * 160, + extra_config=extra_config, + ) + + def get_returnn_configs(train_config, recog_config): + return ReturnnConfigs( + train_config=config_partial( + extra_python=[get_serializer(train_config, ConfigVariant.TRAIN)], + extra_config=extra_config), + prior_config=config_partial(extra_python=[get_serializer(recog_config, ConfigVariant.PRIOR)], + extra_config=extra_config), + recog_configs={ + "recog": config_partial(extra_python=[get_serializer(recog_config, ConfigVariant.RECOG)], + extra_config=recog_extra_config)}, + ) + + experiments = {} + network_args = {"specaug_args": {"time_min_num_masks": 2, + "time_max_mask_per_n_frames": 25, + "time_mask_max_size": 20, + "freq_min_num_masks": 2, + "freq_mask_max_size": 5, + "freq_max_num_masks": 16}, "final_dropout": 0.2, "att_weights_dropout": 0.2} + + config = get_default_config_v1(num_inputs=80, num_outputs=num_outputs, + network_args=network_args) + num_layers_12_experiment_name = f"conformer_logmel_{peak_lr}" + experiments[num_layers_12_experiment_name] = get_returnn_configs(config, config) + + return experiments \ No newline at end of file diff --git a/users/jxu/experiments/ctc/tedlium2/configs/configs_helper.py b/users/jxu/experiments/ctc/tedlium2/configs/configs_helper.py new file mode 100644 index 000000000..f1a6e5d04 --- /dev/null +++ b/users/jxu/experiments/ctc/tedlium2/configs/configs_helper.py @@ -0,0 +1,93 @@ +from typing import Any, Dict, List, Optional, Union +import numpy as np + +def get_base_config() -> Dict[str, Any]: + result = { + # "debug_print_layer_output_template": True, + "log_batch_size": True, + "tf_log_memory_usage": True, + "cache_size": "0", + "batching": "random", + "window": 1, + "update_on_device": True, + "backend": "torch", + "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3} + } + + return result + + +def get_extern_data_config( + num_inputs: Optional[int], + num_outputs: Optional[int], + extern_data_kwargs: Dict = {}, + extern_target_kwargs: Dict = {}, + target: Optional[str] = "classes", + **kwargs, +) -> Dict[str, Any]: + result = {} + if num_inputs is not None: + result["data"] = {"dim": num_inputs, **extern_data_kwargs} + if num_outputs is not None and target is not None: + result[target] = {"dim": num_outputs, "sparse": True, **extern_target_kwargs} + return {"extern_data": result} + + +def get_base_regularization_config( + batch_size: int = 10000, + max_seqs: int = 128, + accum_grad: int = 1, + grad_noise: Optional[float] = 0.1, + grad_clip: Optional[float] = None, + grad_clip_global_norm: Optional[float] = None, + **kwargs, +) -> Dict[str, Any]: + result = {"batch_size": batch_size, "max_seqs": max_seqs} + if grad_noise is not None: + result["gradient_noise"] = grad_noise + if grad_clip is not None: + result["gradient_clip"] = grad_clip + if grad_clip_global_norm is not None: + result["gradient_clip_global_norm"] = grad_clip_global_norm + if accum_grad > 1: + result["accum_grad_multiple_step"] = accum_grad + return result + + +def get_base_post_config(keep_last_n: Optional[int] = None, keep_best_n: Optional[int] = None, keep: Optional[List[int]] = None, **kwargs) -> Dict[str, Any]: + if keep_last_n is None and keep_best_n is None and keep is None: + post_config = {"cleanup_old_models": True} + else: + cleanup_opts = {} + if keep_last_n is not None: + cleanup_opts["keep_last_n"] = keep_last_n + if keep_best_n is not None: + cleanup_opts["keep_best_n"] = keep_best_n + if keep is not None: + cleanup_opts["keep"] = keep + post_config = {"cleanup_old_models": cleanup_opts} + return post_config + + +def get_oclr_config( + num_epochs: int, + peak_lr: float = 1e-03, + cycle_epoch: Optional[int] = None, + lr_1: Optional[float] = None, + lr_2: Optional[float] = None, + final_lr: Optional[float] = None, + **kwargs, +) -> dict: + lr_1 = lr_1 or peak_lr / 10 + lr_2 = lr_2 or peak_lr / 10 + final_lr = final_lr or lr_1 / 5 + cycle_epoch = cycle_epoch or (num_epochs * 9) // 20 # 45% of the training + lr_list = ( + list(np.linspace(lr_1, peak_lr, cycle_epoch, endpoint=False)) + + list(np.linspace(peak_lr, lr_2, cycle_epoch, endpoint=False)) + + list(np.linspace(lr_2, final_lr, num_epochs - 2 * cycle_epoch)) + ) + + return { + "learning_rates": lr_list, + } \ No newline at end of file diff --git a/users/jxu/experiments/ctc/tedlium2/data/ctc_data.py b/users/jxu/experiments/ctc/tedlium2/data/ctc_data.py new file mode 100644 index 000000000..447517e77 --- /dev/null +++ b/users/jxu/experiments/ctc/tedlium2/data/ctc_data.py @@ -0,0 +1,155 @@ +import copy +from i6_core import corpus +from i6_core.lexicon.modification import AddEowPhonemesToLexiconJob +from i6_core.returnn.hdf import BlissToPcmHDFJob +from i6_experiments.users.berger.args.jobs.rasr_init_args import ( + get_feature_extraction_args_16kHz, +) +from i6_experiments.users.berger.args.returnn.dataset import MetaDatasetBuilder, hdf_config_dict_for_files +from i6_experiments.users.berger.corpus.general.experiment_data import ( + CTCSetupData, + PytorchCTCSetupData, +) +from i6_experiments.users.berger.helpers.hdf import build_rasr_feature_hdfs +from i6_experiments.users.berger.recipe.returnn.hdf import BlissCorpusToTargetHdfJob +from i6_experiments.users.berger.systems.dataclasses import FeatureType +from i6_experiments.users.jxu.corpus.tedlium2 import data +from sisyphus import tk + + +def get_tedlium2_pytorch_data( + returnn_root: tk.Path, + returnn_python_exe: tk.Path, + rasr_binary_path: tk.Path, + rasr_arch: str = "linux-x86_64-standard", + add_unknown: bool = False, + augmented_lexicon: bool = True, + feature_type: FeatureType = FeatureType.GAMMATONE_16K, +) -> PytorchCTCSetupData: + # ********** Data inputs ********** + train_data_inputs, cv_data_inputs, dev_data_inputs, test_data_inputs = copy.deepcopy( + data.get_data_inputs( + ctc_lexicon=True, + use_augmented_lexicon=augmented_lexicon, + add_all_allophones=True, + add_unknown_phoneme_and_mapping=add_unknown, + ) + ) + + # ********** Train data ********** + train_corpus_object = train_data_inputs["train"].corpus_object + eow_lexicon = AddEowPhonemesToLexiconJob(train_data_inputs["train"].lexicon.filename).out_lexicon + assert train_corpus_object.corpus_file is not None + + if not add_unknown and not augmented_lexicon: + train_corpus_object.corpus_file = corpus.FilterCorpusRemoveUnknownWordSegmentsJob( + train_corpus_object.corpus_file, + eow_lexicon, + all_unknown=False, + ).out_corpus + + train_dataset_builder = MetaDatasetBuilder() + if feature_type == FeatureType.GAMMATONE_16K: + gt_args = get_feature_extraction_args_16kHz()["gt"] + train_feature_hdf = build_rasr_feature_hdfs( + train_corpus_object, + split=train_data_inputs["train"].concurrent, + feature_type="gt", + feature_extraction_args=gt_args, + returnn_python_exe=returnn_python_exe, + returnn_root=returnn_root, + rasr_binary_path=rasr_binary_path, + rasr_arch=rasr_arch, + ) + elif feature_type == FeatureType.SAMPLES: + train_feature_hdf = BlissToPcmHDFJob(train_corpus_object.corpus_file, returnn_root=returnn_root).out_hdf + else: + raise NotImplementedError + + train_dataset_builder.add_dataset( + dataset_config=hdf_config_dict_for_files([train_feature_hdf]), + name="features", + key_mapping={"data": "data"}, + ) + + train_targets_hdf = BlissCorpusToTargetHdfJob( + train_corpus_object.corpus_file, + bliss_lexicon=eow_lexicon, + returnn_root=returnn_root, + ).out_hdf + train_dataset_builder.add_dataset( + dataset_config=hdf_config_dict_for_files([train_targets_hdf], {"partition_epoch": 5, "seq_ordering":"laplace:.1000"}), + name="targets", + key_mapping={"data": "targets"}, + control=True, + ) + + train_data_config = train_dataset_builder.get_dict() + + # ********** CV data ********** + cv_corpus_object = copy.deepcopy(dev_data_inputs["dev_4gram"].corpus_object) + + if not add_unknown: + cv_corpus_object.corpus_file = corpus.FilterCorpusRemoveUnknownWordSegmentsJob( + cv_corpus_object.corpus_file, + eow_lexicon, + all_unknown=False, + ).out_corpus + + cv_dataset_builder = MetaDatasetBuilder() + + if feature_type == FeatureType.GAMMATONE_16K: + gt_args = get_feature_extraction_args_16kHz()["gt"] + cv_feature_hdf = build_rasr_feature_hdfs( + cv_corpus_object, + split=1, + feature_type="gt", + feature_extraction_args=gt_args, + returnn_python_exe=returnn_python_exe, + returnn_root=returnn_root, + rasr_binary_path=rasr_binary_path, + rasr_arch=rasr_arch, + ) + elif feature_type == FeatureType.SAMPLES: + cv_feature_hdf = BlissToPcmHDFJob(cv_corpus_object.corpus_file, returnn_root=returnn_root).out_hdf + else: + raise NotImplementedError + + cv_dataset_builder.add_dataset( + dataset_config=hdf_config_dict_for_files([cv_feature_hdf]), + name="features", + key_mapping={"data": "data"}, + ) + + cv_targets_hdf = BlissCorpusToTargetHdfJob( + cv_corpus_object.corpus_file, + bliss_lexicon=eow_lexicon, + returnn_root=returnn_root, + ).out_hdf + cv_dataset_builder.add_dataset( + dataset_config=hdf_config_dict_for_files([cv_targets_hdf], {"partition_epoch": 1, "seq_ordering": "sorted"}), + name="targets", + key_mapping={"data": "targets"}, + control=True, + ) + + cv_data_config = cv_dataset_builder.get_dict() + + # ********** Recog lexicon ********** + + for rasr_input in {**dev_data_inputs, **test_data_inputs}.values(): + rasr_input.lexicon.filename = eow_lexicon + + return PytorchCTCSetupData( + train_key="train", + dev_keys=["dev_4gram"], + test_keys=["test_4gram"], + align_keys=["train", "dev"], + train_data_config=train_data_config, + cv_data_config=cv_data_config, + data_inputs={ + **train_data_inputs, + **dev_data_inputs, + **test_data_inputs, + }, + ) diff --git a/users/jxu/experiments/ctc/tedlium2/pytorch_networks/baseline/conformer_size_384_log_mel.py b/users/jxu/experiments/ctc/tedlium2/pytorch_networks/baseline/conformer_size_384_log_mel.py new file mode 100644 index 000000000..1b44b4463 --- /dev/null +++ b/users/jxu/experiments/ctc/tedlium2/pytorch_networks/baseline/conformer_size_384_log_mel.py @@ -0,0 +1,224 @@ +from dataclasses import dataclass +from typing import Optional +from i6_experiments.users.berger.systems.dataclasses import ConfigVariant + +import torch +from torch import nn + +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1, VGG4LayerActFrontendV1Config +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config, ConformerConvolutionV1 +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config, ConformerMHSAV1 +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config, \ + ConformerPositionwiseFeedForwardV1 +from i6_experiments.users.berger.pytorch.models.util import lengths_to_padding_mask +from i6_experiments.common.setups.serialization import Import +from i6_experiments.common.setups.returnn_pytorch.serialization import Collection +from i6_experiments.users.berger.pytorch.serializers.basic import ( + get_basic_pt_network_serializer, +) +from i6_models.assemblies.conformer.conformer_v2 import ConformerBlockV2Config, ConformerEncoderV2Config, ConformerBlockV2, ConformerEncoderV2 +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config +from i6_models.config import ModelConfiguration, ModuleFactoryV1 + + +@dataclass +class ConformerCTCConfig(ModelConfiguration): + feature_extraction_cfg: LogMelFeatureExtractionV1Config + specaug_args: dict + conformer_cfg: ConformerEncoderV2Config + final_dropout: float + target_size: int + + +class ConformerCTCModel(torch.nn.Module): + def __init__(self, step: int, cfg: ConformerCTCConfig, **kwargs): + super().__init__() + self.logmel_feat_extraction = LogMelFeatureExtractionV1(cfg=cfg.feature_extraction_cfg) + self.specaug_args = cfg.specaug_args + self.conformer = ConformerEncoderV2(cfg.conformer_cfg) + self.final_linear = torch.nn.Linear(cfg.conformer_cfg.block_cfg.ff_cfg.input_dim, cfg.target_size) + self.final_dropout = nn.Dropout(p=cfg.final_dropout) + self.export_mode = False + + def forward( + self, + audio_features: torch.Tensor, + audio_features_len: Optional[torch.Tensor] = None, + ): + with torch.no_grad(): + squeezed_features = torch.squeeze(audio_features) + if self.export_mode: + squeezed_features = squeezed_features.type(torch.FloatTensor) + else: + squeezed_features = squeezed_features.type(torch.cuda.FloatTensor) + + audio_features, audio_features_len = self.logmel_feat_extraction(squeezed_features, audio_features_len) + + if self.training: + x = specaugment_v1_by_length(audio_features,**self.specaug_args) # [B, T, F] + else: + x = audio_features + # sequence_mask = None if self.export_mode else lengths_to_padding_mask(audio_features_len) + sequence_mask = lengths_to_padding_mask(audio_features_len) + # sequence_mask = lengths_to_padding_mask((audio_features_len + 2) // 3) + x, sequence_mask = self.conformer(x, sequence_mask) # [B, T, F] + x = self.final_dropout(x) + logits = self.final_linear(x) # [B, T, F] + log_probs = torch.log_softmax(logits, dim=2) + + if self.training: + return log_probs, sequence_mask + return log_probs + + +def get_default_config_v1(num_inputs: int, num_outputs: int, network_args: dict) -> ConformerCTCConfig: + dropout = 0.2 if "dropout" not in network_args else network_args["dropout"] + num_att_heads = 6 if "num_att_heads" not in network_args else network_args["num_att_heads"] + att_weights_dropout = 0.1 if "att_weights_dropout" not in network_args else network_args["att_weights_dropout"] + num_layers = 12 if "num_layers" not in network_args else network_args["num_layers"] + kernel_size = 31 if "kernel_size" not in network_args else network_args["kernel_size"] + specaug_args = {"time_min_num_masks": 2, + "time_max_mask_per_n_frames": 25, + "time_mask_max_size": 20, + "freq_min_num_masks": 2, + "freq_mask_max_size": 5, + "freq_max_num_masks": 10} if "specaug_args" not in network_args else network_args["specaug_args"] + final_dropout = 0 if "final_dropout" not in network_args else network_args["final_dropout"] + + feature_extraction_cfg = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + + frontend_cfg = VGG4LayerActFrontendV1Config( + in_features=num_inputs, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation=torch.nn.ReLU(), + out_features=384, + ) + + frontend = ModuleFactoryV1(VGG4LayerActFrontendV1, frontend_cfg) + + ff_cfg = ConformerPositionwiseFeedForwardV1Config( + input_dim=384, + hidden_dim=1536, + dropout=dropout, + activation=torch.nn.SiLU(), + ) + + mhsa_cfg = ConformerMHSAV1Config( + input_dim=384, + num_att_heads=num_att_heads, + att_weights_dropout=att_weights_dropout, + dropout=dropout, + ) + + conv_cfg = ConformerConvolutionV1Config( + channels=384, + kernel_size=kernel_size, + dropout=dropout, + activation=torch.nn.SiLU(), + norm=torch.nn.LayerNorm(384), + ) + + block_cfg = ConformerBlockV2Config( + ff_cfg=ff_cfg, + mhsa_cfg=mhsa_cfg, + conv_cfg=conv_cfg, + modules=["ff", "conv", "mhsa", "ff"], + ) + + conformer_cfg = ConformerEncoderV2Config( + num_layers=num_layers, + frontend=frontend, + block_cfg=block_cfg, + ) + + return ConformerCTCConfig( + feature_extraction_cfg=feature_extraction_cfg, + specaug_args=specaug_args, + conformer_cfg=conformer_cfg, + target_size=num_outputs, + final_dropout=final_dropout + ) + + +def export(*, model: torch.nn.Module, model_filename: str): + dummy_data = torch.randn(1, 30 * 160, 1, device="cpu") + dummy_data_len = torch.ones((1,), dtype=torch.int32) * 30 * 160 + + model.export_mode = True + torch.onnx.export( + model=model.eval(), + args=(dummy_data, dummy_data_len), + f=model_filename, + verbose=True, + input_names=["data", "data_len"], + output_names=["classes"], + opset_version=17, + dynamic_axes={ + # dict value: manually named axes + "data": {0: "batch", 1: "time"}, + "data_len": {0: "batch"}, + "targets": {0: "batch", 1: "time"}, + }, + ) + + +def get_recog_serializer( + model_config: ConformerCTCConfig, +) -> Collection: + return get_basic_pt_network_serializer( + module_import_path=f"{__name__}.ConformerCTCModel", + model_config=model_config, + additional_serializer_objects=[ + Import(f"{__name__}.export"), + ], + ) + + +def get_prior_serializer( + model_config: ConformerCTCConfig, +) -> Collection: + berger_pytorch_package = "i6_experiments.users.berger.pytorch" + return get_basic_pt_network_serializer( + module_import_path=f"{__name__}.ConformerCTCModel", + model_config=model_config, + additional_serializer_objects=[ + Import(f"{berger_pytorch_package}.forward.basic.forward_step"), + Import(f"{berger_pytorch_package}.forward.prior_callback.ComputePriorCallback", + import_as="forward_callback"), + ], + ) + + +def get_train_serializer( + model_config: ConformerCTCConfig, +) -> Collection: + # pytorch_package = __package__.rpartition(".")[0] + pytorch_package = "i6_experiments.users.berger.pytorch" + return get_basic_pt_network_serializer( + module_import_path=f"{__name__}.{ConformerCTCModel.__name__}", + model_config=model_config, + additional_serializer_objects=[ + Import(f"{__name__}.train_step"), + ], + ) \ No newline at end of file diff --git a/users/jxu/experiments/ctc/tedlium2/utils/dump_log_prob_forward_callback.py b/users/jxu/experiments/ctc/tedlium2/utils/dump_log_prob_forward_callback.py new file mode 100644 index 000000000..407a05ec2 --- /dev/null +++ b/users/jxu/experiments/ctc/tedlium2/utils/dump_log_prob_forward_callback.py @@ -0,0 +1,152 @@ +import copy +import os +import subprocess as sp +from typing import Optional, Union + +import numpy +import numpy as np +import torch +import h5py + +from returnn.forward_iface import ForwardCallbackIface +from returnn.tensor.tensor_dict import TensorDict +from sisyphus import Job, Task, tk + +from i6_core import util +from i6_core.returnn.config import ReturnnConfig +from i6_core.returnn.training import Checkpoint, PtCheckpoint + + + +class DumpLogProbCallback(ForwardCallbackIface): + def init(self, *, model: torch.nn.Module): + self.n = 1 + self.inputs = [] + self.seqLengths = [] + self.seqTags = [] + + def process_seq(self, *, seq_tag: str, outputs: TensorDict): + log_prob_tensor = outputs["log_probs"].raw_tensor + assert log_prob_tensor is not None + self.inputs.extend(list(log_prob_tensor.cpu().numpy())) + self.seqLengths.extend([[len(list(log_prob_tensor.cpu().numpy())), 0]]) + self.seqTags.extend([seq_tag]) + + + def finish(self): + with h5py.File("../output/log_probs.hdf", 'w') as hf: + hf.create_dataset('inputs', data=np.array(self.inputs)) + hf.create_dataset('seqLengths', data=np.array(self.seqLengths)) + utf8_type = h5py.string_dtype('utf-8', 100) + asciiList = numpy.array([n.encode("utf-8") for n in self.seqTags], dtype=utf8_type) + hf.create_dataset('seqTags', data=asciiList) + + + +class ReturnnForwardComputePriorJob(Job): + def __init__( + self, + model_checkpoint: Optional[Union[Checkpoint, PtCheckpoint]], + returnn_config: ReturnnConfig, + returnn_python_exe: tk.Path, + returnn_root: tk.Path, + *, # args below are keyword only + log_verbosity: int = 5, + device: str = "gpu", + time_rqmt: float = 4, + mem_rqmt: float = 4, + cpu_rqmt: int = 2, + ): + self.returnn_config = returnn_config + self.model_checkpoint = model_checkpoint + self.returnn_python_exe = returnn_python_exe + self.returnn_root = returnn_root + self.log_verbosity = log_verbosity + self.device = device + + self.out_returnn_config_file = self.output_path("returnn.config") + + self.out_log_prob = self.output_path("log_probs.hdf") + + self.rqmt = { + "gpu": 1 if device == "gpu" else 0, + "cpu": cpu_rqmt, + "mem": mem_rqmt, + "time": time_rqmt, + } + + def tasks(self): + yield Task("create_files", mini_task=True) + yield Task("run", resume="run", rqmt=self.rqmt) + + def create_files(self): + config = self.create_returnn_config( + model_checkpoint=self.model_checkpoint, + returnn_config=self.returnn_config, + log_verbosity=self.log_verbosity, + device=self.device, + ) + config.write(self.out_returnn_config_file.get_path()) + + cmd = [ + self.returnn_python_exe.get_path(), + os.path.join(self.returnn_root.get_path(), "rnn.py"), + self.out_returnn_config_file.get_path(), + ] + util.create_executable("rnn.sh", cmd) + + # check here if model actually exists + if self.model_checkpoint is not None: + assert self.model_checkpoint.exists(), "Provided model does not exists: %s" % str(self.model_checkpoint) + + def run(self): + sp.check_call( + [ + self.returnn_python_exe.get_path(), + self.returnn_root.join_right("rnn.py").get_path(), + self.out_returnn_config_file.get_path(), + ] + ) + + @classmethod + def create_returnn_config( + cls, + model_checkpoint: Optional[Union[Checkpoint, PtCheckpoint]], + returnn_config: ReturnnConfig, + log_verbosity: int, + device: str, + **kwargs, + ): + assert device in ["gpu", "cpu"] + assert "task" not in returnn_config.config + assert "load" not in returnn_config.config + assert "model" not in returnn_config.config + + res = copy.deepcopy(returnn_config) + + config = {"load": model_checkpoint, "task": "forward", "forward_data": "train"} + + post_config = { + "device": device, + "log": ["./returnn.log"], + "log_verbosity": log_verbosity, + } + + config.update(returnn_config.config) + post_config.update(returnn_config.post_config) + + res.config = config + res.post_config = post_config + res.check_consistency() + + return res + + @classmethod + def hash(cls, kwargs): + d = { + "returnn_config": ReturnnForwardComputePriorJob.create_returnn_config(**kwargs), + "returnn_python_exe": kwargs["returnn_python_exe"], + "returnn_root": kwargs["returnn_root"], + } + + return super().hash(d) From 2042ccaad8c3759d433e0fafa0673157cd47ccf2 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Wed, 15 May 2024 15:36:06 +0000 Subject: [PATCH 007/227] use ff regs for mhsa out --- users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py b/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py index 110b74553..bef0dad64 100644 --- a/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py +++ b/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py @@ -898,9 +898,9 @@ def _create_mhsa_module(self, prefix_name, source, layer_index): l2=self.l2, forward_weights_init=self.mhsa_out_init, with_bias=False, - param_dropout=self.mhsa_weight_drop, + param_dropout=self.ff_weight_drop, param_dropout_min_ndim=2, - param_variational_noise=self.mhsa_weight_noise, + param_variational_noise=self.ff_weight_noise, ) drop = self.network.add_dropout_layer("{}_dropout".format(prefix_name), mhsa_linear, dropout=self.dropout) From 0ab509fe7dc0208b4f700e35abd5216df7f096c7 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Wed, 15 May 2024 15:40:13 +0000 Subject: [PATCH 008/227] add more regularized trafo dec --- .../asr/decoder/transformer_decoder_v2.py | 414 ++++++++++++++++++ 1 file changed, 414 insertions(+) create mode 100644 users/zeineldeen/models/asr/decoder/transformer_decoder_v2.py diff --git a/users/zeineldeen/models/asr/decoder/transformer_decoder_v2.py b/users/zeineldeen/models/asr/decoder/transformer_decoder_v2.py new file mode 100644 index 000000000..30d8624f2 --- /dev/null +++ b/users/zeineldeen/models/asr/decoder/transformer_decoder_v2.py @@ -0,0 +1,414 @@ +from i6_experiments.users.zeineldeen.modules.network import ReturnnNetwork + + +class TransformerDecoder: + """ + Represents standard Transformer decoder + + * Attention Is All You Need + * Ref: https://arxiv.org/abs/1706.03762 + """ + + def __init__( + self, + base_model, + target="bpe", + num_layers=6, + beam_size=12, + ff_init=None, + ff_dim=2048, + ff_act="relu", + att_num_heads=8, + dropout=0.1, + att_dropout=0.0, + softmax_dropout=0.0, + embed_dropout=0.1, + l2=0.0, + self_att_l2=0.0, + embed_pos_enc=False, + apply_embed_weight=False, + label_smoothing=0.1, + mhsa_init=None, + mhsa_out_init=None, + pos_enc=None, + rel_pos_clipping=16, + length_normalization=True, + replace_cross_att_w_masked_self_att=False, + create_ilm_decoder=False, + ilm_type=None, + ilm_args=None, + ff_weight_noise=None, + mhsa_weight_noise=None, + ff_weight_dropout=None, + mhsa_weight_dropout=None, + ): + self.base_model = base_model + self.enc_value_dim = base_model.enc_value_dim + self.enc_key_dim = base_model.enc_key_dim + self.enc_att_num_heads = base_model.att_num_heads + self.enc_key_per_head_dim = base_model.enc_key_per_head_dim + self.enc_val_per_head_dim = base_model.enc_val_per_head_dim + + self.att_num_heads = att_num_heads + + self.target = target + self.num_layers = num_layers + self.beam_size = beam_size + + self.ff_init = ff_init + self.ff_dim = ff_dim + self.ff_act = ff_act + + self.mhsa_init = mhsa_init + self.mhsa_init_out = mhsa_out_init + + self.pos_enc = pos_enc + self.rel_pos_clipping = rel_pos_clipping + + self.dropout = dropout + self.softmax_dropout = softmax_dropout + self.att_dropout = att_dropout + self.label_smoothing = label_smoothing + + self.l2 = l2 + self.self_att_l2 = self_att_l2 + + self.embed_dropout = embed_dropout + self.embed_pos_enc = embed_pos_enc + + self.embed_weight = None + + if apply_embed_weight: + self.embed_weight = self.enc_value_dim**0.5 + + self.decision_layer_name = None + + self.length_normalization = length_normalization + + self.replace_cross_att_w_masked_self_att = replace_cross_att_w_masked_self_att # used to train ILM + + self.ff_weight_drop = ff_weight_dropout + self.mhsa_weight_drop = mhsa_weight_dropout + self.ff_weight_noise = ff_weight_noise + self.mhsa_weight_noise = mhsa_weight_noise + + # used for recognition with ILM + self.create_ilm_decoder = create_ilm_decoder + self.ilm_type = ilm_type + self.ilm_args = ilm_args or {} + if self.create_ilm_decoder: + self.replace_cross_att_w_masked_self_att = False # keep original decoder as-is + + self.network = ReturnnNetwork() + self.subnet_unit = ReturnnNetwork() + self.output_prob = None + + def _create_masked_mhsa(self, subnet_unit: ReturnnNetwork, prefix, source, **kwargs): + prefix = "{}_self_att".format(prefix) + + # for tuning mini-self-att ILM + att_num_heads = kwargs.get("att_num_heads", self.att_num_heads) + enc_key_dim = kwargs.get("enc_key_dim", self.enc_key_dim) + enc_key_per_head_dim = enc_key_dim // att_num_heads + + ln = subnet_unit.add_layer_norm_layer("{}_ln".format(prefix), source) + + ln_rel_pos_enc = None + if self.pos_enc == "rel": + ln_rel_pos_enc = self.subnet_unit.add_relative_pos_encoding_layer( + "{}_ln_rel_pos_enc".format(prefix), + ln, + n_out=enc_key_per_head_dim, + forward_weights_init=self.ff_init, + clipping=self.rel_pos_clipping, + ) + + att = subnet_unit.add_self_att_layer( + "{}_att".format(prefix), + ln, + num_heads=att_num_heads, + total_key_dim=enc_key_dim, + n_out=self.enc_value_dim, + attention_left_only=True, + att_dropout=self.att_dropout, + forward_weights_init=self.mhsa_init, + l2=self.self_att_l2, + key_shift=ln_rel_pos_enc if ln_rel_pos_enc is not None else None, + param_dropout=self.mhsa_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.mhsa_weight_noise, + ) + + linear = subnet_unit.add_linear_layer( + "{}_linear".format(prefix), + att, + activation=None, + with_bias=False, + n_out=self.enc_value_dim, + forward_weights_init=self.mhsa_init_out, + l2=self.l2, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, + ) + + drop = subnet_unit.add_dropout_layer("{}_drop".format(prefix), linear, dropout=self.dropout) + + out = subnet_unit.add_combine_layer( + "{}_out".format(prefix), [drop, source], kind="add", n_out=self.enc_value_dim + ) + + return out + + def _create_mhsa(self, subnet_unit: ReturnnNetwork, prefix, source): + ln = subnet_unit.add_layer_norm_layer("{}_att_ln".format(prefix), source) + + att_query0 = subnet_unit.add_linear_layer( + "{}_att_query0".format(prefix), + ln, + with_bias=False, + n_out=self.enc_value_dim, + forward_weights_init=self.mhsa_init, + l2=self.self_att_l2, + param_dropout=self.mhsa_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.mhsa_weight_noise, + ) + + # (B, H, D/H) + att_query = subnet_unit.add_split_dim_layer( + "{}_att_query".format(prefix), + att_query0, + axis="F", + dims=(self.enc_att_num_heads, self.enc_key_per_head_dim), + ) + + # --------------- Add to the encoder network --------------- # + att_key0 = self.base_model.network.add_linear_layer( + "{}_att_key0".format(prefix), + "encoder", + with_bias=False, + n_out=self.enc_key_dim, + forward_weights_init=self.mhsa_init, + l2=self.self_att_l2, + param_dropout=self.mhsa_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.mhsa_weight_noise, + ) + + # (B, enc-T, H, D/H) + att_key = self.base_model.network.add_split_dim_layer( + "{}_att_key".format(prefix), att_key0, axis="F", dims=(self.enc_att_num_heads, self.enc_key_per_head_dim) + ) + + att_value0 = self.base_model.network.add_linear_layer( + "{}_att_value0".format(prefix), + "encoder", + with_bias=False, + n_out=self.enc_value_dim, + forward_weights_init=self.mhsa_init, + l2=self.self_att_l2, + param_dropout=self.mhsa_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.mhsa_weight_noise, + ) + + # (B, enc-T, H, D'/H) + att_value = self.base_model.network.add_split_dim_layer( + "{}_att_value".format(prefix), + att_value0, + axis="F", + dims=(self.enc_att_num_heads, self.enc_val_per_head_dim), + ) + # ----------------------------------------------------------- # + + # (B, H, enc-T, 1) + att_energy = subnet_unit.add_dot_layer( + "{}_att_energy".format(prefix), source=["base:" + att_key, att_query], red1=-1, red2=-1, var1="T", var2="T?" + ) + + att_weights = subnet_unit.add_softmax_over_spatial_layer( + "{}_att_weights".format(prefix), att_energy, energy_factor=self.enc_key_per_head_dim**-0.5 + ) + + att_weights_drop = subnet_unit.add_dropout_layer( + "{}_att_weights_drop".format(prefix), att_weights, dropout=self.att_dropout, dropout_noise_shape={"*": None} + ) + + # (B, H, V) + att0 = subnet_unit.add_generic_att_layer( + "{}_att0".format(prefix), weights=att_weights_drop, base="base:" + att_value + ) + + att = subnet_unit.add_merge_dims_layer("{}_att".format(prefix), att0, axes="static") # (B, H*V) except_batch + + # output projection + att_linear = subnet_unit.add_linear_layer( + "{}_att_linear".format(prefix), + att, + with_bias=False, + n_out=self.enc_value_dim, + forward_weights_init=self.mhsa_init_out, + l2=self.l2, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, + ) + + att_drop = subnet_unit.add_dropout_layer("{}_att_drop".format(prefix), att_linear, dropout=self.dropout) + + out = subnet_unit.add_combine_layer( + "{}_att_out".format(prefix), [att_drop, source], kind="add", n_out=self.enc_value_dim + ) + return out + + def _create_ff_module(self, subnet_unit: ReturnnNetwork, prefix, source): + ff_ln = subnet_unit.add_layer_norm_layer("{}_ff_ln".format(prefix), source) + + ff1 = subnet_unit.add_linear_layer( + "{}_ff_conv1".format(prefix), + ff_ln, + activation=self.ff_act, + forward_weights_init=self.ff_init, + n_out=self.ff_dim, + with_bias=True, + l2=self.l2, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, + ) + + ff2 = subnet_unit.add_linear_layer( + "{}_ff_conv2".format(prefix), + ff1, + activation=None, + forward_weights_init=self.ff_init, + n_out=self.enc_value_dim, + dropout=self.dropout, + with_bias=True, + l2=self.l2, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, + ) + + drop = subnet_unit.add_dropout_layer("{}_ff_drop".format(prefix), ff2, dropout=self.dropout) + + out = subnet_unit.add_combine_layer( + "{}_ff_out".format(prefix), [drop, source], kind="add", n_out=self.enc_value_dim + ) + return out + + def _create_decoder_block(self, subnet_unit: ReturnnNetwork, source, i): + prefix = "transformer_decoder_%02i" % i + masked_mhsa = self._create_masked_mhsa(subnet_unit, prefix, source) + if self.replace_cross_att_w_masked_self_att: + mhsa = self._create_masked_mhsa(subnet_unit, "ilm_" + prefix, masked_mhsa, **self.ilm_args) + else: + mhsa = self._create_mhsa(subnet_unit, prefix, masked_mhsa) + ff = self._create_ff_module(subnet_unit, prefix, mhsa) + out = subnet_unit.add_copy_layer(prefix, ff) + return out + + def _create_ilm_decoder_block(self, subnet_unit: ReturnnNetwork, source, i): + prefix = "transformer_decoder_%02i" % i + masked_mhsa = self._create_masked_mhsa(subnet_unit, "prior_" + prefix, source) + if self.ilm_type == "mini_lstm": + mhsa = self._create_masked_mhsa(subnet_unit, "mini_ilm_" + prefix, masked_mhsa, **self.ilm_args) + else: + assert self.ilm_type == "zero" + mhsa = subnet_unit.add_eval_layer("zero_att_%02i" % i, masked_mhsa, eval="tf.zeros_like(source(0))") + ff = self._create_ff_module(subnet_unit, "prior_" + prefix, mhsa) + out = subnet_unit.add_copy_layer("prior_" + prefix, ff) + return out + + def _create_decoder(self, subnet_unit: ReturnnNetwork): + self.output_prob = subnet_unit.add_softmax_layer( + "output_prob", + "decoder", + loss="ce", + loss_opts={"label_smoothing": self.label_smoothing}, + target=self.target, + dropout=self.softmax_dropout, + forward_weights_init=self.ff_init, + l2=self.l2, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, + ) + + if self.length_normalization: + output = subnet_unit.add_choice_layer( + "output", self.output_prob, target=self.target, beam_size=self.beam_size, initial_output=0 + ) + else: + output = subnet_unit.add_choice_layer( + "output", + self.output_prob, + target=self.target, + beam_size=self.beam_size, + initial_output=0, + length_normalization=self.length_normalization, + ) + + subnet_unit.add_compare_layer("end", output, value=0) + + target_embed_raw = subnet_unit.add_linear_layer( + "target_embed_raw", + "prev:" + output, + with_bias=False, + n_out=self.enc_value_dim, + forward_weights_init=self.ff_init, + l2=self.l2, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, + ) + + if self.embed_weight: + # note that this probably only makes sense when using embed_pos_enc + target_embed_raw = subnet_unit.add_eval_layer( + "target_embed_weighted", target_embed_raw, eval="source(0) * %f" % self.embed_weight + ) + + if self.embed_pos_enc: + target_embed_raw = subnet_unit.add_pos_encoding_layer("target_embed_pos_enc", target_embed_raw) + + target_embed = subnet_unit.add_dropout_layer( + "target_embed", target_embed_raw, dropout=self.embed_dropout, dropout_noise_shape={"*": None} + ) + + x = target_embed + for i in range(1, self.num_layers + 1): + x = self._create_decoder_block(subnet_unit, x, i) + subnet_unit.add_layer_norm_layer("decoder", x) + + if self.create_ilm_decoder: + x = target_embed + for i in range(1, self.num_layers + 1): + x = self._create_ilm_decoder_block(subnet_unit, x, i) + subnet_unit.add_layer_norm_layer("prior_decoder", x) + + subnet_unit.add_softmax_layer( + "prior_output_prob", + "prior_decoder", + loss="ce", + loss_opts={"label_smoothing": self.label_smoothing}, + target=self.target, + dropout=self.softmax_dropout, + forward_weights_init=self.ff_init, + l2=self.l2, + ) + + dec_output = self.network.add_subnet_rec_layer("output", unit=subnet_unit.get_net(), target=self.target) + + return dec_output + + def create_network(self): + dec_output = self._create_decoder(self.subnet_unit) + + # recurrent subnetwork + decision_layer_name = self.base_model.network.add_decide_layer("decision", dec_output, target=self.target) + self.decision_layer_name = decision_layer_name + + return dec_output From f93337b3a88e943d22ede1e95120da24ad3383ed Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Wed, 15 May 2024 15:41:45 +0000 Subject: [PATCH 009/227] update --- .../models/asr/decoder/transformer_decoder.py | 50 ++- .../asr/decoder/transformer_decoder_v2.py | 414 ------------------ 2 files changed, 46 insertions(+), 418 deletions(-) delete mode 100644 users/zeineldeen/models/asr/decoder/transformer_decoder_v2.py diff --git a/users/zeineldeen/models/asr/decoder/transformer_decoder.py b/users/zeineldeen/models/asr/decoder/transformer_decoder.py index da4dc7850..30d8624f2 100644 --- a/users/zeineldeen/models/asr/decoder/transformer_decoder.py +++ b/users/zeineldeen/models/asr/decoder/transformer_decoder.py @@ -24,6 +24,7 @@ def __init__( softmax_dropout=0.0, embed_dropout=0.1, l2=0.0, + self_att_l2=0.0, embed_pos_enc=False, apply_embed_weight=False, label_smoothing=0.1, @@ -36,6 +37,10 @@ def __init__( create_ilm_decoder=False, ilm_type=None, ilm_args=None, + ff_weight_noise=None, + mhsa_weight_noise=None, + ff_weight_dropout=None, + mhsa_weight_dropout=None, ): self.base_model = base_model self.enc_value_dim = base_model.enc_value_dim @@ -66,6 +71,7 @@ def __init__( self.label_smoothing = label_smoothing self.l2 = l2 + self.self_att_l2 = self_att_l2 self.embed_dropout = embed_dropout self.embed_pos_enc = embed_pos_enc @@ -81,6 +87,11 @@ def __init__( self.replace_cross_att_w_masked_self_att = replace_cross_att_w_masked_self_att # used to train ILM + self.ff_weight_drop = ff_weight_dropout + self.mhsa_weight_drop = mhsa_weight_dropout + self.ff_weight_noise = ff_weight_noise + self.mhsa_weight_noise = mhsa_weight_noise + # used for recognition with ILM self.create_ilm_decoder = create_ilm_decoder self.ilm_type = ilm_type @@ -121,8 +132,11 @@ def _create_masked_mhsa(self, subnet_unit: ReturnnNetwork, prefix, source, **kwa attention_left_only=True, att_dropout=self.att_dropout, forward_weights_init=self.mhsa_init, - l2=self.l2, + l2=self.self_att_l2, key_shift=ln_rel_pos_enc if ln_rel_pos_enc is not None else None, + param_dropout=self.mhsa_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.mhsa_weight_noise, ) linear = subnet_unit.add_linear_layer( @@ -133,6 +147,9 @@ def _create_masked_mhsa(self, subnet_unit: ReturnnNetwork, prefix, source, **kwa n_out=self.enc_value_dim, forward_weights_init=self.mhsa_init_out, l2=self.l2, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, ) drop = subnet_unit.add_dropout_layer("{}_drop".format(prefix), linear, dropout=self.dropout) @@ -152,7 +169,10 @@ def _create_mhsa(self, subnet_unit: ReturnnNetwork, prefix, source): with_bias=False, n_out=self.enc_value_dim, forward_weights_init=self.mhsa_init, - l2=self.l2, + l2=self.self_att_l2, + param_dropout=self.mhsa_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.mhsa_weight_noise, ) # (B, H, D/H) @@ -170,7 +190,10 @@ def _create_mhsa(self, subnet_unit: ReturnnNetwork, prefix, source): with_bias=False, n_out=self.enc_key_dim, forward_weights_init=self.mhsa_init, - l2=self.l2, + l2=self.self_att_l2, + param_dropout=self.mhsa_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.mhsa_weight_noise, ) # (B, enc-T, H, D/H) @@ -184,7 +207,10 @@ def _create_mhsa(self, subnet_unit: ReturnnNetwork, prefix, source): with_bias=False, n_out=self.enc_value_dim, forward_weights_init=self.mhsa_init, - l2=self.l2, + l2=self.self_att_l2, + param_dropout=self.mhsa_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.mhsa_weight_noise, ) # (B, enc-T, H, D'/H) @@ -224,6 +250,9 @@ def _create_mhsa(self, subnet_unit: ReturnnNetwork, prefix, source): n_out=self.enc_value_dim, forward_weights_init=self.mhsa_init_out, l2=self.l2, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, ) att_drop = subnet_unit.add_dropout_layer("{}_att_drop".format(prefix), att_linear, dropout=self.dropout) @@ -244,6 +273,9 @@ def _create_ff_module(self, subnet_unit: ReturnnNetwork, prefix, source): n_out=self.ff_dim, with_bias=True, l2=self.l2, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, ) ff2 = subnet_unit.add_linear_layer( @@ -255,6 +287,9 @@ def _create_ff_module(self, subnet_unit: ReturnnNetwork, prefix, source): dropout=self.dropout, with_bias=True, l2=self.l2, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, ) drop = subnet_unit.add_dropout_layer("{}_ff_drop".format(prefix), ff2, dropout=self.dropout) @@ -297,6 +332,9 @@ def _create_decoder(self, subnet_unit: ReturnnNetwork): dropout=self.softmax_dropout, forward_weights_init=self.ff_init, l2=self.l2, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, ) if self.length_normalization: @@ -322,9 +360,13 @@ def _create_decoder(self, subnet_unit: ReturnnNetwork): n_out=self.enc_value_dim, forward_weights_init=self.ff_init, l2=self.l2, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, ) if self.embed_weight: + # note that this probably only makes sense when using embed_pos_enc target_embed_raw = subnet_unit.add_eval_layer( "target_embed_weighted", target_embed_raw, eval="source(0) * %f" % self.embed_weight ) diff --git a/users/zeineldeen/models/asr/decoder/transformer_decoder_v2.py b/users/zeineldeen/models/asr/decoder/transformer_decoder_v2.py deleted file mode 100644 index 30d8624f2..000000000 --- a/users/zeineldeen/models/asr/decoder/transformer_decoder_v2.py +++ /dev/null @@ -1,414 +0,0 @@ -from i6_experiments.users.zeineldeen.modules.network import ReturnnNetwork - - -class TransformerDecoder: - """ - Represents standard Transformer decoder - - * Attention Is All You Need - * Ref: https://arxiv.org/abs/1706.03762 - """ - - def __init__( - self, - base_model, - target="bpe", - num_layers=6, - beam_size=12, - ff_init=None, - ff_dim=2048, - ff_act="relu", - att_num_heads=8, - dropout=0.1, - att_dropout=0.0, - softmax_dropout=0.0, - embed_dropout=0.1, - l2=0.0, - self_att_l2=0.0, - embed_pos_enc=False, - apply_embed_weight=False, - label_smoothing=0.1, - mhsa_init=None, - mhsa_out_init=None, - pos_enc=None, - rel_pos_clipping=16, - length_normalization=True, - replace_cross_att_w_masked_self_att=False, - create_ilm_decoder=False, - ilm_type=None, - ilm_args=None, - ff_weight_noise=None, - mhsa_weight_noise=None, - ff_weight_dropout=None, - mhsa_weight_dropout=None, - ): - self.base_model = base_model - self.enc_value_dim = base_model.enc_value_dim - self.enc_key_dim = base_model.enc_key_dim - self.enc_att_num_heads = base_model.att_num_heads - self.enc_key_per_head_dim = base_model.enc_key_per_head_dim - self.enc_val_per_head_dim = base_model.enc_val_per_head_dim - - self.att_num_heads = att_num_heads - - self.target = target - self.num_layers = num_layers - self.beam_size = beam_size - - self.ff_init = ff_init - self.ff_dim = ff_dim - self.ff_act = ff_act - - self.mhsa_init = mhsa_init - self.mhsa_init_out = mhsa_out_init - - self.pos_enc = pos_enc - self.rel_pos_clipping = rel_pos_clipping - - self.dropout = dropout - self.softmax_dropout = softmax_dropout - self.att_dropout = att_dropout - self.label_smoothing = label_smoothing - - self.l2 = l2 - self.self_att_l2 = self_att_l2 - - self.embed_dropout = embed_dropout - self.embed_pos_enc = embed_pos_enc - - self.embed_weight = None - - if apply_embed_weight: - self.embed_weight = self.enc_value_dim**0.5 - - self.decision_layer_name = None - - self.length_normalization = length_normalization - - self.replace_cross_att_w_masked_self_att = replace_cross_att_w_masked_self_att # used to train ILM - - self.ff_weight_drop = ff_weight_dropout - self.mhsa_weight_drop = mhsa_weight_dropout - self.ff_weight_noise = ff_weight_noise - self.mhsa_weight_noise = mhsa_weight_noise - - # used for recognition with ILM - self.create_ilm_decoder = create_ilm_decoder - self.ilm_type = ilm_type - self.ilm_args = ilm_args or {} - if self.create_ilm_decoder: - self.replace_cross_att_w_masked_self_att = False # keep original decoder as-is - - self.network = ReturnnNetwork() - self.subnet_unit = ReturnnNetwork() - self.output_prob = None - - def _create_masked_mhsa(self, subnet_unit: ReturnnNetwork, prefix, source, **kwargs): - prefix = "{}_self_att".format(prefix) - - # for tuning mini-self-att ILM - att_num_heads = kwargs.get("att_num_heads", self.att_num_heads) - enc_key_dim = kwargs.get("enc_key_dim", self.enc_key_dim) - enc_key_per_head_dim = enc_key_dim // att_num_heads - - ln = subnet_unit.add_layer_norm_layer("{}_ln".format(prefix), source) - - ln_rel_pos_enc = None - if self.pos_enc == "rel": - ln_rel_pos_enc = self.subnet_unit.add_relative_pos_encoding_layer( - "{}_ln_rel_pos_enc".format(prefix), - ln, - n_out=enc_key_per_head_dim, - forward_weights_init=self.ff_init, - clipping=self.rel_pos_clipping, - ) - - att = subnet_unit.add_self_att_layer( - "{}_att".format(prefix), - ln, - num_heads=att_num_heads, - total_key_dim=enc_key_dim, - n_out=self.enc_value_dim, - attention_left_only=True, - att_dropout=self.att_dropout, - forward_weights_init=self.mhsa_init, - l2=self.self_att_l2, - key_shift=ln_rel_pos_enc if ln_rel_pos_enc is not None else None, - param_dropout=self.mhsa_weight_drop, - param_dropout_min_ndim=2, - param_variational_noise=self.mhsa_weight_noise, - ) - - linear = subnet_unit.add_linear_layer( - "{}_linear".format(prefix), - att, - activation=None, - with_bias=False, - n_out=self.enc_value_dim, - forward_weights_init=self.mhsa_init_out, - l2=self.l2, - param_dropout=self.ff_weight_drop, - param_dropout_min_ndim=2, - param_variational_noise=self.ff_weight_noise, - ) - - drop = subnet_unit.add_dropout_layer("{}_drop".format(prefix), linear, dropout=self.dropout) - - out = subnet_unit.add_combine_layer( - "{}_out".format(prefix), [drop, source], kind="add", n_out=self.enc_value_dim - ) - - return out - - def _create_mhsa(self, subnet_unit: ReturnnNetwork, prefix, source): - ln = subnet_unit.add_layer_norm_layer("{}_att_ln".format(prefix), source) - - att_query0 = subnet_unit.add_linear_layer( - "{}_att_query0".format(prefix), - ln, - with_bias=False, - n_out=self.enc_value_dim, - forward_weights_init=self.mhsa_init, - l2=self.self_att_l2, - param_dropout=self.mhsa_weight_drop, - param_dropout_min_ndim=2, - param_variational_noise=self.mhsa_weight_noise, - ) - - # (B, H, D/H) - att_query = subnet_unit.add_split_dim_layer( - "{}_att_query".format(prefix), - att_query0, - axis="F", - dims=(self.enc_att_num_heads, self.enc_key_per_head_dim), - ) - - # --------------- Add to the encoder network --------------- # - att_key0 = self.base_model.network.add_linear_layer( - "{}_att_key0".format(prefix), - "encoder", - with_bias=False, - n_out=self.enc_key_dim, - forward_weights_init=self.mhsa_init, - l2=self.self_att_l2, - param_dropout=self.mhsa_weight_drop, - param_dropout_min_ndim=2, - param_variational_noise=self.mhsa_weight_noise, - ) - - # (B, enc-T, H, D/H) - att_key = self.base_model.network.add_split_dim_layer( - "{}_att_key".format(prefix), att_key0, axis="F", dims=(self.enc_att_num_heads, self.enc_key_per_head_dim) - ) - - att_value0 = self.base_model.network.add_linear_layer( - "{}_att_value0".format(prefix), - "encoder", - with_bias=False, - n_out=self.enc_value_dim, - forward_weights_init=self.mhsa_init, - l2=self.self_att_l2, - param_dropout=self.mhsa_weight_drop, - param_dropout_min_ndim=2, - param_variational_noise=self.mhsa_weight_noise, - ) - - # (B, enc-T, H, D'/H) - att_value = self.base_model.network.add_split_dim_layer( - "{}_att_value".format(prefix), - att_value0, - axis="F", - dims=(self.enc_att_num_heads, self.enc_val_per_head_dim), - ) - # ----------------------------------------------------------- # - - # (B, H, enc-T, 1) - att_energy = subnet_unit.add_dot_layer( - "{}_att_energy".format(prefix), source=["base:" + att_key, att_query], red1=-1, red2=-1, var1="T", var2="T?" - ) - - att_weights = subnet_unit.add_softmax_over_spatial_layer( - "{}_att_weights".format(prefix), att_energy, energy_factor=self.enc_key_per_head_dim**-0.5 - ) - - att_weights_drop = subnet_unit.add_dropout_layer( - "{}_att_weights_drop".format(prefix), att_weights, dropout=self.att_dropout, dropout_noise_shape={"*": None} - ) - - # (B, H, V) - att0 = subnet_unit.add_generic_att_layer( - "{}_att0".format(prefix), weights=att_weights_drop, base="base:" + att_value - ) - - att = subnet_unit.add_merge_dims_layer("{}_att".format(prefix), att0, axes="static") # (B, H*V) except_batch - - # output projection - att_linear = subnet_unit.add_linear_layer( - "{}_att_linear".format(prefix), - att, - with_bias=False, - n_out=self.enc_value_dim, - forward_weights_init=self.mhsa_init_out, - l2=self.l2, - param_dropout=self.ff_weight_drop, - param_dropout_min_ndim=2, - param_variational_noise=self.ff_weight_noise, - ) - - att_drop = subnet_unit.add_dropout_layer("{}_att_drop".format(prefix), att_linear, dropout=self.dropout) - - out = subnet_unit.add_combine_layer( - "{}_att_out".format(prefix), [att_drop, source], kind="add", n_out=self.enc_value_dim - ) - return out - - def _create_ff_module(self, subnet_unit: ReturnnNetwork, prefix, source): - ff_ln = subnet_unit.add_layer_norm_layer("{}_ff_ln".format(prefix), source) - - ff1 = subnet_unit.add_linear_layer( - "{}_ff_conv1".format(prefix), - ff_ln, - activation=self.ff_act, - forward_weights_init=self.ff_init, - n_out=self.ff_dim, - with_bias=True, - l2=self.l2, - param_dropout=self.ff_weight_drop, - param_dropout_min_ndim=2, - param_variational_noise=self.ff_weight_noise, - ) - - ff2 = subnet_unit.add_linear_layer( - "{}_ff_conv2".format(prefix), - ff1, - activation=None, - forward_weights_init=self.ff_init, - n_out=self.enc_value_dim, - dropout=self.dropout, - with_bias=True, - l2=self.l2, - param_dropout=self.ff_weight_drop, - param_dropout_min_ndim=2, - param_variational_noise=self.ff_weight_noise, - ) - - drop = subnet_unit.add_dropout_layer("{}_ff_drop".format(prefix), ff2, dropout=self.dropout) - - out = subnet_unit.add_combine_layer( - "{}_ff_out".format(prefix), [drop, source], kind="add", n_out=self.enc_value_dim - ) - return out - - def _create_decoder_block(self, subnet_unit: ReturnnNetwork, source, i): - prefix = "transformer_decoder_%02i" % i - masked_mhsa = self._create_masked_mhsa(subnet_unit, prefix, source) - if self.replace_cross_att_w_masked_self_att: - mhsa = self._create_masked_mhsa(subnet_unit, "ilm_" + prefix, masked_mhsa, **self.ilm_args) - else: - mhsa = self._create_mhsa(subnet_unit, prefix, masked_mhsa) - ff = self._create_ff_module(subnet_unit, prefix, mhsa) - out = subnet_unit.add_copy_layer(prefix, ff) - return out - - def _create_ilm_decoder_block(self, subnet_unit: ReturnnNetwork, source, i): - prefix = "transformer_decoder_%02i" % i - masked_mhsa = self._create_masked_mhsa(subnet_unit, "prior_" + prefix, source) - if self.ilm_type == "mini_lstm": - mhsa = self._create_masked_mhsa(subnet_unit, "mini_ilm_" + prefix, masked_mhsa, **self.ilm_args) - else: - assert self.ilm_type == "zero" - mhsa = subnet_unit.add_eval_layer("zero_att_%02i" % i, masked_mhsa, eval="tf.zeros_like(source(0))") - ff = self._create_ff_module(subnet_unit, "prior_" + prefix, mhsa) - out = subnet_unit.add_copy_layer("prior_" + prefix, ff) - return out - - def _create_decoder(self, subnet_unit: ReturnnNetwork): - self.output_prob = subnet_unit.add_softmax_layer( - "output_prob", - "decoder", - loss="ce", - loss_opts={"label_smoothing": self.label_smoothing}, - target=self.target, - dropout=self.softmax_dropout, - forward_weights_init=self.ff_init, - l2=self.l2, - param_dropout=self.ff_weight_drop, - param_dropout_min_ndim=2, - param_variational_noise=self.ff_weight_noise, - ) - - if self.length_normalization: - output = subnet_unit.add_choice_layer( - "output", self.output_prob, target=self.target, beam_size=self.beam_size, initial_output=0 - ) - else: - output = subnet_unit.add_choice_layer( - "output", - self.output_prob, - target=self.target, - beam_size=self.beam_size, - initial_output=0, - length_normalization=self.length_normalization, - ) - - subnet_unit.add_compare_layer("end", output, value=0) - - target_embed_raw = subnet_unit.add_linear_layer( - "target_embed_raw", - "prev:" + output, - with_bias=False, - n_out=self.enc_value_dim, - forward_weights_init=self.ff_init, - l2=self.l2, - param_dropout=self.ff_weight_drop, - param_dropout_min_ndim=2, - param_variational_noise=self.ff_weight_noise, - ) - - if self.embed_weight: - # note that this probably only makes sense when using embed_pos_enc - target_embed_raw = subnet_unit.add_eval_layer( - "target_embed_weighted", target_embed_raw, eval="source(0) * %f" % self.embed_weight - ) - - if self.embed_pos_enc: - target_embed_raw = subnet_unit.add_pos_encoding_layer("target_embed_pos_enc", target_embed_raw) - - target_embed = subnet_unit.add_dropout_layer( - "target_embed", target_embed_raw, dropout=self.embed_dropout, dropout_noise_shape={"*": None} - ) - - x = target_embed - for i in range(1, self.num_layers + 1): - x = self._create_decoder_block(subnet_unit, x, i) - subnet_unit.add_layer_norm_layer("decoder", x) - - if self.create_ilm_decoder: - x = target_embed - for i in range(1, self.num_layers + 1): - x = self._create_ilm_decoder_block(subnet_unit, x, i) - subnet_unit.add_layer_norm_layer("prior_decoder", x) - - subnet_unit.add_softmax_layer( - "prior_output_prob", - "prior_decoder", - loss="ce", - loss_opts={"label_smoothing": self.label_smoothing}, - target=self.target, - dropout=self.softmax_dropout, - forward_weights_init=self.ff_init, - l2=self.l2, - ) - - dec_output = self.network.add_subnet_rec_layer("output", unit=subnet_unit.get_net(), target=self.target) - - return dec_output - - def create_network(self): - dec_output = self._create_decoder(self.subnet_unit) - - # recurrent subnetwork - decision_layer_name = self.base_model.network.add_decide_layer("decision", dec_output, target=self.target) - self.decision_layer_name = decision_layer_name - - return dec_output From 884f762f393a28a26f02dad54a118568585e6a6c Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Wed, 15 May 2024 15:46:59 +0000 Subject: [PATCH 010/227] add regs to rnn decoder --- .../models/asr/decoder/rnn_decoder.py | 39 +++++++++++++++++-- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/users/zeineldeen/models/asr/decoder/rnn_decoder.py b/users/zeineldeen/models/asr/decoder/rnn_decoder.py index ae88168a0..d7f82fdb4 100644 --- a/users/zeineldeen/models/asr/decoder/rnn_decoder.py +++ b/users/zeineldeen/models/asr/decoder/rnn_decoder.py @@ -48,6 +48,8 @@ def __init__( use_monotonic_att_weights_loss_in_recog=False, att_weights_variance_loss_scale=None, include_eos_in_search_output=False, + ff_weight_dropout=None, + ff_weight_noise=None, ): """ :param base_model: base/encoder model instance @@ -137,6 +139,9 @@ def __init__( self.use_zoneout_output = use_zoneout_output + self.ff_weight_drop = ff_weight_dropout + self.ff_weight_noise = ff_weight_noise + self.monotonic_att_weights_loss_opts = monotonic_att_weights_loss_opts self.use_monotonic_att_weights_loss_in_recog = use_monotonic_att_weights_loss_in_recog @@ -161,6 +166,9 @@ def add_decoder_subnetwork(self, subnet_unit: ReturnnNetwork): with_bias=False, l2=self.l2, forward_weights_init=self.embed_weight_init, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, ) subnet_unit.add_dropout_layer( @@ -194,7 +202,14 @@ def add_decoder_subnetwork(self, subnet_unit: ReturnnNetwork): ) if self.lstm_lm_dim != self.enc_value_dim: lstm_lm_component_proj = subnet_unit.add_linear_layer( - "lm_like_s_proj", lstm_lm_component, n_out=self.enc_value_dim, l2=self.l2, dropout=self.dropout + "lm_like_s_proj", + lstm_lm_component, + n_out=self.enc_value_dim, + l2=self.l2, + dropout=self.dropout, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, ) else: lstm_lm_component_proj = lstm_lm_component @@ -235,7 +250,14 @@ def add_decoder_subnetwork(self, subnet_unit: ReturnnNetwork): if self.add_lstm_lm: # s_transformed (query) has 1024 dim s_proj = subnet_unit.add_linear_layer( - "s_proj", "s_transformed", n_out=self.enc_value_dim, l2=self.l2, dropout=self.dropout + "s_proj", + "s_transformed", + n_out=self.enc_value_dim, + l2=self.l2, + dropout=self.dropout, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, ) # back-lstm (query) + context readout_in_src = subnet_unit.add_combine_layer( @@ -244,7 +266,15 @@ def add_decoder_subnetwork(self, subnet_unit: ReturnnNetwork): else: readout_in_src = ["s", "prev:target_embed", "att"] - subnet_unit.add_linear_layer("readout_in", readout_in_src, n_out=self.dec_output_num_units, l2=self.l2) + subnet_unit.add_linear_layer( + "readout_in", + readout_in_src, + n_out=self.dec_output_num_units, + l2=self.l2, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, + ) if self.reduceout: subnet_unit.add_reduceout_layer("readout", "readout_in") @@ -263,6 +293,9 @@ def add_decoder_subnetwork(self, subnet_unit: ReturnnNetwork): loss_opts=ce_loss_opts, target=self.target, dropout=self.softmax_dropout, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, ) self.output_prob_with_coverage = None From 3c8b576b2b62de87217e1d8fe598a5606470122a Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Wed, 15 May 2024 15:50:12 +0000 Subject: [PATCH 011/227] more --- users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py | 3 +++ users/zeineldeen/modules/network.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py b/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py index bef0dad64..89d0d2557 100644 --- a/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py +++ b/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py @@ -1254,6 +1254,9 @@ def _create_all_network_parts(self): loss="ctc", dropout=self.ctc_dropout, loss_opts=default_ctc_loss_opts, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, ) if self.ctc_loss_scale or self.ctc_self_align_delay: self.network["ctc"]["loss_scale"] = (self.ctc_loss_scale or 1.0) * ( diff --git a/users/zeineldeen/modules/network.py b/users/zeineldeen/modules/network.py index f763da614..33d87e898 100644 --- a/users/zeineldeen/modules/network.py +++ b/users/zeineldeen/modules/network.py @@ -219,6 +219,7 @@ def add_softmax_layer( loss_scale=None, param_dropout=None, param_dropout_min_ndim=None, + param_variational_noise=None, **kwargs, ): d = {"class": "softmax", "from": source} @@ -238,6 +239,8 @@ def add_softmax_layer( d["param_dropout"] = param_dropout if param_dropout_min_ndim is not None: d["param_dropout_min_ndim"] = param_dropout_min_ndim + if param_variational_noise: + d["param_variational_noise"] = param_variational_noise if loss_scale: d["loss_scale"] = loss_scale d.update(kwargs) From 1c14be1cad3a09789117df7481e4bfd6a2046ceb Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Wed, 15 May 2024 15:57:14 +0000 Subject: [PATCH 012/227] add more regs to rnn dec --- .../models/asr/decoder/rnn_decoder.py | 2 + users/zeineldeen/modules/attention.py | 66 ++++++++++++++++--- 2 files changed, 59 insertions(+), 9 deletions(-) diff --git a/users/zeineldeen/models/asr/decoder/rnn_decoder.py b/users/zeineldeen/models/asr/decoder/rnn_decoder.py index d7f82fdb4..b292d2ed8 100644 --- a/users/zeineldeen/models/asr/decoder/rnn_decoder.py +++ b/users/zeineldeen/models/asr/decoder/rnn_decoder.py @@ -183,6 +183,8 @@ def add_decoder_subnetwork(self, subnet_unit: ReturnnNetwork): l2=self.l2, loc_filter_size=self.loc_conv_att_filter_size, loc_num_channels=self.enc_key_dim, + weight_drop=self.ff_weight_drop, + weight_noise=self.ff_weight_noise, ) subnet_unit.update(att.create()) diff --git a/users/zeineldeen/modules/attention.py b/users/zeineldeen/modules/attention.py index e13ebd1c7..d0350a622 100644 --- a/users/zeineldeen/modules/attention.py +++ b/users/zeineldeen/modules/attention.py @@ -7,12 +7,14 @@ class ConvLocAwareness(AbsModule): Attention convolution location awareness """ - def __init__(self, enc_key_dim, filter_size, num_channels, l2): + def __init__(self, enc_key_dim, filter_size, num_channels, l2, weight_drop, weight_noise): super().__init__() self.enc_key_dim = enc_key_dim self.filter_size = filter_size self.num_channels = num_channels self.l2 = l2 + self.weight_drop = weight_drop + self.weight_noise = weight_noise def create(self): out_net = ReturnnNetwork() @@ -35,10 +37,20 @@ def create(self): padding="valid", n_out=self.num_channels, l2=self.l2, + param_dropout=self.weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.weight_noise, ) self.name = out_net.add_linear_layer( - "weight_feedback", loc_att_conv, activation=None, with_bias=False, n_out=self.enc_key_dim + "weight_feedback", + loc_att_conv, + activation=None, + with_bias=False, + n_out=self.enc_key_dim, + param_dropout=self.weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.weight_noise, ) return out_net.get_net() @@ -49,11 +61,14 @@ class AdditiveLocAwareness(AbsModule): Attention additive location awareness """ - def __init__(self, enc_key_dim, att_num_heads): + def __init__(self, enc_key_dim, att_num_heads, weight_drop, weight_noise): super().__init__() self.enc_key_dim = enc_key_dim self.att_num_heads = att_num_heads + self.weight_drop = weight_drop + self.weight_noise = weight_noise + def create(self): out_net = ReturnnNetwork() @@ -65,7 +80,13 @@ def create(self): ) self.name = out_net.add_linear_layer( - "weight_feedback", "prev:accum_att_weights", n_out=self.enc_key_dim, with_bias=False + "weight_feedback", + "prev:accum_att_weights", + n_out=self.enc_key_dim, + with_bias=False, + param_dropout=self.weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.weight_noise, ) return out_net.get_net() @@ -76,7 +97,9 @@ class AttentionMechanism(AbsModule): Single-head or Multi-head attention mechanism """ - def __init__(self, enc_key_dim, att_num_heads, att_dropout, l2, loc_filter_size, loc_num_channels): + def __init__( + self, enc_key_dim, att_num_heads, att_dropout, l2, loc_filter_size, loc_num_channels, weight_drop, weight_noise + ): super().__init__() self.enc_key_dim = enc_key_dim self.att_num_heads = att_num_heads @@ -84,15 +107,26 @@ def __init__(self, enc_key_dim, att_num_heads, att_dropout, l2, loc_filter_size, self.att_dropout = att_dropout self.l2 = l2 + self.weight_drop = weight_drop + self.weight_noise = weight_noise + self.loc_filter_size = loc_filter_size self.loc_num_channels = loc_num_channels def create(self): out_net = ReturnnNetwork() + # project query out_net.add_linear_layer( - "s_transformed", "s", n_out=self.enc_key_dim, with_bias=False, l2=self.l2 - ) # project query + "s_transformed", + "s", + n_out=self.enc_key_dim, + with_bias=False, + l2=self.l2, + param_dropout=self.weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.weight_noise, + ) if self.loc_filter_size is not None: assert self.loc_filter_size is not None @@ -101,10 +135,17 @@ def create(self): filter_size=self.loc_filter_size, num_channels=self.loc_num_channels, l2=self.l2, + weight_drop=self.weight_drop, + weight_noise=self.weight_noise, ) else: # additive - weight_feedback = AdditiveLocAwareness(enc_key_dim=self.enc_key_dim, att_num_heads=self.att_num_heads) + weight_feedback = AdditiveLocAwareness( + enc_key_dim=self.enc_key_dim, + att_num_heads=self.att_num_heads, + weight_drop=self.weight_drop, + weight_noise=self.weight_noise, + ) out_net.update(weight_feedback.create()) # add att weight feedback @@ -115,7 +156,14 @@ def create(self): # compute energies out_net.add_activation_layer("energy_tanh", "energy_in", activation="tanh") energy = out_net.add_linear_layer( - "energy", "energy_tanh", n_out=self.att_num_heads, with_bias=False, l2=self.l2 + "energy", + "energy_tanh", + n_out=self.att_num_heads, + with_bias=False, + l2=self.l2, + param_dropout=self.weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.weight_noise, ) if self.att_dropout: From 84686ed232fbb6337b09695e357d0024b5850925 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Wed, 15 May 2024 15:58:50 +0000 Subject: [PATCH 013/227] black formatting --- .../asr/encoder/ebranchformer_encoder.py | 371 ++++++++++++------ 1 file changed, 255 insertions(+), 116 deletions(-) diff --git a/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py b/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py index b0859bc24..3f06572c9 100644 --- a/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py +++ b/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py @@ -7,14 +7,47 @@ class EBranchformerEncoder: * Ref: https://arxiv.org/pdf/2210.00077.pdf """ - def __init__(self, input='data', input_layer='conv-6', input_layer_conv_act='relu', num_blocks=16, - conv_kernel_size=32, specaug=True, pos_enc='rel', activation='swish', ff_dim=512, - ff_bias=True, ctc_loss_scale=None, dropout=0.1, att_dropout=0.1, enc_key_dim=256, att_num_heads=4, - target='bpe', l2=0.0, lstm_dropout=0.1, rec_weight_dropout=0., with_ctc=False, native_ctc=False, - ctc_dropout=0., ctc_l2=0., ctc_opts=None, subsample=None, start_conv_init=None, conv_module_init=None, - mhsa_init=None, mhsa_out_init=None, ff_init=None, rel_pos_clipping=16, dropout_in=0.1, - batch_norm_opts=None, self_att_l2=0.0, sandwich_conv=False, - add_to_prefix_name=None, output_layer_name='encoder', rezero=False): + def __init__( + self, + input="data", + input_layer="conv-6", + input_layer_conv_act="relu", + num_blocks=16, + conv_kernel_size=32, + specaug=True, + pos_enc="rel", + activation="swish", + ff_dim=512, + ff_bias=True, + ctc_loss_scale=None, + dropout=0.1, + att_dropout=0.1, + enc_key_dim=256, + att_num_heads=4, + target="bpe", + l2=0.0, + lstm_dropout=0.1, + rec_weight_dropout=0.0, + with_ctc=False, + native_ctc=False, + ctc_dropout=0.0, + ctc_l2=0.0, + ctc_opts=None, + subsample=None, + start_conv_init=None, + conv_module_init=None, + mhsa_init=None, + mhsa_out_init=None, + ff_init=None, + rel_pos_clipping=16, + dropout_in=0.1, + batch_norm_opts=None, + self_att_l2=0.0, + sandwich_conv=False, + add_to_prefix_name=None, + output_layer_name="encoder", + rezero=False, + ): """ :param str input: input layer name :param str input_layer: type of input layer which does subsampling @@ -85,15 +118,15 @@ def __init__(self, input='data', input_layer='conv-6', input_layer_conv_act='rel if batch_norm_opts is None: batch_norm_opts = {} - bn_momentum = batch_norm_opts.pop('momentum', 0.1) - bn_eps = batch_norm_opts.pop('epsilon', 1e-3) - bn_update_sample_only_in_train = batch_norm_opts.pop('update_sample_only_in_training', True) - bn_delay_sample_update = batch_norm_opts.pop('delay_sample_update', True) + bn_momentum = batch_norm_opts.pop("momentum", 0.1) + bn_eps = batch_norm_opts.pop("epsilon", 1e-3) + bn_update_sample_only_in_train = batch_norm_opts.pop("update_sample_only_in_training", True) + bn_delay_sample_update = batch_norm_opts.pop("delay_sample_update", True) self.batch_norm_opts = { - 'momentum': bn_momentum, - 'epsilon': bn_eps, - 'update_sample_only_in_training': bn_update_sample_only_in_train, - 'delay_sample_update': bn_delay_sample_update, + "momentum": bn_momentum, + "epsilon": bn_eps, + "update_sample_only_in_training": bn_update_sample_only_in_train, + "delay_sample_update": bn_delay_sample_update, } self.batch_norm_opts.update(**batch_norm_opts) @@ -118,7 +151,7 @@ def __init__(self, input='data', input_layer='conv-6', input_layer_conv_act='rel self.subsample = subsample self.subsample_list = [1] * num_blocks if subsample: - for idx, s in enumerate(map(int, subsample.split('_')[:num_blocks])): + for idx, s in enumerate(map(int, subsample.split("_")[:num_blocks])): self.subsample_list[idx] = s self.network = ReturnnNetwork() @@ -128,7 +161,6 @@ def __init__(self, input='data', input_layer='conv-6', input_layer_conv_act='rel self.rezero = rezero - def _create_ff_module(self, prefix_name, i, source, block_scale_var): """ Add Feed Forward Module: @@ -139,126 +171,192 @@ def _create_ff_module(self, prefix_name, i, source, block_scale_var): :return: last layer name of this module :rtype: str """ - prefix_name = prefix_name + '_ffmod_{}'.format(i) + prefix_name = prefix_name + "_ffmod_{}".format(i) - ln = self.network.add_layer_norm_layer('{}_ln'.format(prefix_name), source) + ln = self.network.add_layer_norm_layer("{}_ln".format(prefix_name), source) ff1 = self.network.add_linear_layer( - '{}_ff1'.format(prefix_name), ln, n_out=self.ff_dim, l2=self.l2, forward_weights_init=self.ff_init, - with_bias=self.ff_bias) + "{}_ff1".format(prefix_name), + ln, + n_out=self.ff_dim, + l2=self.l2, + forward_weights_init=self.ff_init, + with_bias=self.ff_bias, + ) - swish_act = self.network.add_activation_layer('{}_swish'.format(prefix_name), ff1, activation='swish') + swish_act = self.network.add_activation_layer("{}_swish".format(prefix_name), ff1, activation="swish") - drop1 = self.network.add_dropout_layer('{}_drop1'.format(prefix_name), swish_act, dropout=self.dropout) + drop1 = self.network.add_dropout_layer("{}_drop1".format(prefix_name), swish_act, dropout=self.dropout) ff2 = self.network.add_linear_layer( - '{}_ff2'.format(prefix_name), drop1, n_out=self.enc_key_dim, l2=self.l2, forward_weights_init=self.ff_init, - with_bias=self.ff_bias) + "{}_ff2".format(prefix_name), + drop1, + n_out=self.enc_key_dim, + l2=self.l2, + forward_weights_init=self.ff_init, + with_bias=self.ff_bias, + ) - drop2 = self.network.add_dropout_layer('{}_drop2'.format(prefix_name), ff2, dropout=self.dropout) + drop2 = self.network.add_dropout_layer("{}_drop2".format(prefix_name), ff2, dropout=self.dropout) if self.rezero: - drop2 = self.network.add_eval_layer('{}_scaled_dropout'.format(prefix_name), [block_scale_var, drop2], eval='source(0) * source(1)') + drop2 = self.network.add_eval_layer( + "{}_scaled_dropout".format(prefix_name), [block_scale_var, drop2], eval="source(0) * source(1)" + ) + + half_step_ff = self.network.add_eval_layer("{}_half_step".format(prefix_name), drop2, eval="0.5 * source(0)") - half_step_ff = self.network.add_eval_layer('{}_half_step'.format(prefix_name), drop2, eval='0.5 * source(0)') - ff_module_res = self.network.add_combine_layer( - '{}_res'.format(prefix_name), kind='add', source=[half_step_ff, source], n_out=self.enc_key_dim) + "{}_res".format(prefix_name), kind="add", source=[half_step_ff, source], n_out=self.enc_key_dim + ) return ff_module_res - def _create_global_extractor(self, prefix_name, source): - prefix_name = '{}_global_extractor'.format(prefix_name) + prefix_name = "{}_global_extractor".format(prefix_name) + + ln = self.network.add_layer_norm_layer("{}_ln".format(prefix_name), source) - ln = self.network.add_layer_norm_layer('{}_ln'.format(prefix_name), source) - ln_rel_pos_enc = self.network.add_relative_pos_encoding_layer( - '{}_ln_rel_pos_enc'.format(prefix_name), ln, n_out=self.enc_key_per_head_dim, forward_weights_init=self.ff_init) - + "{}_ln_rel_pos_enc".format(prefix_name), + ln, + n_out=self.enc_key_per_head_dim, + forward_weights_init=self.ff_init, + ) + mhsa = self.network.add_self_att_layer( - '{}'.format(prefix_name), ln, n_out=self.enc_value_dim, num_heads=self.att_num_heads, - total_key_dim=self.enc_key_dim, att_dropout=self.att_dropout, forward_weights_init=self.ff_init, - key_shift=ln_rel_pos_enc if ln_rel_pos_enc is not None else None) - + "{}".format(prefix_name), + ln, + n_out=self.enc_value_dim, + num_heads=self.att_num_heads, + total_key_dim=self.enc_key_dim, + att_dropout=self.att_dropout, + forward_weights_init=self.ff_init, + key_shift=ln_rel_pos_enc if ln_rel_pos_enc is not None else None, + ) + mhsa_linear = self.network.add_linear_layer( - '{}_linear'.format(prefix_name), mhsa, n_out=self.enc_key_dim, l2=self.l2, forward_weights_init=self.ff_init, - with_bias=False) - - dropout = self.network.add_dropout_layer('{}_dropout'.format(prefix_name), mhsa_linear, dropout=self.dropout) + "{}_linear".format(prefix_name), + mhsa, + n_out=self.enc_key_dim, + l2=self.l2, + forward_weights_init=self.ff_init, + with_bias=False, + ) + + dropout = self.network.add_dropout_layer("{}_dropout".format(prefix_name), mhsa_linear, dropout=self.dropout) return dropout - def _create_local_extractor(self, prefix_name, source): - prefix_name = '{}_local_extractor'.format(prefix_name) + prefix_name = "{}_local_extractor".format(prefix_name) - ln = self.network.add_layer_norm_layer('{}_ln'.format(prefix_name), source) + ln = self.network.add_layer_norm_layer("{}_ln".format(prefix_name), source) ff1 = self.network.add_linear_layer( - '{}_ff_1'.format(prefix_name), ln, n_out=6*self.enc_key_dim, l2=self.l2, forward_weights_init=self.ff_init, - with_bias=False) - - gelu_act = self.network.add_activation_layer('{}_gelu'.format(prefix_name), ff1, activation='gelu') + "{}_ff_1".format(prefix_name), + ln, + n_out=6 * self.enc_key_dim, + l2=self.l2, + forward_weights_init=self.ff_init, + with_bias=False, + ) + + gelu_act = self.network.add_activation_layer("{}_gelu".format(prefix_name), ff1, activation="gelu") - br_part_A = self.network.add_slice_layer('{}_branch_a'.format(prefix_name), gelu_act, 'F', slice_start=0, slice_end =self.enc_key_dim*3) + br_part_A = self.network.add_slice_layer( + "{}_branch_a".format(prefix_name), gelu_act, "F", slice_start=0, slice_end=self.enc_key_dim * 3 + ) - br_part_B = self.network.add_slice_layer('{}_branch_b'.format(prefix_name), gelu_act, 'F', slice_start =self.enc_key_dim*3) + br_part_B = self.network.add_slice_layer( + "{}_branch_b".format(prefix_name), gelu_act, "F", slice_start=self.enc_key_dim * 3 + ) - br_part_B_ln = self.network.add_layer_norm_layer('{}_branch_b_ln'.format(prefix_name), br_part_B) + br_part_B_ln = self.network.add_layer_norm_layer("{}_branch_b_ln".format(prefix_name), br_part_B) br_part_B_dpt_conv = self.network.add_conv_layer( - '{}_branch_b_dpt_conv'.format(prefix_name), br_part_B_ln, n_out=self.enc_key_dim*3, - filter_size=(self.conv_kernel_size,), groups=self.enc_key_dim*3, l2=self.l2) - - br_merge = self.network.add_eval_layer('{}_branch_merge'.format(prefix_name), [br_part_A,br_part_B_dpt_conv], 'source(0)*source(1)') + "{}_branch_b_dpt_conv".format(prefix_name), + br_part_B_ln, + n_out=self.enc_key_dim * 3, + filter_size=(self.conv_kernel_size,), + groups=self.enc_key_dim * 3, + l2=self.l2, + ) + + br_merge = self.network.add_eval_layer( + "{}_branch_merge".format(prefix_name), [br_part_A, br_part_B_dpt_conv], "source(0)*source(1)" + ) - dropout = self.network.add_dropout_layer('{}_dropout'.format(prefix_name), br_merge, dropout=self.dropout) + dropout = self.network.add_dropout_layer("{}_dropout".format(prefix_name), br_merge, dropout=self.dropout) br_merge_ff = self.network.add_linear_layer( - '{}_ff_2'.format(prefix_name), dropout, n_out=self.enc_key_dim, l2=self.l2, forward_weights_init=self.ff_init, - with_bias=False) - + "{}_ff_2".format(prefix_name), + dropout, + n_out=self.enc_key_dim, + l2=self.l2, + forward_weights_init=self.ff_init, + with_bias=False, + ) + return br_merge_ff - def _create_merge_mod(self, prefix_name, source, block_scale_var): - prefix_name = '{}_merge_mod'.format(prefix_name) + prefix_name = "{}_merge_mod".format(prefix_name) glb_ext = self._create_global_extractor(prefix_name, source) lcl_ext = self._create_local_extractor(prefix_name, source) - glb_lcl_merge = self.network.add_copy_layer('{}_global_local_merge'.format(prefix_name), [glb_ext, lcl_ext]) + glb_lcl_merge = self.network.add_copy_layer("{}_global_local_merge".format(prefix_name), [glb_ext, lcl_ext]) dpt_conv = self.network.add_conv_layer( - '{}_dpt_conv'.format(prefix_name), glb_lcl_merge, n_out= 2*self.enc_key_dim, - filter_size=(self.conv_kernel_size,), groups= 2*self.enc_key_dim, l2=self.l2) - + "{}_dpt_conv".format(prefix_name), + glb_lcl_merge, + n_out=2 * self.enc_key_dim, + filter_size=(self.conv_kernel_size,), + groups=2 * self.enc_key_dim, + l2=self.l2, + ) + dpt_conv_res = self.network.add_combine_layer( - '{}_dpt_conv_res'.format(prefix_name), kind='add', source=[glb_lcl_merge, dpt_conv], n_out=2*self.enc_key_dim) - + "{}_dpt_conv_res".format(prefix_name), + kind="add", + source=[glb_lcl_merge, dpt_conv], + n_out=2 * self.enc_key_dim, + ) + ff = self.network.add_linear_layer( - '{}_ff'.format(prefix_name), dpt_conv_res, n_out=self.enc_key_dim, l2=self.l2, forward_weights_init=self.ff_init, - with_bias=False) - - dropout = self.network.add_dropout_layer('{}_dropout'.format(prefix_name), ff, dropout=self.dropout) + "{}_ff".format(prefix_name), + dpt_conv_res, + n_out=self.enc_key_dim, + l2=self.l2, + forward_weights_init=self.ff_init, + with_bias=False, + ) + + dropout = self.network.add_dropout_layer("{}_dropout".format(prefix_name), ff, dropout=self.dropout) if self.rezero: - dropout = self.network.add_eval_layer('{}_scaled_dropout'.format(prefix_name), [block_scale_var, dropout], eval='source(0) * source(1)') + dropout = self.network.add_eval_layer( + "{}_scaled_dropout".format(prefix_name), [block_scale_var, dropout], eval="source(0) * source(1)" + ) merge_mod_res = self.network.add_combine_layer( - '{}_res'.format(prefix_name), kind='add', source=[source, dropout], n_out=self.enc_key_dim) - - return merge_mod_res + "{}_res".format(prefix_name), kind="add", source=[source, dropout], n_out=self.enc_key_dim + ) + return merge_mod_res def _create_e_branchformer_block(self, i, source): - prefix_name = 'ebranchformer_block_%02i' % i + prefix_name = "ebranchformer_block_%02i" % i if self.rezero: self.network["mod_%02i_var" % i] = { - "class": "variable", "init":1e-8, "trainable":True, "add_batch_axis":True, "shape":(1,) + "class": "variable", + "init": 1e-8, + "trainable": True, + "add_batch_axis": True, + "shape": (1,), } ff_module1 = self._create_ff_module(prefix_name, 1, source, "mod_%02i_var" % i) @@ -267,29 +365,30 @@ def _create_e_branchformer_block(self, i, source): ff_module2 = self._create_ff_module(prefix_name, 2, merge_module, "mod_%02i_var" % i) - block_ln = self.network.add_layer_norm_layer('{}_ln'.format(prefix_name), ff_module2) + block_ln = self.network.add_layer_norm_layer("{}_ln".format(prefix_name), ff_module2) block_ln = self.network.add_copy_layer(prefix_name, block_ln) - - return block_ln + return block_ln def _create_all_network_parts(self): """ - ConvSubsampling/LSTM -> Linear -> Dropout -> [Conformer Blocks] x N + ConvSubsampling/LSTM -> Linear -> Dropout -> [Conformer Blocks] x N """ data = self.input if self.specaug: data = self.network.add_eval_layer( - 'source', data, - eval="self.network.get_config().typed_value('transform')(source(0, as_data=True), network=self.network)") + "source", + data, + eval="self.network.get_config().typed_value('transform')(source(0, as_data=True), network=self.network)", + ) subsampled_input = None if self.input_layer is None: subsampled_input = data - elif 'lstm' in self.input_layer: - sample_factor = int(self.input_layer.split('-')[1]) + elif "lstm" in self.input_layer: + sample_factor = int(self.input_layer.split("-")[1]) pool_sizes = None if sample_factor == 2: pool_sizes = [2, 1] @@ -299,9 +398,16 @@ def _create_all_network_parts(self): pool_sizes = [3, 2] # add 2 LSTM layers with max pooling to subsample and encode positional information subsampled_input = self.network.add_lstm_layers( - data, num_layers=2, lstm_dim=self.enc_key_dim, dropout=self.lstm_dropout, bidirectional=True, - rec_weight_dropout=self.rec_weight_dropout, l2=self.l2, pool_sizes=pool_sizes) - elif self.input_layer == 'conv-4': + data, + num_layers=2, + lstm_dim=self.enc_key_dim, + dropout=self.lstm_dropout, + bidirectional=True, + rec_weight_dropout=self.rec_weight_dropout, + l2=self.l2, + pool_sizes=pool_sizes, + ) + elif self.input_layer == "conv-4": # conv-layer-1: 3x3x32 followed by max pool layer on feature axis (1, 2) # conv-layer-2: 3x3x64 with striding (2, 1) on time axis # conv-layer-3: 3x3x64 with striding (2, 1) on time axis @@ -309,31 +415,62 @@ def _create_all_network_parts(self): # TODO: make this more generic conv_input = self.network.add_conv_block( - 'conv_out', data, hwpc_sizes=[((3, 3), (1, 2), 32)], - l2=self.l2, activation=self.input_layer_conv_act, init=self.start_conv_init, merge_out=False) + "conv_out", + data, + hwpc_sizes=[((3, 3), (1, 2), 32)], + l2=self.l2, + activation=self.input_layer_conv_act, + init=self.start_conv_init, + merge_out=False, + ) subsampled_input = self.network.add_conv_block( - 'conv_merged', conv_input, hwpc_sizes=[((3, 3), (2, 1), 64), ((3, 3), (2, 1), 64)], - l2=self.l2, activation=self.input_layer_conv_act, init=self.start_conv_init, use_striding=True, - split_input=False, prefix_name='subsample_') - elif self.input_layer == 'conv-6': + "conv_merged", + conv_input, + hwpc_sizes=[((3, 3), (2, 1), 64), ((3, 3), (2, 1), 64)], + l2=self.l2, + activation=self.input_layer_conv_act, + init=self.start_conv_init, + use_striding=True, + split_input=False, + prefix_name="subsample_", + ) + elif self.input_layer == "conv-6": conv_input = self.network.add_conv_block( - 'conv_out', data, hwpc_sizes=[((3, 3), (1, 2), 32)], - l2=self.l2, activation=self.input_layer_conv_act, init=self.start_conv_init, merge_out=False) + "conv_out", + data, + hwpc_sizes=[((3, 3), (1, 2), 32)], + l2=self.l2, + activation=self.input_layer_conv_act, + init=self.start_conv_init, + merge_out=False, + ) subsampled_input = self.network.add_conv_block( - 'conv_merged', conv_input, hwpc_sizes=[((3, 3), (3, 1), 64), ((3, 3), (2, 1), 64)], - l2=self.l2, activation=self.input_layer_conv_act, init=self.start_conv_init, use_striding=True, - split_input=False, prefix_name='subsample_') + "conv_merged", + conv_input, + hwpc_sizes=[((3, 3), (3, 1), 64), ((3, 3), (2, 1), 64)], + l2=self.l2, + activation=self.input_layer_conv_act, + init=self.start_conv_init, + use_striding=True, + split_input=False, + prefix_name="subsample_", + ) assert subsampled_input is not None source_linear = self.network.add_linear_layer( - 'source_linear', subsampled_input, n_out=self.enc_key_dim, l2=self.l2, forward_weights_init=self.ff_init, - with_bias=False) + "source_linear", + subsampled_input, + n_out=self.enc_key_dim, + l2=self.l2, + forward_weights_init=self.ff_init, + with_bias=False, + ) if self.dropout_in: - source_linear = self.network.add_dropout_layer('source_dropout', source_linear, dropout=self.dropout_in) + source_linear = self.network.add_dropout_layer("source_dropout", source_linear, dropout=self.dropout_in) conformer_block_src = source_linear for i in range(1, self.num_blocks + 1): @@ -342,20 +479,26 @@ def _create_all_network_parts(self): encoder = self.network.add_copy_layer(self.output_layer_name, conformer_block_src) if self.with_ctc: - default_ctc_loss_opts = {'beam_width': 1} + default_ctc_loss_opts = {"beam_width": 1} if self.native_ctc: - default_ctc_loss_opts['use_native'] = True + default_ctc_loss_opts["use_native"] = True else: self.ctc_opts.update({"ignore_longer_outputs_than_inputs": True}) # always enable if self.ctc_opts: - default_ctc_loss_opts['ctc_opts'] = self.ctc_opts + default_ctc_loss_opts["ctc_opts"] = self.ctc_opts self.network.add_softmax_layer( - 'ctc', encoder, l2=self.ctc_l2, target=self.target, loss='ctc', dropout=self.ctc_dropout, - loss_opts=default_ctc_loss_opts, loss_scale=self.ctc_loss_scale) + "ctc", + encoder, + l2=self.ctc_l2, + target=self.target, + loss="ctc", + dropout=self.ctc_dropout, + loss_opts=default_ctc_loss_opts, + loss_scale=self.ctc_loss_scale, + ) return encoder - def _create_e_branchformer_blocks(self, input): conformer_block_src = input for i in range(1, self.num_blocks + 1): @@ -363,9 +506,5 @@ def _create_e_branchformer_blocks(self, input): encoder = self.network.add_copy_layer(self.output_layer_name, conformer_block_src) return encoder - def create_network(self): return self._create_all_network_parts() - - - From 974b7de55f4d38ca13c5c34753abbb4445c430f4 Mon Sep 17 00:00:00 2001 From: Simon Berger Date: Wed, 15 May 2024 19:50:39 +0200 Subject: [PATCH 014/227] Update users/berger --- users/berger/args/jobs/rasr_init_args.py | 25 ++ users/berger/args/returnn/config.py | 1 + .../20230602_rescale_baselines/__init__.py | 2 + .../config_01_conformer_ctc.py | 1 + .../config_01b_conformer_ctc_logmel.py | 1 + .../config_04a_conformer_transducer_bpe.py | 3 +- ...onfig_04a_conformer_transducer_bpe_rasr.py | 225 +++++++++++++ .../config_04b_conformer_transducer_phon.py | 68 ++-- users/berger/helpers/returnn.py | 25 +- users/berger/pytorch/forward/transducer.py | 42 ++- .../pytorch/forward/transducer_beam_search.py | 4 + users/berger/pytorch/models/conformer_ctc.py | 13 +- .../pytorch/models/conformer_transducer_v2.py | 307 +++++++++++------- .../berger/pytorch/train_steps/transducer.py | 5 +- .../recipe/rasr/label_tree_and_scorer.py | 7 +- users/berger/settings.py | 23 +- users/berger/systems/dataclasses.py | 1 + users/berger/systems/functors/rasr_base.py | 11 +- .../functors/recognition/returnn_search.py | 1 + .../functors/recognition/seq2seq_search.py | 2 + users/berger/systems/functors/seq2seq_base.py | 22 +- 21 files changed, 585 insertions(+), 204 deletions(-) create mode 100644 users/berger/configs/tedlium2/20230602_rescale_baselines/config_04a_conformer_transducer_bpe_rasr.py diff --git a/users/berger/args/jobs/rasr_init_args.py b/users/berger/args/jobs/rasr_init_args.py index 40b956324..c5cb4ab2b 100644 --- a/users/berger/args/jobs/rasr_init_args.py +++ b/users/berger/args/jobs/rasr_init_args.py @@ -91,6 +91,7 @@ def get_feature_extraction_args_16kHz( gt_args: Optional[Dict] = None, ) -> Dict: mfcc_filter_width = features.filter_width_from_channels(channels=20, f_max=8000) # = 16000 / 2 + filterbank_filter_width = features.filter_width_from_channels(channels=80, f_max=8000) # = 16000 / 2 if mfcc_cepstrum_options is None: mfcc_cepstrum_options = { @@ -142,6 +143,30 @@ def get_feature_extraction_args_16kHz( "normalization_options": {}, } }, + "filterbank": { + "filterbank_options": { + "warping_function": "mel", + "filter_width": filterbank_filter_width, + "normalize": False, + "normalization_options": {}, + "without_samples": False, + "samples_options": { + "audio_format": "wav", + # "scale_input": 2**-15, + "dc_detection": dc_detection, + }, + "fft_options": { + "preemphasis": 0.97, + "window_type": "hanning", + "window_shift": 0.01, + "window_length": 0.025, + }, + "apply_log": True, + "add_epsilon": True, + "add_features_output": True, + # "warp_differential_unit": False, + }, + }, "energy": { "energy_options": { "without_samples": False, diff --git a/users/berger/args/returnn/config.py b/users/berger/args/returnn/config.py index 694f9844d..3578d45cb 100644 --- a/users/berger/args/returnn/config.py +++ b/users/berger/args/returnn/config.py @@ -21,6 +21,7 @@ def get_base_config(backend: Backend) -> Dict[str, Any]: elif backend == Backend.PYTORCH: result["backend"] = "torch" result["use_lovely_tensors"] = True + # result["torch_amp"] = {"dtype": "bfloat16"} else: raise NotImplementedError return result diff --git a/users/berger/configs/tedlium2/20230602_rescale_baselines/__init__.py b/users/berger/configs/tedlium2/20230602_rescale_baselines/__init__.py index 7f4f1e0a3..949f5fa7d 100644 --- a/users/berger/configs/tedlium2/20230602_rescale_baselines/__init__.py +++ b/users/berger/configs/tedlium2/20230602_rescale_baselines/__init__.py @@ -7,6 +7,7 @@ from .config_01_conformer_ctc import py as py_01 from .config_04a_conformer_transducer_bpe import py as py_04a +from .config_04a_conformer_transducer_bpe_rasr import py as py_04a_rasr from .config_04b_conformer_transducer_phon import py as py_04b @@ -88,6 +89,7 @@ def worker_wrapper(job, task_name, call): copy.deepcopy(py_01()), copy.deepcopy(py_01b()), copy.deepcopy(py_04a()), + copy.deepcopy(py_04a_rasr()), copy.deepcopy(py_04b()), ]: subreport.collapse([SummaryKey.CORPUS.value], best_selector_key=SummaryKey.ERR.value) diff --git a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_01_conformer_ctc.py b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_01_conformer_ctc.py index 7b8e38745..f2c222a29 100644 --- a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_01_conformer_ctc.py +++ b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_01_conformer_ctc.py @@ -114,6 +114,7 @@ def run_exp() -> SummaryReport: prior_scales=[0.5], lm_scales=[1.1], feature_type=FeatureType.GAMMATONE_16K, + search_stats=True, ) # ********** System ********** diff --git a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_01b_conformer_ctc_logmel.py b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_01b_conformer_ctc_logmel.py index 5bffbc40d..2e62a39cb 100644 --- a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_01b_conformer_ctc_logmel.py +++ b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_01b_conformer_ctc_logmel.py @@ -42,6 +42,7 @@ def returnn_config_generator( extra_config = { "train": train_data_config, "dev": dev_data_config, + "torch_amp": {"dtype": "bfloat16"}, } if variant == ConfigVariant.TRAIN: diff --git a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_04a_conformer_transducer_bpe.py b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_04a_conformer_transducer_bpe.py index 3b09d9869..301506b5c 100644 --- a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_04a_conformer_transducer_bpe.py +++ b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_04a_conformer_transducer_bpe.py @@ -59,6 +59,7 @@ def returnn_config_generator( "train": train_data_config, "dev": dev_data_config, "max_seq_length": {"audio_features": 560000}, + "torch_amp": {"dtype": "bfloat16"}, } serializer = model.get_train_serializer(model_config, **kwargs) @@ -159,7 +160,7 @@ def run_exp() -> SummaryReport: data.train_data_config, data.cv_data_config, data.forward_data_config, - beam_sizes=[1, 2, 4], + beam_sizes=[1, 2, 3], ), ) diff --git a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_04a_conformer_transducer_bpe_rasr.py b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_04a_conformer_transducer_bpe_rasr.py new file mode 100644 index 000000000..ddd8a5a6f --- /dev/null +++ b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_04a_conformer_transducer_bpe_rasr.py @@ -0,0 +1,225 @@ +import copy +import os +from typing import List, Optional +from i6_core.returnn.config import ReturnnConfig + +from sisyphus import gs, tk + +import i6_core.rasr as rasr +from i6_experiments.users.berger.args.experiments import transducer as exp_args +from i6_experiments.users.berger.args.returnn.config import get_returnn_config, Backend +from i6_experiments.users.berger.args.returnn.learning_rates import LearningRateSchedules, Optimizers +from i6_experiments.users.berger.corpus.tedlium2.bpe_transducer_data import get_tedlium2_data_dumped_bpe_labels +from i6_experiments.users.berger.pytorch.models import conformer_transducer_v2 as model +from i6_experiments.users.berger.recipe.summary.report import SummaryReport +from i6_experiments.users.berger.systems.dataclasses import ConfigVariant, EncDecConfig, FeatureType, ReturnnConfigs +from i6_experiments.users.berger.systems.returnn_seq2seq_system import ReturnnSeq2SeqSystem +from i6_experiments.users.berger.util import default_tools_v2 +from i6_experiments.users.berger.systems.functors.recognition.returnn_search import LexiconType +from i6_experiments.users.berger.systems.functors.rasr_base import RecognitionScoringType + +# ********** Settings ********** + +rasr.flow.FlowNetwork.default_flags = {"cache_mode": "task_dependent"} + +num_outputs = 1068 +num_subepochs = 500 + +tools = copy.deepcopy(default_tools_v2) +tools.rasr_binary_path = tk.Path("/u/berger/repositories/rasr_versions/gen_seq2seq_dev/arch/linux-x86_64-standard") + + +# ********** Return Config generators ********** + + +def returnn_config_generator( + train_data_config: dict, + dev_data_config: dict, + **kwargs, +) -> ReturnnConfig: + model_config = model.get_default_config_v1(num_outputs=num_outputs) + + extra_config = { + "train": train_data_config, + "dev": dev_data_config, + "max_seq_length": {"audio_features": 560000}, + "torch_amp": {"dtype": "bfloat16"}, + } + serializer = model.get_train_serializer(model_config, **kwargs) + + return get_returnn_config( + num_epochs=num_subepochs, + num_inputs=1, + num_outputs=num_outputs, + target="classes", + extra_python=[serializer], + extern_data_config=True, + backend=Backend.PYTORCH, + grad_noise=0.0, + grad_clip=0.0, + optimizer=Optimizers.AdamW, + schedule=LearningRateSchedules.OCLR, + initial_lr=1e-06, + peak_lr=8e-05, + decayed_lr=1e-05, + final_lr=1e-08, + batch_size=10000 * 160, + use_chunking=False, + extra_config=extra_config, + ) + + +def recog_returnn_configs_generator( + **kwargs, +) -> EncDecConfig[ReturnnConfig]: + model_config = model.get_default_config_v1(num_outputs=num_outputs) + + enc_extra_config = { + "extern_data": { + "sources": {"dim": 80, "dtype": "float32"}, + }, + "model_outputs": { + "source_encodings": { + "dim": 384, + "dtype": "float32", + }, + }, + } + dec_extra_config = { + "extern_data": { + "source_encodings": { + "dim": 384, + "time_dim_axis": None, + "dtype": "float32", + }, + "targets": { + "dim": num_outputs, + "time_dim_axis": None, + "sparse": True, + "shape": (1,), + "dtype": "int32", + }, + }, + "model_outputs": { + "log_probs": { + "dim": num_outputs, + "time_dim_axis": None, + "dtype": "float32", + } + }, + } + enc_serializer = model.get_encoder_recog_serializer(model_config, **kwargs) + dec_serializer = model.get_decoder_recog_serializer(model_config, **kwargs) + + return EncDecConfig( + encoder_config=get_returnn_config( + num_inputs=80, + num_outputs=num_outputs, + target=None, + extra_python=[enc_serializer], + extern_data_config=False, + backend=Backend.PYTORCH, + extra_config=enc_extra_config, + ), + decoder_config=get_returnn_config( + num_inputs=1, + num_outputs=num_outputs, + target=None, + # python_prolog=["from returnn.tensor.dim import Dim, batch_dim"], + extra_python=[dec_serializer], + extern_data_config=False, + backend=Backend.PYTORCH, + extra_config=dec_extra_config, + ), + ) + + +def get_returnn_config_collection( + train_data_config: dict, + dev_data_config: dict, + **kwargs, +) -> ReturnnConfigs[ReturnnConfig]: + return ReturnnConfigs( + train_config=returnn_config_generator( + train_data_config=train_data_config, + dev_data_config=dev_data_config, + blank_id=0, + **kwargs, + ), + recog_configs={ + "recog": recog_returnn_configs_generator( + train_data_config=train_data_config, + dev_data_config=dev_data_config, + **kwargs, + ) + }, + ) + + +def run_exp() -> SummaryReport: + assert tools.returnn_root + assert tools.returnn_python_exe + assert tools.rasr_binary_path + data = get_tedlium2_data_dumped_bpe_labels( + num_classes=num_outputs, + returnn_root=tools.returnn_root, + returnn_python_exe=tools.returnn_python_exe, + rasr_binary_path=tools.rasr_binary_path, + augmented_lexicon=True, + feature_type=FeatureType.SAMPLES, + ) + + # ********** Step args ********** + + train_args = exp_args.get_transducer_train_step_args(num_epochs=num_subepochs, gpu_mem_rqmt=24) + recog_args = exp_args.get_transducer_recog_step_args( + num_classes=num_outputs, + epochs=[500], + lm_scales=[0.5], + label_scorer_type="onnx-ffnn-transducer", + label_scorer_args={"extra_args": {"start_label_index": 0}}, + reduction_subtrahend=3, + reduction_factor=4, + feature_type=FeatureType.LOGMEL_16K, + ) + + # ********** System ********** + + system = ReturnnSeq2SeqSystem(tools) + + system.init_corpora( + dev_keys=data.dev_keys, + test_keys=data.test_keys, + corpus_data=data.data_inputs, + am_args=exp_args.transducer_recog_am_args, + ) + system.setup_scoring() + + # ********** Returnn Configs ********** + + system.add_experiment_configs( + "Conformer_Transducer", + get_returnn_config_collection( + data.train_data_config, + data.cv_data_config, + ), + ) + + system.run_train_step(**train_args) + system.run_dev_recog_step(**recog_args) + + assert system.summary_report + return system.summary_report + + +def py() -> SummaryReport: + filename_handle = os.path.splitext(os.path.basename(__file__))[0][len("config_") :] + gs.ALIAS_AND_OUTPUT_SUBDIR = f"{filename_handle}/" + + summary_report = SummaryReport() + + summary_report.merge_report(run_exp(), update_structure=True) + + tk.register_report(f"{gs.ALIAS_AND_OUTPUT_SUBDIR}/summary.report", summary_report) + + return summary_report diff --git a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_04b_conformer_transducer_phon.py b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_04b_conformer_transducer_phon.py index 3e220f3bc..b6c026453 100644 --- a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_04b_conformer_transducer_phon.py +++ b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_04b_conformer_transducer_phon.py @@ -1,13 +1,10 @@ import copy import os -from typing import List, Optional -from i6_core.returnn.config import ReturnnConfig - -from sisyphus import gs, tk import i6_core.rasr as rasr +from i6_core.returnn.config import CodeWrapper, ReturnnConfig from i6_experiments.users.berger.args.experiments import transducer as exp_args -from i6_experiments.users.berger.args.returnn.config import get_returnn_config, Backend +from i6_experiments.users.berger.args.returnn.config import Backend, get_returnn_config from i6_experiments.users.berger.args.returnn.learning_rates import LearningRateSchedules, Optimizers from i6_experiments.users.berger.corpus.tedlium2.phon_transducer_data import get_tedlium2_data_dumped_labels from i6_experiments.users.berger.pytorch.models import conformer_transducer_v2 as model @@ -20,7 +17,7 @@ ) from i6_experiments.users.berger.systems.returnn_seq2seq_system import ReturnnSeq2SeqSystem from i6_experiments.users.berger.util import default_tools_v2 -from i6_experiments.users.berger.systems.functors.recognition.returnn_search import LexiconType +from sisyphus import gs, tk # ********** Settings ********** @@ -47,6 +44,7 @@ def returnn_config_generator( "train": train_data_config, "dev": dev_data_config, "max_seq_length": {"audio_features": 560000}, + "torch_amp": {"dtype": "bfloat16"}, } serializer = model.get_train_serializer(model_config, **kwargs) @@ -78,16 +76,36 @@ def recog_returnn_configs_generator( model_config = model.get_default_config_v1(num_outputs=num_outputs) enc_extra_config = { + "extern_data": { + "sources": {"dim": 80, "dtype": "float32"}, + }, "model_outputs": { - "encoder": { - "dim": model_config.transcriber_cfg.dim, - } + "source_encodings": { + "dim": 384, + "dtype": "float32", + }, }, } dec_extra_config = { + "extern_data": { + "source_encodings": { + "dim": 384, + "time_dim_axis": None, + "dtype": "float32", + }, + "targets": { + "dim": num_outputs, + "time_dim_axis": None, + "sparse": True, + "shape": (1,), + "dtype": "int32", + }, + }, "model_outputs": { "log_probs": { "dim": num_outputs, + "time_dim_axis": None, + "dtype": "float32", } }, } @@ -96,11 +114,11 @@ def recog_returnn_configs_generator( return EncDecConfig( encoder_config=get_returnn_config( - num_inputs=1, + num_inputs=80, num_outputs=num_outputs, target=None, extra_python=[enc_serializer], - extern_data_config=True, + extern_data_config=False, backend=Backend.PYTORCH, extra_config=enc_extra_config, ), @@ -108,8 +126,9 @@ def recog_returnn_configs_generator( num_inputs=1, num_outputs=num_outputs, target=None, + # python_prolog=["from returnn.tensor.dim import Dim, batch_dim"], extra_python=[dec_serializer], - extern_data_config=True, + extern_data_config=False, backend=Backend.PYTORCH, extra_config=dec_extra_config, ), @@ -148,11 +167,15 @@ def run_exp() -> SummaryReport: # ********** Step args ********** train_args = exp_args.get_transducer_train_step_args(num_epochs=num_subepochs, gpu_mem_rqmt=24) - recog_args = { - "epochs": [500], - "prior_scales": [0.0], - "lm_scales": [0.0], - } + recog_args = exp_args.get_transducer_recog_step_args( + num_classes=num_outputs, + epochs=[num_subepochs], + label_scorer_type="onnx-ffnn-transducer", + label_scorer_args={"extra_args": {"start_label_index": 0}}, + reduction_subtrahend=3, + reduction_factor=4, + feature_type=FeatureType.LOGMEL_16K, + ) # ********** System ********** @@ -193,16 +216,7 @@ def run_exp() -> SummaryReport: system.run_train_step(**train_args) - system.run_dev_recog_step( - recog_exp_names={ - exp_name: [ - recog_exp_name for recog_exp_name in system.get_recog_exp_names()[exp_name] if dev_key in recog_exp_name - ] - for dev_key in data.dev_keys - for exp_name in system.get_exp_names() - }, - **recog_args, - ) + system.run_dev_recog_step(**recog_args) assert system.summary_report return system.summary_report diff --git a/users/berger/helpers/returnn.py b/users/berger/helpers/returnn.py index 65887972c..86d5d6a38 100644 --- a/users/berger/helpers/returnn.py +++ b/users/berger/helpers/returnn.py @@ -1,4 +1,6 @@ +from typing import Any from i6_core import returnn +from i6_core.returnn.config import CodeWrapper from sisyphus import tk from i6_experiments.users.berger.util import ToolPaths from i6_core.returnn import ReturnnConfig @@ -16,6 +18,23 @@ def get_native_lstm_op(tool_paths: ToolPaths) -> tk.Path: return compile_job.out_op +def _replace_proxies_by_code_wrappers(obj: Any) -> Any: + """ + A ReturnnDimTagsProxy.DimRefProxy can currently not be hashed and sisyphus' extract_paths() also does not work, + because the parent attribute contains a set which again contains the original object which leads to recursion errors. + We could fix this in ReturnnDimTagsProxy.DimRefProxy, but for now just replace them with a CodeWrapper. + """ + from returnn_common.nn.naming import ReturnnDimTagsProxy + + if isinstance(obj, (ReturnnDimTagsProxy.SetProxy, ReturnnDimTagsProxy.DimRefProxy)): + return CodeWrapper(str(obj)) + elif isinstance(obj, dict): + return {k: _replace_proxies_by_code_wrappers(v) for k, v in obj.items()} + elif isinstance(obj, (list, tuple)): + return type(obj)([_replace_proxies_by_code_wrappers(x) for x in obj]) + return obj + + def serialize_dim_tags(config: ReturnnConfig) -> ReturnnConfig: """ Serialize dim tags in a given RETURNN config. @@ -26,8 +45,10 @@ def serialize_dim_tags(config: ReturnnConfig) -> ReturnnConfig: dim_tags_proxy = ReturnnDimTagsProxy() config_serialized = dim_tags_proxy.collect_dim_tags_and_transform_config(config.config) if dim_tags_proxy.py_code_str(): - config.config["network"] = _replace_proxies_by_code_wrappers(config_serialized["network"]) - config.config["extern_data"] = _replace_proxies_by_code_wrappers(config_serialized["extern_data"]) + if "network" in config.config: + config.config["network"] = _replace_proxies_by_code_wrappers(config_serialized["network"]) + if "extern_data" in config.config: + config.config["extern_data"] = _replace_proxies_by_code_wrappers(config_serialized["extern_data"]) python_prolog_ext = ( "from returnn.tf.util.data import Dim, batch_dim, single_step_dim, SpatialDim, FeatureDim\n\n" + dim_tags_proxy.py_code_str() diff --git a/users/berger/pytorch/forward/transducer.py b/users/berger/pytorch/forward/transducer.py index d084d4b6c..b9b1caaf4 100644 --- a/users/berger/pytorch/forward/transducer.py +++ b/users/berger/pytorch/forward/transducer.py @@ -14,48 +14,45 @@ def encoder_forward_step(*, model: FFNNTransducerEncoderOnly, extern_data: TensorDict, **_): - audio_features = extern_data["data"].raw_tensor - assert audio_features is not None - audio_features = map_tensor_to_minus1_plus1_interval(audio_features) + sources = extern_data["sources"].raw_tensor + assert sources is not None - audio_feature_lengths = extern_data["data"].dims[1].dyn_size_ext.raw_tensor - assert audio_feature_lengths is not None + source_lengths = extern_data["sources"].dims[1].dyn_size_ext.raw_tensor + assert source_lengths is not None device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - encoder_outputs, encoder_output_lengths = model( - features=audio_features.to(device), - features_size=audio_feature_lengths.to(device), + source_encodings, source_lengths = model( + sources=sources.to(device), + source_lengths=source_lengths.to(device), ) # [B, T, E], [B] import returnn.frontend as rf run_ctx = rf.get_run_ctx() if run_ctx.expected_outputs is not None: - run_ctx.expected_outputs["encoder_outputs"].dims[1].dyn_size_ext.raw_tensor = encoder_output_lengths - run_ctx.mark_as_output(encoder_outputs, name="encoder_outputs") + run_ctx.expected_outputs["source_encodings"].dims[1].dyn_size_ext.raw_tensor = source_lengths + run_ctx.mark_as_output(source_encodings, name="source_encodings") def decoder_forward_step(*, model: FFNNTransducerDecoderOnly, extern_data: TensorDict, **_): - encoder = extern_data["encoder"].raw_tensor - assert encoder is not None + source_encodings = extern_data["source_encodings"].raw_tensor + assert source_encodings is not None - history = extern_data["history"].raw_tensor - assert history is not None + targets = extern_data["targets"].raw_tensor + assert targets is not None device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - encoder_outputs, encoder_output_lengths = model( - features=encoder.to(device), - features_size=audio_feature_lengths.to(device), - ) # [B, T, E], [B] + log_probs = model( + source_encodings=source_encodings.to(device), + targets=targets.to(device), + ) # [B, C] import returnn.frontend as rf run_ctx = rf.get_run_ctx() - if run_ctx.expected_outputs is not None: - run_ctx.expected_outputs["encoder_outputs"].dims[1].dyn_size_ext.raw_tensor = encoder_output_lengths - run_ctx.mark_as_output(encoder_outputs, name="encoder_outputs") + run_ctx.mark_as_output(log_probs, name="log_probs") def monotonic_timesync_beam_search_forward_step( @@ -63,7 +60,8 @@ def monotonic_timesync_beam_search_forward_step( ): audio_features = extern_data["data"].raw_tensor assert audio_features is not None - audio_features = map_tensor_to_minus1_plus1_interval(audio_features) + # audio_features = map_tensor_to_minus1_plus1_interval(audio_features) + audio_features = audio_features.float() assert extern_data["data"].dims[1].dyn_size_ext is not None audio_feature_lengths = extern_data["data"].dims[1].dyn_size_ext.raw_tensor diff --git a/users/berger/pytorch/forward/transducer_beam_search.py b/users/berger/pytorch/forward/transducer_beam_search.py index 0d09c3fdc..0b8de6cb2 100644 --- a/users/berger/pytorch/forward/transducer_beam_search.py +++ b/users/berger/pytorch/forward/transducer_beam_search.py @@ -41,6 +41,8 @@ def monotonic_timesync_beam_search( # Compute encoder once enc, enc_lengths = model.transcribe(features, feature_lengths) # [1, T, E], [1] + print("encoder outputs:") + print(enc[0, :3, :5]) T = int(enc_lengths[0].cpu().item()) def predict_next( @@ -85,6 +87,7 @@ def predict_next( assert new_pred_history_state is not None + print("Predict with encoder state ", enc_state[0, 0, :5], "token", recent_token) log_probs, _, _ = model.join( # [1, C] (packed) or [1, 1, 1, C] (not packed)) source_encodings=enc_state, source_lengths=torch.tensor([1], device=enc.device), @@ -92,6 +95,7 @@ def predict_next( target_lengths=torch.tensor([1], device=enc.device), ) log_probs = log_probs.squeeze() # [C] + print("Probs: ", torch.exp(log_probs[:5])) # extend hypothesis with all possible next classes for c in range(log_probs.size(0)): diff --git a/users/berger/pytorch/models/conformer_ctc.py b/users/berger/pytorch/models/conformer_ctc.py index 36939c210..370bf4ea1 100644 --- a/users/berger/pytorch/models/conformer_ctc.py +++ b/users/berger/pytorch/models/conformer_ctc.py @@ -9,7 +9,10 @@ from i6_experiments.users.berger.pytorch.serializers.basic import ( get_basic_pt_network_serializer, ) -from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config +from i6_models.primitives.feature_extraction import ( + RasrCompatibleLogMelFeatureExtractionV1, + RasrCompatibleLogMelFeatureExtractionV1Config, +) from i6_models.parts.frontend.generic_frontend import ( GenericFrontendV1, GenericFrontendV1Config, @@ -314,17 +317,13 @@ def get_default_config_v2(num_inputs: int, num_outputs: int) -> ConformerCTCConf def get_default_config_v3(num_outputs: int) -> ConformerCTCConfig: feature_extraction = ModuleFactoryV1( - module_class=LogMelFeatureExtractionV1, - cfg=LogMelFeatureExtractionV1Config( + module_class=RasrCompatibleLogMelFeatureExtractionV1, + cfg=RasrCompatibleLogMelFeatureExtractionV1Config( sample_rate=16000, win_size=0.025, hop_size=0.01, - f_min=60, - f_max=7600, min_amp=1e-10, num_filters=80, - center=False, - n_fft=400, ), ) specaugment = ModuleFactoryV1( diff --git a/users/berger/pytorch/models/conformer_transducer_v2.py b/users/berger/pytorch/models/conformer_transducer_v2.py index 1f47d0b9e..32ab9d10f 100644 --- a/users/berger/pytorch/models/conformer_transducer_v2.py +++ b/users/berger/pytorch/models/conformer_transducer_v2.py @@ -1,27 +1,28 @@ from dataclasses import dataclass from enum import Enum, auto -from typing import List, Optional, Union, Callable, Tuple +from typing import Callable, List, Optional, Tuple, Union +import i6_models.assemblies.conformer as conformer_i6 +import i6_models.parts.conformer as conformer_parts_i6 import torch -from torchaudio.models import rnnt - from i6_core.returnn.config import CodeWrapper from i6_experiments.common.setups.returnn_pytorch.serialization import Collection from i6_experiments.common.setups.serialization import Import, PartialImport from i6_experiments.users.berger.pytorch.serializers.basic import ( get_basic_pt_network_serializer, ) -from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1, VGG4LayerActFrontendV1Config -import i6_models.parts.conformer as conformer_parts_i6 -import i6_models.assemblies.conformer as conformer_i6 from i6_models.config import ModelConfiguration, ModuleFactoryV1 -from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config -from .util import lengths_to_padding_mask +from i6_models.parts.frontend.generic_frontend import FrontendLayerType, GenericFrontendV1, GenericFrontendV1Config +from i6_models.primitives.feature_extraction import ( + RasrCompatibleLogMelFeatureExtractionV1, + RasrCompatibleLogMelFeatureExtractionV1Config, +) from ..custom_parts.specaugment import ( - SpecaugmentConfigV1, - SpecaugmentModuleV1, + SpecaugmentByLengthConfigV1, + SpecaugmentByLengthModuleV1, ) +from .util import lengths_to_padding_mask @dataclass @@ -29,42 +30,52 @@ class TransducerTranscriberConfig(ModelConfiguration): feature_extraction: ModuleFactoryV1 specaugment: ModuleFactoryV1 encoder: ModuleFactoryV1 - dim: int - target_size: int -class TransducerTranscriber(rnnt._Transcriber, torch.nn.Module): +class TransducerTranscriber(torch.nn.Module): def __init__(self, cfg: TransducerTranscriberConfig, **_) -> None: super().__init__() self.feature_extraction = cfg.feature_extraction() self.specaugment = cfg.specaugment() self.encoder = cfg.encoder() - self.final_linear = torch.nn.Linear(cfg.dim, cfg.target_size) def forward( self, - input: torch.Tensor, # [B, T, F] - lengths: torch.Tensor, # [B] + sources: torch.Tensor, # [B, T, F] + source_lengths: torch.Tensor, # [B] ) -> Tuple[torch.Tensor, torch.Tensor]: # [B, T, C], [B] with torch.no_grad(): - input = input.squeeze(-1) - x, lengths = self.feature_extraction(input, lengths) - sequence_mask = lengths_to_padding_mask(lengths) + sources = sources.squeeze(-1) + x, source_lengths = self.feature_extraction(sources, source_lengths) + print("Features: ", x[0, :3, :5]) + sequence_mask = lengths_to_padding_mask(source_lengths) x = self.specaugment(x) # [B, T, F] x, sequence_mask = self.encoder(x, sequence_mask) # [B, T, E], [B, T] - x = self.final_linear(x) # [B, T, C] return x, torch.sum(sequence_mask, dim=1).to(torch.int32) # [B, T, C], [B] - def infer( + +class TransducerTranscriberNoFeatExtr(torch.nn.Module): + def __init__(self, cfg: TransducerTranscriberConfig, **_) -> None: + super().__init__() + self.specaugment = cfg.specaugment() + self.encoder = cfg.encoder() + + def forward( self, - input: torch.Tensor, # [B, T, F], - lengths: torch.Tensor, # [B] - states: Optional[List[List[torch.Tensor]]], - ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]: - raise NotImplementedError + sources: torch.Tensor, # [B, T, F] + source_lengths: torch.Tensor, # [B] + ) -> Tuple[torch.Tensor, torch.Tensor]: # [B, T, C], [B] + with torch.no_grad(): + sequence_mask = lengths_to_padding_mask(source_lengths) + + x = self.specaugment(sources) # [B, T, F] + + x, sequence_mask = self.encoder(x, sequence_mask) # [B, T, E], [B, T] + + return x, torch.sum(sequence_mask, dim=1).to(torch.int32) # [B, T, C], [B] @dataclass @@ -82,10 +93,10 @@ class FFNNTransducerPredictorConfig(ModelConfiguration): class FFNNTransducerPredictor(torch.nn.Module): def __init__(self, cfg: FFNNTransducerPredictorConfig, **_) -> None: super().__init__() - self.embedding = torch.nn.Embedding(num_embeddings=cfg.target_size, embedding_dim=cfg.context_embedding_size) - self.blank_id = cfg.blank_id - + self.embedding = torch.nn.Embedding( + num_embeddings=cfg.target_size, embedding_dim=cfg.context_embedding_size, padding_idx=self.blank_id + ) self.context_history_size = cfg.context_history_size prediction_layers = [] prev_size = self.context_history_size * cfg.context_embedding_size @@ -95,14 +106,12 @@ def __init__(self, cfg: FFNNTransducerPredictorConfig, **_) -> None: prediction_layers.append(cfg.activation) prev_size = cfg.layer_size - self.network = torch.nn.Sequential( - *prediction_layers, torch.nn.Dropout(cfg.dropout), torch.nn.Linear(prev_size, cfg.target_size) - ) + self.network = torch.nn.Sequential(*prediction_layers) def forward( self, - input: torch.Tensor, # [B, S], - lengths: torch.Tensor, # [B], + targets: torch.Tensor, # [B, S], + target_lengths: torch.Tensor, # [B], state: Optional[ List[List[torch.Tensor]] ] = None, # Most recently fed inputs, used for higher order context, shape [[[B, H-1]]]; list of lists for compatibility with torchaudio @@ -114,23 +123,23 @@ def forward( # extend input by prepending either the state if it's given or some history consisting of blanks if state is None: prepend = torch.full( - (input.size(0), self.context_history_size - 1), + (targets.size(0), self.context_history_size - 1), fill_value=self.blank_id, - dtype=input.dtype, - device=input.device, + dtype=targets.dtype, + device=targets.device, ) # [B, H-1] # print("Predictor received no state. Use", prepend.deeper()) else: prepend = state[0][0] # [B, H-1] # print("Predictor received state", prepend.deeper()) - extended_input = torch.concat([prepend, input], dim=1) # [B, S+H-1] + extended_input = torch.concat([prepend, targets], dim=1) # [B, S+H-1] # print("extended input", extended_input.deeper()) if self.context_history_size > 1: return_state = extended_input[:, -(self.context_history_size - 1) :] # [B, H-1] else: return_state = torch.empty( - size=(input.size(0), 0), dtype=input.dtype, device=input.device + size=(targets.size(0), 0), dtype=targets.dtype, device=targets.device ) # [B, 0] = [B, H-1] # print("New state is ", return_state.deeper()) @@ -158,7 +167,7 @@ def forward( # "Repeat probabilities:", # torch.gather(torch.nn.functional.softmax(a, dim=-1), dim=-1, index=context[:, :, -2:-1]).deeper(), # ) - return a, lengths, [[return_state]] + return a, target_lengths, [[return_state]] class CombinationMode(Enum): @@ -170,6 +179,7 @@ class CombinationMode(Enum): class TransducerJoinerConfig(ModelConfiguration): layer_size: int act: torch.nn.Module + input_size: int target_size: int combination_mode: CombinationMode @@ -178,7 +188,7 @@ class TransducerJoiner(torch.nn.Module): def __init__(self, cfg: TransducerJoinerConfig, **_) -> None: super().__init__() self.network = torch.nn.Sequential( - torch.nn.Linear(cfg.target_size, cfg.layer_size), + torch.nn.Linear(cfg.input_size, cfg.layer_size), cfg.act, torch.nn.Linear(cfg.layer_size, cfg.target_size), ) @@ -213,7 +223,7 @@ class PackedTransducerJoiner(torch.nn.Module): def __init__(self, cfg: TransducerJoinerConfig, **_) -> None: super().__init__() self.network = torch.nn.Sequential( - torch.nn.Linear(cfg.target_size, cfg.layer_size), + torch.nn.Linear(cfg.input_size, cfg.layer_size), cfg.act, torch.nn.Linear(cfg.layer_size, cfg.target_size), ) @@ -260,62 +270,104 @@ class FFNNTransducerConfig(ModelConfiguration): joiner_cfg: TransducerJoinerConfig -class FFNNTransducer(rnnt.RNNT): +class FFNNTransducer(torch.nn.Module): def __init__(self, step: int, cfg: FFNNTransducerConfig, **_): - super().__init__( - transcriber=TransducerTranscriber(cfg.transcriber_cfg), - predictor=FFNNTransducerPredictor(cfg.predictor_cfg), - joiner=PackedTransducerJoiner(cfg.joiner_cfg), + super().__init__() + self.transcriber = TransducerTranscriber(cfg.transcriber_cfg) + self.predictor = FFNNTransducerPredictor(cfg.predictor_cfg) + self.joiner = PackedTransducerJoiner(cfg.joiner_cfg) + + def transcribe(self, sources: torch.Tensor, source_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + return self.transcriber(sources=sources, source_lengths=source_lengths) + + def predict( + self, targets: torch.Tensor, target_lengths: torch.Tensor, state: Optional[List[List[torch.Tensor]]] + ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]: + return self.predictor(targets=targets, target_lengths=target_lengths, state=state) + + def join( + self, + source_encodings: torch.Tensor, + source_lengths: torch.Tensor, + target_encodings: torch.Tensor, + target_lengths: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + return self.joiner( + source_encodings=source_encodings, + source_lengths=source_lengths, + target_encodings=target_encodings, + target_lengths=target_lengths, + ) + + def forward( + self, + sources: torch.Tensor, + source_lengths: torch.Tensor, + targets: torch.Tensor, + target_lengths: torch.Tensor, + predictor_state: Optional[List[List[torch.Tensor]]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]: + source_encodings, source_lengths = self.transcribe(sources=sources, source_lengths=source_lengths) + target_encodings, target_lengths, state = self.predict( + targets=targets, target_lengths=target_lengths, state=predictor_state ) + output, source_lengths, target_lengths = self.join( + source_encodings=source_encodings, + source_lengths=source_lengths, + target_encodings=target_encodings, + target_lengths=target_lengths, + ) -class FFNNTransducerEncoderOnly(rnnt.RNNT): + return output, source_lengths, target_lengths, state + + +class FFNNTransducerEncoderOnly(torch.nn.Module): def __init__(self, step: int, cfg: FFNNTransducerConfig, **_): - super().__init__( - transcriber=TransducerTranscriber(cfg.transcriber_cfg), - predictor=None, - joiner=None, - ) + super().__init__() + self.transcriber = TransducerTranscriberNoFeatExtr(cfg.transcriber_cfg) def forward( self, - features: torch.Tensor, # [B, T, F] - features_size: torch.Tensor, # [B] + sources: torch.Tensor, # [B, T, F] + source_lengths: torch.Tensor, # [B] ) -> Tuple[torch.Tensor, torch.Tensor]: # [B, T', E] - return self.transcribe(sources=features, source_lengths=features_size) + return self.transcriber(sources=sources, source_lengths=source_lengths) -class FFNNTransducerDecoderOnly(rnnt.RNNT): +class FFNNTransducerDecoderOnly(torch.nn.Module): def __init__(self, step: int, cfg: FFNNTransducerConfig, **_): - super().__init__( - transcriber=None, - predictor=FFNNTransducerPredictor(cfg.predictor_cfg), - joiner=TransducerJoiner(cfg.joiner_cfg), - ) + super().__init__() + self.predictor = FFNNTransducerPredictor(cfg.predictor_cfg) + self.joiner = TransducerJoiner(cfg.joiner_cfg) def forward( self, - encoder: torch.Tensor, # [B, E] - history: torch.Tensor, # [B, H] + source_encodings: torch.Tensor, # [B, E] + targets: torch.Tensor, # [B, H] ) -> torch.Tensor: # [B, C] - dec_history_state = [[history[:, :-1]]] # [[[B, H-1]]] - dec_current_label = history[:, -1:] # [B, 1] - dec_length = torch.tensor([1] * history.size(0), device=history.device) # [B] + dec_state = [[targets[:, :-1]]] # [[[B, H-1]]] + dec_current_label = targets[:, -1:] # [B, 1] + dec_length = torch.tensor([1] * targets.size(0), device=targets.device) # [B] - decoder, _, _ = self.predict( - targets=dec_current_label, target_lengths=dec_length, state=dec_history_state + decoder, _, _ = self.predictor( + targets=dec_current_label, target_lengths=dec_length, state=dec_state ) # [B, 1, P] - encoder = encoder.unsqueeze(1) # [B, 1, E] + source_encodings = source_encodings.unsqueeze(1) # [B, 1, E] - joint_output, _, _ = self.join( - source_encodings=encoder, source_lengths=dec_length, target_encodings=decoder, target_lengths=dec_length + joint_output, _, _ = self.joiner( + source_encodings=source_encodings, + source_lengths=dec_length, + target_encodings=decoder, + target_lengths=dec_length, ) # [B, 1, 1, C] - return joint_output.squeeze((1, 2)) # [B, C] + return joint_output.squeeze(2).squeeze(1) # [B, C] def get_train_serializer(model_config: FFNNTransducerConfig, **kwargs) -> Collection: + assert __package__ is not None pytorch_package = __package__.rpartition(".")[0] return get_basic_pt_network_serializer( module_import_path=f"{__name__}.{FFNNTransducer.__name__}", @@ -332,6 +384,7 @@ def get_train_serializer(model_config: FFNNTransducerConfig, **kwargs) -> Collec def get_torchaudio_train_serializer(model_config: FFNNTransducerConfig, **kwargs) -> Collection: + assert __package__ is not None pytorch_package = __package__.rpartition(".")[0] return get_basic_pt_network_serializer( module_import_path=f"{__name__}.{FFNNTransducer.__name__}", @@ -348,6 +401,7 @@ def get_torchaudio_train_serializer(model_config: FFNNTransducerConfig, **kwargs def get_k2_train_serializer(model_config: FFNNTransducerConfig, **kwargs) -> Collection: + assert __package__ is not None pytorch_package = __package__.rpartition(".")[0] return get_basic_pt_network_serializer( module_import_path=f"{__name__}.{FFNNTransducer.__name__}", @@ -365,6 +419,7 @@ def get_k2_train_serializer(model_config: FFNNTransducerConfig, **kwargs) -> Col def get_pruned_k2_train_serializer(model_config: FFNNTransducerConfig, **kwargs) -> Collection: + assert __package__ is not None pytorch_package = __package__.rpartition(".")[0] return get_basic_pt_network_serializer( module_import_path=f"{__name__}.{FFNNTransducer.__name__}", @@ -382,24 +437,31 @@ def get_pruned_k2_train_serializer(model_config: FFNNTransducerConfig, **kwargs) def get_encoder_recog_serializer(model_config: FFNNTransducerConfig, **_) -> Collection: + assert __package__ is not None pytorch_package = __package__.rpartition(".")[0] return get_basic_pt_network_serializer( module_import_path=f"{__name__}.{FFNNTransducerEncoderOnly.__name__}", model_config=model_config, - additional_serializer_objects=[Import(f"{pytorch_package}.forward.transducer.encoder_forward_step")], + additional_serializer_objects=[ + Import(f"{pytorch_package}.forward.transducer.encoder_forward_step", import_as="forward_step") + ], ) def get_decoder_recog_serializer(model_config: FFNNTransducerConfig, **_) -> Collection: + assert __package__ is not None pytorch_package = __package__.rpartition(".")[0] return get_basic_pt_network_serializer( module_import_path=f"{__name__}.{FFNNTransducerDecoderOnly.__name__}", model_config=model_config, - additional_serializer_objects=[Import(f"{pytorch_package}.forward.transducer.decoder_forward_step")], + additional_serializer_objects=[ + Import(f"{pytorch_package}.forward.transducer.decoder_forward_step", import_as="forward_step") + ], ) def get_beam_search_serializer(model_config: FFNNTransducerConfig, **kwargs) -> Collection: + assert __package__ is not None pytorch_package = __package__.rpartition(".")[0] kwargs.setdefault("lexicon_file", CodeWrapper("lexicon_file")) return get_basic_pt_network_serializer( @@ -420,72 +482,74 @@ def get_beam_search_serializer(model_config: FFNNTransducerConfig, **kwargs) -> def get_default_config_v1(num_outputs: int) -> FFNNTransducerConfig: feature_extraction = ModuleFactoryV1( - module_class=LogMelFeatureExtractionV1, - cfg=LogMelFeatureExtractionV1Config( + module_class=RasrCompatibleLogMelFeatureExtractionV1, + cfg=RasrCompatibleLogMelFeatureExtractionV1Config( sample_rate=16000, win_size=0.025, hop_size=0.01, - f_min=60, - f_max=7600, - min_amp=1e-10, + min_amp=1.175494e-38, num_filters=80, - center=False, - n_fft=400, + alpha=0.97, ), ) specaugment = ModuleFactoryV1( - module_class=SpecaugmentModuleV1, - cfg=SpecaugmentConfigV1( - time_min_num_masks=1, - time_max_num_masks=1, - time_mask_max_size=15, - freq_min_num_masks=1, - freq_max_num_masks=8, + module_class=SpecaugmentByLengthModuleV1, + cfg=SpecaugmentByLengthConfigV1( + time_min_num_masks=2, + time_max_mask_per_n_frames=25, + time_mask_max_size=20, + freq_min_num_masks=2, + freq_max_num_masks=16, freq_mask_max_size=5, ), ) - frontend_cfg = VGG4LayerActFrontendV1Config( - in_features=80, - conv1_channels=32, - conv2_channels=32, - conv3_channels=64, - conv4_channels=64, - conv_kernel_size=(3, 3), - conv_padding=None, - pool1_kernel_size=(2, 2), - pool1_stride=None, - pool1_padding=None, - pool2_kernel_size=(2, 1), - pool2_stride=None, - pool2_padding=None, - activation=torch.nn.SiLU(), - out_features=512, + frontend = ModuleFactoryV1( + GenericFrontendV1, + GenericFrontendV1Config( + in_features=80, + layer_ordering=[ + FrontendLayerType.Conv2d, + FrontendLayerType.Conv2d, + FrontendLayerType.Pool2d, + FrontendLayerType.Conv2d, + FrontendLayerType.Conv2d, + FrontendLayerType.Pool2d, + FrontendLayerType.Activation, + ], + conv_kernel_sizes=[(3, 3), (3, 3), (3, 3), (3, 3)], + conv_paddings=None, + conv_out_dims=[32, 64, 64, 32], + conv_strides=[(1, 1), (1, 1), (1, 1), (1, 1)], + pool_kernel_sizes=[(2, 1), (2, 1)], + pool_strides=None, + pool_paddings=None, + activations=[torch.nn.ReLU()], + out_features=384, + ), ) - frontend = ModuleFactoryV1(VGG4LayerActFrontendV1, frontend_cfg) - ff_cfg = conformer_parts_i6.ConformerPositionwiseFeedForwardV1Config( - input_dim=512, - hidden_dim=2048, - dropout=0.1, + input_dim=384, + hidden_dim=1536, + dropout=0.2, activation=torch.nn.SiLU(), ) mhsa_cfg = conformer_parts_i6.ConformerMHSAV1Config( - input_dim=512, - num_att_heads=8, - att_weights_dropout=0.1, - dropout=0.1, + input_dim=384, + num_att_heads=6, + att_weights_dropout=0.2, + dropout=0.2, ) conv_cfg = conformer_parts_i6.ConformerConvolutionV1Config( - channels=512, + channels=384, kernel_size=31, - dropout=0.1, + dropout=0.2, activation=torch.nn.SiLU(), - norm=torch.nn.BatchNorm1d(num_features=512, affine=False), + norm=torch.nn.BatchNorm1d(num_features=384, affine=False), ) block_cfg = conformer_i6.ConformerBlockV1Config( @@ -504,26 +568,25 @@ def get_default_config_v1(num_outputs: int) -> FFNNTransducerConfig: feature_extraction=feature_extraction, specaugment=specaugment, encoder=ModuleFactoryV1(module_class=conformer_i6.ConformerEncoderV1, cfg=conformer_cfg), - dim=512, - target_size=num_outputs, ) predictor_cfg = FFNNTransducerPredictorConfig( layers=2, layer_size=640, activation=torch.nn.Tanh(), - dropout=0.1, + dropout=0.2, context_history_size=1, context_embedding_size=256, - target_size=num_outputs, blank_id=0, + target_size=num_outputs, ) joiner_cfg = TransducerJoinerConfig( + input_size=1024, layer_size=1024, act=torch.nn.Tanh(), target_size=num_outputs, - combination_mode=CombinationMode.SUM, + combination_mode=CombinationMode.CONCAT, ) return FFNNTransducerConfig( diff --git a/users/berger/pytorch/train_steps/transducer.py b/users/berger/pytorch/train_steps/transducer.py index a9e1ea4bb..ceaae3809 100644 --- a/users/berger/pytorch/train_steps/transducer.py +++ b/users/berger/pytorch/train_steps/transducer.py @@ -8,7 +8,8 @@ def train_step(*, model: torch.nn.Module, extern_data: TensorDict, blank_idx: in audio_features = extern_data["data"].raw_tensor assert audio_features is not None - audio_features = map_tensor_to_minus1_plus1_interval(audio_features) + audio_features = audio_features.float() + # audio_features = map_tensor_to_minus1_plus1_interval(audio_features) assert extern_data["data"].dims[1].dyn_size_ext is not None audio_feature_lengths = extern_data["data"].dims[1].dyn_size_ext.raw_tensor @@ -34,7 +35,7 @@ def train_step(*, model: torch.nn.Module, extern_data: TensorDict, blank_idx: in ) loss = monotonic_rnnt_loss( - acts=model_logits, + acts=model_logits.to(dtype=torch.float32), labels=targets, input_lengths=input_lengths, label_lengths=target_lengths, diff --git a/users/berger/recipe/rasr/label_tree_and_scorer.py b/users/berger/recipe/rasr/label_tree_and_scorer.py index 1b0c89493..cd1451d55 100644 --- a/users/berger/recipe/rasr/label_tree_and_scorer.py +++ b/users/berger/recipe/rasr/label_tree_and_scorer.py @@ -1,11 +1,13 @@ __all__ = ["LabelTree", "LabelScorer"] from typing import Any, Dict, Optional + +from i6_core import rasr from i6_experiments.users.berger import helpers -from sisyphus import * +from sisyphus import tk, setup_path +assert __package__ is not None Path = setup_path(__package__) -from i6_core import rasr class LabelTree: @@ -62,6 +64,7 @@ def apply_config( "tf-attention", "tf-rnn-transducer", "tf-ffnn-transducer", + "onnx-ffnn-transducer", ] diff --git a/users/berger/settings.py b/users/berger/settings.py index ed5e6899f..04308e1af 100644 --- a/users/berger/settings.py +++ b/users/berger/settings.py @@ -1,3 +1,4 @@ +import getpass import sys sys.path.append("/u/beck/dev/cachemanager/") @@ -10,6 +11,13 @@ def file_caching(path, is_output=False): return "`cf %s`" % path +CPU_SLOW_JOBLIST = [ + "ScliteJob", + "Hub5ScoreJob", + "PipelineJob", +] + + def check_engine_limits(current_rqmt, task): """ i6 support for gpu_mem @@ -20,6 +28,10 @@ def check_engine_limits(current_rqmt, task): current_rqmt["sbatch_args"] = ["-p", "gpu_24gb"] else: current_rqmt["sbatch_args"] = ["-p", "gpu_11gb"] + + if task._job.__class__.__name__ in CPU_SLOW_JOBLIST: + current_rqmt["sbatch_args"] = ["-p", "cpu_slow"] + return current_rqmt @@ -39,7 +51,7 @@ def engine(): return EngineSelector( engines={ - "short": LocalEngine(cpus=4, mem=8), + "short": LocalEngine(cpus=4, mem=16), "long": SimpleLinuxUtilityForResourceManagementEngine(default_rqmt=default_rqmt), }, default_engine="long", @@ -148,4 +160,11 @@ def worker_wrapper(job, task_name, call): ) TMP_PREFIX = "/var/tmp/" -DEFAULT_ENVIRONMENT_SET["TMPDIR"] = TMP_PREFIX +DEFAULT_ENVIRONMENT_SET.update( + { + "TMPDIR": TMP_PREFIX, + "TMP": TMP_PREFIX, + "NUMBA_CACHE_DIR": f"{TMP_PREFIX}/numba_cache_{getpass.getuser()}", # used for librosa + "PYTORCH_KERNEL_CACHE_PATH": f"{TMP_PREFIX}/pt_kernel_cache_{getpass.getuser()}", # used for cuda pytorch + } +) diff --git a/users/berger/systems/dataclasses.py b/users/berger/systems/dataclasses.py index ba2a4b831..4af5faf99 100644 --- a/users/berger/systems/dataclasses.py +++ b/users/berger/systems/dataclasses.py @@ -98,6 +98,7 @@ class FeatureType(Enum): CONCAT_SEC_GAMMATONE_16K = auto() CONCAT_MIX_GAMMATONE_16K = auto() CONCAT_SEC_MIX_GAMMATONE_16K = auto() + LOGMEL_16K = auto() @dataclass diff --git a/users/berger/systems/functors/rasr_base.py b/users/berger/systems/functors/rasr_base.py index 6f81dc284..b14fac1dd 100644 --- a/users/berger/systems/functors/rasr_base.py +++ b/users/berger/systems/functors/rasr_base.py @@ -117,6 +117,7 @@ def _make_base_feature_flow( feature_type.CONCAT_SEC_MIX_GAMMATONE_16K: partial( self._make_cached_concatenated_gt_feature_flow_16k, use_sec=True, use_mix=True ), + feature_type.LOGMEL_16K: self._make_base_logmel_feature_flow_16k, }[feature_type](corpus_info=corpus_info, **kwargs) def _make_base_sample_feature_flow(self, corpus_info: dataclasses.CorpusInfo, dc_detection: bool = False, **kwargs): @@ -218,11 +219,13 @@ def _make_cached_concatenated_gt_feature_flow_16k( return features.basic_cache_flow(cache_files=cache_files) def _make_base_logmel_feature_flow_16k(self, corpus_info: dataclasses.CorpusInfo, dc_detection: bool = False, **_): - gt_options = copy.deepcopy(get_feature_extraction_args_16kHz(dc_detection=dc_detection)["gt"]["gt_options"]) + filterbank_options = copy.deepcopy( + get_feature_extraction_args_16kHz(dc_detection=dc_detection)["filterbank"]["filterbank_options"] + ) audio_format = corpus_info.crp.audio_format - gt_options["samples_options"]["audio_format"] = audio_format - gt_options["add_features_output"] = True - return features.gammatone_flow(**gt_options) + filterbank_options["samples_options"]["audio_format"] = audio_format + filterbank_options["add_features_output"] = True + return features.filterbank_flow(**filterbank_options) @lru_cache_with_signature def _get_checkpoint( diff --git a/users/berger/systems/functors/recognition/returnn_search.py b/users/berger/systems/functors/recognition/returnn_search.py index c9ece62d8..fa2691880 100644 --- a/users/berger/systems/functors/recognition/returnn_search.py +++ b/users/berger/systems/functors/recognition/returnn_search.py @@ -137,6 +137,7 @@ def __call__( returnn_root=self.returnn_root, returnn_python_exe=self.returnn_python_exe, output_files=["search_out.py"], + mem_rqmt=8, ) if isinstance(epoch, str): diff --git a/users/berger/systems/functors/recognition/seq2seq_search.py b/users/berger/systems/functors/recognition/seq2seq_search.py index ba70e8200..7190de29d 100644 --- a/users/berger/systems/functors/recognition/seq2seq_search.py +++ b/users/berger/systems/functors/recognition/seq2seq_search.py @@ -112,6 +112,7 @@ def __call__( label_scorer=label_scorer, base_feature_flow=base_feature_flow, onnx_model=onnx_model, + feature_type=feature_type, **model_flow_args, ) else: @@ -128,6 +129,7 @@ def __call__( base_feature_flow=base_feature_flow, enc_onnx_model=enc_model, dec_onnx_model=dec_model, + feature_type=feature_type, **model_flow_args, ) else: diff --git a/users/berger/systems/functors/seq2seq_base.py b/users/berger/systems/functors/seq2seq_base.py index 3e63d22ac..ee897cae3 100644 --- a/users/berger/systems/functors/seq2seq_base.py +++ b/users/berger/systems/functors/seq2seq_base.py @@ -34,20 +34,17 @@ def _make_onnx_enc_dec_config_for_label_scorer( label_scorer: custom_rasr.LabelScorer, enc_onnx_model: tk.Path, dec_onnx_model: tk.Path, - enc_features_name: str = "features", - enc_features_size: str = "features:size1", - enc_output_name: str = "encoder", - enc_output_size: str = "encoder:size1", - dec_features_name: str = "encoder", - dec_features_size: str = "encoder:size1", - dec_history_name: str = "history", + enc_features_name: str = "sources", + enc_features_size: str = "sources:size1", + enc_output_name: str = "source_encodings", + dec_features_name: str = "source_encodings", + dec_history_name: str = "targets", dec_output_name: str = "log_probs", ) -> None: encoder_io_map = rasr.RasrConfig() encoder_io_map.features = enc_features_name encoder_io_map.features_size = enc_features_size encoder_io_map.encoder_output = enc_output_name - encoder_io_map.encoder_output_size = enc_output_size encoder_session = rasr.RasrConfig() encoder_session.file = enc_onnx_model @@ -56,7 +53,6 @@ def _make_onnx_enc_dec_config_for_label_scorer( decoder_io_map = rasr.RasrConfig() decoder_io_map.encoder_output = dec_features_name - decoder_io_map.encoder_output_size = dec_features_size decoder_io_map.feedback = dec_history_name decoder_io_map.output = dec_output_name @@ -65,10 +61,10 @@ def _make_onnx_enc_dec_config_for_label_scorer( decoder_session.inter_op_num_threads = 2 decoder_session.intra_op_num_threads = 2 - label_scorer.apply_config("encoder-io-map", encoder_io_map) - label_scorer.apply_config("encoder-session", encoder_session) - label_scorer.apply_config("decoder-io-map", decoder_io_map) - label_scorer.apply_config("decoder-session", decoder_session) + label_scorer.config.encoder_io_map = encoder_io_map + label_scorer.config.encoder_session = encoder_session + label_scorer.config.decoder_io_map = decoder_io_map + label_scorer.config.decoder_session = decoder_session def _get_tf_feature_flow_for_label_scorer( self, From 53ac13b5026d5dc65f20b3be65fba262eb292a73 Mon Sep 17 00:00:00 2001 From: schmitt Date: Thu, 16 May 2024 10:19:52 +0200 Subject: [PATCH 015/227] update --- .../returnn/config_builder_rf/base.py | 6 +- .../returnn/network_builder_rf/base.py | 144 +---- .../returnn/network_builder_rf/base_old.py | 203 +++++++ .../network_builder_rf/encoder/__init__.py | 0 .../network_builder_rf/encoder/global_.py | 157 ++++++ .../network_builder_rf/global_/decoder.py | 85 +++ .../network_builder_rf/global_/model.py | 137 ++--- .../global_/model_import.py | 172 ++++++ .../global_/model_old/README | 2 + .../global_/model_old/__init__.py | 0 .../global_/model_old/model.py | 220 ++++++++ .../global_/model_old/model_import.py | 172 ++++++ .../global_/model_old/recog.py | 315 +++++++++++ .../global_/model_old/train.py | 115 ++++ .../network_builder_rf/global_/recog.py | 39 +- .../network_builder_rf/global_/train.py | 12 +- .../network_builder_rf/segmental/model.py | 344 +++--------- .../segmental/model_new/__init__.py | 0 .../model_new/blank_model/__init__.py | 0 .../segmental/model_new/blank_model/model.py | 92 +++ .../segmental/model_new/blank_model/train.py | 55 ++ .../model_new/label_model/__init__.py | 0 .../segmental/model_new/label_model/model.py | 246 ++++++++ .../segmental/model_new/label_model/train.py | 75 +++ .../segmental/model_old/README | 2 + .../segmental/model_old/__init__.py | 0 .../segmental/model_old/model.py | 434 +++++++++++++++ .../segmental/{ => model_old}/model_import.py | 0 .../segmental/model_old/recog.py | 527 ++++++++++++++++++ .../segmental/model_old/train.py | 280 ++++++++++ .../network_builder_rf/segmental/recog.py | 66 +-- .../network_builder_rf/segmental/train.py | 182 ++---- .../network_builder_rf/segmental/utils.py | 43 +- .../center_window_att/baseline_v1/__init__.py | 51 +- .../center_window_att/baseline_v1/baseline.py | 9 +- .../center_window_att/train.py | 13 +- .../pipelines/pipeline_ls_conf/checkpoints.py | 4 +- .../global_att/baseline_v1/__init__.py | 15 +- 38 files changed, 3469 insertions(+), 748 deletions(-) create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/base_old.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/encoder/__init__.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/encoder/global_.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/decoder.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_import.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_old/README create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_old/__init__.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_old/model.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_old/model_import.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_old/recog.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_old/train.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/__init__.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/__init__.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/model.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/train.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/__init__.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/model.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/train.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_old/README create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_old/__init__.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_old/model.py rename users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/{ => model_old}/model_import.py (100%) create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_old/recog.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_old/train.py diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/config_builder_rf/base.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/config_builder_rf/base.py index 264a64b4f..156565816 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/config_builder_rf/base.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/config_builder_rf/base.py @@ -467,6 +467,7 @@ class SegmentalAttConfigBuilderRF(LibrispeechConformerConfigBuilderRF): def __init__( self, center_window_size: int, + decoder_version: Optional[int] = None, length_model_opts: Optional[Dict] = None, **kwargs ): @@ -476,8 +477,9 @@ def __init__( center_window_size=center_window_size, )) - if length_model_opts is not None: - self.config_dict["length_model_opts"] = length_model_opts + print(decoder_version) + if decoder_version: + self.config_dict["label_decoder_version"] = decoder_version def get_recog_config(self, opts: Dict): recog_config = super(SegmentalAttConfigBuilderRF, self).get_recog_config(opts) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/base.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/base.py index 0f6fb5c3b..8f02d397a 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/base.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/base.py @@ -5,7 +5,6 @@ from returnn.tensor import Tensor, Dim, single_step_dim import returnn.frontend as rf -from returnn.frontend.encoder.conformer import ConformerEncoder, ConformerConvSubsample from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.supports_label_scorer_torch import RFModelWithMakeLabelScorer @@ -13,55 +12,24 @@ _batch_size_factor = 160 -class BaseModel(rf.Module): +class BaseLabelDecoder(rf.Module): def __init__( self, - in_dim: Dim, *, - num_enc_layers: int = 12, + enc_out_dim: Dim, target_dim: Dim, - wb_target_dim: Optional[Dim] = None, blank_idx: int, - enc_aux_logits: Sequence[int] = (), # layers - enc_model_dim: Dim = Dim(name="enc", dimension=512), - enc_ff_dim: Dim = Dim(name="enc-ff", dimension=2048), - enc_att_num_heads: int = 4, - enc_conformer_layer_opts: Optional[Dict[str, Any]] = None, enc_key_total_dim: Dim = Dim(name="enc_key_total_dim", dimension=1024), att_num_heads: Dim = Dim(name="att_num_heads", dimension=1), att_dropout: float = 0.1, - enc_dropout: float = 0.1, - enc_att_dropout: float = 0.1, l2: float = 0.0001, language_model: Optional[RFModelWithMakeLabelScorer] = None, ): - super(BaseModel, self).__init__() - - from returnn.config import get_global_config - - config = get_global_config(return_empty_if_none=True) - - self.in_dim = in_dim - self.encoder = ConformerEncoder( - in_dim, - enc_model_dim, - ff_dim=enc_ff_dim, - input_layer=ConformerConvSubsample( - in_dim, - out_dims=[Dim(32, name="conv1"), Dim(64, name="conv2"), Dim(64, name="conv3")], - filter_sizes=[(3, 3), (3, 3), (3, 3)], - pool_sizes=[(1, 2)], - strides=[(1, 1), (3, 1), (2, 1)], - ), - encoder_layer_opts=enc_conformer_layer_opts, - num_layers=num_enc_layers, - num_heads=enc_att_num_heads, - dropout=enc_dropout, - att_dropout=enc_att_dropout, - ) + super(BaseLabelDecoder, self).__init__() self.target_dim = target_dim self.blank_idx = blank_idx + self.enc_out_dim = enc_out_dim self.enc_key_total_dim = enc_key_total_dim self.enc_key_per_head_dim = enc_key_total_dim.div_left(att_num_heads) @@ -69,16 +37,10 @@ def __init__( self.att_dropout = att_dropout self.dropout_broadcast = rf.dropout_broadcast_default() - self.enc_ctx = rf.Linear(self.encoder.out_dim, enc_key_total_dim) - self.enc_ctx_dropout = 0.2 - self.enc_win_dim = Dim(name="enc_win_dim", dimension=5) - - self.inv_fertility = rf.Linear(self.encoder.out_dim, att_num_heads, with_bias=False) - self.target_embed = rf.Embedding(target_dim, Dim(name="target_embed", dimension=640)) self.s = rf.ZoneoutLSTM( - self.target_embed.out_dim + att_num_heads * self.encoder.out_dim, + self.target_embed.out_dim + att_num_heads * enc_out_dim, Dim(name="lstm", dimension=1024), zoneout_factor_cell=0.15, zoneout_factor_output=0.05, @@ -93,7 +55,7 @@ def __init__( self.s_transformed = rf.Linear(self.s.out_dim, enc_key_total_dim, with_bias=False) self.energy = rf.Linear(enc_key_total_dim, att_num_heads, with_bias=False) self.readout_in = rf.Linear( - self.s.out_dim + self.target_embed.out_dim + att_num_heads * self.encoder.out_dim, + self.s.out_dim + self.target_embed.out_dim + att_num_heads * enc_out_dim, Dim(name="readout", dimension=1024), ) self.output_prob = rf.Linear(self.readout_in.out_dim // 2, target_dim) @@ -101,103 +63,9 @@ def __init__( for p in self.parameters(): p.weight_decay = l2 - if enc_aux_logits: - if not wb_target_dim: - wb_target_dim = target_dim + 1 - for i in enc_aux_logits: - setattr(self, f"enc_aux_logits_{i}", rf.Linear(self.encoder.out_dim, wb_target_dim)) - - self._specaugment_opts = { - "steps": config.typed_value("specaugment_steps") or (0, 1000, 2000), - "max_consecutive_spatial_dims": config.typed_value("specaugment_max_consecutive_spatial_dims") or 20, - "max_consecutive_feature_dims": config.typed_value("specaugment_max_consecutive_feature_dims") - or (_log_mel_feature_dim // 5), - "num_spatial_mask_factor": config.typed_value("specaugment_num_spatial_mask_factor") or 100, - } - - self._pretrain_opts: Optional[Dict[str, Any]] = config.typed_value("pretrain_opts") - - self._mixup = None - if config.typed_value("mixup", None) is not None: - from i6_experiments.users.schmitt.returnn_frontend.models.rf_mixup import Mixup, MixupOpts - - self._mixup = Mixup(feature_dim=self.in_dim, opts=MixupOpts(**config.typed_value("mixup"))) - # Note: Even though we have this here, it is not used in loop_step or decode_logits. # Instead, it is intended to make a separate label scorer for it. self.language_model = None self.language_model_make_label_scorer = None if language_model: self.language_model, self.language_model_make_label_scorer = language_model - - def encode( - self, - source: Tensor, - *, - in_spatial_dim: Dim, - collected_outputs: Optional[Dict[str, Tensor]] = None, - ) -> Tuple[Dict[str, Tensor], Dim]: - """encode, and extend the encoder output for things we need in the decoder""" - # log mel filterbank features - source, in_spatial_dim = rf.audio.log_mel_filterbank_from_raw( - source, - in_spatial_dim=in_spatial_dim, - out_dim=self.in_dim, - sampling_rate=16_000, - log_base=math.exp(2.3026), # almost 10.0 but not exactly... - ) - if self._mixup: - source = self._mixup(source, spatial_dim=in_spatial_dim) - # SpecAugment - source = rf.audio.specaugment( - source, - spatial_dim=in_spatial_dim, - feature_dim=self.in_dim, - **self._specaugment_opts, - ) - # Encoder including convolutional frontend - with _opt_apply_pretrain_to_encoder(self.encoder, collected_outputs, self._pretrain_opts): - enc, enc_spatial_dim = self.encoder( - source, in_spatial_dim=in_spatial_dim, collected_outputs=collected_outputs - ) - enc_ctx = self.enc_ctx(enc) - inv_fertility = rf.sigmoid(self.inv_fertility(enc)) - return dict(enc=enc, enc_ctx=enc_ctx, inv_fertility=inv_fertility), enc_spatial_dim - - -@contextlib.contextmanager -def _opt_apply_pretrain_to_encoder( - encoder: ConformerEncoder, collected_outputs: Optional[Dict[str, Tensor]], pretrain_opts: Optional[Dict[str, Any]] -): - """Function is run within RETURNN.""" - if not pretrain_opts: - yield - return - step = rf.get_run_ctx().step - steps: Union[Sequence[Tuple[int, Dict[str, Any]]], Dict[int, Dict[str, Any]]] = pretrain_opts["steps"] - if isinstance(steps, (list, tuple)): - steps_ = {} - step_bound = 0 - for step_bound_rel, opts in steps: - step_bound += step_bound_rel - steps_[step_bound] = opts - steps = steps_ - assert isinstance(steps, dict) - for step_bound, opts in sorted(steps.items()): - if step < step_bound: - assert isinstance(opts, dict) - opts_ = opts.copy() - # somewhat hacky but that is still the easiest way I can think of, without touching a lot of other code - pretrain_num_layers = opts_.pop("num_layers") - assert not opts_, f"unhandled opts: {opts_} in opts {opts} for step bound {step_bound}" - orig_layers = encoder.layers[:] - del encoder.layers[pretrain_num_layers:] - yield - encoder.layers[:] = orig_layers - if collected_outputs is not None: - assert len(collected_outputs) == pretrain_num_layers - for i in range(pretrain_num_layers, len(orig_layers)): - collected_outputs[str(i)] = collected_outputs[str(pretrain_num_layers - 1)] - return - yield - return diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/base_old.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/base_old.py new file mode 100644 index 000000000..0f6fb5c3b --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/base_old.py @@ -0,0 +1,203 @@ +from typing import Optional, Dict, Any, Sequence, Tuple, List, Union, TYPE_CHECKING +import contextlib +import math +import functools + +from returnn.tensor import Tensor, Dim, single_step_dim +import returnn.frontend as rf +from returnn.frontend.encoder.conformer import ConformerEncoder, ConformerConvSubsample + +from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.supports_label_scorer_torch import RFModelWithMakeLabelScorer + +_log_mel_feature_dim = 80 +_batch_size_factor = 160 + + +class BaseModel(rf.Module): + def __init__( + self, + in_dim: Dim, + *, + num_enc_layers: int = 12, + target_dim: Dim, + wb_target_dim: Optional[Dim] = None, + blank_idx: int, + enc_aux_logits: Sequence[int] = (), # layers + enc_model_dim: Dim = Dim(name="enc", dimension=512), + enc_ff_dim: Dim = Dim(name="enc-ff", dimension=2048), + enc_att_num_heads: int = 4, + enc_conformer_layer_opts: Optional[Dict[str, Any]] = None, + enc_key_total_dim: Dim = Dim(name="enc_key_total_dim", dimension=1024), + att_num_heads: Dim = Dim(name="att_num_heads", dimension=1), + att_dropout: float = 0.1, + enc_dropout: float = 0.1, + enc_att_dropout: float = 0.1, + l2: float = 0.0001, + language_model: Optional[RFModelWithMakeLabelScorer] = None, + ): + super(BaseModel, self).__init__() + + from returnn.config import get_global_config + + config = get_global_config(return_empty_if_none=True) + + self.in_dim = in_dim + self.encoder = ConformerEncoder( + in_dim, + enc_model_dim, + ff_dim=enc_ff_dim, + input_layer=ConformerConvSubsample( + in_dim, + out_dims=[Dim(32, name="conv1"), Dim(64, name="conv2"), Dim(64, name="conv3")], + filter_sizes=[(3, 3), (3, 3), (3, 3)], + pool_sizes=[(1, 2)], + strides=[(1, 1), (3, 1), (2, 1)], + ), + encoder_layer_opts=enc_conformer_layer_opts, + num_layers=num_enc_layers, + num_heads=enc_att_num_heads, + dropout=enc_dropout, + att_dropout=enc_att_dropout, + ) + + self.target_dim = target_dim + self.blank_idx = blank_idx + + self.enc_key_total_dim = enc_key_total_dim + self.enc_key_per_head_dim = enc_key_total_dim.div_left(att_num_heads) + self.att_num_heads = att_num_heads + self.att_dropout = att_dropout + self.dropout_broadcast = rf.dropout_broadcast_default() + + self.enc_ctx = rf.Linear(self.encoder.out_dim, enc_key_total_dim) + self.enc_ctx_dropout = 0.2 + self.enc_win_dim = Dim(name="enc_win_dim", dimension=5) + + self.inv_fertility = rf.Linear(self.encoder.out_dim, att_num_heads, with_bias=False) + + self.target_embed = rf.Embedding(target_dim, Dim(name="target_embed", dimension=640)) + + self.s = rf.ZoneoutLSTM( + self.target_embed.out_dim + att_num_heads * self.encoder.out_dim, + Dim(name="lstm", dimension=1024), + zoneout_factor_cell=0.15, + zoneout_factor_output=0.05, + use_zoneout_output=False, # like RETURNN/TF ZoneoutLSTM old default + # parts_order="icfo", # like RETURNN/TF ZoneoutLSTM + # parts_order="ifco", + parts_order="jifo", # NativeLSTM (the code above converts it...) + forget_bias=0.0, # the code above already adds it during conversion + ) + + self.weight_feedback = rf.Linear(att_num_heads, enc_key_total_dim, with_bias=False) + self.s_transformed = rf.Linear(self.s.out_dim, enc_key_total_dim, with_bias=False) + self.energy = rf.Linear(enc_key_total_dim, att_num_heads, with_bias=False) + self.readout_in = rf.Linear( + self.s.out_dim + self.target_embed.out_dim + att_num_heads * self.encoder.out_dim, + Dim(name="readout", dimension=1024), + ) + self.output_prob = rf.Linear(self.readout_in.out_dim // 2, target_dim) + + for p in self.parameters(): + p.weight_decay = l2 + + if enc_aux_logits: + if not wb_target_dim: + wb_target_dim = target_dim + 1 + for i in enc_aux_logits: + setattr(self, f"enc_aux_logits_{i}", rf.Linear(self.encoder.out_dim, wb_target_dim)) + + self._specaugment_opts = { + "steps": config.typed_value("specaugment_steps") or (0, 1000, 2000), + "max_consecutive_spatial_dims": config.typed_value("specaugment_max_consecutive_spatial_dims") or 20, + "max_consecutive_feature_dims": config.typed_value("specaugment_max_consecutive_feature_dims") + or (_log_mel_feature_dim // 5), + "num_spatial_mask_factor": config.typed_value("specaugment_num_spatial_mask_factor") or 100, + } + + self._pretrain_opts: Optional[Dict[str, Any]] = config.typed_value("pretrain_opts") + + self._mixup = None + if config.typed_value("mixup", None) is not None: + from i6_experiments.users.schmitt.returnn_frontend.models.rf_mixup import Mixup, MixupOpts + + self._mixup = Mixup(feature_dim=self.in_dim, opts=MixupOpts(**config.typed_value("mixup"))) + + # Note: Even though we have this here, it is not used in loop_step or decode_logits. + # Instead, it is intended to make a separate label scorer for it. + self.language_model = None + self.language_model_make_label_scorer = None + if language_model: + self.language_model, self.language_model_make_label_scorer = language_model + + def encode( + self, + source: Tensor, + *, + in_spatial_dim: Dim, + collected_outputs: Optional[Dict[str, Tensor]] = None, + ) -> Tuple[Dict[str, Tensor], Dim]: + """encode, and extend the encoder output for things we need in the decoder""" + # log mel filterbank features + source, in_spatial_dim = rf.audio.log_mel_filterbank_from_raw( + source, + in_spatial_dim=in_spatial_dim, + out_dim=self.in_dim, + sampling_rate=16_000, + log_base=math.exp(2.3026), # almost 10.0 but not exactly... + ) + if self._mixup: + source = self._mixup(source, spatial_dim=in_spatial_dim) + # SpecAugment + source = rf.audio.specaugment( + source, + spatial_dim=in_spatial_dim, + feature_dim=self.in_dim, + **self._specaugment_opts, + ) + # Encoder including convolutional frontend + with _opt_apply_pretrain_to_encoder(self.encoder, collected_outputs, self._pretrain_opts): + enc, enc_spatial_dim = self.encoder( + source, in_spatial_dim=in_spatial_dim, collected_outputs=collected_outputs + ) + enc_ctx = self.enc_ctx(enc) + inv_fertility = rf.sigmoid(self.inv_fertility(enc)) + return dict(enc=enc, enc_ctx=enc_ctx, inv_fertility=inv_fertility), enc_spatial_dim + + +@contextlib.contextmanager +def _opt_apply_pretrain_to_encoder( + encoder: ConformerEncoder, collected_outputs: Optional[Dict[str, Tensor]], pretrain_opts: Optional[Dict[str, Any]] +): + """Function is run within RETURNN.""" + if not pretrain_opts: + yield + return + step = rf.get_run_ctx().step + steps: Union[Sequence[Tuple[int, Dict[str, Any]]], Dict[int, Dict[str, Any]]] = pretrain_opts["steps"] + if isinstance(steps, (list, tuple)): + steps_ = {} + step_bound = 0 + for step_bound_rel, opts in steps: + step_bound += step_bound_rel + steps_[step_bound] = opts + steps = steps_ + assert isinstance(steps, dict) + for step_bound, opts in sorted(steps.items()): + if step < step_bound: + assert isinstance(opts, dict) + opts_ = opts.copy() + # somewhat hacky but that is still the easiest way I can think of, without touching a lot of other code + pretrain_num_layers = opts_.pop("num_layers") + assert not opts_, f"unhandled opts: {opts_} in opts {opts} for step bound {step_bound}" + orig_layers = encoder.layers[:] + del encoder.layers[pretrain_num_layers:] + yield + encoder.layers[:] = orig_layers + if collected_outputs is not None: + assert len(collected_outputs) == pretrain_num_layers + for i in range(pretrain_num_layers, len(orig_layers)): + collected_outputs[str(i)] = collected_outputs[str(pretrain_num_layers - 1)] + return + yield + return diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/encoder/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/encoder/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/encoder/global_.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/encoder/global_.py new file mode 100644 index 000000000..ea397b0c9 --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/encoder/global_.py @@ -0,0 +1,157 @@ +from typing import Optional, Dict, Any, Sequence, Tuple, List, Union, TYPE_CHECKING +import contextlib +import math +import functools + +from returnn.tensor import Tensor, Dim, single_step_dim +import returnn.frontend as rf +from returnn.frontend.encoder.conformer import ConformerEncoder, ConformerConvSubsample + +_log_mel_feature_dim = 80 +_batch_size_factor = 160 + + +class GlobalConformerEncoder(ConformerEncoder): + def __init__( + self, + in_dim: Dim, + out_dim: Dim = Dim(name="enc", dimension=512), + *, + num_layers: int = 12, + target_dim: Dim, + wb_target_dim: Optional[Dim] = None, + aux_logits: Sequence[int] = (), # layers + ff_dim: Dim = Dim(name="enc-ff", dimension=2048), + num_heads: int = 4, + encoder_layer_opts: Optional[Dict[str, Any]] = None, + enc_key_total_dim: Dim = Dim(name="enc_key_total_dim", dimension=1024), + dec_att_num_heads: Dim = Dim(name="att_num_heads", dimension=1), + dropout: float = 0.1, + att_dropout: float = 0.1, + l2: float = 0.0001, + ): + super(GlobalConformerEncoder, self).__init__( + in_dim, + out_dim, + ff_dim=ff_dim, + input_layer=ConformerConvSubsample( + in_dim, + out_dims=[Dim(32, name="conv1"), Dim(64, name="conv2"), Dim(64, name="conv3")], + filter_sizes=[(3, 3), (3, 3), (3, 3)], + pool_sizes=[(1, 2)], + strides=[(1, 1), (3, 1), (2, 1)], + ), + encoder_layer_opts=encoder_layer_opts, + num_layers=num_layers, + num_heads=num_heads, + dropout=dropout, + att_dropout=att_dropout, + ) + + from returnn.config import get_global_config + + config = get_global_config(return_empty_if_none=True) + + # self.in_dim = in_dim + + self.enc_ctx = rf.Linear(self.out_dim, enc_key_total_dim) + self.enc_ctx_dropout = 0.2 + + self.inv_fertility = rf.Linear(self.out_dim, dec_att_num_heads, with_bias=False) + + for p in self.parameters(): + p.weight_decay = l2 + + if aux_logits: + if not wb_target_dim: + wb_target_dim = target_dim + 1 + for i in aux_logits: + setattr(self, f"enc_aux_logits_{i}", rf.Linear(self.out_dim, wb_target_dim)) + + self._specaugment_opts = { + "steps": config.typed_value("specaugment_steps") or (0, 1000, 2000), + "max_consecutive_spatial_dims": config.typed_value("specaugment_max_consecutive_spatial_dims") or 20, + "max_consecutive_feature_dims": config.typed_value("specaugment_max_consecutive_feature_dims") + or (_log_mel_feature_dim // 5), + "num_spatial_mask_factor": config.typed_value("specaugment_num_spatial_mask_factor") or 100, + } + + self._pretrain_opts: Optional[Dict[str, Any]] = config.typed_value("pretrain_opts") + + self._mixup = None + if config.typed_value("mixup", None) is not None: + from i6_experiments.users.schmitt.returnn_frontend.models.rf_mixup import Mixup, MixupOpts + + self._mixup = Mixup(feature_dim=self.in_dim, opts=MixupOpts(**config.typed_value("mixup"))) + + def encode( + self, + source: Tensor, + *, + in_spatial_dim: Dim, + collected_outputs: Optional[Dict[str, Tensor]] = None, + ) -> Tuple[Dict[str, Tensor], Dim]: + """encode, and extend the encoder output for things we need in the decoder""" + # log mel filterbank features + source, in_spatial_dim = rf.audio.log_mel_filterbank_from_raw( + source, + in_spatial_dim=in_spatial_dim, + out_dim=self.in_dim, + sampling_rate=16_000, + log_base=math.exp(2.3026), # almost 10.0 but not exactly... + ) + if self._mixup: + source = self._mixup(source, spatial_dim=in_spatial_dim) + # SpecAugment + source = rf.audio.specaugment( + source, + spatial_dim=in_spatial_dim, + feature_dim=self.in_dim, + **self._specaugment_opts, + ) + # Encoder including convolutional frontend + with _opt_apply_pretrain_to_encoder(self, collected_outputs, self._pretrain_opts): + enc, enc_spatial_dim = self( + source, in_spatial_dim=in_spatial_dim, collected_outputs=collected_outputs + ) + enc_ctx = self.enc_ctx(enc) + inv_fertility = rf.sigmoid(self.inv_fertility(enc)) + return dict(enc=enc, enc_ctx=enc_ctx, inv_fertility=inv_fertility), enc_spatial_dim + + +@contextlib.contextmanager +def _opt_apply_pretrain_to_encoder( + encoder: ConformerEncoder, collected_outputs: Optional[Dict[str, Tensor]], pretrain_opts: Optional[Dict[str, Any]] +): + """Function is run within RETURNN.""" + if not pretrain_opts: + yield + return + step = rf.get_run_ctx().step + steps: Union[Sequence[Tuple[int, Dict[str, Any]]], Dict[int, Dict[str, Any]]] = pretrain_opts["steps"] + if isinstance(steps, (list, tuple)): + steps_ = {} + step_bound = 0 + for step_bound_rel, opts in steps: + step_bound += step_bound_rel + steps_[step_bound] = opts + steps = steps_ + assert isinstance(steps, dict) + for step_bound, opts in sorted(steps.items()): + if step < step_bound: + assert isinstance(opts, dict) + opts_ = opts.copy() + # somewhat hacky but that is still the easiest way I can think of, without touching a lot of other code + pretrain_num_layers = opts_.pop("num_layers") + assert not opts_, f"unhandled opts: {opts_} in opts {opts} for step bound {step_bound}" + orig_layers = encoder.layers[:] + del encoder.layers[pretrain_num_layers:] + yield + encoder.layers[:] = orig_layers + if collected_outputs is not None: + assert len(collected_outputs) == pretrain_num_layers + for i in range(pretrain_num_layers, len(orig_layers)): + collected_outputs[str(i)] = collected_outputs[str(pretrain_num_layers - 1)] + return + yield + return diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/decoder.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/decoder.py new file mode 100644 index 000000000..d55b163ae --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/decoder.py @@ -0,0 +1,85 @@ +from typing import Optional, Dict, Any, Sequence, Tuple, List +import functools + +from returnn.tensor import Tensor, Dim, single_step_dim +import returnn.frontend as rf + +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.base import BaseLabelDecoder + + +class GlobalAttDecoder(BaseLabelDecoder): + def __init__(self, eos_idx: int, **kwargs): + super(GlobalAttDecoder, self).__init__(**kwargs) + + self.eos_idx = eos_idx + self.bos_idx = eos_idx + + def decoder_default_initial_state(self, *, batch_dims: Sequence[Dim], enc_spatial_dim: Dim) -> rf.State: + """Default initial state""" + state = rf.State( + s=self.s.default_initial_state(batch_dims=batch_dims), + att=rf.zeros(list(batch_dims) + [self.att_num_heads * self.enc_out_dim]), + accum_att_weights=rf.zeros( + list(batch_dims) + [enc_spatial_dim, self.att_num_heads], feature_dim=self.att_num_heads + ), + ) + state.att.feature_dim_axis = len(state.att.dims) - 1 + return state + + def loop_step_output_templates(self, batch_dims: List[Dim]) -> Dict[str, Tensor]: + """loop step out""" + return { + "s": Tensor( + "s", dims=batch_dims + [self.s.out_dim], dtype=rf.get_default_float_dtype(), feature_dim_axis=-1 + ), + "att": Tensor( + "att", + dims=batch_dims + [self.att_num_heads * self.enc_out_dim], + dtype=rf.get_default_float_dtype(), + feature_dim_axis=-1, + ), + } + + def loop_step( + self, + *, + enc: rf.Tensor, + enc_ctx: rf.Tensor, + inv_fertility: rf.Tensor, + enc_spatial_dim: Dim, + input_embed: rf.Tensor, + state: Optional[rf.State] = None, + ) -> Tuple[Dict[str, rf.Tensor], rf.State]: + """step of the inner loop""" + if state is None: + batch_dims = enc.remaining_dims( + remove=(enc.feature_dim, enc_spatial_dim) if enc_spatial_dim != single_step_dim else (enc.feature_dim,) + ) + state = self.decoder_default_initial_state(batch_dims=batch_dims, enc_spatial_dim=enc_spatial_dim) + state_ = rf.State() + + prev_att = state.att + + s, state_.s = self.s(rf.concat_features(input_embed, prev_att), state=state.s, spatial_dim=single_step_dim) + + weight_feedback = self.weight_feedback(state.accum_att_weights) + s_transformed = self.s_transformed(s) + energy_in = enc_ctx + weight_feedback + s_transformed + energy = self.energy(rf.tanh(energy_in)) + att_weights = rf.softmax(energy, axis=enc_spatial_dim) + + state_.accum_att_weights = state.accum_att_weights + att_weights * inv_fertility * 0.5 + att0 = rf.dot(att_weights, enc, reduce=enc_spatial_dim, use_mask=False) + att0.feature_dim = self.enc_out_dim + att, _ = rf.merge_dims(att0, dims=(self.att_num_heads, self.enc_out_dim)) + state_.att = att + + return {"s": s, "att": att}, state_ + + def decode_logits(self, *, s: Tensor, input_embed: Tensor, att: Tensor) -> Tensor: + """logits for the decoder""" + readout_in = self.readout_in(rf.concat_features(s, input_embed, att)) + readout = rf.reduce_out(readout_in, mode="max", num_pieces=2, out_dim=self.output_prob.in_dim) + readout = rf.dropout(readout, drop_prob=0.3, axis=self.dropout_broadcast and readout.feature_dim) + logits = self.output_prob(readout) + return logits diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model.py index 5021ff1a9..e9d4165ab 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model.py @@ -5,85 +5,66 @@ import returnn.frontend as rf from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.model import ModelDef -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.base import BaseModel +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.global_.decoder import GlobalAttDecoder +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.encoder.global_ import GlobalConformerEncoder from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.base import _batch_size_factor, _log_mel_feature_dim +from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.supports_label_scorer_torch import RFModelWithMakeLabelScorer -class GlobalAttentionModel(BaseModel): - def __init__(self, eos_idx: int, **kwargs): - super(GlobalAttentionModel, self).__init__(**kwargs) - self.eos_idx = eos_idx - self.bos_idx = eos_idx +class GlobalAttentionModel(rf.Module): + def __init__( + self, + *, + target_dim: Dim, + blank_idx: int, + enc_key_total_dim: Dim = Dim(name="enc_key_total_dim", dimension=1024), + att_dropout: float = 0.1, + l2: float = 0.0001, + language_model: Optional[RFModelWithMakeLabelScorer] = None, + enc_in_dim: Dim, + enc_out_dim: Dim = Dim(name="enc", dimension=512), + enc_num_layers: int = 12, + enc_aux_logits: Sequence[int] = (), # layers + enc_ff_dim: Dim = Dim(name="enc-ff", dimension=2048), + enc_num_heads: int = 4, + encoder_layer_opts: Optional[Dict[str, Any]] = None, + dec_att_num_heads: Dim = Dim(name="att_num_heads", dimension=1), + enc_dropout: float = 0.1, + eos_idx: int, + ): + super(GlobalAttentionModel, self).__init__() + + self.encoder = GlobalConformerEncoder( + enc_in_dim, + enc_out_dim, + num_layers=enc_num_layers, + target_dim=target_dim, + wb_target_dim=None, + aux_logits=enc_aux_logits, + ff_dim=enc_ff_dim, + num_heads=enc_num_heads, + encoder_layer_opts=encoder_layer_opts, + enc_key_total_dim=enc_key_total_dim, + dec_att_num_heads=dec_att_num_heads, + dropout=enc_dropout, + att_dropout=att_dropout, + l2=l2, + ) - def decoder_default_initial_state(self, *, batch_dims: Sequence[Dim], enc_spatial_dim: Dim) -> rf.State: - """Default initial state""" - state = rf.State( - s=self.s.default_initial_state(batch_dims=batch_dims), - att=rf.zeros(list(batch_dims) + [self.att_num_heads * self.encoder.out_dim]), - accum_att_weights=rf.zeros( - list(batch_dims) + [enc_spatial_dim, self.att_num_heads], feature_dim=self.att_num_heads - ), + self.label_decoder = GlobalAttDecoder( + enc_out_dim=self.encoder.out_dim, + target_dim=target_dim, + att_num_heads=dec_att_num_heads, + att_dropout=att_dropout, + blank_idx=blank_idx, + enc_key_total_dim=enc_key_total_dim, + l2=l2, + language_model=language_model, + eos_idx=eos_idx, ) - state.att.feature_dim_axis = len(state.att.dims) - 1 - return state - - def loop_step_output_templates(self, batch_dims: List[Dim]) -> Dict[str, Tensor]: - """loop step out""" - return { - "s": Tensor( - "s", dims=batch_dims + [self.s.out_dim], dtype=rf.get_default_float_dtype(), feature_dim_axis=-1 - ), - "att": Tensor( - "att", - dims=batch_dims + [self.att_num_heads * self.encoder.out_dim], - dtype=rf.get_default_float_dtype(), - feature_dim_axis=-1, - ), - } - def loop_step( - self, - *, - enc: rf.Tensor, - enc_ctx: rf.Tensor, - inv_fertility: rf.Tensor, - enc_spatial_dim: Dim, - input_embed: rf.Tensor, - state: Optional[rf.State] = None, - ) -> Tuple[Dict[str, rf.Tensor], rf.State]: - """step of the inner loop""" - if state is None: - batch_dims = enc.remaining_dims( - remove=(enc.feature_dim, enc_spatial_dim) if enc_spatial_dim != single_step_dim else (enc.feature_dim,) - ) - state = self.decoder_default_initial_state(batch_dims=batch_dims, enc_spatial_dim=enc_spatial_dim) - state_ = rf.State() - - prev_att = state.att - - s, state_.s = self.s(rf.concat_features(input_embed, prev_att), state=state.s, spatial_dim=single_step_dim) - - weight_feedback = self.weight_feedback(state.accum_att_weights) - s_transformed = self.s_transformed(s) - energy_in = enc_ctx + weight_feedback + s_transformed - energy = self.energy(rf.tanh(energy_in)) - att_weights = rf.softmax(energy, axis=enc_spatial_dim) - - state_.accum_att_weights = state.accum_att_weights + att_weights * inv_fertility * 0.5 - att0 = rf.dot(att_weights, enc, reduce=enc_spatial_dim, use_mask=False) - att0.feature_dim = self.encoder.out_dim - att, _ = rf.merge_dims(att0, dims=(self.att_num_heads, self.encoder.out_dim)) - state_.att = att - - return {"s": s, "att": att}, state_ - - def decode_logits(self, *, s: Tensor, input_embed: Tensor, att: Tensor) -> Tensor: - """logits for the decoder""" - readout_in = self.readout_in(rf.concat_features(s, input_embed, att)) - readout = rf.reduce_out(readout_in, mode="max", num_pieces=2, out_dim=self.output_prob.in_dim) - readout = rf.dropout(readout, drop_prob=0.3, axis=self.dropout_broadcast and readout.feature_dim) - logits = self.output_prob(readout) - return logits + self.blank_idx = blank_idx + self.target_dim = target_dim class MakeModel: @@ -132,12 +113,12 @@ def make_model( lm = (lm, functools.partial(trafo_lm.make_label_scorer_torch, model=lm)) return GlobalAttentionModel( - in_dim=in_dim, - num_enc_layers=num_enc_layers, - enc_model_dim=Dim(name="enc", dimension=512, kind=Dim.Types.Feature), + enc_in_dim=in_dim, + enc_num_layers=num_enc_layers, + enc_out_dim=Dim(name="enc", dimension=512, kind=Dim.Types.Feature), enc_ff_dim=Dim(name="enc-ff", dimension=2048, kind=Dim.Types.Feature), - enc_att_num_heads=8, - enc_conformer_layer_opts=dict( + enc_num_heads=8, + encoder_layer_opts=dict( conv_norm_opts=dict(use_mask=True), self_att_opts=dict( # Shawn et al 2018 style, old RETURNN way. diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_import.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_import.py new file mode 100644 index 000000000..041ab883a --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_import.py @@ -0,0 +1,172 @@ +from typing import Dict +import numpy + +import returnn.frontend as rf + + +_ParamMapping = {} # type: Dict[str,str] + + +def _add_params(): + # frontend + for layer_idx in [0, 1, 2]: + orig_name = "conv0" if layer_idx == 0 else f"subsample_conv{layer_idx - 1}" + _ParamMapping.update( + { + f"encoder.input_layer.conv_layers.{layer_idx}.filter": f"{orig_name}/W", + f"encoder.input_layer.conv_layers.{layer_idx}.bias": f"{orig_name}/bias", + } + ) + _ParamMapping.update( + { + "encoder.input_projection.weight": "source_linear/W", + "encoder.enc_ctx.weight": "enc_ctx/W", + "encoder.enc_ctx.bias": "enc_ctx/b", + "encoder.inv_fertility.weight": "inv_fertility/W", + "label_decoder.target_embed.weight": "output/rec/target_embed0/W", + "label_decoder.weight_feedback.weight": "output/rec/weight_feedback/W", + "label_decoder.s_transformed.weight": "output/rec/s_transformed/W", + "label_decoder.energy.weight": "output/rec/energy/W", + "label_decoder.readout_in.weight": "output/rec/readout_in/W", + "label_decoder.readout_in.bias": "output/rec/readout_in/b", + "label_decoder.output_prob.weight": "output/rec/output_prob/W", + "label_decoder.output_prob.bias": "output/rec/output_prob/b", + } + ) + # conformer + for layer_idx in range(12): + # FF + for sub in [1, 2]: + _ParamMapping[ + f"encoder.layers.{layer_idx}.ffn{sub}.linear_ff.weight" + ] = f"conformer_block_{layer_idx + 1:02d}_ffmod_{sub}_ff1/W" + _ParamMapping[ + f"encoder.layers.{layer_idx}.ffn{sub}.linear_ff.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_ffmod_{sub}_ff1/b" + _ParamMapping[ + f"encoder.layers.{layer_idx}.ffn{sub}.linear_out.weight" + ] = f"conformer_block_{layer_idx + 1:02d}_ffmod_{sub}_ff2/W" + _ParamMapping[ + f"encoder.layers.{layer_idx}.ffn{sub}.linear_out.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_ffmod_{sub}_ff2/b" + _ParamMapping[ + f"encoder.layers.{layer_idx}.ffn{sub}_layer_norm.scale" + ] = f"conformer_block_{layer_idx + 1:02d}_ffmod_{sub}_ln/scale" + _ParamMapping[ + f"encoder.layers.{layer_idx}.ffn{sub}_layer_norm.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_ffmod_{sub}_ln/bias" + # conv + _ParamMapping[ + f"encoder.layers.{layer_idx}.conv_block.positionwise_conv1.weight" + ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_pointwise_conv1/W" + _ParamMapping[ + f"encoder.layers.{layer_idx}.conv_block.positionwise_conv1.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_pointwise_conv1/b" + _ParamMapping[ + f"encoder.layers.{layer_idx}.conv_block.depthwise_conv.filter" + ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_depthwise_conv2/W" + _ParamMapping[ + f"encoder.layers.{layer_idx}.conv_block.depthwise_conv.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_depthwise_conv2/bias" + _ParamMapping[ + f"encoder.layers.{layer_idx}.conv_block.positionwise_conv2.weight" + ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_pointwise_conv2/W" + _ParamMapping[ + f"encoder.layers.{layer_idx}.conv_block.positionwise_conv2.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_pointwise_conv2/b" + _ParamMapping[ + f"encoder.layers.{layer_idx}.conv_layer_norm.scale" + ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_ln/scale" + _ParamMapping[ + f"encoder.layers.{layer_idx}.conv_layer_norm.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_ln/bias" + # self-att + _ParamMapping[ + f"encoder.layers.{layer_idx}.self_att.qkv.weight" + ] = f"conformer_block_{layer_idx + 1:02d}_self_att/QKV" + _ParamMapping[ + f"encoder.layers.{layer_idx}.self_att.proj.weight" + ] = f"conformer_block_{layer_idx + 1:02d}_self_att_linear/W" + _ParamMapping[ + f"encoder.layers.{layer_idx}.self_att_layer_norm.scale" + ] = f"conformer_block_{layer_idx + 1:02d}_self_att_ln/scale" + _ParamMapping[ + f"encoder.layers.{layer_idx}.self_att_layer_norm.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_self_att_ln/bias" + _ParamMapping[ + f"encoder.layers.{layer_idx}.self_att.learned_pos_emb.pos_emb" + ] = f"conformer_block_{layer_idx + 1:02d}_self_att_ln_rel_pos_enc/encoding_matrix" + # final layer norm + _ParamMapping[ + f"encoder.layers.{layer_idx}.final_layer_norm.scale" + ] = f"conformer_block_{layer_idx + 1:02d}_ln/scale" + _ParamMapping[ + f"encoder.layers.{layer_idx}.final_layer_norm.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_ln/bias" + + +_add_params() + + +def map_param_func_v2(reader, name: str, var: rf.Parameter) -> numpy.ndarray: + """map params, TF to RF""" + from tensorflow.python.training.py_checkpoint_reader import CheckpointReader + from i6_experiments.users.zeyer.returnn.convert.params import numpy as convert_params_np + from i6_experiments.users.zeyer.returnn.convert.params import tf_to_rf_np as convert_params_tf_to_rf_np + + assert isinstance(reader, CheckpointReader) + assert isinstance(var, rf.Parameter) + + tf_var_name = name.replace(".", "/") + if reader.has_tensor(tf_var_name): + return reader.get_tensor(tf_var_name) + + if name in _ParamMapping: + var_name = _ParamMapping[name] + assert reader.has_tensor(var_name) + value = reader.get_tensor(var_name) + assert isinstance(value, numpy.ndarray) + if name.endswith(".filter"): + value = convert_params_np.convert_tf_conv_to_pt_conv_filter(value) + assert ( + value.shape == var.batch_shape + ), f"new param {name} {var.batch_shape} vs ckpt param {var_name} {value.shape}" + assert value.dtype.name == var.dtype, f"new param {name} {var.dtype} vs ckpt param {var_name} {value.dtype}" + return value + + if name == "label_decoder.s.ff_weight": + value = reader.get_tensor("output/rec/s/rec/lstm_cell/kernel") + value = convert_params_np.convert_tf_lstm_to_native_lstm_ff(value) + assert value.shape == var.batch_shape, name + assert value.dtype.name == var.dtype, name + return value + + if name == "label_decoder.s.rec_weight": + value = reader.get_tensor("output/rec/s/rec/lstm_cell/kernel") + value = convert_params_np.convert_tf_lstm_to_native_lstm_rec(value) + assert value.shape == var.batch_shape, name + assert value.dtype.name == var.dtype, name + return value + + if name == "label_decoder.s.bias": + value = reader.get_tensor("output/rec/s/rec/lstm_cell/bias") + value = convert_params_np.convert_tf_lstm_to_native_lstm_bias(value, forget_gate_bias=1.0) + assert value.shape == var.batch_shape, name + assert value.dtype.name == var.dtype, name + return value + + if ".conv_block.norm." in name: + assert name.startswith("encoder.layers.") + layer_idx = int(name.split(".")[2]) + value = convert_params_tf_to_rf_np.convert_tf_batch_norm_to_rf( + reader=reader, + rf_name=name, + rf_prefix_name=f"encoder.layers.{layer_idx}.conv_block.norm.", + tf_prefix_name=f"conformer_block_{layer_idx + 1:02d}_conv_mod_bn/batch_norm/", + var=var, + ) + assert value.shape == var.batch_shape, name + assert value.dtype.name == var.dtype, name + return value + + raise NotImplementedError(f"cannot map {name!r} {var}") diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_old/README b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_old/README new file mode 100644 index 000000000..bab2e8d4e --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_old/README @@ -0,0 +1,2 @@ +This package contains everything needed for doing viterbi training and recognition for the V1 baseline of our +global att model. It leads to the same scores, losses, speed and WER as our old RETURNN setup (net_dict style). \ No newline at end of file diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_old/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_old/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_old/model.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_old/model.py new file mode 100644 index 000000000..40a759309 --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_old/model.py @@ -0,0 +1,220 @@ +from typing import Optional, Dict, Any, Sequence, Tuple, List +import functools + +from returnn.tensor import Tensor, Dim, single_step_dim +import returnn.frontend as rf + +from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.model import ModelDef +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.base_old import BaseModel +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.base_old import _batch_size_factor, _log_mel_feature_dim + + +class GlobalAttentionModel(BaseModel): + def __init__(self, eos_idx: int, **kwargs): + super(GlobalAttentionModel, self).__init__(**kwargs) + self.eos_idx = eos_idx + self.bos_idx = eos_idx + + def decoder_default_initial_state(self, *, batch_dims: Sequence[Dim], enc_spatial_dim: Dim) -> rf.State: + """Default initial state""" + state = rf.State( + s=self.s.default_initial_state(batch_dims=batch_dims), + att=rf.zeros(list(batch_dims) + [self.att_num_heads * self.encoder.out_dim]), + accum_att_weights=rf.zeros( + list(batch_dims) + [enc_spatial_dim, self.att_num_heads], feature_dim=self.att_num_heads + ), + ) + state.att.feature_dim_axis = len(state.att.dims) - 1 + return state + + def loop_step_output_templates(self, batch_dims: List[Dim]) -> Dict[str, Tensor]: + """loop step out""" + return { + "s": Tensor( + "s", dims=batch_dims + [self.s.out_dim], dtype=rf.get_default_float_dtype(), feature_dim_axis=-1 + ), + "att": Tensor( + "att", + dims=batch_dims + [self.att_num_heads * self.encoder.out_dim], + dtype=rf.get_default_float_dtype(), + feature_dim_axis=-1, + ), + } + + def loop_step( + self, + *, + enc: rf.Tensor, + enc_ctx: rf.Tensor, + inv_fertility: rf.Tensor, + enc_spatial_dim: Dim, + input_embed: rf.Tensor, + state: Optional[rf.State] = None, + ) -> Tuple[Dict[str, rf.Tensor], rf.State]: + """step of the inner loop""" + if state is None: + batch_dims = enc.remaining_dims( + remove=(enc.feature_dim, enc_spatial_dim) if enc_spatial_dim != single_step_dim else (enc.feature_dim,) + ) + state = self.decoder_default_initial_state(batch_dims=batch_dims, enc_spatial_dim=enc_spatial_dim) + state_ = rf.State() + + prev_att = state.att + + s, state_.s = self.s(rf.concat_features(input_embed, prev_att), state=state.s, spatial_dim=single_step_dim) + + weight_feedback = self.weight_feedback(state.accum_att_weights) + s_transformed = self.s_transformed(s) + energy_in = enc_ctx + weight_feedback + s_transformed + energy = self.energy(rf.tanh(energy_in)) + att_weights = rf.softmax(energy, axis=enc_spatial_dim) + + state_.accum_att_weights = state.accum_att_weights + att_weights * inv_fertility * 0.5 + att0 = rf.dot(att_weights, enc, reduce=enc_spatial_dim, use_mask=False) + att0.feature_dim = self.encoder.out_dim + att, _ = rf.merge_dims(att0, dims=(self.att_num_heads, self.encoder.out_dim)) + state_.att = att + + return {"s": s, "att": att}, state_ + + def decode_logits(self, *, s: Tensor, input_embed: Tensor, att: Tensor) -> Tensor: + """logits for the decoder""" + readout_in = self.readout_in(rf.concat_features(s, input_embed, att)) + readout = rf.reduce_out(readout_in, mode="max", num_pieces=2, out_dim=self.output_prob.in_dim) + readout = rf.dropout(readout, drop_prob=0.3, axis=self.dropout_broadcast and readout.feature_dim) + logits = self.output_prob(readout) + return logits + + +class MakeModel: + """for import""" + + def __init__(self, in_dim: int, target_dim: int, *, eos_label: int = 0, num_enc_layers: int = 12): + self.in_dim = in_dim + self.target_dim = target_dim + self.eos_label = eos_label + self.num_enc_layers = num_enc_layers + + def __call__(self) -> GlobalAttentionModel: + from returnn.datasets.util.vocabulary import Vocabulary + + in_dim = Dim(name="in", dimension=self.in_dim, kind=Dim.Types.Feature) + target_dim = Dim(name="target", dimension=self.target_dim, kind=Dim.Types.Feature) + target_dim.vocab = Vocabulary.create_vocab_from_labels( + [str(i) for i in range(target_dim.dimension)], eos_label=self.eos_label + ) + + return self.make_model(in_dim, target_dim, num_enc_layers=self.num_enc_layers) + + @classmethod + def make_model( + cls, + in_dim: Dim, + target_dim: Dim, + *, + num_enc_layers: int = 12, + pos_emb_dropout: float = 0.0, + language_model: Optional[Dict[str, Any]] = None, + **extra, + ) -> GlobalAttentionModel: + """make""" + lm = None + if language_model: + assert isinstance(language_model, dict) + language_model = language_model.copy() + cls_name = language_model.pop("class") + assert cls_name == "TransformerDecoder" + language_model.pop("vocab_dim", None) # will just overwrite + + from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.lm.trafo import model as trafo_lm + + lm = trafo_lm.MakeModel(vocab_dim=target_dim, **language_model)() + lm = (lm, functools.partial(trafo_lm.make_label_scorer_torch, model=lm)) + + return GlobalAttentionModel( + in_dim=in_dim, + num_enc_layers=num_enc_layers, + enc_model_dim=Dim(name="enc", dimension=512, kind=Dim.Types.Feature), + enc_ff_dim=Dim(name="enc-ff", dimension=2048, kind=Dim.Types.Feature), + enc_att_num_heads=8, + enc_conformer_layer_opts=dict( + conv_norm_opts=dict(use_mask=True), + self_att_opts=dict( + # Shawn et al 2018 style, old RETURNN way. + with_bias=False, + with_linear_pos=False, + with_pos_bias=False, + learnable_pos_emb=True, + separate_pos_emb_per_head=False, + pos_emb_dropout=pos_emb_dropout, + ), + ff_activation=lambda x: rf.relu(x) ** 2.0, + ), + eos_idx=_get_eos_idx(target_dim), + target_dim=target_dim, + blank_idx=target_dim.dimension, + language_model=lm, + **extra, + ) + + +def _get_bos_idx(target_dim: Dim) -> int: + """for non-blank labels""" + assert target_dim.vocab + if target_dim.vocab.bos_label_id is not None: + bos_idx = target_dim.vocab.bos_label_id + elif target_dim.vocab.eos_label_id is not None: + bos_idx = target_dim.vocab.eos_label_id + elif "" in target_dim.vocab.user_defined_symbol_ids: + bos_idx = target_dim.vocab.user_defined_symbol_ids[""] + else: + raise Exception(f"cannot determine bos_idx from vocab {target_dim.vocab}") + return bos_idx + + +def _get_eos_idx(target_dim: Dim) -> int: + """for non-blank labels""" + assert target_dim.vocab + if target_dim.vocab.eos_label_id is not None: + eos_idx = target_dim.vocab.eos_label_id + else: + raise Exception(f"cannot determine eos_idx from vocab {target_dim.vocab}") + return eos_idx + + +def from_scratch_model_def(*, epoch: int, in_dim: Dim, target_dim: Dim) -> GlobalAttentionModel: + """Function is run within RETURNN.""" + from returnn.config import get_global_config + + in_dim, epoch # noqa + config = get_global_config() # noqa + enc_aux_logits = config.typed_value("aux_loss_layers") + pos_emb_dropout = config.float("pos_emb_dropout", 0.0) + # real input is raw audio, internally it does logmel + in_dim = Dim(name="logmel", dimension=_log_mel_feature_dim, kind=Dim.Types.Feature) + lm_opts = config.typed_value("external_lm") + return MakeModel.make_model( + in_dim, target_dim, enc_aux_logits=enc_aux_logits or (), pos_emb_dropout=pos_emb_dropout, language_model=lm_opts + ) + + +from_scratch_model_def: ModelDef[GlobalAttentionModel] +from_scratch_model_def.behavior_version = 16 +from_scratch_model_def.backend = "torch" +from_scratch_model_def.batch_size_factor = _batch_size_factor + + +def _returnn_v2_get_model(*, epoch: int, **_kwargs_unused): + from returnn.tensor import Tensor + from returnn.config import get_global_config + + config = get_global_config() + default_input_key = config.typed_value("default_input") + default_target_key = config.typed_value("target") + extern_data_dict = config.typed_value("extern_data") + data = Tensor(name=default_input_key, **extern_data_dict[default_input_key]) + targets = Tensor(name=default_target_key, **extern_data_dict[default_target_key]) + + model_def = config.typed_value("_model_def") + model = model_def(epoch=epoch, in_dim=data.feature_dim, target_dim=targets.sparse_dim) + return model diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_old/model_import.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_old/model_import.py new file mode 100644 index 000000000..3fd087957 --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_old/model_import.py @@ -0,0 +1,172 @@ +from typing import Dict +import numpy + +import returnn.frontend as rf + + +_ParamMapping = {} # type: Dict[str,str] + + +def _add_params(): + # frontend + for layer_idx in [0, 1, 2]: + orig_name = "conv0" if layer_idx == 0 else f"subsample_conv{layer_idx - 1}" + _ParamMapping.update( + { + f"encoder.input_layer.conv_layers.{layer_idx}.filter": f"{orig_name}/W", + f"encoder.input_layer.conv_layers.{layer_idx}.bias": f"{orig_name}/bias", + } + ) + _ParamMapping.update( + { + "encoder.input_projection.weight": "source_linear/W", + "enc_ctx.weight": "enc_ctx/W", + "enc_ctx.bias": "enc_ctx/b", + "inv_fertility.weight": "inv_fertility/W", + "target_embed.weight": "output/rec/target_embed0/W", + "weight_feedback.weight": "output/rec/weight_feedback/W", + "s_transformed.weight": "output/rec/s_transformed/W", + "energy.weight": "output/rec/energy/W", + "readout_in.weight": "output/rec/readout_in/W", + "readout_in.bias": "output/rec/readout_in/b", + "output_prob.weight": "output/rec/output_prob/W", + "output_prob.bias": "output/rec/output_prob/b", + } + ) + # conformer + for layer_idx in range(12): + # FF + for sub in [1, 2]: + _ParamMapping[ + f"encoder.layers.{layer_idx}.ffn{sub}.linear_ff.weight" + ] = f"conformer_block_{layer_idx + 1:02d}_ffmod_{sub}_ff1/W" + _ParamMapping[ + f"encoder.layers.{layer_idx}.ffn{sub}.linear_ff.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_ffmod_{sub}_ff1/b" + _ParamMapping[ + f"encoder.layers.{layer_idx}.ffn{sub}.linear_out.weight" + ] = f"conformer_block_{layer_idx + 1:02d}_ffmod_{sub}_ff2/W" + _ParamMapping[ + f"encoder.layers.{layer_idx}.ffn{sub}.linear_out.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_ffmod_{sub}_ff2/b" + _ParamMapping[ + f"encoder.layers.{layer_idx}.ffn{sub}_layer_norm.scale" + ] = f"conformer_block_{layer_idx + 1:02d}_ffmod_{sub}_ln/scale" + _ParamMapping[ + f"encoder.layers.{layer_idx}.ffn{sub}_layer_norm.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_ffmod_{sub}_ln/bias" + # conv + _ParamMapping[ + f"encoder.layers.{layer_idx}.conv_block.positionwise_conv1.weight" + ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_pointwise_conv1/W" + _ParamMapping[ + f"encoder.layers.{layer_idx}.conv_block.positionwise_conv1.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_pointwise_conv1/b" + _ParamMapping[ + f"encoder.layers.{layer_idx}.conv_block.depthwise_conv.filter" + ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_depthwise_conv2/W" + _ParamMapping[ + f"encoder.layers.{layer_idx}.conv_block.depthwise_conv.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_depthwise_conv2/bias" + _ParamMapping[ + f"encoder.layers.{layer_idx}.conv_block.positionwise_conv2.weight" + ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_pointwise_conv2/W" + _ParamMapping[ + f"encoder.layers.{layer_idx}.conv_block.positionwise_conv2.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_pointwise_conv2/b" + _ParamMapping[ + f"encoder.layers.{layer_idx}.conv_layer_norm.scale" + ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_ln/scale" + _ParamMapping[ + f"encoder.layers.{layer_idx}.conv_layer_norm.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_ln/bias" + # self-att + _ParamMapping[ + f"encoder.layers.{layer_idx}.self_att.qkv.weight" + ] = f"conformer_block_{layer_idx + 1:02d}_self_att/QKV" + _ParamMapping[ + f"encoder.layers.{layer_idx}.self_att.proj.weight" + ] = f"conformer_block_{layer_idx + 1:02d}_self_att_linear/W" + _ParamMapping[ + f"encoder.layers.{layer_idx}.self_att_layer_norm.scale" + ] = f"conformer_block_{layer_idx + 1:02d}_self_att_ln/scale" + _ParamMapping[ + f"encoder.layers.{layer_idx}.self_att_layer_norm.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_self_att_ln/bias" + _ParamMapping[ + f"encoder.layers.{layer_idx}.self_att.learned_pos_emb.pos_emb" + ] = f"conformer_block_{layer_idx + 1:02d}_self_att_ln_rel_pos_enc/encoding_matrix" + # final layer norm + _ParamMapping[ + f"encoder.layers.{layer_idx}.final_layer_norm.scale" + ] = f"conformer_block_{layer_idx + 1:02d}_ln/scale" + _ParamMapping[ + f"encoder.layers.{layer_idx}.final_layer_norm.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_ln/bias" + + +_add_params() + + +def map_param_func_v2(reader, name: str, var: rf.Parameter) -> numpy.ndarray: + """map params, TF to RF""" + from tensorflow.python.training.py_checkpoint_reader import CheckpointReader + from i6_experiments.users.zeyer.returnn.convert.params import numpy as convert_params_np + from i6_experiments.users.zeyer.returnn.convert.params import tf_to_rf_np as convert_params_tf_to_rf_np + + assert isinstance(reader, CheckpointReader) + assert isinstance(var, rf.Parameter) + + tf_var_name = name.replace(".", "/") + if reader.has_tensor(tf_var_name): + return reader.get_tensor(tf_var_name) + + if name in _ParamMapping: + var_name = _ParamMapping[name] + assert reader.has_tensor(var_name) + value = reader.get_tensor(var_name) + assert isinstance(value, numpy.ndarray) + if name.endswith(".filter"): + value = convert_params_np.convert_tf_conv_to_pt_conv_filter(value) + assert ( + value.shape == var.batch_shape + ), f"new param {name} {var.batch_shape} vs ckpt param {var_name} {value.shape}" + assert value.dtype.name == var.dtype, f"new param {name} {var.dtype} vs ckpt param {var_name} {value.dtype}" + return value + + if name == "s.ff_weight": + value = reader.get_tensor("output/rec/s/rec/lstm_cell/kernel") + value = convert_params_np.convert_tf_lstm_to_native_lstm_ff(value) + assert value.shape == var.batch_shape, name + assert value.dtype.name == var.dtype, name + return value + + if name == "s.rec_weight": + value = reader.get_tensor("output/rec/s/rec/lstm_cell/kernel") + value = convert_params_np.convert_tf_lstm_to_native_lstm_rec(value) + assert value.shape == var.batch_shape, name + assert value.dtype.name == var.dtype, name + return value + + if name == "s.bias": + value = reader.get_tensor("output/rec/s/rec/lstm_cell/bias") + value = convert_params_np.convert_tf_lstm_to_native_lstm_bias(value, forget_gate_bias=1.0) + assert value.shape == var.batch_shape, name + assert value.dtype.name == var.dtype, name + return value + + if ".conv_block.norm." in name: + assert name.startswith("encoder.layers.") + layer_idx = int(name.split(".")[2]) + value = convert_params_tf_to_rf_np.convert_tf_batch_norm_to_rf( + reader=reader, + rf_name=name, + rf_prefix_name=f"encoder.layers.{layer_idx}.conv_block.norm.", + tf_prefix_name=f"conformer_block_{layer_idx + 1:02d}_conv_mod_bn/batch_norm/", + var=var, + ) + assert value.shape == var.batch_shape, name + assert value.dtype.name == var.dtype, name + return value + + raise NotImplementedError(f"cannot map {name!r} {var}") diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_old/recog.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_old/recog.py new file mode 100644 index 000000000..644a637cd --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_old/recog.py @@ -0,0 +1,315 @@ +from typing import Optional, Dict, Any, Tuple +import tree + +from returnn.tensor import Tensor, Dim +import returnn.frontend as rf +from returnn.frontend.tensor_array import TensorArray + +from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.recog import RecogDef +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.base import _batch_size_factor +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.global_.model_old.model import GlobalAttentionModel + + +def model_recog( + *, + model: GlobalAttentionModel, + data: Tensor, + data_spatial_dim: Dim, + max_seq_len: Optional[int] = None, +) -> Tuple[Tensor, Tensor, Dim, Dim]: + """ + Function is run within RETURNN. + + Earlier we used the generic beam_search function, + but now we just directly perform the search here, + as this is overall simpler and shorter. + + :return: + recog results including beam {batch, beam, out_spatial}, + log probs {batch, beam}, + out_spatial_dim, + final beam_dim + """ + assert not model.language_model # not implemented here. use the pure PyTorch search instead + + batch_dims = data.remaining_dims((data_spatial_dim, data.feature_dim)) + enc_args, enc_spatial_dim = model.encode(data, in_spatial_dim=data_spatial_dim) + beam_size = 12 + length_normalization_exponent = 1.0 + if max_seq_len is None: + max_seq_len = enc_spatial_dim.get_size_tensor() + else: + max_seq_len = rf.convert_to_tensor(max_seq_len, dtype="int32") + print("** max seq len:", max_seq_len.raw_tensor) + + # Eager-mode implementation of beam search. + # Initial state. + beam_dim = Dim(1, name="initial-beam") + batch_dims_ = [beam_dim] + batch_dims + decoder_state = model.decoder_default_initial_state(batch_dims=batch_dims_, enc_spatial_dim=enc_spatial_dim) + target = rf.constant(model.bos_idx, dims=batch_dims_, sparse_dim=model.target_dim) + ended = rf.constant(False, dims=batch_dims_) + out_seq_len = rf.constant(0, dims=batch_dims_) + seq_log_prob = rf.constant(0.0, dims=batch_dims_) + + i = 0 + seq_targets = [] + seq_backrefs = [] + while True: + if i == 0: + input_embed = rf.zeros(batch_dims_ + [model.target_embed.out_dim], feature_dim=model.target_embed.out_dim) + else: + input_embed = model.target_embed(target) + step_out, decoder_state = model.loop_step( + **enc_args, + enc_spatial_dim=enc_spatial_dim, + input_embed=input_embed, + state=decoder_state, + ) + logits = model.decode_logits(input_embed=input_embed, **step_out) + label_log_prob = rf.log_softmax(logits, axis=model.target_dim) + # Filter out finished beams + label_log_prob = rf.where( + ended, + rf.sparse_to_dense(model.eos_idx, axis=model.target_dim, label_value=0.0, other_value=-1.0e30), + label_log_prob, + ) + seq_log_prob = seq_log_prob + label_log_prob # Batch, InBeam, Vocab + seq_log_prob, (backrefs, target), beam_dim = rf.top_k( + seq_log_prob, k_dim=Dim(beam_size, name=f"dec-step{i}-beam"), axis=[beam_dim, model.target_dim] + ) # seq_log_prob, backrefs, target: Batch, Beam + seq_targets.append(target) + seq_backrefs.append(backrefs) + decoder_state = tree.map_structure(lambda s: rf.gather(s, indices=backrefs), decoder_state) + ended = rf.gather(ended, indices=backrefs) + out_seq_len = rf.gather(out_seq_len, indices=backrefs) + i += 1 + + ended = rf.logical_or(ended, target == model.eos_idx) + ended = rf.logical_or(ended, rf.copy_to_device(i >= max_seq_len)) + if bool(rf.reduce_all(ended, axis=ended.dims).raw_tensor): + break + out_seq_len = out_seq_len + rf.where(ended, 0, 1) + + if i > 1 and length_normalization_exponent != 0: + # Length-normalized scores, so we evaluate score_t/len. + # If seq ended, score_i/i == score_{i-1}/(i-1), thus score_i = score_{i-1}*(i/(i-1)) + # Because we count with EOS symbol, shifted by one. + seq_log_prob *= rf.where( + ended, + (i / (i - 1)) ** length_normalization_exponent, + 1.0, + ) + + if i > 0 and length_normalization_exponent != 0: + seq_log_prob *= (1 / i) ** length_normalization_exponent + + # Backtrack via backrefs, resolve beams. + seq_targets_ = [] + indices = rf.range_over_dim(beam_dim) # FinalBeam -> FinalBeam + for backrefs, target in zip(seq_backrefs[::-1], seq_targets[::-1]): + # indices: FinalBeam -> Beam + # backrefs: Beam -> PrevBeam + seq_targets_.insert(0, rf.gather(target, indices=indices)) + indices = rf.gather(backrefs, indices=indices) # FinalBeam -> PrevBeam + + seq_targets__ = TensorArray(seq_targets_[0]) + for target in seq_targets_: + seq_targets__ = seq_targets__.push_back(target) + out_spatial_dim = Dim(out_seq_len, name="out-spatial") + seq_targets = seq_targets__.stack(axis=out_spatial_dim) + + return seq_targets, seq_log_prob, out_spatial_dim, beam_dim + + +# RecogDef API +model_recog: RecogDef[GlobalAttentionModel] +model_recog.output_with_beam = True +# output_blank_label=blank is actually wrong for AED, but now we don't change it anymore +# because it would change all recog hashes. +# Also, it does not matter too much -- it will just cause an extra SearchRemoveLabelJob, +# which will not have any effect here. +model_recog.output_blank_label = "" +model_recog.batch_size_dependent = False + + +def model_recog_pure_torch( + *, + model: GlobalAttentionModel, + data: Tensor, + data_spatial_dim: Dim, + max_seq_len: Optional[int] = None, +) -> Tuple[Tensor, Tensor, Dim, Dim]: + """ + Function is run within RETURNN. + + Earlier we used the generic beam_search function, + but now we just directly perform the search here, + as this is overall simpler and shorter. + + :return: + recog results including beam {batch, beam, out_spatial}, + log probs {batch, beam}, + recog results info: key -> {batch, beam}, + out_spatial_dim, + final beam_dim + """ + import torch + from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.beam_search.label_sync import BeamSearchOpts, label_sync_beam_search + from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.label_scorer import ShallowFusedLabelScorers + from returnn.config import get_global_config + + config = get_global_config() + + torch.cuda.set_sync_debug_mode(1) # debug CUDA sync. does not hurt too much to leave this always in? + + data_concat_zeros = config.float("data_concat_zeros", 0) + if data_concat_zeros: + data_concat_zeros_dim = Dim(int(data_concat_zeros * _batch_size_factor * 100), name="data_concat_zeros") + data, data_spatial_dim = rf.concat( + (data, data_spatial_dim), (rf.zeros([data_concat_zeros_dim]), data_concat_zeros_dim), allow_broadcast=True + ) + + batch_dims = data.remaining_dims((data_spatial_dim, data.feature_dim)) + assert len(batch_dims) == 1, batch_dims # not implemented otherwise, simple to add... + batch_dim = batch_dims[0] + enc, enc_spatial_dim = model.encode(data, in_spatial_dim=data_spatial_dim) + if max_seq_len is None: + max_seq_len = enc_spatial_dim.get_size_tensor() + else: + max_seq_len = rf.convert_to_tensor(max_seq_len, dtype="int32") + + beam_search_opts = (config.typed_value("beam_search_opts", None) or {}).copy() + if beam_search_opts.get("beam_size") is None: + beam_search_opts["beam_size"] = config.int("beam_size", 12) + if beam_search_opts.get("length_normalization_exponent") is None: + beam_search_opts["length_normalization_exponent"] = config.float("length_normalization_exponent", 1.0) + + label_scorer = ShallowFusedLabelScorers() + label_scorer.label_scorers["decoder"] = ( + get_label_scorer_pure_torch(model=model, batch_dim=batch_dim, enc=enc, enc_spatial_dim=enc_spatial_dim), + 1.0, + ) + if model.language_model: + lm_scale = beam_search_opts.pop("lm_scale") # must be defined with LM + label_scorer.label_scorers["lm"] = (model.language_model_make_label_scorer(), lm_scale) + + print("** max seq len:", max_seq_len.raw_tensor) + + # Beam search happening here: + ( + seq_targets, # [Batch,FinalBeam,OutSeqLen] + seq_log_prob, # [Batch,FinalBeam] + out_seq_len, # [Batch,FinalBeam] + ) = label_sync_beam_search( + label_scorer, + batch_size=int(batch_dim.get_dim_value()), + max_seq_len=max_seq_len.copy_compatible_to_dims_raw([batch_dim]), + device=data.raw_tensor.device, + opts=BeamSearchOpts( + **beam_search_opts, + bos_label=model.bos_idx, + eos_label=model.eos_idx, + num_labels=model.target_dim.dimension, + ), + ) + + beam_dim = Dim(seq_log_prob.shape[1], name="beam") + out_spatial_dim = Dim(rf.convert_to_tensor(out_seq_len, dims=[batch_dim, beam_dim], name="out_spatial")) + seq_targets_t = rf.convert_to_tensor( + seq_targets, dims=[batch_dim, beam_dim, out_spatial_dim], sparse_dim=model.target_dim + ) + seq_log_prob_t = rf.convert_to_tensor(seq_log_prob, dims=[batch_dim, beam_dim]) + + return seq_targets_t, seq_log_prob_t, out_spatial_dim, beam_dim + + +# RecogDef API +model_recog_pure_torch: RecogDef[GlobalAttentionModel] +model_recog_pure_torch.output_with_beam = True +model_recog_pure_torch.output_blank_label = None +model_recog_pure_torch.batch_size_dependent = False + + +def get_label_scorer_pure_torch( + *, + model: GlobalAttentionModel, + batch_dim: Dim, + enc: Dict[str, Tensor], + enc_spatial_dim: Dim, +): + import torch + import functools + from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.label_scorer import ( + LabelScorerIntf, + StateObjTensorExt, + StateObjIgnored, + ) + + class LabelScorer(LabelScorerIntf): + """label scorer""" + + def get_initial_state(self, *, batch_size: int, device: torch.device) -> Any: + """Initial state.""" + beam_dim = Dim(1, name="initial-beam") + batch_dims_ = [batch_dim, beam_dim] + decoder_state = model.decoder_default_initial_state(batch_dims=batch_dims_, enc_spatial_dim=enc_spatial_dim) + return tree.map_structure(functools.partial(self._map_tensor_to_raw, beam_dim=beam_dim), decoder_state) + + def score_and_update_state( + self, + *, + prev_state: Any, + prev_label: torch.Tensor, + prev_align_label: Optional[torch.Tensor] = None, # not used + t: Optional[int] = None, # not used + ) -> Tuple[torch.Tensor, Any]: + """update state""" + beam_dim = Dim(prev_label.shape[1], name="beam") + + def _map_raw_to_tensor(v): + if isinstance(v, StateObjTensorExt): + tensor: Tensor = v.extra + tensor = tensor.copy_template_new_dim_tags( + (batch_dim, beam_dim) + tensor.dims[2:], keep_special_axes=True + ) + tensor.raw_tensor = v.tensor + return tensor + elif isinstance(v, StateObjIgnored): + return v.content + else: + raise TypeError(f"_map_raw_to_tensor: unexpected {v} ({type(v).__name__})") + + input_embed = model.target_embed( + rf.convert_to_tensor(prev_label, dims=[batch_dim, beam_dim], sparse_dim=model.target_dim) + ) + decode_out, decoder_state = model.loop_step( + **enc, + enc_spatial_dim=enc_spatial_dim, + input_embed=input_embed, + state=tree.map_structure(_map_raw_to_tensor, prev_state), + ) + logits = model.decode_logits(input_embed=input_embed, **decode_out) + label_log_prob = rf.log_softmax(logits, axis=model.target_dim) + assert set(label_log_prob.dims) == {batch_dim, beam_dim, model.target_dim} + + return ( + self._map_tensor_to_raw(label_log_prob, beam_dim=beam_dim).tensor, + tree.map_structure(functools.partial(self._map_tensor_to_raw, beam_dim=beam_dim), decoder_state), + ) + + @staticmethod + def _map_tensor_to_raw(v, *, beam_dim: Dim): + if isinstance(v, Tensor): + if beam_dim not in v.dims: + return StateObjIgnored(v) + batch_dims_ = [batch_dim, beam_dim] + v = v.copy_transpose(batch_dims_ + [dim for dim in v.dims if dim not in batch_dims_]) + raw = v.raw_tensor + return StateObjTensorExt(raw, v.copy_template()) + elif isinstance(v, Dim): + return StateObjIgnored(v) + else: + raise TypeError(f"_map_tensor_to_raw: unexpected {v} ({type(v).__name__})") + + return LabelScorer() diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_old/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_old/train.py new file mode 100644 index 000000000..857880013 --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_old/train.py @@ -0,0 +1,115 @@ +from returnn.tensor import TensorDict +from returnn.tensor import Tensor, Dim +import returnn.frontend as rf + +from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.training import TrainDef +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.global_.model_old.model import GlobalAttentionModel + + +def _returnn_v2_train_step(*, model, extern_data: TensorDict, **_kwargs_unused): + from returnn.config import get_global_config + + config = get_global_config() + default_input_key = config.typed_value("default_input") + default_target_key = config.typed_value("target") + data = extern_data[default_input_key] + data_spatial_dim = data.get_time_dim_tag() + targets = extern_data[default_target_key] + targets_spatial_dim = targets.get_time_dim_tag() + train_def: TrainDef = config.typed_value("_train_def") + train_def( + model=model, + data=data, + data_spatial_dim=data_spatial_dim, + targets=targets, + targets_spatial_dim=targets_spatial_dim, + ) + + +def from_scratch_training( + *, + model: GlobalAttentionModel, + data: rf.Tensor, + data_spatial_dim: Dim, + targets: rf.Tensor, + targets_spatial_dim: Dim +): + """Function is run within RETURNN.""" + from returnn.config import get_global_config + + config = get_global_config() # noqa + aux_loss_layers = config.typed_value("aux_loss_layers") + aux_loss_scales = config.typed_value("aux_loss_scales", ([1.0] * len(aux_loss_layers)) if aux_loss_layers else None) + aed_loss_scale = config.float("aed_loss_scale", 1.0) + use_normalized_loss = config.bool("use_normalized_loss", True) + + if data.feature_dim and data.feature_dim.dimension == 1: + data = rf.squeeze(data, axis=data.feature_dim) + assert not data.feature_dim # raw audio + + collected_outputs = {} + enc_args, enc_spatial_dim = model.encode(data, in_spatial_dim=data_spatial_dim, collected_outputs=collected_outputs) + if aux_loss_layers: + for i, layer_idx in enumerate(aux_loss_layers): + if layer_idx > len(model.encoder.layers): + continue + linear = getattr(model, f"enc_aux_logits_{layer_idx}") + aux_logits = linear(collected_outputs[str(layer_idx - 1)]) + aux_loss = rf.ctc_loss( + logits=aux_logits, + targets=targets, + input_spatial_dim=enc_spatial_dim, + targets_spatial_dim=targets_spatial_dim, + blank_index=model.blank_idx, + ) + aux_loss.mark_as_loss( + f"ctc_{layer_idx}", + scale=aux_loss_scales[i], + custom_inv_norm_factor=targets_spatial_dim.get_size_tensor(), + use_normalized_loss=use_normalized_loss, + ) + + batch_dims = data.remaining_dims(data_spatial_dim) + input_embeddings = model.target_embed(targets) + input_embeddings = rf.shift_right(input_embeddings, axis=targets_spatial_dim, pad_value=0.0) + + def _body(input_embed: Tensor, state: rf.State): + new_state = rf.State() + loop_out_, new_state.decoder = model.loop_step( + **enc_args, + enc_spatial_dim=enc_spatial_dim, + input_embed=input_embed, + state=state.decoder, + ) + return loop_out_, new_state + + loop_out, _, _ = rf.scan( + spatial_dim=targets_spatial_dim, + xs=input_embeddings, + ys=model.loop_step_output_templates(batch_dims=batch_dims), + initial=rf.State( + decoder=model.decoder_default_initial_state(batch_dims=batch_dims, enc_spatial_dim=enc_spatial_dim), + ), + body=_body, + ) + + logits = model.decode_logits(input_embed=input_embeddings, **loop_out) + logits_packed, pack_dim = rf.pack_padded(logits, dims=batch_dims + [targets_spatial_dim], enforce_sorted=False) + targets_packed, _ = rf.pack_padded( + targets, dims=batch_dims + [targets_spatial_dim], enforce_sorted=False, out_dim=pack_dim + ) + + log_prob = rf.log_softmax(logits_packed, axis=model.target_dim) + log_prob = rf.label_smoothed_log_prob_gradient(log_prob, 0.1, axis=model.target_dim) + loss = rf.cross_entropy( + target=targets_packed, estimated=log_prob, estimated_type="log-probs", axis=model.target_dim + ) + loss.mark_as_loss("ce", scale=aed_loss_scale, use_normalized_loss=use_normalized_loss) + + best = rf.reduce_argmax(logits_packed, axis=model.target_dim) + frame_error = best != targets_packed + frame_error.mark_as_loss(name="fer", as_error=True) + + +from_scratch_training: TrainDef[GlobalAttentionModel] +from_scratch_training.learning_rate_control_error_measure = "dev_score_full_sum" diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/recog.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/recog.py index 396002fd2..8448a5744 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/recog.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/recog.py @@ -30,10 +30,10 @@ def model_recog( out_spatial_dim, final beam_dim """ - assert not model.language_model # not implemented here. use the pure PyTorch search instead + assert not model.label_decoder.language_model # not implemented here. use the pure PyTorch search instead batch_dims = data.remaining_dims((data_spatial_dim, data.feature_dim)) - enc_args, enc_spatial_dim = model.encode(data, in_spatial_dim=data_spatial_dim) + enc_args, enc_spatial_dim = model.encoder.encode(data, in_spatial_dim=data_spatial_dim) beam_size = 12 length_normalization_exponent = 1.0 if max_seq_len is None: @@ -46,8 +46,8 @@ def model_recog( # Initial state. beam_dim = Dim(1, name="initial-beam") batch_dims_ = [beam_dim] + batch_dims - decoder_state = model.decoder_default_initial_state(batch_dims=batch_dims_, enc_spatial_dim=enc_spatial_dim) - target = rf.constant(model.bos_idx, dims=batch_dims_, sparse_dim=model.target_dim) + decoder_state = model.label_decoder.decoder_default_initial_state(batch_dims=batch_dims_, enc_spatial_dim=enc_spatial_dim) + target = rf.constant(model.label_decoder.bos_idx, dims=batch_dims_, sparse_dim=model.target_dim) ended = rf.constant(False, dims=batch_dims_) out_seq_len = rf.constant(0, dims=batch_dims_) seq_log_prob = rf.constant(0.0, dims=batch_dims_) @@ -57,21 +57,22 @@ def model_recog( seq_backrefs = [] while True: if i == 0: - input_embed = rf.zeros(batch_dims_ + [model.target_embed.out_dim], feature_dim=model.target_embed.out_dim) + input_embed = rf.zeros( + batch_dims_ + [model.label_decoder.target_embed.out_dim], feature_dim=model.label_decoder.target_embed.out_dim) else: - input_embed = model.target_embed(target) - step_out, decoder_state = model.loop_step( + input_embed = model.label_decoder.target_embed(target) + step_out, decoder_state = model.label_decoder.loop_step( **enc_args, enc_spatial_dim=enc_spatial_dim, input_embed=input_embed, state=decoder_state, ) - logits = model.decode_logits(input_embed=input_embed, **step_out) + logits = model.label_decoder.decode_logits(input_embed=input_embed, **step_out) label_log_prob = rf.log_softmax(logits, axis=model.target_dim) # Filter out finished beams label_log_prob = rf.where( ended, - rf.sparse_to_dense(model.eos_idx, axis=model.target_dim, label_value=0.0, other_value=-1.0e30), + rf.sparse_to_dense(model.label_decoder.eos_idx, axis=model.target_dim, label_value=0.0, other_value=-1.0e30), label_log_prob, ) seq_log_prob = seq_log_prob + label_log_prob # Batch, InBeam, Vocab @@ -85,7 +86,7 @@ def model_recog( out_seq_len = rf.gather(out_seq_len, indices=backrefs) i += 1 - ended = rf.logical_or(ended, target == model.eos_idx) + ended = rf.logical_or(ended, rf.convert_to_tensor(target == model.label_decoder.eos_idx)) ended = rf.logical_or(ended, rf.copy_to_device(i >= max_seq_len)) if bool(rf.reduce_all(ended, axis=ended.dims).raw_tensor): break @@ -173,7 +174,7 @@ def model_recog_pure_torch( batch_dims = data.remaining_dims((data_spatial_dim, data.feature_dim)) assert len(batch_dims) == 1, batch_dims # not implemented otherwise, simple to add... batch_dim = batch_dims[0] - enc, enc_spatial_dim = model.encode(data, in_spatial_dim=data_spatial_dim) + enc, enc_spatial_dim = model.encoder.encode(data, in_spatial_dim=data_spatial_dim) if max_seq_len is None: max_seq_len = enc_spatial_dim.get_size_tensor() else: @@ -190,9 +191,9 @@ def model_recog_pure_torch( get_label_scorer_pure_torch(model=model, batch_dim=batch_dim, enc=enc, enc_spatial_dim=enc_spatial_dim), 1.0, ) - if model.language_model: + if model.label_decoder.language_model: lm_scale = beam_search_opts.pop("lm_scale") # must be defined with LM - label_scorer.label_scorers["lm"] = (model.language_model_make_label_scorer(), lm_scale) + label_scorer.label_scorers["lm"] = (model.label_decoder.language_model_make_label_scorer(), lm_scale) print("** max seq len:", max_seq_len.raw_tensor) @@ -208,8 +209,8 @@ def model_recog_pure_torch( device=data.raw_tensor.device, opts=BeamSearchOpts( **beam_search_opts, - bos_label=model.bos_idx, - eos_label=model.eos_idx, + bos_label=model.label_decoder.bos_idx, + eos_label=model.label_decoder.eos_idx, num_labels=model.target_dim.dimension, ), ) @@ -253,7 +254,7 @@ def get_initial_state(self, *, batch_size: int, device: torch.device) -> Any: """Initial state.""" beam_dim = Dim(1, name="initial-beam") batch_dims_ = [batch_dim, beam_dim] - decoder_state = model.decoder_default_initial_state(batch_dims=batch_dims_, enc_spatial_dim=enc_spatial_dim) + decoder_state = model.label_decoder.decoder_default_initial_state(batch_dims=batch_dims_, enc_spatial_dim=enc_spatial_dim) return tree.map_structure(functools.partial(self._map_tensor_to_raw, beam_dim=beam_dim), decoder_state) def score_and_update_state( @@ -280,16 +281,16 @@ def _map_raw_to_tensor(v): else: raise TypeError(f"_map_raw_to_tensor: unexpected {v} ({type(v).__name__})") - input_embed = model.target_embed( + input_embed = model.label_decoder.target_embed( rf.convert_to_tensor(prev_label, dims=[batch_dim, beam_dim], sparse_dim=model.target_dim) ) - decode_out, decoder_state = model.loop_step( + decode_out, decoder_state = model.label_decoder.loop_step( **enc, enc_spatial_dim=enc_spatial_dim, input_embed=input_embed, state=tree.map_structure(_map_raw_to_tensor, prev_state), ) - logits = model.decode_logits(input_embed=input_embed, **decode_out) + logits = model.label_decoder.decode_logits(input_embed=input_embed, **decode_out) label_log_prob = rf.log_softmax(logits, axis=model.target_dim) assert set(label_log_prob.dims) == {batch_dim, beam_dim, model.target_dim} diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/train.py index a8413320c..d81959356 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/train.py @@ -48,7 +48,7 @@ def from_scratch_training( assert not data.feature_dim # raw audio collected_outputs = {} - enc_args, enc_spatial_dim = model.encode(data, in_spatial_dim=data_spatial_dim, collected_outputs=collected_outputs) + enc_args, enc_spatial_dim = model.encoder.encode(data, in_spatial_dim=data_spatial_dim, collected_outputs=collected_outputs) if aux_loss_layers: for i, layer_idx in enumerate(aux_loss_layers): if layer_idx > len(model.encoder.layers): @@ -70,12 +70,12 @@ def from_scratch_training( ) batch_dims = data.remaining_dims(data_spatial_dim) - input_embeddings = model.target_embed(targets) + input_embeddings = model.label_decoder.target_embed(targets) input_embeddings = rf.shift_right(input_embeddings, axis=targets_spatial_dim, pad_value=0.0) def _body(input_embed: Tensor, state: rf.State): new_state = rf.State() - loop_out_, new_state.decoder = model.loop_step( + loop_out_, new_state.decoder = model.label_decoder.loop_step( **enc_args, enc_spatial_dim=enc_spatial_dim, input_embed=input_embed, @@ -86,14 +86,14 @@ def _body(input_embed: Tensor, state: rf.State): loop_out, _, _ = rf.scan( spatial_dim=targets_spatial_dim, xs=input_embeddings, - ys=model.loop_step_output_templates(batch_dims=batch_dims), + ys=model.label_decoder.loop_step_output_templates(batch_dims=batch_dims), initial=rf.State( - decoder=model.decoder_default_initial_state(batch_dims=batch_dims, enc_spatial_dim=enc_spatial_dim), + decoder=model.label_decoder.decoder_default_initial_state(batch_dims=batch_dims, enc_spatial_dim=enc_spatial_dim), ), body=_body, ) - logits = model.decode_logits(input_embed=input_embeddings, **loop_out) + logits = model.label_decoder.decode_logits(input_embed=input_embeddings, **loop_out) logits_packed, pack_dim = rf.pack_padded(logits, dims=batch_dims + [targets_spatial_dim], enforce_sorted=False) targets_packed, _ = rf.pack_padded( targets, dims=batch_dims + [targets_spatial_dim], enforce_sorted=False, out_dim=pack_dim diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model.py index ca03a65ce..da3bba4c0 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model.py @@ -6,282 +6,88 @@ from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.model import ModelDef from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.base import _batch_size_factor, _log_mel_feature_dim -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.base import BaseModel +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.blank_model.model import BlankDecoder +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.model import ( + SegmentalAttLabelDecoder, + SegmentalAttLabelDecoderWoCtxInState +) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.encoder.global_ import GlobalConformerEncoder +from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.supports_label_scorer_torch import RFModelWithMakeLabelScorer -class SegmentalAttentionModel(BaseModel): +class SegmentalAttentionModel(rf.Module): def __init__( self, + *, length_model_state_dim: Dim, length_model_embed_dim: Dim, center_window_size: int, align_target_dim: Dim, - **kwargs + target_dim: Dim, + blank_idx: int, + enc_key_total_dim: Dim = Dim(name="enc_key_total_dim", dimension=1024), + att_dropout: float = 0.1, + l2: float = 0.0001, + language_model: Optional[RFModelWithMakeLabelScorer] = None, + enc_in_dim: Dim, + enc_out_dim: Dim = Dim(name="enc", dimension=512), + enc_num_layers: int = 12, + enc_aux_logits: Sequence[int] = (), # layers + enc_ff_dim: Dim = Dim(name="enc-ff", dimension=2048), + enc_num_heads: int = 4, + encoder_layer_opts: Optional[Dict[str, Any]] = None, + dec_att_num_heads: Dim = Dim(name="att_num_heads", dimension=1), + enc_dropout: float = 0.1, + label_decoder_version: int = 1, ): - super(SegmentalAttentionModel, self).__init__(**kwargs) + super(SegmentalAttentionModel, self).__init__() - self.align_target_dim = align_target_dim - - self.length_model_state_dim = length_model_state_dim - self.length_model_embed_dim = length_model_embed_dim - self.emit_prob_dim = Dim(name="emit_prob", dimension=1) - self.center_window_size = center_window_size - self.accum_att_weights_dim = Dim(name="accum_att_weights", dimension=center_window_size) - - self.target_embed_length_model = rf.Embedding(align_target_dim, self.length_model_embed_dim) - # when using rf.LSTM, something with the parameter import from TF checkpoint was not right - # i.e. the import worked but the LSTM output was different than in TF even though the inputs were the same - # self.s_length_model = rf.LSTM(self.encoder.out_dim + self.length_model_embed_dim, self.length_model_state_dim) - self.s_length_model = rf.ZoneoutLSTM( - self.encoder.out_dim + self.length_model_embed_dim, - self.length_model_state_dim, - parts_order="jifo", - forget_bias=0.0, - ) - self.emit_prob = rf.Linear(self.length_model_state_dim, self.emit_prob_dim) - - def label_decoder_default_initial_state( - self, - *, - batch_dims: Sequence[Dim], - segment_starts_sparse_dim: Optional[Dim] = None, - segment_lens_sparse_dim: Optional[Dim] = None, - ) -> rf.State: - """Default initial state""" - state = rf.State( - s=self.s.default_initial_state(batch_dims=batch_dims), - att=rf.zeros(list(batch_dims) + [self.att_num_heads * self.encoder.out_dim]), - accum_att_weights=rf.zeros( - list(batch_dims) + [self.accum_att_weights_dim, self.att_num_heads], feature_dim=self.att_num_heads - ), - segment_starts=rf.zeros(batch_dims, sparse_dim=segment_starts_sparse_dim, dtype="int32"), - segment_lens=rf.zeros(batch_dims, sparse_dim=segment_lens_sparse_dim, dtype="int32"), - ) - state.att.feature_dim_axis = len(state.att.dims) - 1 - return state - - def label_loop_step_output_templates(self, batch_dims: List[Dim]) -> Dict[str, Tensor]: - """loop step out""" - return { - "s": Tensor( - "s", dims=batch_dims + [self.s.out_dim], dtype=rf.get_default_float_dtype(), feature_dim_axis=-1 - ), - "att": Tensor( - "att", - dims=batch_dims + [self.att_num_heads * self.encoder.out_dim], - dtype=rf.get_default_float_dtype(), - feature_dim_axis=-1, - ), - } - - def _get_prev_accum_att_weights_scattered( - self, - prev_accum_att_weights: Tensor, - segment_starts: Tensor, - prev_segment_starts: Tensor, - prev_segment_lens: Tensor, - ) -> Tensor: - - overlap_len = rf.cast(prev_segment_starts + prev_segment_lens - segment_starts, "int32") - overlap_len = rf.where( - rf.logical_or(overlap_len < 0, overlap_len > prev_segment_lens), - rf.convert_to_tensor(0), - overlap_len - ) - overlap_start = prev_segment_lens - overlap_len - - slice_dim = Dim(name="slice", dimension=overlap_len) - gather_positions = rf.range_over_dim(slice_dim) - gather_positions += overlap_start - prev_accum_att_weights_overlap = rf.gather( - prev_accum_att_weights, axis=self.accum_att_weights_dim, indices=gather_positions, clip_to_valid=True - ) - overlap_range = rf.range_over_dim(slice_dim) - - prev_accum_att_weights_scattered = rf.scatter( - prev_accum_att_weights_overlap, - out_dim=self.accum_att_weights_dim, - indices=overlap_range, - indices_dim=slice_dim, - ) - - return prev_accum_att_weights_scattered - - def _get_accum_att_weights( - self, - att_t_dim: Dim, - enc_spatial_dim: Dim, - inv_fertility: Tensor, - att_weights: Tensor, - prev_accum_att_weights_scattered: Tensor, - gather_positions: Tensor, - ) -> Tensor: - att_weights_range = rf.range_over_dim(att_t_dim) - att_weights_scattered = rf.scatter( - att_weights, - out_dim=self.accum_att_weights_dim, - indices=att_weights_range, - indices_dim=att_t_dim, - ) - - inv_fertility_sliced = rf.gather(inv_fertility, axis=enc_spatial_dim, indices=gather_positions, clip_to_valid=True) - inv_fertility_scattered = rf.scatter( - inv_fertility_sliced, - out_dim=self.accum_att_weights_dim, - indices=att_weights_range, - indices_dim=att_t_dim, - ) - - accum_att_weights = prev_accum_att_weights_scattered + att_weights_scattered * inv_fertility_scattered * 0.5 - - return accum_att_weights - - def _get_weight_feedback( - self, - prev_accum_att_weights_scattered: Tensor, - att_t_dim: Dim, - ) -> Tensor: - gather_positions = rf.range_over_dim(att_t_dim) - prev_accum_att_weights_sliced = rf.gather( - prev_accum_att_weights_scattered, - axis=self.accum_att_weights_dim, - indices=gather_positions, - clip_to_valid=True - ) - - return self.weight_feedback(prev_accum_att_weights_sliced) - - def label_sync_loop_step( - self, - *, - enc: rf.Tensor, - enc_ctx: rf.Tensor, - inv_fertility: rf.Tensor, - enc_spatial_dim: Dim, - input_embed: rf.Tensor, - segment_starts: rf.Tensor, - segment_lens: rf.Tensor, - state: Optional[rf.State] = None, - ) -> Tuple[Dict[str, rf.Tensor], rf.State]: - """step of the inner loop""" - if state is None: - batch_dims = enc.remaining_dims( - remove=(enc.feature_dim, enc_spatial_dim) if enc_spatial_dim != single_step_dim else (enc.feature_dim,) - ) - state = self.label_decoder_default_initial_state(batch_dims=batch_dims) - state_ = rf.State() - - # during search, these need to be the values from the previous "emit" step (not necessarily the previous time step) - prev_att = state.att - prev_s_state = state.s - prev_accum_att_weights = state.accum_att_weights - prev_segment_starts = state.segment_starts - prev_segment_lens = state.segment_lens - - s, state_.s = self.s(rf.concat_features(input_embed, prev_att), state=prev_s_state, spatial_dim=single_step_dim) - s_transformed = self.s_transformed(s) - - slice_dim = Dim(name="slice", dimension=segment_lens) - gather_positions = rf.range_over_dim(slice_dim) - gather_positions += segment_starts - - enc_ctx_sliced = rf.gather(enc_ctx, axis=enc_spatial_dim, indices=gather_positions, clip_to_valid=True) - enc_sliced = rf.gather(enc, axis=enc_spatial_dim, indices=gather_positions, clip_to_valid=True) - - prev_accum_att_weights_scattered = self._get_prev_accum_att_weights_scattered( - prev_accum_att_weights=prev_accum_att_weights, - segment_starts=segment_starts, - prev_segment_starts=prev_segment_starts, - prev_segment_lens=prev_segment_lens, - ) - weight_feedback = self._get_weight_feedback( - prev_accum_att_weights_scattered=prev_accum_att_weights_scattered, - att_t_dim=slice_dim, - ) - - energy_in = enc_ctx_sliced + weight_feedback + s_transformed - energy = self.energy(rf.tanh(energy_in)) - att_weights = rf.softmax(energy, axis=slice_dim) - # we do not need use_mask because the softmax output is already padded with zeros - att0 = rf.dot(att_weights, enc_sliced, reduce=slice_dim, use_mask=False) - att0.feature_dim = self.encoder.out_dim - att, _ = rf.merge_dims(att0, dims=(self.att_num_heads, self.encoder.out_dim)) - state_.att = att - - accum_att_weights = self._get_accum_att_weights( - att_t_dim=slice_dim, - enc_spatial_dim=enc_spatial_dim, - inv_fertility=inv_fertility, - att_weights=att_weights, - prev_accum_att_weights_scattered=prev_accum_att_weights_scattered, - gather_positions=gather_positions, + self.encoder = GlobalConformerEncoder( + enc_in_dim, + enc_out_dim, + num_layers=enc_num_layers, + target_dim=target_dim, + wb_target_dim=align_target_dim, + aux_logits=enc_aux_logits, + ff_dim=enc_ff_dim, + num_heads=enc_num_heads, + encoder_layer_opts=encoder_layer_opts, + enc_key_total_dim=enc_key_total_dim, + dec_att_num_heads=dec_att_num_heads, + dropout=enc_dropout, + att_dropout=att_dropout, + l2=l2, ) - accum_att_weights.feature_dim = self.att_num_heads - state_.accum_att_weights = accum_att_weights - - state_.segment_starts = segment_starts - state_.segment_lens = segment_lens - return {"s": s, "att": att}, state_ + print("using label_decoder_version", label_decoder_version) + if label_decoder_version == 1: + label_decoder_class = SegmentalAttLabelDecoder + else: + assert label_decoder_version == 2 + label_decoder_class = SegmentalAttLabelDecoderWoCtxInState - def decode_label_logits(self, *, s: Tensor, input_embed: Tensor, att: Tensor) -> Tensor: - """logits for the decoder""" - readout_in = self.readout_in(rf.concat_features(s, input_embed, att)) - readout = rf.reduce_out(readout_in, mode="max", num_pieces=2, out_dim=self.output_prob.in_dim) - readout = rf.dropout(readout, drop_prob=0.3, axis=self.dropout_broadcast and readout.feature_dim) - logits = self.output_prob(readout) - return logits - - def blank_decoder_default_initial_state(self, *, batch_dims: Sequence[Dim]) -> rf.State: - """Default initial state""" - state = rf.State( - s_length_model=self.s_length_model.default_initial_state(batch_dims=batch_dims), - i=rf.zeros(batch_dims, dtype="int32"), + self.label_decoder = label_decoder_class( + enc_out_dim=self.encoder.out_dim, + target_dim=target_dim, + att_num_heads=dec_att_num_heads, + att_dropout=att_dropout, + blank_idx=blank_idx, + enc_key_total_dim=enc_key_total_dim, + l2=l2, + center_window_size=center_window_size, + language_model=language_model, ) - return state - - def blank_loop_step_output_templates(self, batch_dims: List[Dim]) -> Dict[str, Tensor]: - """loop step out""" - return { - "s_length_model": Tensor( - "s_length_model", - dims=batch_dims + [self.s_length_model.out_dim], - dtype=rf.get_default_float_dtype(), - feature_dim_axis=-1 - ), - } - - def time_sync_loop_step( - self, - *, - enc: rf.Tensor, - enc_spatial_dim: Dim, - input_embed: rf.Tensor, - state: Optional[rf.State] = None, - ) -> Tuple[Dict[str, rf.Tensor], rf.State]: - """step of the inner loop""" - if state is None: - batch_dims = enc.remaining_dims( - remove=(enc.feature_dim, enc_spatial_dim) if enc_spatial_dim != single_step_dim else (enc.feature_dim,) - ) - state = self.blank_decoder_default_initial_state(batch_dims=batch_dims) - state_ = rf.State() - - am = rf.gather(enc, axis=enc_spatial_dim, indices=state.i, clip_to_valid=True) - s_length_model, state_.s_length_model = self.s_length_model( - rf.concat_features(am, input_embed), - state=state.s_length_model, - spatial_dim=single_step_dim + self.blank_decoder = BlankDecoder( + length_model_state_dim=length_model_state_dim, + length_model_embed_dim=length_model_embed_dim, + align_target_dim=align_target_dim, + encoder_out_dim=self.encoder.out_dim, ) - state_.i = state.i + 1 - - return {"s_length_model": s_length_model}, state_ - - def decode_blank_logits(self, *, s_length_model: Tensor) -> Tensor: - """logits for the decoder""" - logits = self.emit_prob(s_length_model) - return logits + self.blank_idx = self.label_decoder.blank_idx + self.center_window_size = center_window_size + self.target_dim = self.label_decoder.target_dim + self.align_target_dim = align_target_dim class MakeModel: @@ -318,6 +124,7 @@ def make_model( num_enc_layers: int = 12, pos_emb_dropout: float = 0.0, language_model: Optional[Dict[str, Any]] = None, + label_decoder_version: int, **extra, ) -> SegmentalAttentionModel: """make""" @@ -335,12 +142,12 @@ def make_model( lm = (lm, functools.partial(trafo_lm.make_time_sync_label_scorer_torch, model=lm, align_target_dim=align_target_dim)) return SegmentalAttentionModel( - in_dim=in_dim, - num_enc_layers=num_enc_layers, - enc_model_dim=Dim(name="enc", dimension=512, kind=Dim.Types.Feature), + enc_in_dim=in_dim, + enc_num_layers=num_enc_layers, + enc_out_dim=Dim(name="enc", dimension=512, kind=Dim.Types.Feature), enc_ff_dim=Dim(name="enc-ff", dimension=2048, kind=Dim.Types.Feature), - enc_att_num_heads=8, - enc_conformer_layer_opts=dict( + enc_num_heads=8, + encoder_layer_opts=dict( conv_norm_opts=dict(use_mask=True), self_att_opts=dict( # Shawn et al 2018 style, old RETURNN way. @@ -360,6 +167,7 @@ def make_model( length_model_state_dim=Dim(name="length_model_state", dimension=128, kind=Dim.Types.Feature), length_model_embed_dim=Dim(name="length_model_embed", dimension=128, kind=Dim.Types.Feature), center_window_size=center_window_size, + label_decoder_version=label_decoder_version, **extra, ) @@ -379,6 +187,9 @@ def from_scratch_model_def( center_window_size = config.typed_value("center_window_size") if center_window_size is None: raise ValueError("center_window_size is not set!") + + label_decoder_version = config.int("label_decoder_version", 1) + return MakeModel.make_model( in_dim, align_target_dim, @@ -386,7 +197,8 @@ def from_scratch_model_def( center_window_size=center_window_size, enc_aux_logits=enc_aux_logits or (), pos_emb_dropout=pos_emb_dropout, - language_model=lm_opts + language_model=lm_opts, + label_decoder_version=label_decoder_version, ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/model.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/model.py new file mode 100644 index 000000000..8b57923e8 --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/model.py @@ -0,0 +1,92 @@ +from typing import Optional, Dict, Any, Sequence, Tuple, List +import functools + +from returnn.tensor import Tensor, Dim, single_step_dim +import returnn.frontend as rf + +from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.model import ModelDef +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.base import _batch_size_factor, _log_mel_feature_dim +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.base import BaseLabelDecoder + + +class BlankDecoder(rf.Module): + def __init__( + self, + length_model_state_dim: Dim, + length_model_embed_dim: Dim, + align_target_dim: Dim, + encoder_out_dim: Dim, + ): + super(BlankDecoder, self).__init__() + self.length_model_state_dim = length_model_state_dim + self.length_model_embed_dim = length_model_embed_dim + self.emit_prob_dim = Dim(name="emit_prob", dimension=1) + + self.target_embed = rf.Embedding(align_target_dim, self.length_model_embed_dim) + self.s = rf.LSTM( + encoder_out_dim + self.length_model_embed_dim, + self.length_model_state_dim, + ) + self.emit_prob = rf.Linear(self.length_model_state_dim, self.emit_prob_dim) + + def default_initial_state(self, *, batch_dims: Sequence[Dim]) -> rf.State: + """Default initial state""" + state = rf.State( + s_blank=self.s.default_initial_state(batch_dims=batch_dims), + i=rf.zeros(batch_dims, dtype="int32"), + ) + return state + + def loop_step_output_templates(self, batch_dims: List[Dim]) -> Dict[str, Tensor]: + """loop step out""" + return { + "s_blank": Tensor( + "s_blank", + dims=batch_dims + [self.s.out_dim], + dtype=rf.get_default_float_dtype(), + feature_dim_axis=-1 + ), + } + + def loop_step( + self, + *, + enc: rf.Tensor, + enc_spatial_dim: Dim, + input_embed: rf.Tensor, + state: Optional[rf.State] = None, + spatial_dim=single_step_dim + ) -> Tuple[Dict[str, rf.Tensor], rf.State]: + """step of the inner loop""" + if state is None: + batch_dims = enc.remaining_dims( + remove=(enc.feature_dim, enc_spatial_dim) if enc_spatial_dim != single_step_dim else (enc.feature_dim,) + ) + state = self.default_initial_state(batch_dims=batch_dims) + state_ = rf.State() + + if spatial_dim == single_step_dim: + i = state.i + clip_to_valid = True + else: + i = rf.range_over_dim(spatial_dim) + # do clipping here, because rf.gather complains that i.dims_set is not a superset of enc_spatial_dim.dims_set + seq_lens = rf.copy_to_device(enc_spatial_dim.dyn_size_ext, i.device) + i = rf.where(i < seq_lens, i, seq_lens - 1) + clip_to_valid = False + + am = rf.gather(enc, axis=enc_spatial_dim, indices=i, clip_to_valid=clip_to_valid) + s_blank, state_.s_blank = self.s( + rf.concat_features(am, input_embed), + state=state.s_blank, + spatial_dim=spatial_dim + ) + + state_.i = state.i + 1 + + return {"s_blank": s_blank}, state_ + + def decode_logits(self, *, s_blank: Tensor) -> Tensor: + """logits for the decoder""" + logits = self.emit_prob(s_blank) + return logits diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/train.py new file mode 100644 index 000000000..13daa8de3 --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/train.py @@ -0,0 +1,55 @@ +from typing import Optional, Dict, Any, Sequence, Tuple, List + +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.blank_model.model import BlankDecoder + +from returnn.tensor import Dim +import returnn.frontend as rf + + +def viterbi_training( + *, + model: BlankDecoder, + enc_args: Dict, + enc_spatial_dim: Dim, + align_targets: rf.Tensor, + align_targets_spatial_dim: Dim, + emit_ground_truth: rf.Tensor, + emit_blank_target_dim: Dim, + batch_dims: List[Dim], +): + align_input_embeddings = model.target_embed(align_targets) + align_input_embeddings = rf.shift_right( + align_input_embeddings, axis=align_targets_spatial_dim, pad_value=0.0) + + blank_loop_out, _ = model.loop_step( + enc=enc_args["enc"], + enc_spatial_dim=enc_spatial_dim, + input_embed=align_input_embeddings, + state=model.default_initial_state(batch_dims=batch_dims,), + spatial_dim=align_targets_spatial_dim, + ) + + blank_logits = model.decode_logits(**blank_loop_out) + blank_logits_packed, pack_dim = rf.pack_padded( + blank_logits, dims=batch_dims + [align_targets_spatial_dim], enforce_sorted=False) + emit_ground_truth_packed, _ = rf.pack_padded( + emit_ground_truth, dims=batch_dims + [align_targets_spatial_dim], enforce_sorted=False, out_dim=pack_dim + ) + + # rf.log_sigmoid not implemented for torch backend + emit_log_prob = rf.log(rf.sigmoid(blank_logits_packed)) + blank_log_prob = rf.log(rf.sigmoid(-blank_logits_packed)) + blank_logit_dim = blank_logits_packed.remaining_dims((pack_dim,))[0] + emit_blank_log_prob, _ = rf.concat( + (blank_log_prob, blank_logit_dim), (emit_log_prob, blank_logit_dim), out_dim=emit_blank_target_dim) + blank_loss = rf.cross_entropy( + target=emit_ground_truth_packed, + estimated=emit_blank_log_prob, + estimated_type="log-probs", + axis=emit_blank_target_dim + ) + blank_loss.mark_as_loss("emit_blank_ce", scale=1.0, use_normalized_loss=True) + + best = rf.reduce_argmax(emit_blank_log_prob, axis=emit_blank_target_dim) + frame_error = best != emit_ground_truth_packed + frame_error.mark_as_loss(name="emit_blank_fer", as_error=True) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/model.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/model.py new file mode 100644 index 000000000..2313ced51 --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/model.py @@ -0,0 +1,246 @@ +from typing import Optional, Dict, Any, Sequence, Tuple, List +import functools + +from returnn.tensor import Tensor, Dim, single_step_dim +import returnn.frontend as rf + +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.base import BaseLabelDecoder + + +class SegmentalAttLabelDecoder(BaseLabelDecoder): + def __init__(self, center_window_size: int, **kwargs): + super(SegmentalAttLabelDecoder, self).__init__(**kwargs) + + self.center_window_size = center_window_size + self.accum_att_weights_dim = Dim(name="accum_att_weights", dimension=center_window_size) + + def default_initial_state( + self, + *, + batch_dims: Sequence[Dim], + segment_starts_sparse_dim: Optional[Dim] = None, + segment_lens_sparse_dim: Optional[Dim] = None, + ) -> rf.State: + """Default initial state""" + state = rf.State( + s=self._get_lstm().default_initial_state(batch_dims=batch_dims), + att=rf.zeros(list(batch_dims) + [self.att_num_heads * self.enc_out_dim]), + accum_att_weights=rf.zeros( + list(batch_dims) + [self.accum_att_weights_dim, self.att_num_heads], feature_dim=self.att_num_heads + ), + segment_starts=rf.zeros(batch_dims, sparse_dim=segment_starts_sparse_dim, dtype="int32"), + segment_lens=rf.zeros(batch_dims, sparse_dim=segment_lens_sparse_dim, dtype="int32"), + ) + state.att.feature_dim_axis = len(state.att.dims) - 1 + return state + + def loop_step_output_templates(self, batch_dims: List[Dim]) -> Dict[str, Tensor]: + """loop step out""" + return { + "s": Tensor( + "s", dims=batch_dims + [self._get_lstm().out_dim], dtype=rf.get_default_float_dtype(), feature_dim_axis=-1 + ), + "att": Tensor( + "att", + dims=batch_dims + [self.att_num_heads * self.enc_out_dim], + dtype=rf.get_default_float_dtype(), + feature_dim_axis=-1, + ), + } + + def _get_prev_accum_att_weights_scattered( + self, + prev_accum_att_weights: Tensor, + segment_starts: Tensor, + prev_segment_starts: Tensor, + prev_segment_lens: Tensor, + ) -> Tensor: + + overlap_len = rf.cast(prev_segment_starts + prev_segment_lens - segment_starts, "int32") + overlap_len = rf.where( + rf.logical_or(overlap_len < 0, overlap_len > prev_segment_lens), + rf.convert_to_tensor(0), + overlap_len + ) + overlap_start = prev_segment_lens - overlap_len + + slice_dim = Dim(name="slice", dimension=overlap_len) + gather_positions = rf.range_over_dim(slice_dim) + gather_positions += overlap_start + prev_accum_att_weights_overlap = rf.gather( + prev_accum_att_weights, axis=self.accum_att_weights_dim, indices=gather_positions, clip_to_valid=True + ) + overlap_range = rf.range_over_dim(slice_dim) + + prev_accum_att_weights_scattered = rf.scatter( + prev_accum_att_weights_overlap, + out_dim=self.accum_att_weights_dim, + indices=overlap_range, + indices_dim=slice_dim, + ) + + return prev_accum_att_weights_scattered + + def _get_accum_att_weights( + self, + att_t_dim: Dim, + enc_spatial_dim: Dim, + inv_fertility: Tensor, + att_weights: Tensor, + prev_accum_att_weights_scattered: Tensor, + gather_positions: Tensor, + ) -> Tensor: + att_weights_range = rf.range_over_dim(att_t_dim) + att_weights_scattered = rf.scatter( + att_weights, + out_dim=self.accum_att_weights_dim, + indices=att_weights_range, + indices_dim=att_t_dim, + ) + + inv_fertility_sliced = rf.gather(inv_fertility, axis=enc_spatial_dim, indices=gather_positions, clip_to_valid=True) + inv_fertility_scattered = rf.scatter( + inv_fertility_sliced, + out_dim=self.accum_att_weights_dim, + indices=att_weights_range, + indices_dim=att_t_dim, + ) + + accum_att_weights = prev_accum_att_weights_scattered + att_weights_scattered * inv_fertility_scattered * 0.5 + + return accum_att_weights + + def _get_weight_feedback( + self, + prev_accum_att_weights_scattered: Tensor, + att_t_dim: Dim, + ) -> Tensor: + gather_positions = rf.range_over_dim(att_t_dim) + prev_accum_att_weights_sliced = rf.gather( + prev_accum_att_weights_scattered, + axis=self.accum_att_weights_dim, + indices=gather_positions, + clip_to_valid=True + ) + + return self.weight_feedback(prev_accum_att_weights_sliced) + + def _update_state( + self, + input_embed: rf.Tensor, + prev_att: rf.Tensor, + prev_s_state: rf.LstmState, + ): + return self.s(rf.concat_features(input_embed, prev_att), state=prev_s_state, spatial_dim=single_step_dim) + + def _get_lstm(self): + return self.s + + def loop_step( + self, + *, + enc: rf.Tensor, + enc_ctx: rf.Tensor, + inv_fertility: rf.Tensor, + enc_spatial_dim: Dim, + input_embed: rf.Tensor, + segment_starts: rf.Tensor, + segment_lens: rf.Tensor, + state: Optional[rf.State] = None, + ) -> Tuple[Dict[str, rf.Tensor], rf.State]: + """step of the inner loop""" + if state is None: + batch_dims = enc.remaining_dims( + remove=(enc.feature_dim, enc_spatial_dim) if enc_spatial_dim != single_step_dim else (enc.feature_dim,) + ) + state = self.default_initial_state(batch_dims=batch_dims) + state_ = rf.State() + + # during search, these need to be the values from the previous "emit" step (not necessarily the previous time step) + prev_att = state.att + prev_s_state = state.s + prev_accum_att_weights = state.accum_att_weights + prev_segment_starts = state.segment_starts + prev_segment_lens = state.segment_lens + + s, state_.s = self._update_state(input_embed, prev_att, prev_s_state) + s_transformed = self.s_transformed(s) + + slice_dim = Dim(name="slice", dimension=segment_lens) + gather_positions = rf.range_over_dim(slice_dim) + gather_positions += segment_starts + + enc_ctx_sliced = rf.gather(enc_ctx, axis=enc_spatial_dim, indices=gather_positions, clip_to_valid=True) + enc_sliced = rf.gather(enc, axis=enc_spatial_dim, indices=gather_positions, clip_to_valid=True) + + prev_accum_att_weights_scattered = self._get_prev_accum_att_weights_scattered( + prev_accum_att_weights=prev_accum_att_weights, + segment_starts=segment_starts, + prev_segment_starts=prev_segment_starts, + prev_segment_lens=prev_segment_lens, + ) + weight_feedback = self._get_weight_feedback( + prev_accum_att_weights_scattered=prev_accum_att_weights_scattered, + att_t_dim=slice_dim, + ) + + energy_in = enc_ctx_sliced + weight_feedback + s_transformed + energy = self.energy(rf.tanh(energy_in)) + att_weights = rf.softmax(energy, axis=slice_dim) + # we do not need use_mask because the softmax output is already padded with zeros + att0 = rf.dot(att_weights, enc_sliced, reduce=slice_dim, use_mask=False) + att0.feature_dim = self.enc_out_dim + att, _ = rf.merge_dims(att0, dims=(self.att_num_heads, self.enc_out_dim)) + state_.att = att + + accum_att_weights = self._get_accum_att_weights( + att_t_dim=slice_dim, + enc_spatial_dim=enc_spatial_dim, + inv_fertility=inv_fertility, + att_weights=att_weights, + prev_accum_att_weights_scattered=prev_accum_att_weights_scattered, + gather_positions=gather_positions, + ) + accum_att_weights.feature_dim = self.att_num_heads + state_.accum_att_weights = accum_att_weights + + state_.segment_starts = segment_starts + state_.segment_lens = segment_lens + + return {"s": s, "att": att}, state_ + + def decode_logits(self, *, s: Tensor, input_embed: Tensor, att: Tensor) -> Tensor: + """logits for the decoder""" + readout_in = self.readout_in(rf.concat_features(s, input_embed, att)) + readout = rf.reduce_out(readout_in, mode="max", num_pieces=2, out_dim=self.output_prob.in_dim) + readout = rf.dropout(readout, drop_prob=0.3, axis=self.dropout_broadcast and readout.feature_dim) + logits = self.output_prob(readout) + return logits + + +class SegmentalAttLabelDecoderWoCtxInState(SegmentalAttLabelDecoder): + def __init__(self, **kwargs): + super(SegmentalAttLabelDecoderWoCtxInState, self).__init__(**kwargs) + + # replace old state with new one + self.s_wo_att = rf.ZoneoutLSTM( + self.target_embed.out_dim, + self.s.out_dim, + zoneout_factor_cell=0.15, + zoneout_factor_output=0.05, + use_zoneout_output=False, + parts_order="jifo", + forget_bias=0.0, + ) + delattr(self, "s") + + def _update_state( + self, + input_embed: rf.Tensor, + prev_att: rf.Tensor, + prev_s_state: rf.LstmState, + ): + return self.s_wo_att(rf.concat_features(input_embed), state=prev_s_state, spatial_dim=single_step_dim) + + def _get_lstm(self): + return self.s_wo_att diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/train.py new file mode 100644 index 000000000..fe2d14b7f --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/train.py @@ -0,0 +1,75 @@ +from typing import Optional, Dict, Any, Sequence, Tuple, List + +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.utils import get_non_blank_mask, get_masked +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.model import SegmentalAttLabelDecoder + +from returnn.tensor import Dim +import returnn.frontend as rf + +from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.training import TrainDef + + +def viterbi_training( + *, + model: SegmentalAttLabelDecoder, + enc_args: Dict, + enc_spatial_dim: Dim, + non_blank_targets: rf.Tensor, + non_blank_targets_spatial_dim: Dim, + segment_starts: rf.Tensor, + segment_lens: rf.Tensor, + batch_dims: List[Dim], +): + non_blank_input_embeddings = model.target_embed(non_blank_targets) + non_blank_input_embeddings = rf.shift_right( + non_blank_input_embeddings, axis=non_blank_targets_spatial_dim, pad_value=0.0) + + # ------------------- label loop ------------------- + + def _label_loop_body(xs, state: rf.State): + new_state = rf.State() + loop_out_, new_state.decoder = model.loop_step( + **enc_args, + enc_spatial_dim=enc_spatial_dim, + input_embed=xs["input_embed"], + segment_starts=xs["segment_starts"], + segment_lens=xs["segment_lens"], + state=state.decoder, + ) + return loop_out_, new_state + + label_loop_out, _, _ = rf.scan( + spatial_dim=non_blank_targets_spatial_dim, + xs={ + "input_embed": non_blank_input_embeddings, + "segment_starts": segment_starts, + "segment_lens": segment_lens, + }, + ys=model.loop_step_output_templates(batch_dims=batch_dims), + initial=rf.State( + decoder=model.default_initial_state( + batch_dims=batch_dims, + # TODO: do we need these sparse dims? they are automatically added by rf.range_over_dim + segment_starts_sparse_dim=segment_starts.sparse_dim, + segment_lens_sparse_dim=segment_lens.sparse_dim, + ), + ), + body=_label_loop_body, + ) + + logits = model.decode_logits(input_embed=non_blank_input_embeddings, **label_loop_out) + logits_packed, pack_dim = rf.pack_padded(logits, dims=batch_dims + [non_blank_targets_spatial_dim], enforce_sorted=False) + non_blank_targets_packed, _ = rf.pack_padded( + non_blank_targets, dims=batch_dims + [non_blank_targets_spatial_dim], enforce_sorted=False, out_dim=pack_dim + ) + + log_prob = rf.log_softmax(logits_packed, axis=model.target_dim) + log_prob = rf.label_smoothed_log_prob_gradient(log_prob, 0.1, axis=model.target_dim) + loss = rf.cross_entropy( + target=non_blank_targets_packed, estimated=log_prob, estimated_type="log-probs", axis=model.target_dim + ) + loss.mark_as_loss("non_blank_ce", scale=1.0, use_normalized_loss=True) + + best = rf.reduce_argmax(logits_packed, axis=model.target_dim) + frame_error = best != non_blank_targets_packed + frame_error.mark_as_loss(name="non_blank_fer", as_error=True) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_old/README b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_old/README new file mode 100644 index 000000000..5b7120b18 --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_old/README @@ -0,0 +1,2 @@ +This package contains everything needed for doing viterbi training and recognition for the V1 baseline of our +segmental model. It leads to the same scores, losses, speed and WER as our old RETURNN setup (net_dict style). \ No newline at end of file diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_old/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_old/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_old/model.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_old/model.py new file mode 100644 index 000000000..32759b342 --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_old/model.py @@ -0,0 +1,434 @@ +from typing import Optional, Dict, Any, Sequence, Tuple, List +import functools + +from returnn.tensor import Tensor, Dim, single_step_dim +import returnn.frontend as rf + +from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.model import ModelDef +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.base_old import _batch_size_factor, _log_mel_feature_dim +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.base_old import BaseModel + + +class SegmentalAttentionModel(BaseModel): + def __init__( + self, + length_model_state_dim: Dim, + length_model_embed_dim: Dim, + center_window_size: int, + align_target_dim: Dim, + **kwargs + ): + super(SegmentalAttentionModel, self).__init__(**kwargs) + + self.align_target_dim = align_target_dim + + self.length_model_state_dim = length_model_state_dim + self.length_model_embed_dim = length_model_embed_dim + self.emit_prob_dim = Dim(name="emit_prob", dimension=1) + self.center_window_size = center_window_size + self.accum_att_weights_dim = Dim(name="accum_att_weights", dimension=center_window_size) + + self.target_embed_length_model = rf.Embedding(align_target_dim, self.length_model_embed_dim) + # when using rf.LSTM, something with the parameter import from TF checkpoint was not right + # i.e. the import worked but the LSTM output was different than in TF even though the inputs were the same + # self.s_length_model = rf.LSTM(self.encoder.out_dim + self.length_model_embed_dim, self.length_model_state_dim) + # self.s_length_model = rf.ZoneoutLSTM( + # self.encoder.out_dim + self.length_model_embed_dim, + # self.length_model_state_dim, + # parts_order="jifo", + # forget_bias=0.0, + # ) + self.s_length_model = rf.LSTM( + self.encoder.out_dim + self.length_model_embed_dim, + self.length_model_state_dim, + ) + self.emit_prob = rf.Linear(self.length_model_state_dim, self.emit_prob_dim) + + def label_decoder_default_initial_state( + self, + *, + batch_dims: Sequence[Dim], + segment_starts_sparse_dim: Optional[Dim] = None, + segment_lens_sparse_dim: Optional[Dim] = None, + ) -> rf.State: + """Default initial state""" + state = rf.State( + s=self.s.default_initial_state(batch_dims=batch_dims), + att=rf.zeros(list(batch_dims) + [self.att_num_heads * self.encoder.out_dim]), + accum_att_weights=rf.zeros( + list(batch_dims) + [self.accum_att_weights_dim, self.att_num_heads], feature_dim=self.att_num_heads + ), + segment_starts=rf.zeros(batch_dims, sparse_dim=segment_starts_sparse_dim, dtype="int32"), + segment_lens=rf.zeros(batch_dims, sparse_dim=segment_lens_sparse_dim, dtype="int32"), + ) + state.att.feature_dim_axis = len(state.att.dims) - 1 + return state + + def label_loop_step_output_templates(self, batch_dims: List[Dim]) -> Dict[str, Tensor]: + """loop step out""" + return { + "s": Tensor( + "s", dims=batch_dims + [self.s.out_dim], dtype=rf.get_default_float_dtype(), feature_dim_axis=-1 + ), + "att": Tensor( + "att", + dims=batch_dims + [self.att_num_heads * self.encoder.out_dim], + dtype=rf.get_default_float_dtype(), + feature_dim_axis=-1, + ), + } + + def _get_prev_accum_att_weights_scattered( + self, + prev_accum_att_weights: Tensor, + segment_starts: Tensor, + prev_segment_starts: Tensor, + prev_segment_lens: Tensor, + ) -> Tensor: + + overlap_len = rf.cast(prev_segment_starts + prev_segment_lens - segment_starts, "int32") + overlap_len = rf.where( + rf.logical_or(overlap_len < 0, overlap_len > prev_segment_lens), + rf.convert_to_tensor(0), + overlap_len + ) + overlap_start = prev_segment_lens - overlap_len + + slice_dim = Dim(name="slice", dimension=overlap_len) + gather_positions = rf.range_over_dim(slice_dim) + gather_positions += overlap_start + prev_accum_att_weights_overlap = rf.gather( + prev_accum_att_weights, axis=self.accum_att_weights_dim, indices=gather_positions, clip_to_valid=True + ) + overlap_range = rf.range_over_dim(slice_dim) + + prev_accum_att_weights_scattered = rf.scatter( + prev_accum_att_weights_overlap, + out_dim=self.accum_att_weights_dim, + indices=overlap_range, + indices_dim=slice_dim, + ) + + return prev_accum_att_weights_scattered + + def _get_accum_att_weights( + self, + att_t_dim: Dim, + enc_spatial_dim: Dim, + inv_fertility: Tensor, + att_weights: Tensor, + prev_accum_att_weights_scattered: Tensor, + gather_positions: Tensor, + ) -> Tensor: + att_weights_range = rf.range_over_dim(att_t_dim) + att_weights_scattered = rf.scatter( + att_weights, + out_dim=self.accum_att_weights_dim, + indices=att_weights_range, + indices_dim=att_t_dim, + ) + + inv_fertility_sliced = rf.gather(inv_fertility, axis=enc_spatial_dim, indices=gather_positions, clip_to_valid=True) + inv_fertility_scattered = rf.scatter( + inv_fertility_sliced, + out_dim=self.accum_att_weights_dim, + indices=att_weights_range, + indices_dim=att_t_dim, + ) + + accum_att_weights = prev_accum_att_weights_scattered + att_weights_scattered * inv_fertility_scattered * 0.5 + + return accum_att_weights + + def _get_weight_feedback( + self, + prev_accum_att_weights_scattered: Tensor, + att_t_dim: Dim, + ) -> Tensor: + gather_positions = rf.range_over_dim(att_t_dim) + prev_accum_att_weights_sliced = rf.gather( + prev_accum_att_weights_scattered, + axis=self.accum_att_weights_dim, + indices=gather_positions, + clip_to_valid=True + ) + + return self.weight_feedback(prev_accum_att_weights_sliced) + + def label_sync_loop_step( + self, + *, + enc: rf.Tensor, + enc_ctx: rf.Tensor, + inv_fertility: rf.Tensor, + enc_spatial_dim: Dim, + input_embed: rf.Tensor, + segment_starts: rf.Tensor, + segment_lens: rf.Tensor, + state: Optional[rf.State] = None, + ) -> Tuple[Dict[str, rf.Tensor], rf.State]: + """step of the inner loop""" + if state is None: + batch_dims = enc.remaining_dims( + remove=(enc.feature_dim, enc_spatial_dim) if enc_spatial_dim != single_step_dim else (enc.feature_dim,) + ) + state = self.label_decoder_default_initial_state(batch_dims=batch_dims) + state_ = rf.State() + + # during search, these need to be the values from the previous "emit" step (not necessarily the previous time step) + prev_att = state.att + prev_s_state = state.s + prev_accum_att_weights = state.accum_att_weights + prev_segment_starts = state.segment_starts + prev_segment_lens = state.segment_lens + + s, state_.s = self.s(rf.concat_features(input_embed, prev_att), state=prev_s_state, spatial_dim=single_step_dim) + s_transformed = self.s_transformed(s) + + slice_dim = Dim(name="slice", dimension=segment_lens) + gather_positions = rf.range_over_dim(slice_dim) + gather_positions += segment_starts + + enc_ctx_sliced = rf.gather(enc_ctx, axis=enc_spatial_dim, indices=gather_positions, clip_to_valid=True) + enc_sliced = rf.gather(enc, axis=enc_spatial_dim, indices=gather_positions, clip_to_valid=True) + + prev_accum_att_weights_scattered = self._get_prev_accum_att_weights_scattered( + prev_accum_att_weights=prev_accum_att_weights, + segment_starts=segment_starts, + prev_segment_starts=prev_segment_starts, + prev_segment_lens=prev_segment_lens, + ) + weight_feedback = self._get_weight_feedback( + prev_accum_att_weights_scattered=prev_accum_att_weights_scattered, + att_t_dim=slice_dim, + ) + + energy_in = enc_ctx_sliced + weight_feedback + s_transformed + energy = self.energy(rf.tanh(energy_in)) + att_weights = rf.softmax(energy, axis=slice_dim) + # we do not need use_mask because the softmax output is already padded with zeros + att0 = rf.dot(att_weights, enc_sliced, reduce=slice_dim, use_mask=False) + att0.feature_dim = self.encoder.out_dim + att, _ = rf.merge_dims(att0, dims=(self.att_num_heads, self.encoder.out_dim)) + state_.att = att + + accum_att_weights = self._get_accum_att_weights( + att_t_dim=slice_dim, + enc_spatial_dim=enc_spatial_dim, + inv_fertility=inv_fertility, + att_weights=att_weights, + prev_accum_att_weights_scattered=prev_accum_att_weights_scattered, + gather_positions=gather_positions, + ) + accum_att_weights.feature_dim = self.att_num_heads + state_.accum_att_weights = accum_att_weights + + state_.segment_starts = segment_starts + state_.segment_lens = segment_lens + + return {"s": s, "att": att}, state_ + + def decode_label_logits(self, *, s: Tensor, input_embed: Tensor, att: Tensor) -> Tensor: + """logits for the decoder""" + readout_in = self.readout_in(rf.concat_features(s, input_embed, att)) + readout = rf.reduce_out(readout_in, mode="max", num_pieces=2, out_dim=self.output_prob.in_dim) + readout = rf.dropout(readout, drop_prob=0.3, axis=self.dropout_broadcast and readout.feature_dim) + logits = self.output_prob(readout) + return logits + + def blank_decoder_default_initial_state(self, *, batch_dims: Sequence[Dim]) -> rf.State: + """Default initial state""" + state = rf.State( + s_length_model=self.s_length_model.default_initial_state(batch_dims=batch_dims), + i=rf.zeros(batch_dims, dtype="int32"), + ) + return state + + def blank_loop_step_output_templates(self, batch_dims: List[Dim]) -> Dict[str, Tensor]: + """loop step out""" + return { + "s_length_model": Tensor( + "s_length_model", + dims=batch_dims + [self.s_length_model.out_dim], + dtype=rf.get_default_float_dtype(), + feature_dim_axis=-1 + ), + } + + def time_sync_loop_step( + self, + *, + enc: rf.Tensor, + enc_spatial_dim: Dim, + input_embed: rf.Tensor, + state: Optional[rf.State] = None, + spatial_dim=single_step_dim + ) -> Tuple[Dict[str, rf.Tensor], rf.State]: + """step of the inner loop""" + if state is None: + batch_dims = enc.remaining_dims( + remove=(enc.feature_dim, enc_spatial_dim) if enc_spatial_dim != single_step_dim else (enc.feature_dim,) + ) + state = self.blank_decoder_default_initial_state(batch_dims=batch_dims) + state_ = rf.State() + + if spatial_dim == single_step_dim: + i = state.i + clip_to_valid = True + else: + i = rf.range_over_dim(spatial_dim) + # do clipping here, because rf.gather complains that i.dims_set is not a superset of enc_spatial_dim.dims_set + seq_lens = rf.copy_to_device(enc_spatial_dim.dyn_size_ext, i.device) + i = rf.where(i < seq_lens, i, seq_lens - 1) + clip_to_valid = False + + am = rf.gather(enc, axis=enc_spatial_dim, indices=i, clip_to_valid=clip_to_valid) + s_length_model, state_.s_length_model = self.s_length_model( + rf.concat_features(am, input_embed), + state=state.s_length_model, + spatial_dim=spatial_dim + ) + + state_.i = state.i + 1 + + return {"s_length_model": s_length_model}, state_ + + def decode_blank_logits(self, *, s_length_model: Tensor) -> Tensor: + """logits for the decoder""" + logits = self.emit_prob(s_length_model) + return logits + + +class MakeModel: + """for import""" + + def __init__(self, in_dim: int, align_target_dim: int, target_dim: int, *, center_window_size: int, eos_label: int = 0, num_enc_layers: int = 12): + self.in_dim = in_dim + self.align_target_dim = align_target_dim + self.target_dim = target_dim + self.center_window_size = center_window_size + self.eos_label = eos_label + self.num_enc_layers = num_enc_layers + + def __call__(self) -> SegmentalAttentionModel: + from returnn.datasets.util.vocabulary import Vocabulary + + in_dim = Dim(name="in", dimension=self.in_dim, kind=Dim.Types.Feature) + align_target_dim = Dim(name="align_target", dimension=self.align_target_dim, kind=Dim.Types.Feature) + target_dim = Dim(name="non_blank_target", dimension=self.target_dim, kind=Dim.Types.Feature) + target_dim.vocab = Vocabulary.create_vocab_from_labels( + [str(i) for i in range(target_dim.dimension)], eos_label=self.eos_label + ) + + return self.make_model(in_dim, align_target_dim, target_dim, center_window_size=self.center_window_size) + + @classmethod + def make_model( + cls, + in_dim: Dim, + align_target_dim: Dim, + target_dim: Dim, + *, + center_window_size: int, + num_enc_layers: int = 12, + pos_emb_dropout: float = 0.0, + language_model: Optional[Dict[str, Any]] = None, + **extra, + ) -> SegmentalAttentionModel: + """make""" + lm = None + if language_model: + assert isinstance(language_model, dict) + language_model = language_model.copy() + cls_name = language_model.pop("class") + assert cls_name == "TransformerDecoder" + language_model.pop("vocab_dim", None) # will just overwrite + + from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.lm.trafo import model as trafo_lm + + lm = trafo_lm.MakeModel(vocab_dim=target_dim, **language_model)() + lm = (lm, functools.partial(trafo_lm.make_time_sync_label_scorer_torch, model=lm, align_target_dim=align_target_dim)) + + return SegmentalAttentionModel( + in_dim=in_dim, + num_enc_layers=num_enc_layers, + enc_model_dim=Dim(name="enc", dimension=512, kind=Dim.Types.Feature), + enc_ff_dim=Dim(name="enc-ff", dimension=2048, kind=Dim.Types.Feature), + enc_att_num_heads=8, + enc_conformer_layer_opts=dict( + conv_norm_opts=dict(use_mask=True), + self_att_opts=dict( + # Shawn et al 2018 style, old RETURNN way. + with_bias=False, + with_linear_pos=False, + with_pos_bias=False, + learnable_pos_emb=True, + separate_pos_emb_per_head=False, + pos_emb_dropout=pos_emb_dropout, + ), + ff_activation=lambda x: rf.relu(x) ** 2.0, + ), + target_dim=target_dim, + align_target_dim=align_target_dim, + blank_idx=target_dim.dimension, + language_model=lm, + length_model_state_dim=Dim(name="length_model_state", dimension=128, kind=Dim.Types.Feature), + length_model_embed_dim=Dim(name="length_model_embed", dimension=128, kind=Dim.Types.Feature), + center_window_size=center_window_size, + **extra, + ) + + +def from_scratch_model_def( + *, epoch: int, in_dim: Dim, align_target_dim: Dim, target_dim: Dim) -> SegmentalAttentionModel: + """Function is run within RETURNN.""" + from returnn.config import get_global_config + + in_dim, epoch # noqa + config = get_global_config() # noqa + enc_aux_logits = config.typed_value("aux_loss_layers") + pos_emb_dropout = config.float("pos_emb_dropout", 0.0) + # real input is raw audio, internally it does logmel + in_dim = Dim(name="logmel", dimension=_log_mel_feature_dim, kind=Dim.Types.Feature) + lm_opts = config.typed_value("external_lm") + center_window_size = config.typed_value("center_window_size") + if center_window_size is None: + raise ValueError("center_window_size is not set!") + return MakeModel.make_model( + in_dim, + align_target_dim, + target_dim, + center_window_size=center_window_size, + enc_aux_logits=enc_aux_logits or (), + pos_emb_dropout=pos_emb_dropout, + language_model=lm_opts + ) + + +from_scratch_model_def: ModelDef[SegmentalAttentionModel] +from_scratch_model_def.behavior_version = 16 +from_scratch_model_def.backend = "torch" +from_scratch_model_def.batch_size_factor = _batch_size_factor + + +def _returnn_v2_get_model(*, epoch: int, **_kwargs_unused): + from returnn.tensor import Tensor, Dim + from returnn.config import get_global_config + + config = get_global_config() + default_input_key = config.typed_value("default_input") + default_target_key = config.typed_value("target") + extern_data_dict = config.typed_value("extern_data") + non_blank_vocab = config.typed_value("non_blank_vocab") + data = Tensor(name=default_input_key, **extern_data_dict[default_input_key]) + targets = Tensor(name=default_target_key, **extern_data_dict[default_target_key]) + non_blank_targets = Tensor( + name="non_blank_targets", + sparse_dim=Dim(description="non_blank_vocab", dimension=targets.sparse_dim.dimension - 1, kind=Dim.Types.Spatial), + vocab=non_blank_vocab, + ) + + model_def = config.typed_value("_model_def") + model = model_def( + epoch=epoch, in_dim=data.feature_dim, align_target_dim=targets.sparse_dim, target_dim=non_blank_targets.sparse_dim) + return model diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_import.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_old/model_import.py similarity index 100% rename from users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_import.py rename to users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_old/model_import.py diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_old/recog.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_old/recog.py new file mode 100644 index 000000000..bce5d47b6 --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_old/recog.py @@ -0,0 +1,527 @@ +from typing import Optional, Dict, Any, Tuple +import tree + +from returnn.tensor import Tensor, Dim +import returnn.frontend as rf +from returnn.frontend.tensor_array import TensorArray + +from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.recog import RecogDef +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.base import _batch_size_factor +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_old.model import SegmentalAttentionModel +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.utils import get_masked, get_non_blank_mask + + +def model_recog( + *, + model: SegmentalAttentionModel, + data: Tensor, + data_spatial_dim: Dim, + max_seq_len: Optional[int] = None, +) -> Tuple[Tensor, Tensor, Dim, Dim]: + """ + Function is run within RETURNN. + + Earlier we used the generic beam_search function, + but now we just directly perform the search here, + as this is overall simpler and shorter. + + :return: + recog results including beam {batch, beam, out_spatial}, + log probs {batch, beam}, + out_spatial_dim, + final beam_dim + """ + assert not model.language_model # not implemented here. use the pure PyTorch search instead + + batch_dims = data.remaining_dims((data_spatial_dim, data.feature_dim)) + enc_args, enc_spatial_dim = model.encode(data, in_spatial_dim=data_spatial_dim) + beam_size = 12 + if max_seq_len is None: + max_seq_len = enc_spatial_dim.get_size_tensor() + else: + max_seq_len = rf.convert_to_tensor(max_seq_len, dtype="int32") + print("** max seq len:", max_seq_len.raw_tensor) + max_seq_len = rf.reduce_max(max_seq_len, axis=max_seq_len.dims) + + # Eager-mode implementation of beam search. + # Initial state. + beam_dim = Dim(1, name="initial-beam") + batch_dims_ = [beam_dim] + batch_dims + label_decoder_state = model.label_decoder_default_initial_state(batch_dims=batch_dims_,) + + blank_decoder_state = model.blank_decoder_default_initial_state(batch_dims=batch_dims_) + bos_idx = 0 + target = rf.constant(bos_idx, dims=batch_dims_, sparse_dim=model.align_target_dim) + target_non_blank = rf.constant(bos_idx, dims=batch_dims_, sparse_dim=model.target_dim) + # ended = rf.constant(False, dims=batch_dims_) + seq_log_prob = rf.constant(0.0, dims=batch_dims_) + + i = 0 + seq_targets = [] + seq_backrefs = [] + while i < max_seq_len.raw_tensor: + if i == 0: + input_embed = rf.zeros(batch_dims_ + [model.target_embed.out_dim], feature_dim=model.target_embed.out_dim, dtype="float32") + input_embed_length_model = rf.zeros( + batch_dims_ + [model.target_embed_length_model.out_dim], feature_dim=model.target_embed_length_model.out_dim) + else: + input_embed_length_model = model.target_embed_length_model(target) + + # ------------------- label step ------------------- + center_position = rf.minimum( + rf.full(dims=[beam_dim] + batch_dims, fill_value=i, dtype="int32"), + rf.copy_to_device(enc_spatial_dim.get_size_tensor() - 1, data.device) + ) + segment_starts = rf.maximum( + rf.convert_to_tensor(0, dtype="int32"), center_position - model.center_window_size // 2) + segment_ends = rf.minimum( + rf.copy_to_device(enc_spatial_dim.get_size_tensor() - 1, data.device), + center_position + model.center_window_size // 2 + ) + segment_lens = segment_ends - segment_starts + 1 + + label_step_out, label_decoder_state_updated = model.label_sync_loop_step( + **enc_args, + enc_spatial_dim=enc_spatial_dim, + input_embed=input_embed, + segment_lens=segment_lens, + segment_starts=segment_starts, + state=label_decoder_state, + ) + label_logits = model.decode_label_logits(input_embed=input_embed, **label_step_out) + label_log_prob = rf.log_softmax(label_logits, axis=model.target_dim) + + # ------------------- blank step ------------------- + + blank_step_out, blank_decoder_state = model.time_sync_loop_step( + enc=enc_args["enc"], + enc_spatial_dim=enc_spatial_dim, + input_embed=input_embed_length_model, + state=blank_decoder_state, + ) + blank_logits = model.decode_blank_logits(**blank_step_out) + emit_log_prob = rf.log(rf.sigmoid(blank_logits)) + emit_log_prob = rf.squeeze(emit_log_prob, axis=emit_log_prob.feature_dim) + blank_log_prob = rf.log(rf.sigmoid(-blank_logits)) + + # combine blank and label probs + label_log_prob += emit_log_prob + output_log_prob, _ = rf.concat( + (label_log_prob, model.target_dim), (blank_log_prob, blank_log_prob.feature_dim), + out_dim=model.align_target_dim + ) + + # top-k + seq_log_prob = seq_log_prob + output_log_prob # Batch, InBeam, Vocab + old_beam_dim = beam_dim.copy() + seq_log_prob, (backrefs, target), beam_dim = rf.top_k( + seq_log_prob, k_dim=Dim(beam_size, name=f"dec-step{i}-beam"), axis=[beam_dim, model.align_target_dim] + ) # seq_log_prob, backrefs, target: Batch, Beam + seq_targets.append(target) + seq_backrefs.append(backrefs) + + update_state_mask = rf.convert_to_tensor(target != model.blank_idx) + + def _get_masked_state(old, new, mask): + old = rf.gather(old, indices=backrefs, axis=old_beam_dim) + new = rf.gather(new, indices=backrefs, axis=old_beam_dim) + return rf.where(mask, new, old) + + label_decoder_state = tree.map_structure( + lambda old_state, new_state: _get_masked_state(old_state, new_state, update_state_mask), + label_decoder_state, label_decoder_state_updated + ) + + target_non_blank = rf.where(update_state_mask, target, rf.gather(target_non_blank, indices=backrefs)) + target_non_blank.sparse_dim = model.target_embed.in_dim + input_embed = rf.where( + update_state_mask, + model.target_embed(target_non_blank), + rf.gather(input_embed, indices=backrefs, axis=old_beam_dim) + ) + + blank_decoder_state = tree.map_structure(lambda s: rf.gather(s, indices=backrefs), blank_decoder_state) + + i += 1 + + # Backtrack via backrefs, resolve beams. + seq_targets_ = [] + indices = rf.range_over_dim(beam_dim) # FinalBeam -> FinalBeam + for backrefs, target in zip(seq_backrefs[::-1], seq_targets[::-1]): + # indices: FinalBeam -> Beam + # backrefs: Beam -> PrevBeam + seq_targets_.insert(0, rf.gather(target, indices=indices)) + indices = rf.gather(backrefs, indices=indices) # FinalBeam -> PrevBeam + + seq_targets__ = TensorArray(seq_targets_[0]) + for target in seq_targets_: + seq_targets__ = seq_targets__.push_back(target) + seq_targets = seq_targets__.stack(axis=enc_spatial_dim) + + non_blank_targets, non_blank_targets_spatial_dim = get_masked( + seq_targets, + get_non_blank_mask(seq_targets, model.blank_idx), + enc_spatial_dim, + [beam_dim] + batch_dims, + ) + non_blank_targets.sparse_dim = model.target_dim + + return non_blank_targets, seq_log_prob, non_blank_targets_spatial_dim, beam_dim + + +# RecogDef API +model_recog: RecogDef[SegmentalAttentionModel] +model_recog.output_with_beam = True +# output_blank_label=blank is actually wrong for AED, but now we don't change it anymore +# because it would change all recog hashes. +# Also, it does not matter too much -- it will just cause an extra SearchRemoveLabelJob, +# which will not have any effect here. +model_recog.output_blank_label = "" +model_recog.batch_size_dependent = False + + +def model_recog_pure_torch( + *, + model: SegmentalAttentionModel, + data: Tensor, + data_spatial_dim: Dim, + max_seq_len: Optional[int] = None, +) -> Tuple[Tensor, Tensor, Dim, Dim]: + """ + Function is run within RETURNN. + + Earlier we used the generic beam_search function, + but now we just directly perform the search here, + as this is overall simpler and shorter. + + :return: + recog results including beam {batch, beam, out_spatial}, + log probs {batch, beam}, + recog results info: key -> {batch, beam}, + out_spatial_dim, + final beam_dim + """ + import torch + from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.beam_search.time_sync import BeamSearchOpts, time_sync_beam_search + from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.label_scorer import ShallowFusedLabelScorers + from returnn.config import get_global_config + + config = get_global_config() + + torch.cuda.set_sync_debug_mode(1) # debug CUDA sync. does not hurt too much to leave this always in? + + data_concat_zeros = config.float("data_concat_zeros", 0) + if data_concat_zeros: + data_concat_zeros_dim = Dim(int(data_concat_zeros * _batch_size_factor * 100), name="data_concat_zeros") + data, data_spatial_dim = rf.concat( + (data, data_spatial_dim), (rf.zeros([data_concat_zeros_dim]), data_concat_zeros_dim), allow_broadcast=True + ) + + batch_dims = data.remaining_dims((data_spatial_dim, data.feature_dim)) + assert len(batch_dims) == 1, batch_dims # not implemented otherwise, simple to add... + batch_dim = batch_dims[0] + enc, enc_spatial_dim = model.encode(data, in_spatial_dim=data_spatial_dim) + if max_seq_len is None: + max_seq_len = enc_spatial_dim.get_size_tensor() + else: + max_seq_len = rf.convert_to_tensor(max_seq_len, dtype="int32") + + beam_search_opts = (config.typed_value("beam_search_opts", None) or {}).copy() + if beam_search_opts.get("beam_size") is None: + beam_search_opts["beam_size"] = config.int("beam_size", 12) + if beam_search_opts.get("length_normalization_exponent") is None: + beam_search_opts["length_normalization_exponent"] = config.float("length_normalization_exponent", 1.0) + + label_scorer = ShallowFusedLabelScorers() + label_scorer.label_scorers["label_sync_decoder"] = ( + get_label_sync_scorer_pure_torch(model=model, batch_dim=batch_dim, enc=enc, enc_spatial_dim=enc_spatial_dim), + 1.0, + ) + label_scorer.label_scorers["time_sync_decoder"] = ( + get_time_sync_scorer_pure_torch(model=model, batch_dim=batch_dim, enc=enc, enc_spatial_dim=enc_spatial_dim), + 1.0, + ) + if model.language_model: + lm_scale = beam_search_opts.pop("lm_scale") # must be defined with LM + label_scorer.label_scorers["lm"] = (model.language_model_make_label_scorer(), lm_scale) + + print("** max seq len:", max_seq_len.raw_tensor) + + # Beam search happening here: + ( + seq_targets, # [Batch,FinalBeam,OutSeqLen] + seq_log_prob, # [Batch,FinalBeam] + ) = time_sync_beam_search( + label_scorer, + label_sync_keys=["label_sync_decoder", "lm"] if model.language_model else ["label_sync_decoder"], + time_sync_keys=["time_sync_decoder"], + batch_size=int(batch_dim.get_dim_value()), + blank_idx=model.blank_idx, + max_seq_len=max_seq_len.copy_compatible_to_dims_raw([batch_dim]), + device=data.raw_tensor.device, + opts=BeamSearchOpts( + **beam_search_opts, + bos_label=0, + eos_label=0, + num_labels=model.target_dim.dimension, + ), + ) + + beam_dim = Dim(seq_log_prob.shape[1], name="beam") + seq_targets_t = rf.convert_to_tensor( + seq_targets, dims=[batch_dim, beam_dim, enc_spatial_dim], sparse_dim=model.target_dim + ) + seq_log_prob_t = rf.convert_to_tensor(seq_log_prob, dims=[batch_dim, beam_dim]) + + non_blank_targets, non_blank_targets_spatial_dim = get_masked( + seq_targets_t, + get_non_blank_mask(seq_targets_t, model.blank_idx), + enc_spatial_dim, + [beam_dim] + batch_dims, + ) + non_blank_targets.sparse_dim = model.target_dim + + return non_blank_targets, seq_log_prob_t, non_blank_targets_spatial_dim, beam_dim + + +# RecogDef API +model_recog_pure_torch: RecogDef[SegmentalAttentionModel] +model_recog_pure_torch.output_with_beam = True +model_recog_pure_torch.output_blank_label = None +model_recog_pure_torch.batch_size_dependent = False + + +def get_label_sync_scorer_pure_torch( + *, + model: SegmentalAttentionModel, + batch_dim: Dim, + enc: Dict[str, Tensor], + enc_spatial_dim: Dim, +): + import torch + import functools + from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.label_scorer import ( + LabelScorerIntf, + StateObjTensorExt, + StateObjIgnored, + ) + + class LabelScorer(LabelScorerIntf): + """label scorer""" + + def get_initial_state(self, *, batch_size: int, device: torch.device) -> Any: + """Initial state.""" + beam_dim = Dim(1, name="initial-beam") + batch_dims_ = [batch_dim, beam_dim] + decoder_state = model.label_decoder_default_initial_state(batch_dims=batch_dims_,) + return tree.map_structure(functools.partial(self._map_tensor_to_raw, beam_dim=beam_dim), decoder_state) + + def score_and_update_state( + self, + *, + prev_state: Any, + prev_label: torch.Tensor, + prev_align_label: Optional[torch.Tensor] = None, + t: Optional[int] = None, + ) -> Tuple[torch.Tensor, Any]: + """update state""" + beam_dim = Dim(prev_label.shape[1], name="beam") + assert t is not None + + def _map_raw_to_tensor(v): + if isinstance(v, StateObjTensorExt): + tensor: Tensor = v.extra + tensor = tensor.copy_template_new_dim_tags( + (batch_dim, beam_dim) + tensor.dims[2:], keep_special_axes=True + ) + tensor.raw_tensor = v.tensor + return tensor + elif isinstance(v, StateObjIgnored): + return v.content + else: + raise TypeError(f"_map_raw_to_tensor: unexpected {v} ({type(v).__name__})") + + center_position = rf.minimum( + rf.full(dims=[beam_dim, batch_dim], fill_value=t, dtype="int32"), + rf.copy_to_device(enc_spatial_dim.get_size_tensor() - 1, enc["enc"].device) + ) + segment_starts = rf.maximum( + rf.convert_to_tensor(0, dtype="int32"), center_position - model.center_window_size // 2) + segment_ends = rf.minimum( + rf.copy_to_device(enc_spatial_dim.get_size_tensor() - 1, enc["enc"].device), + center_position + model.center_window_size // 2 + ) + segment_lens = segment_ends - segment_starts + 1 + + zeros_embed = rf.zeros( + [batch_dim, beam_dim, model.target_embed.out_dim], + feature_dim=model.target_embed.out_dim, + dtype="float32" + ) + initial_output_mask = rf.convert_to_tensor(prev_label == -1, dims=[batch_dim, beam_dim]) + prev_label = rf.convert_to_tensor(prev_label, dims=[batch_dim, beam_dim], sparse_dim=model.target_dim) + prev_label = rf.where( + initial_output_mask, + rf.zeros_like(prev_label), + prev_label + ) + input_embed = rf.where( + initial_output_mask, + zeros_embed, + model.target_embed(prev_label) + ) + + decode_out, decoder_state = model.label_sync_loop_step( + **enc, + enc_spatial_dim=enc_spatial_dim, + input_embed=input_embed, + segment_lens=segment_lens, + segment_starts=segment_starts, + state=tree.map_structure(_map_raw_to_tensor, prev_state), + ) + logits = model.decode_label_logits(input_embed=input_embed, **decode_out) + label_log_prob = rf.log_softmax(logits, axis=model.target_dim) + + blank_log_prob = rf.zeros( + [Dim(1, name="blank_log_prob_label_scorer")], + dtype="float32" + ) + output_log_prob, _ = rf.concat( + (label_log_prob, model.target_dim), (blank_log_prob, blank_log_prob.dims[0]), + out_dim=model.align_target_dim, + allow_broadcast=True + ) + assert set(output_log_prob.dims) == {batch_dim, beam_dim, model.align_target_dim} + + return ( + self._map_tensor_to_raw(output_log_prob, beam_dim=beam_dim).tensor, + tree.map_structure(functools.partial(self._map_tensor_to_raw, beam_dim=beam_dim), decoder_state), + ) + + @staticmethod + def _map_tensor_to_raw(v, *, beam_dim: Dim): + if isinstance(v, Tensor): + if beam_dim not in v.dims: + return StateObjIgnored(v) + batch_dims_ = [batch_dim, beam_dim] + v = v.copy_transpose(batch_dims_ + [dim for dim in v.dims if dim not in batch_dims_]) + raw = v.raw_tensor + return StateObjTensorExt(raw, v.copy_template()) + elif isinstance(v, Dim): + return StateObjIgnored(v) + else: + raise TypeError(f"_map_tensor_to_raw: unexpected {v} ({type(v).__name__})") + + return LabelScorer() + + +def get_time_sync_scorer_pure_torch( + *, + model: SegmentalAttentionModel, + batch_dim: Dim, + enc: Dict[str, Tensor], + enc_spatial_dim: Dim, +): + import torch + import functools + from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.label_scorer import ( + LabelScorerIntf, + StateObjTensorExt, + StateObjIgnored, + ) + + class LabelScorer(LabelScorerIntf): + """label scorer""" + + def get_initial_state(self, *, batch_size: int, device: torch.device) -> Any: + """Initial state.""" + beam_dim = Dim(1, name="initial-beam") + batch_dims_ = [batch_dim, beam_dim] + decoder_state = model.blank_decoder_default_initial_state(batch_dims=batch_dims_,) + return tree.map_structure(functools.partial(self._map_tensor_to_raw, beam_dim=beam_dim), decoder_state) + + def score_and_update_state( + self, + *, + prev_state: Any, + prev_label: torch.Tensor, + prev_align_label: Optional[torch.Tensor] = None, + t: Optional[int] = None, + ) -> Tuple[torch.Tensor, Any]: + """update state""" + beam_dim = Dim(prev_label.shape[1], name="beam") + assert prev_align_label is not None + + def _map_raw_to_tensor(v): + if isinstance(v, StateObjTensorExt): + tensor: Tensor = v.extra + tensor = tensor.copy_template_new_dim_tags( + (batch_dim, beam_dim) + tensor.dims[2:], keep_special_axes=True + ) + tensor.raw_tensor = v.tensor + return tensor + elif isinstance(v, StateObjIgnored): + return v.content + else: + raise TypeError(f"_map_raw_to_tensor: unexpected {v} ({type(v).__name__})") + + zeros_embed = rf.zeros( + [batch_dim, beam_dim, model.target_embed_length_model.out_dim], + feature_dim=model.target_embed_length_model.out_dim, + dtype="float32" + ) + initial_output_mask = rf.convert_to_tensor(prev_align_label == -1, dims=[batch_dim, beam_dim]) + prev_align_label = rf.convert_to_tensor(prev_align_label, dims=[batch_dim, beam_dim], sparse_dim=model.align_target_dim) + prev_align_label = rf.where( + initial_output_mask, + rf.zeros_like(prev_align_label), + prev_align_label + ) + input_embed = rf.where( + initial_output_mask, + zeros_embed, + model.target_embed_length_model(prev_align_label) + ) + + decode_out, decoder_state = model.time_sync_loop_step( + enc=enc["enc"], + enc_spatial_dim=enc_spatial_dim, + input_embed=input_embed, + state=tree.map_structure(_map_raw_to_tensor, prev_state), + ) + blank_logits = model.decode_blank_logits(**decode_out) + emit_log_prob = rf.log(rf.sigmoid(blank_logits)) + emit_log_prob = rf.squeeze(emit_log_prob, axis=emit_log_prob.feature_dim) + blank_log_prob = rf.log(rf.sigmoid(-blank_logits)) + + label_log_prob = rf.zeros( + dims=[batch_dim, beam_dim, model.target_dim], + dtype="float32" + ) + label_log_prob += emit_log_prob + output_log_prob, _ = rf.concat( + (label_log_prob, model.target_dim), (blank_log_prob, blank_log_prob.feature_dim), + out_dim=model.align_target_dim + ) + assert set(output_log_prob.dims) == {batch_dim, beam_dim, model.align_target_dim} + + return ( + self._map_tensor_to_raw(output_log_prob, beam_dim=beam_dim).tensor, + tree.map_structure(functools.partial(self._map_tensor_to_raw, beam_dim=beam_dim), decoder_state), + ) + + @staticmethod + def _map_tensor_to_raw(v, *, beam_dim: Dim): + if isinstance(v, Tensor): + if beam_dim not in v.dims: + return StateObjIgnored(v) + batch_dims_ = [batch_dim, beam_dim] + v = v.copy_transpose(batch_dims_ + [dim for dim in v.dims if dim not in batch_dims_]) + raw = v.raw_tensor + return StateObjTensorExt(raw, v.copy_template()) + elif isinstance(v, Dim): + return StateObjIgnored(v) + else: + raise TypeError(f"_map_tensor_to_raw: unexpected {v} ({type(v).__name__})") + + return LabelScorer() diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_old/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_old/train.py new file mode 100644 index 000000000..3b62b32c2 --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_old/train.py @@ -0,0 +1,280 @@ +from typing import Dict, List + +from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.training import FramewiseTrainDef + +from returnn.tensor import TensorDict + +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.utils import get_non_blank_mask, get_masked +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_old.model import SegmentalAttentionModel + +from returnn.tensor import Dim +import returnn.frontend as rf + +from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.training import TrainDef + + +def _returnn_v2_train_step(*, model, extern_data: TensorDict, **_kwargs_unused): + from returnn.config import get_global_config + + config = get_global_config() + default_input_key = config.typed_value("default_input") + default_target_key = config.typed_value("target") + data = extern_data[default_input_key] + data_spatial_dim = data.get_time_dim_tag() + targets = extern_data[default_target_key] + targets_spatial_dim = targets.get_time_dim_tag() + train_def: FramewiseTrainDef = config.typed_value("_train_def") + train_def( + model=model, + data=data, + data_spatial_dim=data_spatial_dim, + align_targets=targets, + align_targets_spatial_dim=targets_spatial_dim, + ) + + +def get_blank_loop_out_unoptimized( + model: SegmentalAttentionModel, + enc_args: Dict, + enc_spatial_dim: Dim, + align_targets_spatial_dim: Dim, + align_input_embeddings: rf.Tensor, + batch_dims: List[Dim] +): + def _blank_loop_body(xs, state: rf.State): + new_state = rf.State() + loop_out_, new_state.decoder = model.time_sync_loop_step( + enc=enc_args["enc"], + enc_spatial_dim=enc_spatial_dim, + input_embed=xs["input_embed"], + state=state.decoder, + ) + return loop_out_, new_state + + blank_loop_out, _, _ = rf.scan( + spatial_dim=align_targets_spatial_dim, + xs={ + "input_embed": align_input_embeddings, + }, + ys=model.blank_loop_step_output_templates(batch_dims=batch_dims), + initial=rf.State( + decoder=model.blank_decoder_default_initial_state( + batch_dims=batch_dims, + ), + ), + body=_blank_loop_body, + ) + return blank_loop_out + + +def get_blank_loop_out_optimized( + model: SegmentalAttentionModel, + enc_args: Dict, + enc_spatial_dim: Dim, + align_input_embeddings: rf.Tensor, + batch_dims: List[Dim], + align_targets_spatial_dim: Dim +): + blank_loop_out, _ = model.time_sync_loop_step( + enc=enc_args["enc"], + enc_spatial_dim=enc_spatial_dim, + input_embed=align_input_embeddings, + state=model.blank_decoder_default_initial_state(batch_dims=batch_dims,), + spatial_dim=align_targets_spatial_dim, + ) + return blank_loop_out + + +def from_scratch_training( + *, + model: SegmentalAttentionModel, + data: rf.Tensor, + data_spatial_dim: Dim, + align_targets: rf.Tensor, + align_targets_spatial_dim: Dim +): + """ + Here + """ + from returnn.config import get_global_config + import torch + + config = get_global_config() # noqa + aux_loss_layers = config.typed_value("aux_loss_layers") + aux_loss_scales = config.typed_value("aux_loss_scales", ([1.0] * len(aux_loss_layers)) if aux_loss_layers else None) + aed_loss_scale = config.float("aed_loss_scale", 1.0) + use_normalized_loss = config.bool("use_normalized_loss", True) + use_optimized_length_model_loop = config.bool("use_optimized_length_model_loop", True) + + if data.feature_dim and data.feature_dim.dimension == 1: + data = rf.squeeze(data, axis=data.feature_dim) + assert not data.feature_dim # raw audio + + batch_dims = data.remaining_dims(data_spatial_dim) + + def _get_segment_starts_and_lens(out_spatial_dim: Dim): + non_blank_mask = get_non_blank_mask(align_targets, model.blank_idx) + targets_range = rf.range_over_dim(align_targets_spatial_dim, dtype="int32") + targets_range = rf.expand_dim(targets_range, batch_dims[0]) + non_blank_positions, _ = get_masked( + targets_range, non_blank_mask, align_targets_spatial_dim, batch_dims, out_spatial_dim + ) + starts = rf.maximum( + rf.convert_to_tensor(0, dtype="int32"), non_blank_positions - model.center_window_size // 2) + ends = rf.minimum( + rf.copy_to_device(align_targets_spatial_dim.get_size_tensor() - 1, non_blank_positions.device), + non_blank_positions + model.center_window_size // 2 + ) + lens = ends - starts + 1 + + return starts, lens + + def _get_emit_ground_truth(): + non_blank_mask = get_non_blank_mask(align_targets, model.blank_idx) + result = rf.where(non_blank_mask, rf.convert_to_tensor(1), rf.convert_to_tensor(0)) + sparse_dim = Dim(name="emit_ground_truth", dimension=2) + # result = rf.expand_dim(result, sparse_dim) + result.sparse_dim = sparse_dim + torch.set_printoptions(threshold=10000) + + return result, sparse_dim + + non_blank_targets, non_blank_targets_spatial_dim = get_masked( + align_targets, get_non_blank_mask(align_targets, model.blank_idx), align_targets_spatial_dim, batch_dims + ) + non_blank_targets.sparse_dim = model.target_dim + segment_starts, segment_lens = _get_segment_starts_and_lens(non_blank_targets_spatial_dim) + + # ------------------- encoder aux loss ------------------- + + collected_outputs = {} + enc_args, enc_spatial_dim = model.encode(data, in_spatial_dim=data_spatial_dim, collected_outputs=collected_outputs) + + if aux_loss_layers: + for i, layer_idx in enumerate(aux_loss_layers): + if layer_idx > len(model.encoder.layers): + continue + linear = getattr(model, f"enc_aux_logits_{layer_idx}") + aux_logits = linear(collected_outputs[str(layer_idx - 1)]) + aux_loss = rf.ctc_loss( + logits=aux_logits, + targets=non_blank_targets, + input_spatial_dim=enc_spatial_dim, + targets_spatial_dim=non_blank_targets_spatial_dim, + blank_index=model.blank_idx, + ) + aux_loss.mark_as_loss( + f"ctc_{layer_idx}", + scale=aux_loss_scales[i], + custom_inv_norm_factor=align_targets_spatial_dim.get_size_tensor(), + use_normalized_loss=use_normalized_loss, + ) + + non_blank_input_embeddings = model.target_embed(non_blank_targets) + non_blank_input_embeddings = rf.shift_right( + non_blank_input_embeddings, axis=non_blank_targets_spatial_dim, pad_value=0.0) + + align_input_embeddings = model.target_embed_length_model(align_targets) + align_input_embeddings = rf.shift_right( + align_input_embeddings, axis=align_targets_spatial_dim, pad_value=0.0) + + # ------------------- label loop ------------------- + + def _label_loop_body(xs, state: rf.State): + new_state = rf.State() + loop_out_, new_state.decoder = model.label_sync_loop_step( + **enc_args, + enc_spatial_dim=enc_spatial_dim, + input_embed=xs["input_embed"], + segment_starts=xs["segment_starts"], + segment_lens=xs["segment_lens"], + state=state.decoder, + ) + return loop_out_, new_state + + label_loop_out, _, _ = rf.scan( + spatial_dim=non_blank_targets_spatial_dim, + xs={ + "input_embed": non_blank_input_embeddings, + "segment_starts": segment_starts, + "segment_lens": segment_lens, + }, + ys=model.label_loop_step_output_templates(batch_dims=batch_dims), + initial=rf.State( + decoder=model.label_decoder_default_initial_state( + batch_dims=batch_dims, + # TODO: do we need these sparse dims? they are automatically added by rf.range_over_dim + segment_starts_sparse_dim=segment_starts.sparse_dim, + segment_lens_sparse_dim=segment_lens.sparse_dim, + ), + ), + body=_label_loop_body, + ) + + logits = model.decode_label_logits(input_embed=non_blank_input_embeddings, **label_loop_out) + logits_packed, pack_dim = rf.pack_padded(logits, dims=batch_dims + [non_blank_targets_spatial_dim], enforce_sorted=False) + non_blank_targets_packed, _ = rf.pack_padded( + non_blank_targets, dims=batch_dims + [non_blank_targets_spatial_dim], enforce_sorted=False, out_dim=pack_dim + ) + + log_prob = rf.log_softmax(logits_packed, axis=model.target_dim) + # deactivate to compare to old returnn training (label smoothing is done differently) + # log_prob = rf.label_smoothed_log_prob_gradient(log_prob, 0.1, axis=model.target_dim) + loss = rf.cross_entropy( + target=non_blank_targets_packed, estimated=log_prob, estimated_type="log-probs", axis=model.target_dim + ) + loss.mark_as_loss("non_blank_ce", scale=aed_loss_scale, use_normalized_loss=use_normalized_loss) + + best = rf.reduce_argmax(logits_packed, axis=model.target_dim) + frame_error = best != non_blank_targets_packed + frame_error.mark_as_loss(name="non_blank_fer", as_error=True) + + # ------------------- blank loop ------------------- + + if use_optimized_length_model_loop: + blank_loop_out = get_blank_loop_out_optimized( + model=model, + enc_args=enc_args, + enc_spatial_dim=enc_spatial_dim, + align_targets_spatial_dim=align_targets_spatial_dim, + align_input_embeddings=align_input_embeddings, + batch_dims=batch_dims + ) + else: + blank_loop_out = get_blank_loop_out_unoptimized( + model=model, + enc_args=enc_args, + enc_spatial_dim=enc_spatial_dim, + align_targets_spatial_dim=align_targets_spatial_dim, + align_input_embeddings=align_input_embeddings, + batch_dims=batch_dims + ) + + blank_logits = model.decode_blank_logits(**blank_loop_out) + blank_logits_packed, pack_dim = rf.pack_padded(blank_logits, dims=batch_dims + [align_targets_spatial_dim], enforce_sorted=False) + emit_ground_truth, emit_blank_target_dim = _get_emit_ground_truth() + emit_ground_truth_packed, _ = rf.pack_padded( + emit_ground_truth, dims=batch_dims + [align_targets_spatial_dim], enforce_sorted=False, out_dim=pack_dim + ) + + # rf.log_sigmoid not implemented for torch backend + emit_log_prob = rf.log(rf.sigmoid(blank_logits_packed)) + blank_log_prob = rf.log(rf.sigmoid(-blank_logits_packed)) + blank_logit_dim = blank_logits_packed.remaining_dims((pack_dim,))[0] + emit_blank_log_prob, _ = rf.concat( + (blank_log_prob, blank_logit_dim), (emit_log_prob, blank_logit_dim), out_dim=emit_blank_target_dim) + blank_loss = rf.cross_entropy( + target=emit_ground_truth_packed, + estimated=emit_blank_log_prob, + estimated_type="log-probs", + axis=emit_blank_target_dim + ) + blank_loss.mark_as_loss("emit_blank_ce", scale=aed_loss_scale, use_normalized_loss=use_normalized_loss) + + best = rf.reduce_argmax(emit_blank_log_prob, axis=emit_blank_target_dim) + frame_error = best != emit_ground_truth_packed + frame_error.mark_as_loss(name="emit_blank_fer", as_error=True) + + +from_scratch_training: TrainDef[SegmentalAttentionModel] +from_scratch_training.learning_rate_control_error_measure = "dev_score_full_sum" diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recog.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recog.py index eb03bebcd..4245362d5 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recog.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recog.py @@ -31,10 +31,10 @@ def model_recog( out_spatial_dim, final beam_dim """ - assert not model.language_model # not implemented here. use the pure PyTorch search instead + assert not model.label_decoder.language_model # not implemented here. use the pure PyTorch search instead batch_dims = data.remaining_dims((data_spatial_dim, data.feature_dim)) - enc_args, enc_spatial_dim = model.encode(data, in_spatial_dim=data_spatial_dim) + enc_args, enc_spatial_dim = model.encoder.encode(data, in_spatial_dim=data_spatial_dim) beam_size = 12 if max_seq_len is None: max_seq_len = enc_spatial_dim.get_size_tensor() @@ -47,9 +47,9 @@ def model_recog( # Initial state. beam_dim = Dim(1, name="initial-beam") batch_dims_ = [beam_dim] + batch_dims - label_decoder_state = model.label_decoder_default_initial_state(batch_dims=batch_dims_,) + label_decoder_state = model.label_decoder.default_initial_state(batch_dims=batch_dims_, ) - blank_decoder_state = model.blank_decoder_default_initial_state(batch_dims=batch_dims_) + blank_decoder_state = model.blank_decoder.default_initial_state(batch_dims=batch_dims_) bos_idx = 0 target = rf.constant(bos_idx, dims=batch_dims_, sparse_dim=model.align_target_dim) target_non_blank = rf.constant(bos_idx, dims=batch_dims_, sparse_dim=model.target_dim) @@ -61,11 +61,15 @@ def model_recog( seq_backrefs = [] while i < max_seq_len.raw_tensor: if i == 0: - input_embed = rf.zeros(batch_dims_ + [model.target_embed.out_dim], feature_dim=model.target_embed.out_dim, dtype="float32") + input_embed = rf.zeros( + batch_dims_ + [model.label_decoder.target_embed.out_dim], + feature_dim=model.label_decoder.target_embed.out_dim, + dtype="float32" + ) input_embed_length_model = rf.zeros( - batch_dims_ + [model.target_embed_length_model.out_dim], feature_dim=model.target_embed_length_model.out_dim) + batch_dims_ + [model.blank_decoder.target_embed.out_dim], feature_dim=model.blank_decoder.target_embed.out_dim) else: - input_embed_length_model = model.target_embed_length_model(target) + input_embed_length_model = model.blank_decoder.target_embed(target) # ------------------- label step ------------------- center_position = rf.minimum( @@ -80,7 +84,7 @@ def model_recog( ) segment_lens = segment_ends - segment_starts + 1 - label_step_out, label_decoder_state_updated = model.label_sync_loop_step( + label_step_out, label_decoder_state_updated = model.label_decoder.loop_step( **enc_args, enc_spatial_dim=enc_spatial_dim, input_embed=input_embed, @@ -88,18 +92,18 @@ def model_recog( segment_starts=segment_starts, state=label_decoder_state, ) - label_logits = model.decode_label_logits(input_embed=input_embed, **label_step_out) + label_logits = model.label_decoder.decode_logits(input_embed=input_embed, **label_step_out) label_log_prob = rf.log_softmax(label_logits, axis=model.target_dim) # ------------------- blank step ------------------- - blank_step_out, blank_decoder_state = model.time_sync_loop_step( + blank_step_out, blank_decoder_state = model.blank_decoder.loop_step( enc=enc_args["enc"], enc_spatial_dim=enc_spatial_dim, input_embed=input_embed_length_model, state=blank_decoder_state, ) - blank_logits = model.decode_blank_logits(**blank_step_out) + blank_logits = model.blank_decoder.decode_logits(**blank_step_out) emit_log_prob = rf.log(rf.sigmoid(blank_logits)) emit_log_prob = rf.squeeze(emit_log_prob, axis=emit_log_prob.feature_dim) blank_log_prob = rf.log(rf.sigmoid(-blank_logits)) @@ -133,10 +137,10 @@ def _get_masked_state(old, new, mask): ) target_non_blank = rf.where(update_state_mask, target, rf.gather(target_non_blank, indices=backrefs)) - target_non_blank.sparse_dim = model.target_embed.in_dim + target_non_blank.sparse_dim = model.label_decoder.target_embed.in_dim input_embed = rf.where( update_state_mask, - model.target_embed(target_non_blank), + model.label_decoder.target_embed(target_non_blank), rf.gather(input_embed, indices=backrefs, axis=old_beam_dim) ) @@ -172,10 +176,6 @@ def _get_masked_state(old, new, mask): # RecogDef API model_recog: RecogDef[SegmentalAttentionModel] model_recog.output_with_beam = True -# output_blank_label=blank is actually wrong for AED, but now we don't change it anymore -# because it would change all recog hashes. -# Also, it does not matter too much -- it will just cause an extra SearchRemoveLabelJob, -# which will not have any effect here. model_recog.output_blank_label = "" model_recog.batch_size_dependent = False @@ -220,7 +220,7 @@ def model_recog_pure_torch( batch_dims = data.remaining_dims((data_spatial_dim, data.feature_dim)) assert len(batch_dims) == 1, batch_dims # not implemented otherwise, simple to add... batch_dim = batch_dims[0] - enc, enc_spatial_dim = model.encode(data, in_spatial_dim=data_spatial_dim) + enc, enc_spatial_dim = model.encoder.encode(data, in_spatial_dim=data_spatial_dim) if max_seq_len is None: max_seq_len = enc_spatial_dim.get_size_tensor() else: @@ -241,9 +241,9 @@ def model_recog_pure_torch( get_time_sync_scorer_pure_torch(model=model, batch_dim=batch_dim, enc=enc, enc_spatial_dim=enc_spatial_dim), 1.0, ) - if model.language_model: + if model.label_decoder.language_model: lm_scale = beam_search_opts.pop("lm_scale") # must be defined with LM - label_scorer.label_scorers["lm"] = (model.language_model_make_label_scorer(), lm_scale) + label_scorer.label_scorers["lm"] = (model.label_decoder.language_model_make_label_scorer(), lm_scale) print("** max seq len:", max_seq_len.raw_tensor) @@ -253,7 +253,7 @@ def model_recog_pure_torch( seq_log_prob, # [Batch,FinalBeam] ) = time_sync_beam_search( label_scorer, - label_sync_keys=["label_sync_decoder", "lm"] if model.language_model else ["label_sync_decoder"], + label_sync_keys=["label_sync_decoder", "lm"] if model.label_decoder.language_model else ["label_sync_decoder"], time_sync_keys=["time_sync_decoder"], batch_size=int(batch_dim.get_dim_value()), blank_idx=model.blank_idx, @@ -313,7 +313,7 @@ def get_initial_state(self, *, batch_size: int, device: torch.device) -> Any: """Initial state.""" beam_dim = Dim(1, name="initial-beam") batch_dims_ = [batch_dim, beam_dim] - decoder_state = model.label_decoder_default_initial_state(batch_dims=batch_dims_,) + decoder_state = model.label_decoder.default_initial_state(batch_dims=batch_dims_, ) return tree.map_structure(functools.partial(self._map_tensor_to_raw, beam_dim=beam_dim), decoder_state) def score_and_update_state( @@ -354,8 +354,8 @@ def _map_raw_to_tensor(v): segment_lens = segment_ends - segment_starts + 1 zeros_embed = rf.zeros( - [batch_dim, beam_dim, model.target_embed.out_dim], - feature_dim=model.target_embed.out_dim, + [batch_dim, beam_dim, model.label_decoder.target_embed.out_dim], + feature_dim=model.label_decoder.target_embed.out_dim, dtype="float32" ) initial_output_mask = rf.convert_to_tensor(prev_label == -1, dims=[batch_dim, beam_dim]) @@ -368,10 +368,10 @@ def _map_raw_to_tensor(v): input_embed = rf.where( initial_output_mask, zeros_embed, - model.target_embed(prev_label) + model.label_decoder.target_embed(prev_label) ) - decode_out, decoder_state = model.label_sync_loop_step( + decode_out, decoder_state = model.label_decoder.loop_step( **enc, enc_spatial_dim=enc_spatial_dim, input_embed=input_embed, @@ -379,7 +379,7 @@ def _map_raw_to_tensor(v): segment_starts=segment_starts, state=tree.map_structure(_map_raw_to_tensor, prev_state), ) - logits = model.decode_label_logits(input_embed=input_embed, **decode_out) + logits = model.label_decoder.decode_logits(input_embed=input_embed, **decode_out) label_log_prob = rf.log_softmax(logits, axis=model.target_dim) blank_log_prob = rf.zeros( @@ -437,7 +437,7 @@ def get_initial_state(self, *, batch_size: int, device: torch.device) -> Any: """Initial state.""" beam_dim = Dim(1, name="initial-beam") batch_dims_ = [batch_dim, beam_dim] - decoder_state = model.blank_decoder_default_initial_state(batch_dims=batch_dims_,) + decoder_state = model.blank_decoder.default_initial_state(batch_dims=batch_dims_, ) return tree.map_structure(functools.partial(self._map_tensor_to_raw, beam_dim=beam_dim), decoder_state) def score_and_update_state( @@ -466,8 +466,8 @@ def _map_raw_to_tensor(v): raise TypeError(f"_map_raw_to_tensor: unexpected {v} ({type(v).__name__})") zeros_embed = rf.zeros( - [batch_dim, beam_dim, model.target_embed_length_model.out_dim], - feature_dim=model.target_embed_length_model.out_dim, + [batch_dim, beam_dim, model.blank_decoder.target_embed.out_dim], + feature_dim=model.blank_decoder.target_embed.out_dim, dtype="float32" ) initial_output_mask = rf.convert_to_tensor(prev_align_label == -1, dims=[batch_dim, beam_dim]) @@ -480,16 +480,16 @@ def _map_raw_to_tensor(v): input_embed = rf.where( initial_output_mask, zeros_embed, - model.target_embed_length_model(prev_align_label) + model.blank_decoder.target_embed(prev_align_label) ) - decode_out, decoder_state = model.time_sync_loop_step( + decode_out, decoder_state = model.blank_decoder.loop_step( enc=enc["enc"], enc_spatial_dim=enc_spatial_dim, input_embed=input_embed, state=tree.map_structure(_map_raw_to_tensor, prev_state), ) - blank_logits = model.decode_blank_logits(**decode_out) + blank_logits = model.blank_decoder.decode_logits(**decode_out) emit_log_prob = rf.log(rf.sigmoid(blank_logits)) emit_log_prob = rf.squeeze(emit_log_prob, axis=emit_log_prob.feature_dim) blank_log_prob = rf.log(rf.sigmoid(-blank_logits)) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/train.py index 592f06a0d..599aa6ef8 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/train.py @@ -2,8 +2,19 @@ from returnn.tensor import TensorDict -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.utils import get_non_blank_mask, get_masked +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.utils import ( + get_non_blank_mask, + get_masked, + get_emit_ground_truth, + get_segment_starts_and_lens +) from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model import SegmentalAttentionModel +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.train import ( + viterbi_training as label_model_viterbi_training +) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.blank_model.train import ( + viterbi_training as blank_model_viterbi_training +) from returnn.tensor import Dim import returnn.frontend as rf @@ -31,7 +42,7 @@ def _returnn_v2_train_step(*, model, extern_data: TensorDict, **_kwargs_unused): ) -def from_scratch_training( +def viterbi_training( *, model: SegmentalAttentionModel, data: rf.Tensor, @@ -41,13 +52,10 @@ def from_scratch_training( ): """Function is run within RETURNN.""" from returnn.config import get_global_config - import torch config = get_global_config() # noqa aux_loss_layers = config.typed_value("aux_loss_layers") aux_loss_scales = config.typed_value("aux_loss_scales", ([1.0] * len(aux_loss_layers)) if aux_loss_layers else None) - aed_loss_scale = config.float("aed_loss_scale", 1.0) - use_normalized_loss = config.bool("use_normalized_loss", True) if data.feature_dim and data.feature_dim.dimension == 1: data = rf.squeeze(data, axis=data.feature_dim) @@ -55,43 +63,23 @@ def from_scratch_training( batch_dims = data.remaining_dims(data_spatial_dim) - def _get_segment_starts_and_lens(out_spatial_dim: Dim): - non_blank_mask = get_non_blank_mask(align_targets, model.blank_idx) - targets_range = rf.range_over_dim(align_targets_spatial_dim, dtype="int32") - targets_range = rf.expand_dim(targets_range, batch_dims[0]) - non_blank_positions, _ = get_masked( - targets_range, non_blank_mask, align_targets_spatial_dim, batch_dims, out_spatial_dim - ) - starts = rf.maximum( - rf.convert_to_tensor(0, dtype="int32"), non_blank_positions - model.center_window_size // 2) - ends = rf.minimum( - rf.copy_to_device(align_targets_spatial_dim.get_size_tensor() - 1, non_blank_positions.device), - non_blank_positions + model.center_window_size // 2 - ) - lens = ends - starts + 1 - - return starts, lens - - def _get_emit_ground_truth(): - non_blank_mask = get_non_blank_mask(align_targets, model.blank_idx) - result = rf.where(non_blank_mask, rf.convert_to_tensor(1), rf.convert_to_tensor(0)) - sparse_dim = Dim(name="emit_ground_truth", dimension=2) - # result = rf.expand_dim(result, sparse_dim) - result.sparse_dim = sparse_dim - torch.set_printoptions(threshold=10000) - - return result, sparse_dim - non_blank_targets, non_blank_targets_spatial_dim = get_masked( align_targets, get_non_blank_mask(align_targets, model.blank_idx), align_targets_spatial_dim, batch_dims ) non_blank_targets.sparse_dim = model.target_dim - segment_starts, segment_lens = _get_segment_starts_and_lens(non_blank_targets_spatial_dim) + segment_starts, segment_lens = get_segment_starts_and_lens( + align_targets, + align_targets_spatial_dim, + model, + batch_dims, + non_blank_targets_spatial_dim + ) # ------------------- encoder aux loss ------------------- collected_outputs = {} - enc_args, enc_spatial_dim = model.encode(data, in_spatial_dim=data_spatial_dim, collected_outputs=collected_outputs) + enc_args, enc_spatial_dim = model.encoder.encode( + data, in_spatial_dim=data_spatial_dim, collected_outputs=collected_outputs) if aux_loss_layers: for i, layer_idx in enumerate(aux_loss_layers): @@ -110,118 +98,36 @@ def _get_emit_ground_truth(): f"ctc_{layer_idx}", scale=aux_loss_scales[i], custom_inv_norm_factor=align_targets_spatial_dim.get_size_tensor(), - use_normalized_loss=use_normalized_loss, + use_normalized_loss=True, ) - non_blank_input_embeddings = model.target_embed(non_blank_targets) - non_blank_input_embeddings = rf.shift_right( - non_blank_input_embeddings, axis=non_blank_targets_spatial_dim, pad_value=0.0) - - align_input_embeddings = model.target_embed_length_model(align_targets) - align_input_embeddings = rf.shift_right( - align_input_embeddings, axis=align_targets_spatial_dim, pad_value=0.0) - # ------------------- label loop ------------------- - def _label_loop_body(xs, state: rf.State): - new_state = rf.State() - loop_out_, new_state.decoder = model.label_sync_loop_step( - **enc_args, - enc_spatial_dim=enc_spatial_dim, - input_embed=xs["input_embed"], - segment_starts=xs["segment_starts"], - segment_lens=xs["segment_lens"], - state=state.decoder, - ) - return loop_out_, new_state - - label_loop_out, _, _ = rf.scan( - spatial_dim=non_blank_targets_spatial_dim, - xs={ - "input_embed": non_blank_input_embeddings, - "segment_starts": segment_starts, - "segment_lens": segment_lens, - }, - ys=model.label_loop_step_output_templates(batch_dims=batch_dims), - initial=rf.State( - decoder=model.label_decoder_default_initial_state( - batch_dims=batch_dims, - # TODO: do we need these sparse dims? they are automatically added by rf.range_over_dim - segment_starts_sparse_dim=segment_starts.sparse_dim, - segment_lens_sparse_dim=segment_lens.sparse_dim, - ), - ), - body=_label_loop_body, - ) - - logits = model.decode_label_logits(input_embed=non_blank_input_embeddings, **label_loop_out) - logits_packed, pack_dim = rf.pack_padded(logits, dims=batch_dims + [non_blank_targets_spatial_dim], enforce_sorted=False) - non_blank_targets_packed, _ = rf.pack_padded( - non_blank_targets, dims=batch_dims + [non_blank_targets_spatial_dim], enforce_sorted=False, out_dim=pack_dim - ) - - log_prob = rf.log_softmax(logits_packed, axis=model.target_dim) - log_prob = rf.label_smoothed_log_prob_gradient(log_prob, 0.1, axis=model.target_dim) - loss = rf.cross_entropy( - target=non_blank_targets_packed, estimated=log_prob, estimated_type="log-probs", axis=model.target_dim + label_model_viterbi_training( + model=model.label_decoder, + enc_args=enc_args, + enc_spatial_dim=enc_spatial_dim, + non_blank_targets=non_blank_targets, + non_blank_targets_spatial_dim=non_blank_targets_spatial_dim, + segment_starts=segment_starts, + segment_lens=segment_lens, + batch_dims=batch_dims, ) - loss.mark_as_loss("non_blank_ce", scale=aed_loss_scale, use_normalized_loss=use_normalized_loss) - - best = rf.reduce_argmax(logits_packed, axis=model.target_dim) - frame_error = best != non_blank_targets_packed - frame_error.mark_as_loss(name="non_blank_fer", as_error=True) # ------------------- blank loop ------------------- - def _blank_loop_body(xs, state: rf.State): - new_state = rf.State() - loop_out_, new_state.decoder = model.time_sync_loop_step( - enc=enc_args["enc"], - enc_spatial_dim=enc_spatial_dim, - input_embed=xs["input_embed"], - state=state.decoder, - ) - return loop_out_, new_state - - label_loop_out, _, _ = rf.scan( - spatial_dim=align_targets_spatial_dim, - xs={ - "input_embed": align_input_embeddings, - }, - ys=model.blank_loop_step_output_templates(batch_dims=batch_dims), - initial=rf.State( - decoder=model.blank_decoder_default_initial_state( - batch_dims=batch_dims, - ), - ), - body=_blank_loop_body, - ) - - blank_logits = model.decode_blank_logits(**label_loop_out) - blank_logits_packed, pack_dim = rf.pack_padded(blank_logits, dims=batch_dims + [align_targets_spatial_dim], enforce_sorted=False) - emit_ground_truth, emit_blank_target_dim = _get_emit_ground_truth() - emit_ground_truth_packed, _ = rf.pack_padded( - emit_ground_truth, dims=batch_dims + [align_targets_spatial_dim], enforce_sorted=False, out_dim=pack_dim + emit_ground_truth, emit_blank_target_dim = get_emit_ground_truth(align_targets, model.blank_idx) + blank_model_viterbi_training( + model=model.blank_decoder, + enc_args=enc_args, + enc_spatial_dim=enc_spatial_dim, + align_targets=align_targets, + align_targets_spatial_dim=align_targets_spatial_dim, + emit_ground_truth=emit_ground_truth, + emit_blank_target_dim=emit_blank_target_dim, + batch_dims=batch_dims, ) - # rf.log_sigmoid not implemented for torch backend - emit_log_prob = rf.log(rf.sigmoid(blank_logits_packed)) - blank_log_prob = rf.log(rf.sigmoid(-blank_logits_packed)) - blank_logit_dim = blank_logits_packed.remaining_dims((pack_dim,))[0] - emit_blank_log_prob, _ = rf.concat( - (blank_log_prob, blank_logit_dim), (emit_log_prob, blank_logit_dim), out_dim=emit_blank_target_dim) - blank_loss = rf.cross_entropy( - target=emit_ground_truth_packed, - estimated=emit_blank_log_prob, - estimated_type="log-probs", - axis=emit_blank_target_dim - ) - blank_loss.mark_as_loss("emit_blank_ce", scale=aed_loss_scale, use_normalized_loss=use_normalized_loss) - - best = rf.reduce_argmax(emit_blank_log_prob, axis=emit_blank_target_dim) - frame_error = best != emit_ground_truth_packed - frame_error.mark_as_loss(name="emit_blank_fer", as_error=True) - -from_scratch_training: TrainDef[SegmentalAttentionModel] -from_scratch_training.learning_rate_control_error_measure = "dev_score_full_sum" +viterbi_training: TrainDef[SegmentalAttentionModel] +viterbi_training.learning_rate_control_error_measure = "dev_score_full_sum" diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/utils.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/utils.py index e4304d0f8..707be5236 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/utils.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/utils.py @@ -1,8 +1,10 @@ from typing import Optional, Sequence, Tuple +import torch from returnn.tensor import Tensor, Dim import returnn.frontend as rf +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model import SegmentalAttentionModel def get_non_blank_mask(x: Tensor, blank_idx: int): @@ -13,9 +15,6 @@ def get_non_blank_mask(x: Tensor, blank_idx: int): def get_masked( input: Tensor, mask: Tensor, mask_dim: Dim, batch_dims: Sequence[Dim], result_spatial_dim: Optional[Dim] = None ) -> Tuple[Tensor, Dim]: - - import torch - if not result_spatial_dim: new_lens = rf.reduce_sum(rf.cast(mask, "int32"), axis=mask_dim) result_spatial_dim = Dim(name=f"{mask_dim.name}_masked", dimension=rf.copy_to_device(new_lens, "cpu")) @@ -42,3 +41,41 @@ def get_masked( result = result.copy_transpose(batch_dims + [result_spatial_dim]) return result, result_spatial_dim + + +def get_segment_starts_and_lens( + align_targets: Tensor, + align_targets_spatial_dim: Dim, + model: SegmentalAttentionModel, + batch_dims: Sequence[Dim], + out_spatial_dim: Dim +): + non_blank_mask = get_non_blank_mask(align_targets, model.blank_idx) + targets_range = rf.range_over_dim(align_targets_spatial_dim, dtype="int32") + targets_range = rf.expand_dim(targets_range, batch_dims[0]) + non_blank_positions, _ = get_masked( + targets_range, non_blank_mask, align_targets_spatial_dim, batch_dims, out_spatial_dim + ) + starts = rf.maximum( + rf.convert_to_tensor(0, dtype="int32"), non_blank_positions - model.center_window_size // 2) + ends = rf.minimum( + rf.copy_to_device(align_targets_spatial_dim.get_size_tensor() - 1, non_blank_positions.device), + non_blank_positions + model.center_window_size // 2 + ) + lens = ends - starts + 1 + + return starts, lens + + +def get_emit_ground_truth( + align_targets: Tensor, + blank_idx: int +): + non_blank_mask = get_non_blank_mask(align_targets, blank_idx) + result = rf.where(non_blank_mask, rf.convert_to_tensor(1), rf.convert_to_tensor(0)) + sparse_dim = Dim(name="emit_ground_truth", dimension=2) + # result = rf.expand_dim(result, sparse_dim) + result.sparse_dim = sparse_dim + torch.set_printoptions(threshold=10000) + + return result, sparse_dim diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v1/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v1/__init__.py index 71065787f..f9266f4f2 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v1/__init__.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v1/__init__.py @@ -4,38 +4,17 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.center_window_att import ( train, recog ) - -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model import MakeModel -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_import import map_param_func_v2 - -from i6_experiments.users.zeyer.returnn.convert_ckpt_rf import ConvertTfCheckpointToRfPtJob - -from i6_core.returnn.training import PtCheckpoint, Checkpoint - -from sisyphus import Path +from i6_experiments.users.schmitt.custom_load_params import load_missing_params def run_exps(): for model_alias, config_builder in baseline.center_window_att_baseline_rf( win_size_list=(5,), ): - # for train_alias, checkpoint in train.train_center_window_att_from_scratch( - # alias=model_alias, - # config_builder=config_builder, - # n_epochs_list=(1,), - # time_rqmt=4 - # ): - # recog.center_window_returnn_frame_wise_beam_search( - # alias=train_alias, - # config_builder=config_builder, - # checkpoint=checkpoint, - # checkpoint_aliases=("last",) - # ) - for train_alias, checkpoint in train.train_center_window_att_import_global_tf( alias=model_alias, config_builder=config_builder, - n_epochs_list=(1,), + n_epochs_list=(10,), time_rqmt=4, train_opts=dict( aux_loss_layers=None, @@ -57,33 +36,25 @@ def run_exps(): pure_torch=True, ) + for model_alias, config_builder in baseline.center_window_att_baseline_rf( + win_size_list=(5,), decoder_version=2 + ): for train_alias, checkpoint in train.train_center_window_att_import_global_tf( alias=model_alias, config_builder=config_builder, - n_epochs_list=(20,), - time_rqmt=12, + n_epochs_list=(10,), + time_rqmt=4, train_opts=dict( aux_loss_layers=None, accum_grad_multiple_step=2, optimizer={"class": "adam", "epsilon": 1e-8} - ) + ), + custom_missing_load_func=load_missing_params ): recog.center_window_returnn_frame_wise_beam_search( alias=train_alias, config_builder=config_builder, checkpoint=checkpoint, - checkpoint_aliases=("last",) + checkpoint_aliases=("last",), + pure_torch=True, ) - - # for train_alias, checkpoint in train.train_center_window_att_import_center_window_tf( - # alias=model_alias, - # config_builder=config_builder, - # n_epochs_list=(9,), - # time_rqmt=4 - # ): - # recog.center_window_returnn_frame_wise_beam_search( - # alias=train_alias, - # config_builder=config_builder, - # checkpoint=checkpoint, - # checkpoint_aliases=("last",) - # ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v1/baseline.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v1/baseline.py index 60dfefc64..b80ee17cc 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v1/baseline.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v1/baseline.py @@ -44,6 +44,7 @@ def get_center_window_att_config_builder_rf( config_builder = SegmentalAttConfigBuilderRF( variant_params=variant_params, center_window_size=win_size, + decoder_version=decoder_version, model_def=from_scratch_model_def, get_model_func=_returnn_v2_get_model, ) @@ -53,13 +54,13 @@ def get_center_window_att_config_builder_rf( def center_window_att_baseline_rf( win_size_list: Tuple[int, ...] = (5, 129), + decoder_version: Optional[int] = None, ): for win_size in win_size_list: - alias = f"{base_alias}/baseline_rf/win-size-%d" % ( - win_size - ) + alias = f"{base_alias}/baseline_rf/win-size-{win_size}/decoder-version-{decoder_version if decoder_version else 1}" yield alias, get_center_window_att_config_builder_rf( win_size=win_size, use_weight_feedback=True, - length_model_opts={"use_label_model_state": True, "use_alignment_ctx": False}, + length_model_opts={"use_label_model_state": False, "use_alignment_ctx": False}, + decoder_version=decoder_version, ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/train.py index d74888066..d82fdf89d 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/train.py @@ -1,8 +1,8 @@ -from typing import Tuple, Optional, List, Dict +from typing import Tuple, Optional, List, Dict, Union, Callable from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.config_builder_rf.base import SegmentalAttConfigBuilderRF from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.train_new import SegmentalTrainExperiment -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.train import _returnn_v2_train_step, from_scratch_training +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.train import _returnn_v2_train_step, viterbi_training from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.checkpoints import ( external_checkpoints, default_import_model_name, @@ -33,7 +33,7 @@ def train_center_window_att_from_scratch( }, "import_model_train_epoch1": None, "lr_opts": {"type": "dyn_lr_lin_warmup_invsqrt_decay"}, - "train_def": from_scratch_training, + "train_def": viterbi_training, "train_step_func": _returnn_v2_train_step, } ) @@ -55,6 +55,7 @@ def train_center_window_att_import_global_tf( n_epochs_list: Tuple[int, ...], time_rqmt: int = 168, train_opts: Optional[Dict] = None, + custom_missing_load_func: Optional[Callable] = None ): _train_opts = { "preload_from_files": { @@ -64,10 +65,12 @@ def train_center_window_att_import_global_tf( "ignore_missing": True, # because of length model params } }, - "train_def": from_scratch_training, + "train_def": viterbi_training, "train_step_func": _returnn_v2_train_step, "batching": "random", } + if custom_missing_load_func: + _train_opts["preload_from_files"]["pretrained_global_att_params"]["custom_missing_load_func"] = custom_missing_load_func if train_opts: _train_opts.update(train_opts) @@ -119,7 +122,7 @@ def train_center_window_att_import_center_window_tf( # } # }, "import_model_train_epoch1": get_center_window_baseline_v1_tf_checkpoint(), - "train_def": from_scratch_training, + "train_def": viterbi_training, "train_step_func": _returnn_v2_train_step, "batching": "random", "aux_loss_layers": None, diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/checkpoints.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/checkpoints.py index 732c9dfa6..2b3dc216b 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/checkpoints.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/checkpoints.py @@ -1,10 +1,10 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.pipelines.pipeline_ls_conf.checkpoints import external_checkpoints as external_checkpoints_tf from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.pipelines.pipeline_ls_conf.checkpoints import default_import_model_name from i6_experiments.users.schmitt.returnn_frontend.convert.checkpoint import ConvertTfCheckpointToRfPtJob -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_import import map_param_func_v2 as map_param_func_v2_global +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.global_.model_import import map_param_func_v2 as map_param_func_v2_global from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.global_.model import MakeModel as MakeModelGlobal from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model import MakeModel as MakeModelSegmental -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_import import map_param_func_v2 as map_param_func_v2_segmental +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_old.model_import import map_param_func_v2 as map_param_func_v2_segmental from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.pipelines.pipeline_ls_conf.center_window_att.baseline_v1 import ( baseline, ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/__init__.py index 1499832d9..b291006ca 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/__init__.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/__init__.py @@ -29,20 +29,7 @@ def run_exps(): for train_alias, checkpoint in train.train_import_global_tf( alias=model_alias, config_builder=config_builder, - n_epochs_list=(1,), - time_rqmt=4, - ): - recog.global_att_returnn_label_sync_beam_search( - alias=train_alias, - config_builder=config_builder, - checkpoint=checkpoint, - checkpoint_aliases=("last",), - ) - - for train_alias, checkpoint in train.train_import_global_tf( - alias=model_alias, - config_builder=config_builder, - n_epochs_list=(20,), + n_epochs_list=(10,), time_rqmt=4, ): recog.global_att_returnn_label_sync_beam_search( From f59d4a2dbc84bd87d0d7b9b982c47ee5965cf7d5 Mon Sep 17 00:00:00 2001 From: schmitt Date: Thu, 16 May 2024 10:28:31 +0200 Subject: [PATCH 016/227] add readme for RF --- .../dependencies/returnn/network_builder_rf/README | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/README diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/README b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/README new file mode 100644 index 000000000..1bb9d3c89 --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/README @@ -0,0 +1,10 @@ +The contents in this package are all tested for commit: + +commit d94322fa3ea82a53426766afe845e62811433df2 (HEAD -> main) +Author: schmitt +Date: Thu May 16 10:19:52 2024 +0200 + + update + +and are consistent with baseline V1 of my old global and segmental model experiments regarding scores, WERs, losses +and speed. From 2d0a5dc827ebf939fe01980a8cd53a8d2fe80ced Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 16 May 2024 10:20:25 +0200 Subject: [PATCH 017/227] more --- users/zeyer/experiments/exp2024_04_23_baselines/ctc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index a56f931ec..04bed1fa8 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -30,7 +30,7 @@ def py(): - train_exp( + train_exp( # 8.23 "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2", config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, config_updates={ @@ -41,7 +41,7 @@ def py(): }, ) - train_exp( + train_exp( # 8.12 "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-spm10k", config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, config_updates={ From 9998dbf742ced5094f097162a6a5bf6c3686293d Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 16 May 2024 10:20:34 +0200 Subject: [PATCH 018/227] cleanup, generalize, different spm vocab sizes --- users/zeyer/datasets/librispeech.py | 62 +++++++++++++++++------------ 1 file changed, 36 insertions(+), 26 deletions(-) diff --git a/users/zeyer/datasets/librispeech.py b/users/zeyer/datasets/librispeech.py index ac62016f8..c800e6893 100644 --- a/users/zeyer/datasets/librispeech.py +++ b/users/zeyer/datasets/librispeech.py @@ -5,6 +5,8 @@ from __future__ import annotations from typing import Optional, Any, Union, Tuple, Dict from copy import deepcopy +import re +from functools import cache from sisyphus import tk from i6_core.corpus.convert import CorpusToTextDictJob @@ -48,28 +50,37 @@ _train_corpus_text_dict = _corpus_text_dicts["train-other-960"] _train_corpus_text = TextDictToTextLinesJob(_train_corpus_text_dict, gzip=True).out_text_lines -# https://github.com/google/sentencepiece/blob/master/doc/options.md -_spm10k_train_job = TrainSentencePieceJob( - training_text=_train_corpus_text, - # Not sure if power-of-two or just multiple-of-64, but 10240 has more 2s in it (2048*5) than 10048. - vocab_size=10_240, - model_type=SentencePieceType.UNIGRAM, - additional_options={ - "split_digits": True, - "unk_id": 2, # default is 0 - "bos_id": 1, # default is 1 - "eos_id": 0, # default is 2 - }, -) -spm10k = SentencePieceModel( - dim=10_240, - model_file=_spm10k_train_job.out_model, - unknown_label="", - bos_idx=1, - eos_idx=0, -) -# common +@cache +def _get_spm_vocab(*, dim: Union[int, str]) -> SentencePieceModel: + if isinstance(dim, str): + # Not sure if power-of-two or just multiple-of-64, but 10240 has more 2s in it (2048*5) than 10048. + dim = {"10k": 10_240, "5k": 5_120, "4k": 4_096, "1k": 1_024}[dim] + assert isinstance(dim, int) and dim >= 10 + + # https://github.com/google/sentencepiece/blob/master/doc/options.md + _spm_train_job = TrainSentencePieceJob( + training_text=_train_corpus_text, + vocab_size=dim, + model_type=SentencePieceType.UNIGRAM, + additional_options={ + "split_digits": True, + "unk_id": 2, # default is 0 + "bos_id": 1, # default is 1 + "eos_id": 0, # default is 2 + }, + ) + spm = SentencePieceModel( + dim=dim, + model_file=_spm_train_job.out_model, + unknown_label="", + bos_idx=1, + eos_idx=0, + ) + return spm + + +# common, this is the BPE10k that many of us use bpe10k = Bpe( dim=10_025, eos_idx=0, @@ -537,7 +548,10 @@ def get_librispeech_task_raw_v2( so it is easier to copy this setup to a new environment. """ if isinstance(vocab, str): - vocab = {"bpe10k": bpe10k, "spm10k": spm10k}[vocab] + if re.match("^spm[0-9]+.*$", vocab): + vocab = _get_spm_vocab(dim=vocab[3:]) + else: + vocab = {"bpe10k": bpe10k}[vocab] cache_key = make_hashable((dataset_cls, vocab, train_vocab_opts, audio_opts, audio_dim, dataset_train_opts)) if cache_key in _librispeech_task_raw_v2_cache: @@ -581,10 +595,6 @@ def get_librispeech_task_raw_v2( return task -def get_librispeech_task_bpe10k_raw_v2(**dataset_train_opts) -> Task: - return get_librispeech_task_raw_v2(vocab=bpe10k, **dataset_train_opts) - - def _bpe_to_words_v1(bpe: RecogOutput) -> RecogOutput: """BPE to words""" from i6_core.returnn.search import SearchBPEtoWordsJob From d5e00decef3176d4fae364e54d9d0484349cb4f8 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 16 May 2024 10:30:54 +0200 Subject: [PATCH 019/227] more --- users/zeyer/datasets/librispeech.py | 24 ++++++++--- .../exp2024_04_23_baselines/ctc.py | 41 +++++++++---------- 2 files changed, 37 insertions(+), 28 deletions(-) diff --git a/users/zeyer/datasets/librispeech.py b/users/zeyer/datasets/librispeech.py index c800e6893..5fdf35cb7 100644 --- a/users/zeyer/datasets/librispeech.py +++ b/users/zeyer/datasets/librispeech.py @@ -52,7 +52,9 @@ @cache -def _get_spm_vocab(*, dim: Union[int, str]) -> SentencePieceModel: +def _get_spm_vocab( + *, dim: Union[int, str], model_type: SentencePieceType = SentencePieceType.UNIGRAM +) -> SentencePieceModel: if isinstance(dim, str): # Not sure if power-of-two or just multiple-of-64, but 10240 has more 2s in it (2048*5) than 10048. dim = {"10k": 10_240, "5k": 5_120, "4k": 4_096, "1k": 1_024}[dim] @@ -62,7 +64,7 @@ def _get_spm_vocab(*, dim: Union[int, str]) -> SentencePieceModel: _spm_train_job = TrainSentencePieceJob( training_text=_train_corpus_text, vocab_size=dim, - model_type=SentencePieceType.UNIGRAM, + model_type=model_type, additional_options={ "split_digits": True, "unk_id": 2, # default is 0 @@ -91,6 +93,19 @@ def _get_spm_vocab(*, dim: Union[int, str]) -> SentencePieceModel: unknown_label=None, ) + +@cache +def _get_vocab_by_str(vocab: str) -> Union[SentencePieceModel, Bpe]: + if re.match("^spm[0-9]+.*$", vocab): + return _get_spm_vocab(dim=vocab[3:], model_type=SentencePieceType.UNIGRAM) + elif re.match("^spm_bpe[0-9]+.*$", vocab): + return _get_spm_vocab(dim=vocab[3:], model_type=SentencePieceType.BPE) + elif vocab == "bpe10k": # predefined + return bpe10k + else: + raise ValueError(f"invalid vocab {vocab!r}") + + # ESPnet uses this SPM. However, it does not use the vocab directly from it. # It has some custom code to generate its own vocab based from this: # https://github.com/espnet/espnet/blob/d0047402e830a3c53e8b590064af4bf70415fb3b/egs2/TEMPLATE/asr1/asr.sh#L878 @@ -548,10 +563,7 @@ def get_librispeech_task_raw_v2( so it is easier to copy this setup to a new environment. """ if isinstance(vocab, str): - if re.match("^spm[0-9]+.*$", vocab): - vocab = _get_spm_vocab(dim=vocab[3:]) - else: - vocab = {"bpe10k": bpe10k}[vocab] + vocab = _get_vocab_by_str(vocab) cache_key = make_hashable((dataset_cls, vocab, train_vocab_opts, audio_opts, audio_dim, dataset_train_opts)) if cache_key in _librispeech_task_raw_v2_cache: diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 04bed1fa8..828dc82ea 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -30,28 +30,25 @@ def py(): - train_exp( # 8.23 - "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2", - config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, - config_updates={ - **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), - "optimizer.weight_decay": 1e-2, - "__train_audio_preprocess": speed_pert_librosa_config, - "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], - }, - ) - - train_exp( # 8.12 - "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-spm10k", - config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, - config_updates={ - **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), - "optimizer.weight_decay": 1e-2, - "__train_audio_preprocess": speed_pert_librosa_config, - "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], - }, - vocab="spm10k", - ) + for vocab in [ + "bpm10k", # 8.23 + "spm10k", # 8.12 + "spm_bpe10k", + "spm4k", + "spm1k", + "bpe1k", + ]: + train_exp( # 8.23 + f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-{vocab}", + config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), + "optimizer.weight_decay": 1e-2, + "__train_audio_preprocess": speed_pert_librosa_config, + "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], + }, + vocab=vocab, + ) train_exp( "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-spm10k-spmSample03", From 9a8d172f03b7662800ca359af60784fd8e83d5a7 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 16 May 2024 10:31:04 +0200 Subject: [PATCH 020/227] more --- users/zeyer/experiments/exp2024_04_23_baselines/ctc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 828dc82ea..cfb5507f8 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -36,7 +36,7 @@ def py(): "spm_bpe10k", "spm4k", "spm1k", - "bpe1k", + "spm_bpe1k", ]: train_exp( # 8.23 f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-{vocab}", From c3d0391906dc26edda847395f9a7a39ec64f9f75 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 16 May 2024 10:31:43 +0200 Subject: [PATCH 021/227] small fix --- users/zeyer/experiments/exp2024_04_23_baselines/ctc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index cfb5507f8..65265722d 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -31,7 +31,7 @@ def py(): for vocab in [ - "bpm10k", # 8.23 + "bpe10k", # 8.23 "spm10k", # 8.12 "spm_bpe10k", "spm4k", From 3220040eed0bf77129b8839200df32f9b73a3868 Mon Sep 17 00:00:00 2001 From: "luca.gaudino" Date: Thu, 16 May 2024 11:12:09 +0200 Subject: [PATCH 022/227] update ls att+ctc+lm --- .../librispeech_960/attention_asr_config.py | 14 +- .../configs/ctc_att_search_w_recombine.py | 208 +++++++++++++++++- 2 files changed, 213 insertions(+), 9 deletions(-) diff --git a/users/gaudino/experiments/conformer_att_2023/librispeech_960/attention_asr_config.py b/users/gaudino/experiments/conformer_att_2023/librispeech_960/attention_asr_config.py index c801f5546..3fbe78a96 100644 --- a/users/gaudino/experiments/conformer_att_2023/librispeech_960/attention_asr_config.py +++ b/users/gaudino/experiments/conformer_att_2023/librispeech_960/attention_asr_config.py @@ -505,8 +505,9 @@ class CTCDecoderArgs(DecoderArgs): add_att_dec: bool = False att_scale: float = 0.65 ts_reward: float = 0.0 - blank_prob_scale: float = 0.0 - repeat_prob_scale: float = 0.0 + blank_prob_scale: float = 1.0 + repeat_prob_scale: float = 1.0 + label_prob_scale: float = 1.0 ctc_prior_correction: bool = False prior_scale: float = 1.0 logits: bool = False @@ -520,10 +521,15 @@ class CTCDecoderArgs(DecoderArgs): one_minus_term_mul_scale: float = 1.0 one_minus_term_sub_scale: float = 0.0 length_normalization: bool = False - hash_override_version: Optional[int] = None + length_normalization_scale: float = 1.0 + target_dim: int = 10025 + target_embed_dim: int = 640 + # hash_override_version: Optional[int] = None blank_collapse: bool = False renorm_p_comb: bool = False - recombine:bool = False + renorm_after_remove_blank: bool = True + recombine: bool = False + max_approx: bool = False def create_config( diff --git a/users/gaudino/experiments/conformer_att_2023/librispeech_960/configs/ctc_att_search_w_recombine.py b/users/gaudino/experiments/conformer_att_2023/librispeech_960/configs/ctc_att_search_w_recombine.py index 3f405dcf0..3ed600a57 100644 --- a/users/gaudino/experiments/conformer_att_2023/librispeech_960/configs/ctc_att_search_w_recombine.py +++ b/users/gaudino/experiments/conformer_att_2023/librispeech_960/configs/ctc_att_search_w_recombine.py @@ -503,7 +503,7 @@ def run_exp( # --------------------------- Without LM --------------------------- # # att only - for beam_size in [12]: + for beam_size in []: search_args = copy.deepcopy(oclr_args) search_args["beam_size"] = beam_size run_decoding( @@ -525,7 +525,7 @@ def run_exp( ) # ctc ts - for beam_size, prior_scale in product([12, 32], [0, 0.1]): + for beam_size, prior_scale in product([12, 32], []): search_args = copy.deepcopy(oclr_args) search_args["beam_size"] = beam_size search_args["ctc_log_prior_file"] = new_prior_file @@ -550,12 +550,13 @@ def run_exp( use_sclite=True, ) - # optsr att + ctc w prior - for scales, prior_scale, beam_size in product([(0.65, 0.35), (0.7, 0.3)], [0, 0.1], [32]): + # optsr max att + ctc w prior + for scales, prior_scale, beam_size in product([(0.65, 0.35)], [0.0, 0.05, 0.1], [32]): search_args = copy.deepcopy(oclr_args) search_args["beam_size"] = beam_size att_scale, ctc_scale = scales search_args["ctc_log_prior_file"] = new_prior_file + label_scale = 1.0 search_args["decoder_args"] = CTCDecoderArgs( add_att_dec=True, @@ -564,11 +565,20 @@ def run_exp( ctc_prior_correction=prior_scale > 0, prior_scale=prior_scale, recombine=True, + max_approx=True, + # normalization settings + one_minus_term_mul_scale=0.0, + renorm_after_remove_blank=False, + blank_prob_scale=1.0, + repeat_prob_scale=1.0, + label_prob_scale=label_scale, ) + time_rqmt = 2 run_decoding( exp_name=f"bsf{bsf}/" - + f"optsr_att{att_scale}_ctc{ctc_scale}" + + f"optsr_max_att{att_scale}_ctc{ctc_scale}" + (f"_prior_{prior_scale}" if prior_scale > 0 else "") + + f"_vanilla" + f"_beam{beam_size}", train_data=train_data, checkpoint=train_job_avg_ckpt[ @@ -584,7 +594,195 @@ def run_exp( "", }, # blanks are removed in the network use_sclite=True, + time_rqmt=time_rqmt, ) # --------------------------- With Lstm LM --------------------------- # + + # optsr max ctc w prior + lstm lm + for lm_scale, beam_size in product([0.3, 0.4, 0.5, 0.6], [12]): + search_args = copy.deepcopy(oclr_args) + search_args["beam_size"] = beam_size + search_args["ctc_log_prior_file"] = new_prior_file + ctc_scale = 1.0 + label_scale = 1.0 + prior_scale = 0.0 + + ext_lm_opts = lstm_lm_opts_map[BPE_10K] + + if not bsf > 0: + search_args["batch_size"] = ( + 4000 * 160 if beam_size <= 32 else 2000 * 160 + ) + + time_rqmt = 2 + if beam_size > 30: + time_rqmt = 3 + + search_args["decoder_args"] = CTCDecoderArgs( + # add_att_dec=True, + # att_scale=att_scale, + add_ext_lm=True, + lm_type="lstm", + ext_lm_opts=ext_lm_opts, + lm_scale=lm_scale, + ctc_scale=ctc_scale, + ctc_prior_correction=prior_scale > 0, + prior_scale=prior_scale, + recombine=True, + max_approx=True, + # normalization settings + one_minus_term_mul_scale=0.0, + renorm_after_remove_blank=False, + blank_prob_scale=1.0, + repeat_prob_scale=1.0, + label_prob_scale=label_scale, + ) + run_decoding( + exp_name=f"bsf{bsf}/" + + f"optsr_max_ctc{ctc_scale}_lstmlm{lm_scale}" + + (f"_prior_{prior_scale}" if prior_scale > 0 else "") + + f"_vanilla" + + f"_beam{beam_size}", + train_data=train_data, + checkpoint=train_job_avg_ckpt[ + f"base_conf_12l_lstm_1l_conv6_OCLR_sqrdReLU_cyc915_ep2035_peak0.0009_retrain1_const20_linDecay580_{1e-4}" + ], + search_args=search_args, + feature_extraction_net=log10_net_10ms, + bpe_size=BPE_10K, + test_sets=["dev-other"], + # test_sets=["dev-other"], + remove_label={ + "", + "", + }, # blanks are removed in the network + use_sclite=True, + time_rqmt=time_rqmt, + ) + # --------------------------- With Trafo LM --------------------------- # + + # optsr max ctc w prior + trafo lm + for lm_scale, beam_size in product([0.5, 0.6, 0.65, 0.7], [12]): + search_args = copy.deepcopy(oclr_args) + search_args["beam_size"] = beam_size + search_args["ctc_log_prior_file"] = new_prior_file + ctc_scale = 1.0 + label_scale = 1.0 + prior_scale = 0.0 + + ext_lm_opts = trafo_lm_opts_map[BPE_10K] + + if not bsf > 0: + search_args["batch_size"] = ( + 4000 * 160 if beam_size <= 32 else 2000 * 160 + ) + + time_rqmt = 4 + if beam_size > 30: + time_rqmt = 5 + + search_args["decoder_args"] = CTCDecoderArgs( + # add_att_dec=True, + # att_scale=att_scale, + add_ext_lm=True, + lm_type="trafo", + ext_lm_opts=ext_lm_opts, + lm_scale=lm_scale, + ctc_scale=ctc_scale, + ctc_prior_correction=prior_scale > 0, + prior_scale=prior_scale, + recombine=True, + max_approx=True, + # normalization settings + one_minus_term_mul_scale=0.0, + renorm_after_remove_blank=False, + blank_prob_scale=1.0, + repeat_prob_scale=1.0, + label_prob_scale=label_scale, + ) + run_decoding( + exp_name=f"bsf{bsf}/" + + f"optsr_max_ctc{ctc_scale}_trafolm{lm_scale}" + + (f"_prior_{prior_scale}" if prior_scale > 0 else "") + + f"_vanilla" + + f"_beam{beam_size}", + train_data=train_data, + checkpoint=train_job_avg_ckpt[ + f"base_conf_12l_lstm_1l_conv6_OCLR_sqrdReLU_cyc915_ep2035_peak0.0009_retrain1_const20_linDecay580_{1e-4}" + ], + search_args=search_args, + feature_extraction_net=log10_net_10ms, + bpe_size=BPE_10K, + test_sets=["dev-other"], + # test_sets=["dev-other"], + remove_label={ + "", + "", + }, # blanks are removed in the network + use_sclite=True, + time_rqmt=time_rqmt, + ) + + # optsr max att + ctc w prior + trafo lm + for scales, lm_scale, beam_size in product([(0.65, 0.35, 0.0)], [0.45, 0.5, 0.55], [12]): + search_args = copy.deepcopy(oclr_args) + search_args["beam_size"] = beam_size + search_args["ctc_log_prior_file"] = new_prior_file + att_scale, ctc_scale, prior_scale = scales + label_scale = 1.0 + + ext_lm_opts = trafo_lm_opts_map[BPE_10K] + + if not bsf > 0: + search_args["batch_size"] = ( + 4000 * 160 if beam_size <= 32 else 2000 * 160 + ) + + time_rqmt = 4 + if beam_size > 30: + time_rqmt = 5 + + search_args["decoder_args"] = CTCDecoderArgs( + add_att_dec=True, + att_scale=att_scale, + add_ext_lm=True, + lm_type="trafo", + ext_lm_opts=ext_lm_opts, + lm_scale=lm_scale, + ctc_scale=ctc_scale, + ctc_prior_correction=prior_scale > 0, + prior_scale=prior_scale, + recombine=True, + max_approx=True, + # normalization settings + one_minus_term_mul_scale=0.0, + renorm_after_remove_blank=False, + blank_prob_scale=1.0, + repeat_prob_scale=1.0, + label_prob_scale=label_scale, + ) + run_decoding( + exp_name=f"bsf{bsf}/" + + f"optsr_max_att{att_scale}_ctc{ctc_scale}_trafolm{lm_scale}" + + (f"_prior_{prior_scale}" if prior_scale > 0 else "") + + f"_vanilla" + + f"_beam{beam_size}", + train_data=train_data, + checkpoint=train_job_avg_ckpt[ + f"base_conf_12l_lstm_1l_conv6_OCLR_sqrdReLU_cyc915_ep2035_peak0.0009_retrain1_const20_linDecay580_{1e-4}" + ], + search_args=search_args, + feature_extraction_net=log10_net_10ms, + bpe_size=BPE_10K, + test_sets=["dev-other"], + # test_sets=["dev-other"], + remove_label={ + "", + "", + }, # blanks are removed in the network + use_sclite=True, + time_rqmt=time_rqmt, + ) + From 7b294a2b081199da35c32dbeceb01422169816da Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 16 May 2024 09:58:36 +0000 Subject: [PATCH 023/227] add args --- .../librispeech_960/attention_asr_config.py | 31 ++++++++++++++----- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/attention_asr_config.py b/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/attention_asr_config.py index ded12f9e8..3a6f500a1 100644 --- a/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/attention_asr_config.py +++ b/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/attention_asr_config.py @@ -6,6 +6,7 @@ from dataclasses import dataclass, asdict from i6_experiments.users.zeineldeen.models.asr.encoder.conformer_encoder import ConformerEncoder +from i6_experiments.users.zeineldeen.models.asr.encoder.conformer_encoder_v2 import ConformerEncoderV2 from i6_experiments.users.zeineldeen.models.asr.decoder.transformer_decoder import TransformerDecoder from i6_experiments.users.zeineldeen.models.asr.decoder.conformer_decoder import ConformerDecoder from i6_experiments.users.zeineldeen.models.asr.decoder.rnn_decoder import RNNDecoder @@ -349,11 +350,6 @@ def pretrain_layers_and_dims( net_dict = encoder_model.network.get_net() - # if decoder_args["ce_loss_scale"] == 0.0: - # assert encoder_args["with_ctc"], "CTC loss is not enabled." - # net_dict["output"] = {"class": "copy", "from": "ctc"} - # net_dict["decision"]["target"] = "bpe_labels_w_blank" - # else: net_dict.update(decoder_model.network.get_net()) net_dict.update(extra_net_dict) @@ -430,6 +426,17 @@ class ConformerEncoderArgs(EncoderArgs): convolution_first: bool = False +class ConformerEncoderV2Args(ConformerEncoderArgs): + # weight noise + ff_weight_noise: Optional[float] = None + mhsa_weight_noise: Optional[float] = None + conv_weight_noise: Optional[float] = None + frontend_conv_weight_noise: Optional[float] = None + + # weight dropout + frontend_conv_weight_dropout: Optional[float] = None + + class DecoderArgs: pass @@ -454,8 +461,14 @@ class TransformerDecoderArgs(DecoderArgs): embed_dropout: float = 0.1 softmax_dropout: float = 0.0 + ff_weight_noise: Optional[float] = None + mhsa_weight_noise: Optional[float] = None + ff_weight_dropout: Optional[float] = None + mhsa_weight_dropout: Optional[float] = None + # other regularization l2: float = 0.0 + self_att_l2: float = 0.0 rel_pos_clipping: int = 16 label_smoothing: float = 0.1 apply_embed_weight: bool = False @@ -732,9 +745,13 @@ def create_config( exp_config["newbob_learning_rate_decay"] = lr_decay # -------------------------- network -------------------------- # - encoder_type = None + if isinstance(encoder_args, ConformerEncoderArgs): encoder_type = ConformerEncoder + elif isinstance(encoder_args, ConformerEncoderV2Args): + encoder_type = ConformerEncoderV2 + else: + raise ValueError("invalid encoder_args type") if isinstance(decoder_args, TransformerDecoderArgs): decoder_type = TransformerDecoder @@ -746,7 +763,7 @@ def create_config( decoder_type = ConformerDecoder dec_type = "conformer" # TODO: check if same as transformer else: - assert False, "invalid decoder_args type" + raise ValueError("invalid decoder_args type") encoder_args = asdict(encoder_args) if feature_extraction_net: From 4a79ffad26f3ada7c86f5c16520e94e486ea05fd Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 16 May 2024 10:11:58 +0000 Subject: [PATCH 024/227] fix pretraining --- .../librispeech_960/attention_asr_config.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/attention_asr_config.py b/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/attention_asr_config.py index 3a6f500a1..8e2d62658 100644 --- a/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/attention_asr_config.py +++ b/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/attention_asr_config.py @@ -315,14 +315,18 @@ def pretrain_layers_and_dims( dim_frac_enc = 1 dim_frac_dec = 1 - # TODO: use explicit param names otherwise multiple matches can lead to multiple reductions! - # TODO: WARNING: this does not include weight dropout and weight noise! # do not enable regulizations in the first pretraining step to make it more stable if initial_disabled_regularization_patterns is None: - regs_words = ["dropout", "noise", "l2"] # dropout, weight dropout, l2, weight noise + # dropout, weight dropout, l2, weight noise + regs_words = ["dropout", "weight_noise", "l2"] else: regs_words = initial_disabled_regularization_patterns + + excluded_keys = ["param_dropout_min_ndim", "weight_noise_layers"] + for k in encoder_args_copy.keys(): + if k in excluded_keys: + continue for regs_word in regs_words: if regs_word in k and encoder_args_copy[k] is not None: if not isinstance(encoder_args_copy[k], float): @@ -333,6 +337,8 @@ def pretrain_layers_and_dims( encoder_args_copy[k] *= dim_frac_enc for k in decoder_args_copy.keys(): + if k in excluded_keys: + continue for regs_word in regs_words: if regs_word in k and decoder_args_copy[k] is not None: if not isinstance(decoder_args_copy[k], float): From da315769f79e73c5689df0c08263236f227b01d7 Mon Sep 17 00:00:00 2001 From: Lukas Rilling Date: Tue, 16 Apr 2024 17:35:55 +0200 Subject: [PATCH 025/227] Glow-TTS-ASR: Update with fixed invertibility tests --- .../evaluation/forward_comparison.ipynb | 110 +- .../evaluation/phoneme_prediction_eval.ipynb | 618 +++--- users/rilling/evaluation/swer_eval.ipynb | 869 ++++---- users/rilling/evaluation/wer_eval.ipynb | 1782 +++++++++-------- .../common/eval_references/swer.py | 2 +- .../librispeech/common/tts_eval.py | 11 +- .../librispeech_glow_asr/experiments.py | 229 ++- ...former_no_freeze_spec_augment_weak_conf.py | 290 +++ ...2.py => glowASR_conformer_v2_weak_conf.py} | 33 +- ...former_x_vector_v2_bottleneck_weak_conf.py | 445 ++++ .../training_comparison.ipynb | 31 +- ..._test_maxlike_alignment_multi_layer_ffn.py | 1 + .../training_comparison.ipynb | 12 +- .../librispeech_joint_training/experiments.py | 69 + .../librispeech_joint_training/pipeline.py | 6 +- .../glowTTS_ASR_conformer_two_forward_pass.py | 4 +- .../glowTTS_ASR_conformer_x_vector.py | 7 +- .../pytorch_networks/glow_ASR_conformer.py | 8 +- .../glow_ASR_conformer_specaugment_before.py | 8 +- ...SR_conformer_specaugment_before_xvector.py | 7 +- .../shared/eval_invertibility.py | 76 +- .../config.py | 5 +- .../data.py | 2 +- .../exp_joint/experiments.py | 6 +- .../exp_joint_2step/experiments.py | 99 +- .../exp_joint_2step/training_comparison.ipynb | 152 +- .../experiments.py | 56 + .../training_comparison.ipynb | 42 +- .../exp_tts/experiments.py | 192 +- .../exp_tts/training_comparison.ipynb | 589 +++++- .../pipeline.py | 1 + .../frozen_glowtts/ga_glowTTS_ASR_ffn.py | 1 + .../frozen_glowtts/ga_glowTTS_ASR_ffn_mas.py | 658 ++++++ .../ga_glowTTS_ASR_ffn_mas_no_eval.py | 657 ++++++ .../ga_glowTTS_ASR_ffn_x_vector.py | 1 - .../ga_glowTTS_ASR_ffn_x_vector_mas.py | 8 +- ...ga_glowTTS_ASR_ffn_x_vector_mas_no_eval.py | 600 ++++++ .../glowASR_conformer_x_vector.py | 483 +++++ .../pytorch_networks/glowTTS.py | 1 + .../pytorch_networks/glowTTS_x_vector_v2.py | 1 + .../pytorch_networks/shared/configs.py | 41 +- .../shared/eval_invertibility.py | 90 + .../storage.py | 5 +- 43 files changed, 6465 insertions(+), 1843 deletions(-) create mode 100644 users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_no_freeze_spec_augment_weak_conf.py rename users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/{glowASR_conformer_v2.py => glowASR_conformer_v2_weak_conf.py} (96%) create mode 100644 users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_x_vector_v2_bottleneck_weak_conf.py create mode 100644 users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/ga_glowTTS_ASR_ffn_mas.py create mode 100644 users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval.py create mode 100644 users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas_no_eval.py create mode 100644 users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/glowASR_conformer_x_vector.py create mode 100644 users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/shared/eval_invertibility.py diff --git a/users/rilling/evaluation/forward_comparison.ipynb b/users/rilling/evaluation/forward_comparison.ipynb index 604037054..c22bf1a9e 100644 --- a/users/rilling/evaluation/forward_comparison.ipynb +++ b/users/rilling/evaluation/forward_comparison.ipynb @@ -2,14 +2,14 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_3656952/2049839218.py:8: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display\n", + "/tmp/ipykernel_503671/2049839218.py:8: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display\n", " from IPython.core.display import display\n" ] } @@ -33,35 +33,38 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tts_eval_gl/test-clean/forward/output/audio_files/': '/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tts_eval_gl/test-clean/forward/output/audio_files/': '/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tts_eval_gl/test-clean/forward/output/audio_files/': '/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tts_eval_gl/test-clean/forward/output/audio_files/': '/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/': '/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.05/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/': '/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/'}" + "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/': '/glowTTS_x_vector_v2/enc768/400ep/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/': '/glowTTS_x_vector_v2/enc768/200ep/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/100ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/': '/glowTTS_x_vector_v2/enc768/100ep/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/': '/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/': '/glowTTS/enc768/100ep/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/': '/glowTTS/enc768/200ep/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/grad_clip_10/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/': '/glowTTS/enc768/400ep/grad_clip_10/'}" ] }, - "execution_count": 15, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "globs = [\n", - " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector/enc768/100ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/\",\n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/*/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/\",\n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/*/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/\",\n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/*/*/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS/tts_eval_gl/test-clean/forward/output/audio_files/\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_x_vector/tts_eval_gl/test-clean/forward/output/audio_files/\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_conformer_coupling/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_simple_encoder/*/200ep/dec_drop_*/tts_eval_gl/forward/output/audio_files/\",\n", - " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment*/tts_eval_gl/test-clean/forward/output/audio_files/\",\n", - " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass*/tts_eval_gl/test-clean/forward/output/audio_files/\",\n", - " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc*/200ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/\",\n", + " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment*/tts_eval_gl/test-clean/forward/output/audio_files/\",\n", + " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass*/tts_eval_gl/test-clean/forward/output/audio_files/\",\n", + " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc*/200ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/\",\n", "]\n", "lr_files = []\n", "for g in globs:\n", @@ -83,16 +86,16 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_8463-287645-0010_8463-287645-0010.ogg\n", - "/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/\n", - "AutoMOS: 3.196073057424081\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_121-127105-0010_121-127105-0010.ogg\n", + "/glowTTS_x_vector_v2/enc768/400ep/\n", + "AutoMOS: 3.3250440482638384\n", "\n" ] }, @@ -101,7 +104,7 @@ "text/html": [ "\n", " \n", " " @@ -117,9 +120,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_8463-287645-0010_8463-287645-0010.ogg\n", - "/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/\n", - "AutoMOS: 2.320053706043645\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_121-127105-0010_121-127105-0010.ogg\n", + "/glowTTS_x_vector_v2/enc768/200ep/\n", + "AutoMOS: 3.2574123507855752\n", "\n" ] }, @@ -128,7 +131,7 @@ "text/html": [ "\n", " \n", " " @@ -144,9 +147,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_8463-287645-0010_8463-287645-0010.ogg\n", - "/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/\n", - "AutoMOS: 3.1575978984566113\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/100ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_121-127105-0010_121-127105-0010.ogg\n", + "/glowTTS_x_vector_v2/enc768/100ep/\n", + "AutoMOS: 3.181718836146358\n", "\n" ] }, @@ -155,7 +158,7 @@ "text/html": [ "\n", " \n", " " @@ -171,9 +174,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_8463-287645-0010_8463-287645-0010.ogg\n", - "/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/\n", - "AutoMOS: 2.614637943229785\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_121-127105-0010_121-127105-0010.ogg\n", + "/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/\n", + "AutoMOS: 3.212764627153152\n", "\n" ] }, @@ -182,7 +185,7 @@ "text/html": [ "\n", " \n", " " @@ -198,9 +201,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_8463-287645-0010_8463-287645-0010.ogg\n", - "/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.05/\n", - "AutoMOS: 3.2963643568126777\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_121-127105-0010_121-127105-0010.ogg\n", + "/glowTTS/enc768/100ep/\n", + "AutoMOS: 3.302589650658008\n", "\n" ] }, @@ -209,7 +212,7 @@ "text/html": [ "\n", " \n", " " @@ -225,9 +228,36 @@ "name": "stdout", "output_type": "stream", "text": [ - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_8463-287645-0010_8463-287645-0010.ogg\n", - "/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/\n", - "AutoMOS: 3.2574123507855752\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_121-127105-0010_121-127105-0010.ogg\n", + "/glowTTS/enc768/200ep/\n", + "AutoMOS: 3.3596264232244146\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/grad_clip_10/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_121-127105-0010_121-127105-0010.ogg\n", + "/glowTTS/enc768/400ep/grad_clip_10/\n", + "AutoMOS: 3.2932603021495437\n", "\n" ] }, @@ -236,7 +266,7 @@ "text/html": [ "\n", " \n", " " @@ -250,7 +280,7 @@ } ], "source": [ - "sequence_index = 5\n", + "sequence_index = 10\n", "sequence_name = sequence_names[sequence_index]\n", "\n", "for folder, name in files.items():\n", diff --git a/users/rilling/evaluation/phoneme_prediction_eval.ipynb b/users/rilling/evaluation/phoneme_prediction_eval.ipynb index fd2438cd1..31e863a2d 100644 --- a/users/rilling/evaluation/phoneme_prediction_eval.ipynb +++ b/users/rilling/evaluation/phoneme_prediction_eval.ipynb @@ -44,57 +44,67 @@ { "data": { "text/plain": [ - "['/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/mean_only/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_1.0/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_simple_encoder_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/basic_init/no_specaug/tts_target_size/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/',\n", + "['/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_simple_encoder_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_0.1/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/specaug/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/basic_init/no_specaug/tts_target_size/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/mean_only/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_1.0/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/specaug/ce_ls_1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/no_specaug/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/specaug/ce_ls_0.1/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/no_specaug/ce_ls_1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_tts/ga_glowTTS_ASR_ffn_x_vector_v2_2ndstep_tts/ce_ls_1.0/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/basic_init/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/tts_pretrained/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/basic_init/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.01/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.01/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_2ndstep/ce_ls_1.0/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/encoder/decoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/encoder/encoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/decoder/encoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/decoder/decoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/decoder/decoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/decoder/encoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/no_specaug/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_4.0/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_3.0/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_2.0/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_1.0/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas/100ep/encoder/decoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas/100ep/encoder/encoder_eval/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/encoder/decoder_eval/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/encoder/encoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/decoder/encoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/decoder/decoder_eval/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/encoder/encoder_eval/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/encoder/decoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/decoder/encoder_eval/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/decoder/decoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/decoder/encoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/encoder/encoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/encoder/decoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/decoder/encoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/decoder/decoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/encoder/decoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/encoder/encoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/decoder/decoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/decoder/encoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/decoder/decoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/decoder/encoder_eval/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/encoder/decoder_eval/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/encoder/encoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/decoder/encoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/decoder/decoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_1.0/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_2.0/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_3.0/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_4.0/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/decoder/decoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/decoder/encoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/encoder/decoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/encoder/encoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.01/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.01/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_tts/ga_glowTTS_ASR_ffn_x_vector_v2_2ndstep_tts/ce_ls_1.0/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_1.0/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_0.1/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_1.0/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_0.1/']" + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/basic_init/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/tts_pretrained/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/basic_init/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_2ndstep/ce_ls_1.0/']" ] }, "execution_count": 3, @@ -163,24 +173,24 @@ " \n", " \n", " joint_training/given_alignments\n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/mean_only/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/\n", - " 22.60\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_simple_encoder_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/\n", + " 21.30\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/\n", - " 23.40\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_0.1/\n", + " 26.49\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/\n", - " 22.76\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_1/\n", + " 27.30\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_1.0/\n", - " 22.24\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/\n", + " 30.09\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_simple_encoder_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/\n", - " 21.30\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/\n", + " 28.56\n", " \n", " \n", "\n", @@ -189,11 +199,11 @@ "text/plain": [ " Accuracy [%]\n", "Group Experiment \n", - "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 22.60\n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 23.40\n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 22.76\n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 22.24\n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 21.30" + "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 21.30\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 26.49\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 27.30\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 30.09\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 28.56" ] }, "execution_count": 4, @@ -265,63 +275,63 @@ " \n", " \n", " joint_training/given_alignments\n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/mean_only/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/\n", - " 22.60\n", - " 2.922168\n", - " 2.922654\n", - " -0.803728\n", - " 0.932394\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_simple_encoder_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/\n", + " 21.30\n", + " 2.920585\n", + " 2.941674\n", + " -0.797284\n", + " 0.957548\n", " True\n", " False\n", - " 1.000166\n", + " 1.007221\n", " True\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/\n", - " 23.40\n", - " 2.879141\n", - " 2.887437\n", - " -0.805783\n", - " 0.625962\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_0.1/\n", + " 26.49\n", + " 2.868420\n", + " 2.858513\n", + " -0.802836\n", + " 0.434087\n", " True\n", " False\n", - " 1.002882\n", + " 0.996546\n", " True\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/\n", - " 22.76\n", - " 2.923517\n", - " 2.933859\n", - " -0.799504\n", - " 0.818331\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_1/\n", + " 27.30\n", + " 2.664396\n", + " 2.957262\n", + " -0.72108\n", + " 0.747725\n", " True\n", " False\n", - " 1.003538\n", + " 1.109919\n", " True\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_1.0/\n", - " 22.24\n", - " 2.922999\n", - " 2.924061\n", - " -0.801181\n", - " 0.834689\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/\n", + " 30.09\n", + " 2.782269\n", + " 2.795637\n", + " -0.785492\n", + " 0.890597\n", " True\n", " False\n", - " 1.000363\n", + " 1.004805\n", " True\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_simple_encoder_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/\n", - " 21.30\n", - " 2.920585\n", - " 2.941674\n", - " -0.797284\n", - " 0.957548\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/\n", + " 28.56\n", + " 2.813527\n", + " 2.825326\n", + " -0.803716\n", + " 0.439343\n", " True\n", " False\n", - " 1.007221\n", + " 1.004194\n", " True\n", " \n", " \n", @@ -331,43 +341,43 @@ "text/plain": [ " Accuracy [%] \\\n", "Group Experiment \n", - "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 22.60 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 23.40 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 22.76 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 22.24 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 21.30 \n", + "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 21.30 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 26.49 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 27.30 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 30.09 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 28.56 \n", "\n", " CE \\\n", "Group Experiment \n", - "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.922168 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.879141 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.923517 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.922999 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.920585 \n", + "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.920585 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.868420 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.664396 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.782269 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.813527 \n", "\n", " dev CE \\\n", "Group Experiment \n", - "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.922654 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.887437 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.933859 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.924061 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.941674 \n", + "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.941674 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.858513 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.957262 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.795637 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.825326 \n", "\n", " MLE \\\n", "Group Experiment \n", - "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.803728 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.805783 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.799504 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.801181 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.797284 \n", + "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.797284 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.802836 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.72108 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.785492 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.803716 \n", "\n", " DP \\\n", "Group Experiment \n", - "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.932394 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.625962 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.818331 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.834689 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.957548 \n", + "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.957548 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.434087 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.747725 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.890597 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.439343 \n", "\n", " Joint \\\n", "Group Experiment \n", @@ -387,11 +397,11 @@ "\n", " overfitting \\\n", "Group Experiment \n", - "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.000166 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.002882 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.003538 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.000363 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.007221 \n", + "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.007221 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.996546 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.109919 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.004805 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.004194 \n", "\n", " Training data available \n", "Group Experiment \n", @@ -526,33 +536,16 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 13, "metadata": {}, - "outputs": [ - { - "ename": "IndexError", - "evalue": "list index out of range", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[8], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m df_indexed[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCE loss scale\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mdf_indexed\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mfloat\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mExperiment\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/ce_ls_\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/work/tools22/users/lukas.rilling/sis_env/lib/python3.10/site-packages/pandas/core/frame.py:10037\u001b[0m, in \u001b[0;36mDataFrame.apply\u001b[0;34m(self, func, axis, raw, result_type, args, by_row, **kwargs)\u001b[0m\n\u001b[1;32m 10025\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapply\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m frame_apply\n\u001b[1;32m 10027\u001b[0m op \u001b[38;5;241m=\u001b[39m frame_apply(\n\u001b[1;32m 10028\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 10029\u001b[0m func\u001b[38;5;241m=\u001b[39mfunc,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 10035\u001b[0m kwargs\u001b[38;5;241m=\u001b[39mkwargs,\n\u001b[1;32m 10036\u001b[0m )\n\u001b[0;32m> 10037\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mop\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39m__finalize__(\u001b[38;5;28mself\u001b[39m, method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mapply\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "File \u001b[0;32m/work/tools22/users/lukas.rilling/sis_env/lib/python3.10/site-packages/pandas/core/apply.py:831\u001b[0m, in \u001b[0;36mFrameApply.apply\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 828\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mraw:\n\u001b[1;32m 829\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapply_raw()\n\u001b[0;32m--> 831\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply_standard\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/work/tools22/users/lukas.rilling/sis_env/lib/python3.10/site-packages/pandas/core/apply.py:957\u001b[0m, in \u001b[0;36mFrameApply.apply_standard\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 956\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mapply_standard\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 957\u001b[0m results, res_index \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply_series_generator\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 959\u001b[0m \u001b[38;5;66;03m# wrap results\u001b[39;00m\n\u001b[1;32m 960\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mwrap_results(results, res_index)\n", - "File \u001b[0;32m/work/tools22/users/lukas.rilling/sis_env/lib/python3.10/site-packages/pandas/core/apply.py:973\u001b[0m, in \u001b[0;36mFrameApply.apply_series_generator\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 970\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m option_context(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmode.chained_assignment\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 971\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(series_gen):\n\u001b[1;32m 972\u001b[0m \u001b[38;5;66;03m# ignore SettingWithCopy here in case the user mutates\u001b[39;00m\n\u001b[0;32m--> 973\u001b[0m results[i] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mv\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 974\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(results[i], ABCSeries):\n\u001b[1;32m 975\u001b[0m \u001b[38;5;66;03m# If we have a view on v, we need to make a copy because\u001b[39;00m\n\u001b[1;32m 976\u001b[0m \u001b[38;5;66;03m# series_generator will swap out the underlying data\u001b[39;00m\n\u001b[1;32m 977\u001b[0m results[i] \u001b[38;5;241m=\u001b[39m results[i]\u001b[38;5;241m.\u001b[39mcopy(deep\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n", - "Cell \u001b[0;32mIn[8], line 1\u001b[0m, in \u001b[0;36m\u001b[0;34m(x)\u001b[0m\n\u001b[0;32m----> 1\u001b[0m df_indexed[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCE loss scale\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m df_indexed\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: \u001b[38;5;28mfloat\u001b[39m(\u001b[43mx\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mExperiment\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/ce_ls_\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m\"\u001b[39m)[\u001b[38;5;241m0\u001b[39m]), axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n", - "\u001b[0;31mIndexError\u001b[0m: list index out of range" - ] - } - ], + "outputs": [], "source": [ - "df_indexed[\"CE loss scale\"] = df_indexed.apply(lambda x: float(x[\"Experiment\"].split(\"/ce_ls_\")[1].split(\"/\")[0]), axis=1)" + "df_indexed[\"CE loss scale\"] = df_indexed.apply(lambda x: float(x[\"Experiment\"].split(\"/ce_ls_\")[1].split(\"/\")[0] if \"/ce_ls_\" in x[\"Experiment\"] else np.nan), axis=1)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -607,7 +600,7 @@ " False\n", " 1.007\n", " True\n", - " 200\n", + " 200.0\n", " [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05]\n", " unknown\n", " 0.1\n", @@ -616,52 +609,52 @@ " 1\n", " joint_training/given_alignments\n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/o...\n", - " 21.61\n", - " 2.939\n", - " 2.947\n", - " -\n", - " -\n", - " False\n", + " 26.49\n", + " 2.868\n", + " 2.859\n", + " -0.803\n", + " 0.434\n", + " True\n", " False\n", - " 1.003\n", + " 0.997\n", " True\n", - " 100\n", - " [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05]\n", + " 250.0\n", + " [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]\n", " unknown\n", - " 1.0\n", + " 0.1\n", " \n", " \n", " 2\n", " joint_training/given_alignments\n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/o...\n", - " 21.33\n", - " 2.925\n", - " 2.946\n", - " -0.792\n", - " 0.247\n", + " 27.30\n", + " 2.664\n", + " 2.957\n", + " -0.721\n", + " 0.748\n", " True\n", " False\n", - " 1.007\n", + " 1.11\n", " True\n", - " 200\n", - " [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05]\n", + " 250.0\n", + " [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]\n", " unknown\n", - " 0.1\n", + " 1.0\n", " \n", " \n", " 3\n", " joint_training/given_alignments\n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/o...\n", - " 21.39\n", - " 2.921\n", - " 2.947\n", - " -0.779\n", - " 0.256\n", + " 30.09\n", + " 2.782\n", + " 2.796\n", + " -0.785\n", + " 0.891\n", " True\n", " False\n", - " 1.009\n", + " 1.005\n", " True\n", - " 200\n", + " 200.0\n", " [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05]\n", " unknown\n", " 0.1\n", @@ -671,18 +664,18 @@ " joint_training/given_alignments\n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/o...\n", " 28.56\n", - " 0.003\n", - " 0.003\n", - " -0.702\n", - " 0.075\n", + " 2.814\n", + " 2.825\n", + " -0.804\n", + " 0.439\n", " True\n", " False\n", - " 0.948\n", + " 1.004\n", " True\n", - " 100\n", - " [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05]\n", + " 200.0\n", + " [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05]\n", " unknown\n", - " 4.0\n", + " 0.1\n", " \n", " \n", "\n", @@ -698,41 +691,41 @@ "\n", " Experiment Accuracy [%] CE \\\n", "0 /u/lukas.rilling/experiments/glow_tts_asr_v2/o... 21.30 2.921 \n", - "1 /u/lukas.rilling/experiments/glow_tts_asr_v2/o... 21.61 2.939 \n", - "2 /u/lukas.rilling/experiments/glow_tts_asr_v2/o... 21.33 2.925 \n", - "3 /u/lukas.rilling/experiments/glow_tts_asr_v2/o... 21.39 2.921 \n", - "4 /u/lukas.rilling/experiments/glow_tts_asr_v2/o... 28.56 0.003 \n", + "1 /u/lukas.rilling/experiments/glow_tts_asr_v2/o... 26.49 2.868 \n", + "2 /u/lukas.rilling/experiments/glow_tts_asr_v2/o... 27.30 2.664 \n", + "3 /u/lukas.rilling/experiments/glow_tts_asr_v2/o... 30.09 2.782 \n", + "4 /u/lukas.rilling/experiments/glow_tts_asr_v2/o... 28.56 2.814 \n", "\n", " dev CE MLE DP Joint Still running overfitting \\\n", "0 2.942 -0.797 0.958 True False 1.007 \n", - "1 2.947 - - False False 1.003 \n", - "2 2.946 -0.792 0.247 True False 1.007 \n", - "3 2.947 -0.779 0.256 True False 1.009 \n", - "4 0.003 -0.702 0.075 True False 0.948 \n", + "1 2.859 -0.803 0.434 True False 0.997 \n", + "2 2.957 -0.721 0.748 True False 1.11 \n", + "3 2.796 -0.785 0.891 True False 1.005 \n", + "4 2.825 -0.804 0.439 True False 1.004 \n", "\n", - " Training data available Num Epochs \\\n", - "0 True 200 \n", - "1 True 100 \n", - "2 True 200 \n", - "3 True 200 \n", - "4 True 100 \n", + " Training data available Num Epochs \\\n", + "0 True 200.0 \n", + "1 True 250.0 \n", + "2 True 250.0 \n", + "3 True 200.0 \n", + "4 True 200.0 \n", "\n", - " LR ASR Model Type \\\n", - "0 [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] unknown \n", - "1 [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] unknown \n", - "2 [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] unknown \n", - "3 [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] unknown \n", - "4 [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] unknown \n", + " LR ASR Model Type \\\n", + "0 [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] unknown \n", + "1 [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] unknown \n", + "2 [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] unknown \n", + "3 [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] unknown \n", + "4 [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] unknown \n", "\n", - " CE loss scale \n", - "0 0.1 \n", - "1 1.0 \n", - "2 0.1 \n", - "3 0.1 \n", - "4 4.0 " + " CE loss scale \n", + "0 0.1 \n", + "1 0.1 \n", + "2 1.0 \n", + "3 0.1 \n", + "4 0.1 " ] }, - "execution_count": 49, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -744,177 +737,82 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "| | Group | Experiment | Accuracy [%] | CE | dev CE | MLE | DP | Joint | Still running | overfitting | Training data available | Num Epochs | LR | ASR Model Type | CE loss scale |\n", - "|---:|:--------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------:|:------|:---------|:-------|:------|:--------|:----------------|:--------------|:--------------------------|-------------:|:-------------------------------------------------|:-----------------|----------------:|\n", - "| 0 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_simple_encoder_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 21.3 | 2.921 | 2.942 | -0.797 | 0.958 | True | False | 1.007 | True | 200 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 1 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_2ndstep/ce_ls_1.0/ | 21.61 | 2.939 | 2.947 | - | - | False | False | 1.003 | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 1 |\n", - "| 2 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/ce_ls_0.1/ | 21.33 | 2.925 | 2.946 | -0.792 | 0.247 | True | False | 1.007 | True | 200 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 3 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/basic_init/ce_ls_0.1/ | 21.39 | 2.921 | 2.947 | -0.779 | 0.256 | True | False | 1.009 | True | 200 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 4 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_4.0/ | 28.56 | 0.003 | 0.003 | -0.702 | 0.075 | True | False | 0.948 | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 4 |\n", - "| 5 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_1.0/ | 20.45 | 0.012 | 0.012 | -0.716 | 0.075 | True | False | 0.98 | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 1 |\n", - "| 6 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_2.0/ | 20.38 | 0.006 | 0.006 | -0.708 | 0.075 | True | False | 1.018 | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 2 |\n", - "| 7 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_3.0/ | 20.47 | 0.004 | 0.004 | -0.703 | 0.075 | True | False | 0.954 | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 3 |\n", - "| 8 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/basic_init/ce_ls_0.1/ | 25.67 | 2.583 | 3.019 | -0.767 | 0.265 | True | False | 1.169 | True | 200 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 9 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/tts_pretrained/ce_ls_0.1/ | 27.63 | 2.617 | 2.915 | -0.775 | 0.263 | True | False | 1.114 | True | 200 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 10 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_0.1/ | 20.71 | 0.124 | 0.122 | -0.757 | 0.071 | True | False | 0.983 | True | 200 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 11 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_1.0/ | 20.48 | 0.012 | 0.011 | -0.73 | 0.07 | True | False | 0.965 | True | 200 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 1 |\n", - "| 12 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_1.0/ | 20.42 | 0.014 | 0.014 | -0.729 | 0.076 | True | False | 1.049 | True | 200 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 1 |\n", - "| 13 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_0.1/ | 20.22 | 0.14 | 0.14 | -0.76 | 0.075 | True | False | 0.995 | True | 200 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 14 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_tts/ga_glowTTS_ASR_ffn_x_vector_v2_2ndstep_tts/ce_ls_1.0/ | 20.18 | - | - | - | - | True | True | - | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 1 |\n", - "| 15 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.01/ | 19.98 | 0.372 | 0.338 | -0.797 | 0.063 | True | False | 0.91 | True | 200 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.01 |\n", - "| 16 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.1/ | 20.29 | 0.052 | 0.047 | -0.791 | 0.052 | True | False | 0.9 | True | 200 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 17 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.01/ | 19.97 | 0.341 | 0.308 | -0.787 | 0.064 | True | False | 0.903 | True | 200 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.01 |\n", - "| 18 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.1/ | 20.32 | 0.052 | 0.048 | -0.783 | 0.052 | True | False | 0.916 | True | 200 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 19 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 27.3 | 2.573 | 2.942 | -0.776 | 0.432 | True | False | 1.143 | True | 200 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 20 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 28.92 | 2.564 | 2.893 | -0.793 | 0.396 | True | False | 1.128 | True | 200 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 21 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_0.1/ | 26.49 | 2.868 | 2.859 | -0.803 | 0.434 | True | False | 0.997 | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | unknown | 0.1 |\n", - "| 22 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_1/ | 27.3 | 2.664 | 2.957 | -0.721 | 0.748 | True | False | 1.11 | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | unknown | 1 |\n", - "| 23 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 30.09 | 2.782 | 2.796 | -0.785 | 0.891 | True | False | 1.005 | True | 200 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 24 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 28.56 | 2.814 | 2.825 | -0.804 | 0.439 | True | False | 1.004 | True | 200 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 25 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_0.1/ | 30.44 | 2.793 | 2.791 | -0.795 | 0.506 | True | False | 1.0 | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | unknown | 0.1 |\n", - "| 26 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_1/ | 27.72 | 2.532 | 2.992 | -0.691 | 0.76 | True | False | 1.182 | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | unknown | 1 |\n", - "| 27 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_1.0/ | 22.24 | 2.923 | 2.924 | -0.801 | 0.835 | True | False | 1.0 | True | 200 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 1 |\n", - "| 28 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 22.76 | 2.924 | 2.934 | -0.8 | 0.818 | True | False | 1.004 | True | 200 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 29 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 23.4 | 2.879 | 2.887 | -0.806 | 0.626 | True | False | 1.003 | True | 200 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 30 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/mean_only/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 22.6 | 2.922 | 2.923 | -0.804 | 0.932 | True | False | 1.0 | True | 200 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 31 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/specaug/ce_ls_1/ | 35.97 | 1.395 | 2.544 | -0.66 | 0.303 | True | False | 1.824 | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | blstm | 1 |\n", - "| 32 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/specaug/ce_ls_0.1/ | 36.7 | 1.672 | 2.312 | -0.774 | 1.02 | True | False | 1.383 | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | blstm | 0.1 |\n", - "| 33 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/no_specaug/ce_ls_1/ | 34.25 | 1.042 | 3.15 | -0.69 | 0.352 | True | False | 3.023 | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | blstm | 1 |\n", - "| 34 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/no_specaug/ce_ls_0.1/ | 33.05 | 1.031 | 3.339 | -0.786 | 0.368 | True | False | 3.239 | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | blstm | 0.1 |\n" + "| | Group | Experiment | Accuracy [%] | CE | dev CE | MLE | DP | Joint | Still running | overfitting | Training data available | Num Epochs | LR | ASR Model Type | CE loss scale |\n", + "|---:|:--------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------:|:------|:---------|:-------|:------|:--------|:----------------|:--------------|:--------------------------|:-------------|:-------------------------------------------------|:-----------------|:----------------|\n", + "| 0 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_simple_encoder_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 21.3 | 2.921 | 2.942 | -0.797 | 0.958 | True | False | 1.007 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 1 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_0.1/ | 26.49 | 2.868 | 2.859 | -0.803 | 0.434 | True | False | 0.997 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | unknown | 0.1 |\n", + "| 2 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_1/ | 27.3 | 2.664 | 2.957 | -0.721 | 0.748 | True | False | 1.11 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | unknown | 1.0 |\n", + "| 3 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 30.09 | 2.782 | 2.796 | -0.785 | 0.891 | True | False | 1.005 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 4 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 28.56 | 2.814 | 2.825 | -0.804 | 0.439 | True | False | 1.004 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 5 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_1/ | 27.72 | 2.532 | 2.992 | -0.691 | 0.76 | True | False | 1.182 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | unknown | 1.0 |\n", + "| 6 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_0.1/ | 30.44 | 2.793 | 2.791 | -0.795 | 0.506 | True | False | 1.0 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | unknown | 0.1 |\n", + "| 7 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 28.92 | 2.564 | 2.893 | -0.793 | 0.396 | True | False | 1.128 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 8 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 27.3 | 2.573 | 2.942 | -0.776 | 0.432 | True | False | 1.143 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 9 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/mean_only/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 22.6 | 2.922 | 2.923 | -0.804 | 0.932 | True | False | 1.0 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 10 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 23.4 | 2.879 | 2.887 | -0.806 | 0.626 | True | False | 1.003 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 11 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_1.0/ | 22.24 | 2.923 | 2.924 | -0.801 | 0.835 | True | False | 1.0 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 1.0 |\n", + "| 12 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 22.76 | 2.924 | 2.934 | -0.8 | 0.818 | True | False | 1.004 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 13 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/specaug/ce_ls_1/ | 35.97 | 1.395 | 2.544 | -0.66 | 0.303 | True | False | 1.824 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | blstm | 1.0 |\n", + "| 14 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/specaug/ce_ls_0.1/ | 36.7 | 1.672 | 2.312 | -0.774 | 1.02 | True | False | 1.383 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | blstm | 0.1 |\n", + "| 15 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/no_specaug/ce_ls_1/ | 34.25 | 1.042 | 3.15 | -0.69 | 0.352 | True | False | 3.023 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | blstm | 1.0 |\n", + "| 16 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/no_specaug/ce_ls_0.1/ | 33.05 | 1.031 | 3.339 | -0.786 | 0.368 | True | False | 3.239 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | blstm | 0.1 |\n", + "| 17 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_4.0/ | 28.56 | 0.003 | 0.003 | -0.702 | 0.075 | True | False | 0.948 | True | 100.0 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 4.0 |\n", + "| 18 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_3.0/ | 20.47 | 0.004 | 0.004 | -0.703 | 0.075 | True | False | 0.954 | True | 100.0 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 3.0 |\n", + "| 19 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_2.0/ | 20.38 | 0.006 | 0.006 | -0.708 | 0.075 | True | False | 1.018 | True | 100.0 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 2.0 |\n", + "| 20 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_1.0/ | 20.45 | 0.012 | 0.012 | -0.716 | 0.075 | True | False | 0.98 | True | 100.0 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 1.0 |\n", + "| 21 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas/100ep/encoder/decoder_eval/ | 37.48 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 22 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas/100ep/encoder/encoder_eval/ | 37.57 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 23 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/encoder/decoder_eval/ | 12.68 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 24 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/encoder/encoder_eval/ | 42.05 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 25 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/decoder/encoder_eval/ | 19.16 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 26 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/decoder/decoder_eval/ | 19.96 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 27 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/encoder/encoder_eval/ | 44.63 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 28 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/encoder/decoder_eval/ | 12.24 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 29 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/decoder/decoder_eval/ | 19.94 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 30 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/decoder/encoder_eval/ | 18.49 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 31 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/encoder/encoder_eval/ | 70.57 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 32 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/encoder/decoder_eval/ | 11.4 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 33 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/decoder/encoder_eval/ | 22.76 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 34 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/decoder/decoder_eval/ | 21.33 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 35 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/encoder/decoder_eval/ | 12.72 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 36 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/encoder/encoder_eval/ | 12.68 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 37 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/decoder/decoder_eval/ | 12.73 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 38 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/decoder/encoder_eval/ | 12.71 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 39 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/decoder/decoder_eval/ | 21.3 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 40 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/decoder/encoder_eval/ | 22.09 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 41 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/encoder/decoder_eval/ | 11.33 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 42 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/encoder/encoder_eval/ | 72.13 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 43 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/decoder/decoder_eval/ | 37.64 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 44 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/decoder/encoder_eval/ | 37.46 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 45 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/encoder/decoder_eval/ | 37.51 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 46 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/encoder/encoder_eval/ | 37.56 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 47 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.1/ | 20.29 | 0.052 | 0.047 | -0.791 | 0.052 | True | False | 0.9 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 48 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.01/ | 19.98 | 0.372 | 0.338 | -0.797 | 0.063 | True | False | 0.91 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.01 |\n", + "| 49 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.1/ | 20.32 | 0.052 | 0.048 | -0.783 | 0.052 | True | False | 0.916 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 50 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.01/ | 19.97 | 0.341 | 0.308 | -0.787 | 0.064 | True | False | 0.903 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.01 |\n", + "| 51 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_tts/ga_glowTTS_ASR_ffn_x_vector_v2_2ndstep_tts/ce_ls_1.0/ | 20.18 | - | - | - | - | True | True | - | True | 100.0 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 1.0 |\n", + "| 52 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_1.0/ | 20.42 | 0.014 | 0.014 | -0.729 | 0.076 | True | False | 1.049 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 1.0 |\n", + "| 53 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_0.1/ | 20.22 | 0.14 | 0.14 | -0.76 | 0.075 | True | False | 0.995 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 54 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_1.0/ | 20.48 | 0.012 | 0.011 | -0.73 | 0.07 | True | False | 0.965 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 1.0 |\n", + "| 55 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_0.1/ | 20.71 | 0.124 | 0.122 | -0.757 | 0.071 | True | False | 0.983 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 56 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/basic_init/ce_ls_0.1/ | 21.39 | 2.921 | 2.947 | -0.779 | 0.256 | True | False | 1.009 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 57 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/ce_ls_0.1/ | 21.33 | 2.925 | 2.946 | -0.792 | 0.247 | True | False | 1.007 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 58 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/tts_pretrained/ce_ls_0.1/ | 27.63 | 2.617 | 2.915 | -0.775 | 0.263 | True | False | 1.114 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 59 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/basic_init/ce_ls_0.1/ | 25.67 | 2.583 | 3.019 | -0.767 | 0.265 | True | False | 1.169 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 60 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_2ndstep/ce_ls_1.0/ | 21.61 | 2.939 | 2.947 | - | - | False | False | 1.003 | True | 100.0 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 1.0 |\n" ] } ], "source": [ "print(df_rounded.to_markdown())" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "False", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "File \u001b[0;32m/work/tools22/users/lukas.rilling/sis_env/lib/python3.10/site-packages/pandas/core/indexes/base.py:3790\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3789\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3790\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3791\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", - "File \u001b[0;32mindex.pyx:152\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32mindex.pyx:181\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7080\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7088\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mKeyError\u001b[0m: False", - "\nThe above exception was the direct cause of the following exception:\n", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[28], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mdf_tuned\u001b[49m\u001b[43m[\u001b[49m\u001b[43mdf_tuned\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mJoint\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mto_markdown())\n", - "File \u001b[0;32m/work/tools22/users/lukas.rilling/sis_env/lib/python3.10/site-packages/pandas/core/frame.py:3896\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3894\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 3895\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 3896\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3897\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 3898\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", - "File \u001b[0;32m/work/tools22/users/lukas.rilling/sis_env/lib/python3.10/site-packages/pandas/core/indexes/base.py:3797\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3792\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3793\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3794\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3795\u001b[0m ):\n\u001b[1;32m 3796\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3797\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3798\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3799\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3800\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3801\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3802\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", - "\u001b[0;31mKeyError\u001b[0m: False" - ] - } - ], - "source": [ - "print(df_tuned[df_tuned[\"Joint\"]].to_markdown())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df_tuned.plot(kind=\"scatter\", x=\"WER (dev-other)\", y=\"CTC\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df_tuned.plot(kind=\"scatter\", x=\"WER (dev-other)\", y=\"dev CTC\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjIAAAGwCAYAAACzXI8XAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA5qklEQVR4nO3de3hU1b3/8c+QG7mQgVxIiCQkkHDxViMIxlgQTYvWWhG0mkMPeEql9Ydc1Va0YL1C8VpQBJED0iOgWFHxHLQUIREMlwCpWhQCBEKBhARIhiSQxGT//rBMHUjCZDLJnh3er+fZz0PW3rPnO9nAfJ611l7bZhiGIQAAAAvqYHYBAAAAniLIAAAAyyLIAAAAyyLIAAAAyyLIAAAAyyLIAAAAyyLIAAAAy/I3u4DWVl9fryNHjqhTp06y2WxmlwMAANxgGIZOnTqluLg4dejQeL9Luw8yR44cUXx8vNllAAAADxw6dEjdu3dvdH+7DzKdOnWS9N0vIjw83ORqAACAOxwOh+Lj453f441p90Hm7HBSeHg4QQYAAIu50LQQJvsCAADLIsgAAADLIsgAAADLMj3IHD58WL/4xS8UGRmp4OBgXXHFFcrNzXXuNwxDM2bMULdu3RQcHKyMjAzl5+ebWDEAAPAVpgaZkydPKj09XQEBAVqzZo127dqlF154QV26dHEeM3v2bM2ZM0fz58/Xli1bFBoaqmHDhunMmTMmVg4AAHyBzTAMw6w3f+SRR7Rp0yZ99tlnDe43DENxcXF68MEH9dBDD0mSysvLFRMToyVLluiee+654Hs4HA7Z7XaVl5dz1xIAABbh7ve3qT0yH374oQYMGKC77rpLXbt2VWpqqhYuXOjcX1BQoKKiImVkZDjb7Ha7Bg0apJycnAbPWV1dLYfD4bIBAID2ydQgs3//fr322mtKSUnRJ598ovvvv18TJ07Um2++KUkqKiqSJMXExLi8LiYmxrnvXDNnzpTdbndurOoLAED7ZWqQqa+v19VXX61nn31WqampGjdunO677z7Nnz/f43NOmzZN5eXlzu3QoUNerBgAAPgSU4NMt27ddOmll7q09evXT4WFhZKk2NhYSVJxcbHLMcXFxc595woKCnKu4stqvgAAtG+mBpn09HTt3r3bpW3Pnj3q0aOHJCkpKUmxsbFat26dc7/D4dCWLVuUlpbWprUC7cH+kgqt331MBaWVZpcCAF5h6rOWpkyZouuuu07PPvusfv7zn2vr1q16/fXX9frrr0v67vkKkydP1tNPP62UlBQlJSVp+vTpiouL0/Dhw80sHbCUsqoaTVyep+z8Emfb4JRozc1MlT0kwMTKAKBlTL39WpI++ugjTZs2Tfn5+UpKStLUqVN13333OfcbhqHHH39cr7/+usrKynT99ddr3rx56t27t1vn5/ZrQBq9aKs27S1V3ff+ufvZbEpPjtLSsQNNrAwAGubu97fpQaa1EWRwsdtfUqEbX8hqdP/6h25QUlRoG1YEABdmiXVkALS+gyeqmtx/4DjzZQBYF0EGaOd6RIQ0uT8xkt4YANZFkAHauZ7RYRqcEi0/m82l3c9m0+CUaIaVAFgaQQa4CMzNTFV6cpRLW3pylOZmpppUEQB4h6m3XwNoG/aQAC0dO1AFpZU6cLxSiZGh9MQAaBcIMsBFJCmKAAOgfWFoCQAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWBZBBgAAWJapQeYPf/iDbDaby9a3b1/n/jNnzmj8+PGKjIxUWFiYRo4cqeLiYhMrBgAAvsT0HpnLLrtMR48edW4bN2507psyZYpWr16tlStXKisrS0eOHNGIESNMrBYAAPgSf9ML8PdXbGzsee3l5eVatGiRli1bphtvvFGStHjxYvXr10+bN2/Wtdde29alAgAAH2N6j0x+fr7i4uLUs2dPjRo1SoWFhZKk7du3q7a2VhkZGc5j+/btq4SEBOXk5DR6vurqajkcDpcNAAC0T6YGmUGDBmnJkiX6+OOP9dprr6mgoEA//OEPderUKRUVFSkwMFCdO3d2eU1MTIyKiooaPefMmTNlt9udW3x8fCt/CgAAYBZTh5ZuueUW55+vvPJKDRo0SD169NA777yj4OBgj845bdo0TZ061fmzw+EgzAAA0E6ZPrT0fZ07d1bv3r21d+9excbGqqamRmVlZS7HFBcXNzin5qygoCCFh4e7bAAAoH3yqSBTUVGhffv2qVu3burfv78CAgK0bt065/7du3ersLBQaWlpJlYJAAB8halDSw899JBuu+029ejRQ0eOHNHjjz8uPz8/ZWZmym63a+zYsZo6daoiIiIUHh6uCRMmKC0tjTuWAACAJJODzD//+U9lZmbq+PHjio6O1vXXX6/NmzcrOjpakvTSSy+pQ4cOGjlypKqrqzVs2DDNmzfPzJIBAIAPsRmGYZhdRGtyOByy2+0qLy9nvgwAABbh7ve3T82RAQAAaA6CDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyCDAAAsCyfCTKzZs2SzWbT5MmTnW1nzpzR+PHjFRkZqbCwMI0cOVLFxcXmFQkAAHyKTwSZbdu2acGCBbryyitd2qdMmaLVq1dr5cqVysrK0pEjRzRixAiTqgQAAL7G9CBTUVGhUaNGaeHCherSpYuzvby8XIsWLdKLL76oG2+8Uf3799fixYv1+eefa/PmzY2er7q6Wg6Hw2UDAADtk+lBZvz48br11luVkZHh0r59+3bV1ta6tPft21cJCQnKyclp9HwzZ86U3W53bvHx8a1WOwAAMJepQWbFihXasWOHZs6ced6+oqIiBQYGqnPnzi7tMTExKioqavSc06ZNU3l5uXM7dOiQt8uGB/aXVGj97mMqKK00uxQAQDvib9YbHzp0SJMmTdLatWvVsWNHr503KChIQUFBXjsfWqasqkYTl+cpO7/E2TY4JVpzM1NlDwkwsTIAQHtgWo/M9u3bdezYMV199dXy9/eXv7+/srKyNGfOHPn7+ysmJkY1NTUqKytzeV1xcbFiY2PNKRrNNnF5njbtLXVp27S3VBOW7zSpIgBAe2JakLnpppv05ZdfKi8vz7kNGDBAo0aNcv45ICBA69atc75m9+7dKiwsVFpamllloxn2l1QoO79EdYbh0l5nGMrOL2GYCQDQYqYNLXXq1EmXX365S1toaKgiIyOd7WPHjtXUqVMVERGh8PBwTZgwQWlpabr22mvNKBnNdPBEVZP7DxyvVFJUaBtVAwBoj0wLMu546aWX1KFDB40cOVLV1dUaNmyY5s2bZ3ZZcFOPiJAm9ydGEmIAAC1jM4xz+v3bGYfDIbvdrvLycoWHh5tdzkVn9KKt2rS31GV4yc9mU3pylJaOHWhiZQAAX+bu97fp68igfZubmar05CiXtvTkKM3NTDWpIgBAe+LTQ0uwPntIgJaOHaiC0kodOF6pxMhQ5sUAALyGIIM2kRRFgAEAeB9DSwAAwLLokUGb219SoYMnqhhmAgC0GEEGbYbHFQAAvI2hJbQZHlcAAPA2ggzaBI8rAAC0BoIM2oQ7jysAAKC5CDJoEzyuAADQGggyaBMRoYHq0sCEXj/bdxN+uXsJAOAJggzaxMTleSqvqj2vPTw4gMcVAAA8RpBBqzs70be+gX0nq2p1oqqmzWsCALQPBBm0Oib6AgBaC0EGrY6JvgCA1kKQQavrGR2mwSnR8rPZXNr9bDYm+gIAWoQggzYxNzNV6clRLm3pyVFM9AUAtAjPWkKbsIcEaOnYgSoordSB45U8MBIA4BUEGbSppCgCDADAexhaAgAAluVRj4zD4Wiw3WazKSgoSIGBgS0qCgAAwB0eBZnOnTvLds4dKN/XvXt33XvvvXr88cfVoQOdPgAAoHV4FGSWLFmixx57TPfee68GDhwoSdq6davefPNN/f73v1dJSYmef/55BQUF6dFHH/VqwQAAAGd5FGTefPNNvfDCC/r5z3/ubLvtttt0xRVXaMGCBVq3bp0SEhL0zDPPEGQAAECr8Wjc5/PPP1dq6vnrf6SmpionJ0eSdP3116uwsLBl1QEAADTBoyATHx+vRYsWnde+aNEixcfHS5KOHz+uLl26tKw6AACAJng0tPT888/rrrvu0po1a3TNNddIknJzc/XNN9/o3XfflSRt27ZNd999t/cqBQAAOIfNMAzDkxcWFBRowYIF2rNnjySpT58++vWvf63ExERv1tdiDodDdrtd5eXlCg8PN7scAADgBne/vz0OMlZBkAEAwHrc/f72+BEFZWVl2rp1q44dO6b6+nqXfaNHj/b0tAAAAG7zKMisXr1ao0aNUkVFhcLDw10Wx7PZbAQZAADQJjy6a+nBBx/UL3/5S1VUVKisrEwnT550bidOnPB2jQAAAA3yKMgcPnxYEydOVEhIiLfrAQAAcJtHQWbYsGHKzc31di2AT9tfUqH1u4+poLTS7FIAAP/i0RyZW2+9VQ8//LB27dqlK664QgEBAS77f/azn3mlOMAXlFXVaOLyPGXnlzjbBqdEa25mquwhAU28EgDQ2jy6/bqpJ1rbbDbV1dW1qChv4vZrtNToRVu1aW+p6r73T8XPZlN6cpSWjh1oYmUA0H65+/3t0dBSfX19o5svhRigpfaXVCg7v8QlxEhSnWEoO7+EYSYAMJlHQQa4WBw8UdXk/gPHCTIAYCa358jMmTNH48aNU8eOHTVnzpwmj504cWKLCwN8QY+Ipu/MS4wMbaNKAAANcXuOTFJSknJzcxUZGamkpKTGT2izaf/+/V4rsKWYI4OWYo4MALQ9nrX0LwQZtFR5Va0mLN/JXUsA0IZa9VlLTz75pB566KHzFsQ7ffq0nnvuOc2YMcOT0wI+yR4SoKVjB6qgtFIHjlcqMTJUSVEMKQGAL/CoR8bPz09Hjx5V165dXdqPHz+url27+tSdS/TIAABgPa16+7VhGC4Pijzr73//uyIiIjw5JQAAQLM1a2ipS5custlsstls6t27t0uYqaurU0VFhX7zm994vUgAAICGNCvIvPzyyzIMQ7/85S/1xBNPyG63O/cFBgYqMTFRaWlpXi8SAACgIc0KMmPGjJH03a3Y6enp8vf3aK6w02uvvabXXntNBw4ckCRddtllmjFjhm655RZJ0pkzZ/Tggw9qxYoVqq6u1rBhwzRv3jzFxMS06H0BAED74NEcmRtvvFEnTpw4r/348ePy8/Nz+zzdu3fXrFmztH37duXm5urGG2/U7bffrn/84x+SpClTpmj16tVauXKlsrKydOTIEY0YMcKTkgEAQDvk8UMji4qKzrtr6ciRI+rVq5dOnz7tcUERERF67rnndOeddyo6OlrLli3TnXfeKUn65ptv1K9fP+Xk5Ojaa69t8PXV1dWqrq52/uxwOBQfH89dSwAAWEirrCNz9tEENptNb7zxhsLCwpz76urqlJ2drb59+3pUcF1dnVauXKnKykqlpaVp+/btqq2tVUZGhvOYvn37KiEhockgM3PmTD3xxBMe1QAAAKylWUHmpZdekvTd7dfz5893GUY6O9l3/vz5zSrgyy+/VFpams6cOaOwsDCtWrVKl156qfLy8hQYGKjOnTu7HB8TE6OioqJGzzdt2jRNnTrV+fPZHhkAAND+NCvIFBQUSJKGDh2q9957T126dGlxAX369FFeXp7Ky8v17rvvasyYMcrKyvL4fEFBQQoKCmpxXQAAwPd5dNvR+vXrvVZAYGCgkpOTJUn9+/fXtm3b9Kc//Ul33323ampqVFZW5tIrU1xcrNjYWK+9PwAAsC63g8zUqVP11FNPKTQ01GXopiEvvviixwXV19erurpa/fv3V0BAgNatW6eRI0dKknbv3q3CwkLWqgEAAJKaEWR27typ2tpaSdKOHTsafESBpEbbGzJt2jTdcsstSkhI0KlTp7Rs2TJt2LBBn3zyiex2u8aOHaupU6cqIiJC4eHhmjBhgtLS0hqd6AsAAC4ubgeZP/3pT87bnzZs2OCVNz927JhGjx6to0ePym6368orr9Qnn3yiH/3oR5K+m1zcoUMHjRw50mVBPAAAAKkZ68h8/4nXPXv21LZt2xQZGdna9bUYT78GAMB6vP70686dOzvvWjpw4IDq6+tbXiUAAEALuD20NHLkSA0ZMkTdunWTzWbTgAEDGn0cwf79+71WIAAAQGPcDjKvv/66RowYob1792rixIm677771KlTp9asDQAAoEnNWkfm5ptvliRt375dkyZNIsgAAABTNfvp17W1tfrzn/+sgwcPtkY9AAAAbmt2kAkICFBCQoLq6upaox4AAAC3NTvISNJjjz2mRx99VCdOnPB2PQAAAG7z6FlLr7zyivbu3au4uDj16NFDoaGhLvt37NjhleIAAACa4lGQGT58uJfLAAAAaD63V/a1Klb2BQDAery+su+5ysrK9MYbb2jatGnOuTI7duzQ4cOHPT0lAABAs3g0tPTFF18oIyNDdrtdBw4c0H333aeIiAi99957Kiws1NKlS71dJwAAwHk86pGZOnWq7r33XuXn56tjx47O9p/85CfKzs72WnEAAABN8SjIbNu2Tb/+9a/Pa7/kkktUVFTU4qIAAADc4VGQCQoKksPhOK99z549io6ObnFRAAAA7vAoyPzsZz/Tk08+qdraWkmSzWZTYWGhfve732nkyJFeLRAAAKAxHgWZF154QRUVFeratatOnz6tIUOGKDk5WZ06ddIzzzzj7RoBAAAa5NFdS3a7XWvXrtXGjRv1xRdfqKKiQldffbUyMjK8XR9w0dtfUqGDJ6qUGBmqpKjQC78AAC4iHi2Id+jQIcXHx7dGPV7HgniwqrKqGk1cnqfs/BJn2+CUaM3NTJU9JMDEygCg9bXqgniJiYkaMmSIFi5cqJMnT3pcJIDGTVyep017S13aNu0t1YTlO02qCAB8j0dBJjc3VwMHDtSTTz6pbt26afjw4Xr33XdVXV3t7frgJftLKrR+9zEVlFaaXQrcsL+kQtn5Jao7p8O0zjCUnV/CdQSAf/FojkxqaqpSU1M1e/ZsbdiwQcuWLdO4ceNUX1+vESNG6L//+7+9XSc8xPCENR08UdXk/gPHK5kvAwBqwbOWpO9uux46dKgWLlyov/3tb0pKStKbb77prdrgBQxPWFOPiJAm9ydGEmIAQGphkPnnP/+p2bNn66qrrtLAgQMVFhamV1991Vu1oYUYnrCuntFhGpwSLT+bzaXdz2bT4JRoemMA4F88CjILFizQkCFD1KNHDy1dulR333239u3bp88++0y/+c1vvF0jPOTO8AR819zMVKUnR7m0pSdHaW5mqkkVAYDv8WiOzNNPP63MzEzNmTNHP/jBD7xdE7yE4Qlrs4cEaOnYgSoordSB45WsIwMADfAoyBQWFqq8vFyLFi3S3LlzJUmXXnqpxo4dK7vd7tUC4bmzwxOb9pa6DC/52WxKT47iS9EikqIIMADQGI+Glnbs2KHk5GS99NJLOnHihE6cOKGXXnpJvXr10o4dO7xdI1qA4QkAQHvm0cq+P/zhD5WcnKyFCxfK3/+7Tp1vv/1Wv/rVr7R//35lZ2d7vVBPsbLvdxieAABYibvf3x4FmeDgYO3cuVN9+/Z1ad+1a5cGDBigqqqmJ5m2JYIMAADW06qPKAgPD1dhYeF57YcOHVKnTp08OSUAAECzeRRk7r77bo0dO1Zvv/22Dh06pEOHDmnFihX61a9+pczMTG/XCAAA0CCP7lp6/vnnZbPZNHr0aH377beSpICAAN1///2aNWuWVwsEAABojEdzZM6qqqrSvn37JEm9evVSSEjT65aYgTkyAABYj7vf3x71yJwVEhKiK664oiWnAAAA8FiLnrUEAABgJoIMAACwLIIMAACwLIIMAACwLIIMAACwLIIMAACwLIIMAACwLIIMAACwLIIMAACwLIIMAACwLIIMAACwLFODzMyZM3XNNdeoU6dO6tq1q4YPH67du3e7HHPmzBmNHz9ekZGRCgsL08iRI1VcXGxSxQAAwJeYGmSysrI0fvx4bd68WWvXrlVtba1+/OMfq7Ky0nnMlClTtHr1aq1cuVJZWVk6cuSIRowYYWLVAADAV9gMwzDMLuKskpISde3aVVlZWRo8eLDKy8sVHR2tZcuW6c4775QkffPNN+rXr59ycnJ07bXXXvCc7j4GHAAA+A53v799ao5MeXm5JCkiIkKStH37dtXW1iojI8N5TN++fZWQkKCcnJwGz1FdXS2Hw+GyAQCA9slngkx9fb0mT56s9PR0XX755ZKkoqIiBQYGqnPnzi7HxsTEqKioqMHzzJw5U3a73bnFx8e3dukAAMAkPhNkxo8fr6+++korVqxo0XmmTZum8vJy53bo0CEvVQgAAHyNv9kFSNIDDzygjz76SNnZ2erevbuzPTY2VjU1NSorK3PplSkuLlZsbGyD5woKClJQUFBrlwwAAHyAqT0yhmHogQce0KpVq/Tpp58qKSnJZX///v0VEBCgdevWOdt2796twsJCpaWltXW5AADAx5jaIzN+/HgtW7ZMH3zwgTp16uSc92K32xUcHCy73a6xY8dq6tSpioiIUHh4uCZMmKC0tDS37lgCAADtm6m3X9tstgbbFy9erHvvvVfSdwviPfjgg1q+fLmqq6s1bNgwzZs3r9GhpXNx+zUAANbj7ve3T60j0xoIMgAAWI8l15EBAABoDoIMAACwLIIMAACwLIIMAACwLIIMAACwLIIMAACwLIIMAACwLIIMAACwLIIMAACwLIIMAACwLIIMAACwLIIMAACwLIIMAACwLH+zCwCasr+kQgdPVCkxMlRJUaFmlwMA8DEEGfiksqoaTVyep+z8Emfb4JRozc1MlT0kwMTKAAC+hKEl+KSJy/O0aW+pS9umvaWasHynSRUBAHwRQQY+Z39JhbLzS1RnGC7tdYah7PwSFZRWmlQZAMDXEGTgcw6eqGpy/4HjBBkAwHcIMvA5PSJCmtyfGMmkXwDAdwgy8Dk9o8M0OCVafjabS7ufzabBKdHcvQQAcCLIwCfNzUxVenKUS1t6cpTmZqaaVBEAwBdx+zV8kj0kQEvHDlRBaaUOHK9kHRkAQIMIMvBpSVEEGABA4xhaAgAAlkWQAQAAlkWQAQAAlkWQAQAAlkWQAQAAlkWQAQAAlkWQAQAAlkWQAQAAlkWQAQAAlkWQAQAAlkWQAQAAlkWQAQAAlkWQAQAAlkWQAQAAlkWQAQAAlkWQAQAAlkWQAQAAlkWQAQAAlkWQAQAAluVvdgGAlewvqdDBE1VKjAxVUlSo2eUAwEWPIAO4oayqRhOX5yk7v8TZNjglWnMzU2UPCTCxMgC4uDG0BLhh4vI8bdpb6tK2aW+pJizfaVJFAACJIANc0P6SCmXnl6jOMFza6wxD2fklKiitNKkyAICpQSY7O1u33Xab4uLiZLPZ9P7777vsNwxDM2bMULdu3RQcHKyMjAzl5+ebUywuWgdPVDW5/8BxggwAmMXUIFNZWakf/OAHevXVVxvcP3v2bM2ZM0fz58/Xli1bFBoaqmHDhunMmTNtXCkuZj0iQprcnxjJpF8AMIupk31vueUW3XLLLQ3uMwxDL7/8sn7/+9/r9ttvlyQtXbpUMTExev/993XPPfe0Zam4iPWMDtPglGht2lvqMrzkZ7MpPTmKu5cAwEQ+O0emoKBARUVFysjIcLbZ7XYNGjRIOTk5jb6uurpaDofDZQNaam5mqtKTo1za0pOjNDcz1aSKAACSD99+XVRUJEmKiYlxaY+JiXHua8jMmTP1xBNPtGptuPjYQwK0dOxAFZRW6sDxStaRAQAf4bM9Mp6aNm2aysvLnduhQ4fMLgntSFJUqIb26UqIAQAf4bNBJjY2VpJUXFzs0l5cXOzc15CgoCCFh4e7bAAAoH3y2SCTlJSk2NhYrVu3ztnmcDi0ZcsWpaWlmVgZAADwFabOkamoqNDevXudPxcUFCgvL08RERFKSEjQ5MmT9fTTTyslJUVJSUmaPn264uLiNHz4cPOKBgAAPsPUIJObm6uhQ4c6f546daokacyYMVqyZIl++9vfqrKyUuPGjVNZWZmuv/56ffzxx+rYsaNZJQMAAB9iM4xz1l1vZxwOh+x2u8rLy5kvAwCARbj7/e2zc2QAAAAuhCADAAAsiyADAAAsiyADAAAsiyADAAAsiyADAAAsiyADAAAsy2effm01+0sqdPBEFU9FBgCgDRFkWqisqka/ejNXuQdPOtsGp0Rrbmaq7CEBJlYGiYAJAO0dQaYFyqpqNPT5DTpZVevSvjG/RBOW79TSsQNNqgxlVTWauDxP2fklzjYCJgC0P8yRaYH7luaeF2IkqV5Sdn6JCkor274oSJImLs/Tpr2lLm2b9pZqwvKdJlUEAGgNBBkP7S+p0LYDJ5s85sBxgowZ9pdUKDu/RHXnPEaszjAImADQzhBkPLSl4PgFj0mMZE6GGQ6eqGpyPwETANoP5sg0U0NzLxpyTY8uTC41SY+IkCb3EzABoP2gR6aZGpp7ca4uIQF6Y8w1bVQRztUzOkyDU6LlZ7O5tPvZbBqcEk3ABIB2hCDTDI3Nvfi+axK7aMNDQ7kzxmRzM1OVnhzl0paeHKW5makmVQQAaA0MLTXDheZezBxxhTIHJrRRNWiKPSRAS8cOVEFppQ4cr2QdGQBopwgyzXChuRfX9oxso0rgrqQoAgwAtGcMLTUDcy/av/0lFVq/+xi3aAOARdAj00xzM1M1YflOl7uWmHthfawEDADWZDOMJmautgMOh0N2u13l5eUKDw/32nmZe9G+jF60VZv2lrpM5Paz2ZSeHMWjJgDABO5+f9Mj4yHmXlhD1u5jyvtnma5O6KIfpkQ3eMzZu9HO9f2VgLnWAOCbCDJolw4er9TwVze5PAurS0iAPhx/veIjXSdtu7MSMEEGAHwTk33RLp0bYiTpZFWtfvbqxvOOZSVgALAuggzanazdxxp8Krn0XZh55dN8l7uSuBsNAKyLIIN2J++fZU3uf/6vezT0+Q0avWiryv8VeFgJGACsiTkyaHeu6t7ZreM27S3VhOU7tXTsQFYCBgCLIsj4iP0lFTp4oqrdf4G2xecc0qeruoQENDq8dFZDdyVxNxoAWAtBxmQXy0Js3vyc7oShD8dfr5+9uvGCYUbyjbuSLpYgCwDexoJ4JrtYFmLzxuf0JAx9ll+idV8Xa8nnBxs97/qHbjAtPFwsQRYAmsvd728m+5ro7EJsdedkye8PebQH3vqcE5fnadPeUpe2s/NcGvPDlGj94WeX++xdSZ58JgDAvxFkTOTOQmxtpTUfluiNz9nSMOSLdyWZFWR5MCaA9oQ5MibyhYXY8gpP6vcffKWvDjucbd4e2vDG52zp6ru+eFdSW68o7I1hLHfn8jDnB0BbIciY6OxCbI3NHWnNL4CGvtTOys4v0X1Lt+md31znlffyxuf0VujzpbuS2jrINjWMdaF5Su6GIOb8AGhrDC2ZzKwhj4nL87SxgRBz1tYDJ7069NDSz2mF1XebO2TTlp+ppcNY7s7lYc4PgLZGj4zJzBjyaOxpz+f63y+O6IEbU7zynt74nHMzUzVh+U6X2s2e5yK1rBeirT5TS4ax3H06OE8RB2AGgoyPaMshj6w9Fw4xklRaUeP19zYMQ0XlZ1RcfkaSmvWZfXGei9SyIZu2+kwtGcZyNwTxFHEAZiDItBFfmvz49Ee73Drupn5dvfaeZVU1uv9/dihn/3GX9ut6Req1Uf2bNX/Cl+a5eKsXorU/U0vmKbkbgnxh8jqAtuUL320EmVbm7rBDW/1leHtroercWALRHhygH6ZEe+19Jy7POy/ESNLn+4671XPhq6zUC+HpMJa7IcjMyesA2pYvTewnyLSy+5bmavvBky5t3x92aOgvw4AeXfRf1yXq0kvsXrvF9eyxa78uvmDNXUIC9OH46yVJWbuPKe+fZbo6oYvHweZCc3KsPH/CSr0QLRnGcjcE+eo8JgDe1ZIhdW8jyLSSsqoa/erNXOWeE2Ik12GHxz/4x3l/GXIPnnS+rqW3uDZ1m3VDRl8bryeHX6mDxyuV+uRfXZ5VdDbgxEc2/eV9rgv1Wki+1XPRHFbshfBkGMvdEOSr85gAeI+vTezn9utWMnF5nnY0EGK+b9zS3AZvif2+lt7i2tCxTTlw/LtJuMNf3XTeAxdPVtXqZ69udPtcZ12o10LyrZ6L5vLFVYNbS1JUqIb26XrB/6TcPQ6A9fjSqvQSPTKtwt3bm/ceq7jgMS25xdXdOr4vO79Eb28rbPSp0SeravVZfkmzhpnO9lo0VouvrAPjKXohAFxMfG1InR6ZVuDOUIokNeex42cTbnOSsLt1nKuhSbnft6Ow6Z6mhszNTFVaz8jz2q/rFdluei7ohQBwMfC1BUrpkWkF7gylfF8HSfUXOMaTW1ybW8dZaT0j9f7OI43uvzqhS7PPaQ8J0PJx16qgtFKb9x+XTdKgnpF86QOABfnSxH6CTCtobAJoY/r36KJtjcynacktrk0dGx7sL8fpbxs8x93XJGjWmm8aHF7qEtKy27J9aQ0YAIBnfGlI3RJDS6+++qoSExPVsWNHDRo0SFu3bjW7pAtqaALouc52w628/zqtf+gGvfIfqbom0bW3o7FbXM89d7+4TnpoWG+36khPjtIrmVerX7dOjb7Xh+OvV5dz7oD6/m3ZbaG5zy4CALQtXxhStxmGG10GJnr77bc1evRozZ8/X4MGDdLLL7+slStXavfu3era9cIrzzocDtntdpWXlys8PLwNKnZ1Nq1GhgTq+b/uceuWaXcT7t8PndRjq77SV0ccbp8zIiRAL/w136WOyy8J17N3XKEru3c+7z0+yy/RjsKTLVpHprl8aaElAIA53P3+9vkgM2jQIF1zzTV65ZVXJEn19fWKj4/XhAkT9Mgjj1zw9WYHmXN5sxtu9KKtjQ4xNbYgkSevaWtWqBEA0Lrc/f726aGlmpoabd++XRkZGc62Dh06KCMjQzk5OQ2+prq6Wg6Hw2XzJd7qhjt7a/W5c3C+fxu2N17T1qxQIwDAd/h0kCktLVVdXZ1iYmJc2mNiYlRUVNTga2bOnCm73e7c4uPj26LUNufJgkS+tohRQ6xQIwDAd/h0kPHEtGnTVF5e7twOHTpkdkmtwpMFiXxtEaOGWKFGAIDv8OkgExUVJT8/PxUXuz7osLi4WLGxsQ2+JigoSOHh4S5be+TJgkS+tohRQ6xQIwDAd/h0kAkMDFT//v21bt06Z1t9fb3WrVuntLQ0EyvzDZ4848cKzwWyQo0AAN/g83ctvf322xozZowWLFiggQMH6uWXX9Y777yjb7755ry5Mw3xtbuWWoMnd0L5wiJGF2KFGgEArcPd72+fX9n37rvvVklJiWbMmKGioiJdddVV+vjjj90KMRcLT1bLtcIKu1aoEQBgLp/vkWmpi6FHBgCA9qZdrCMDAADQFIIMAACwLIIMAACwLIIMAACwLIIMAACwLIIMAACwLIIMAACwLIIMAACwLIIMAACwLJ9/REFLnV242OFwmFwJAABw19nv7Qs9gKDdB5lTp05JkuLj402uBAAANNepU6dkt9sb3d/un7VUX1+vI0eOqFOnTrLZbGaXI4fDofj4eB06dIhnP1kE18x6uGbWxHWznta8ZoZh6NSpU4qLi1OHDo3PhGn3PTIdOnRQ9+7dzS7jPOHh4fxDtRiumfVwzayJ62Y9rXXNmuqJOYvJvgAAwLIIMgAAwLIIMm0sKChIjz/+uIKCgswuBW7imlkP18yauG7W4wvXrN1P9gUAAO0XPTIAAMCyCDIAAMCyCDIAAMCyCDIAAMCyCDKtYObMmbrmmmvUqVMnde3aVcOHD9fu3btdjjlz5ozGjx+vyMhIhYWFaeTIkSouLjapYpxr1qxZstlsmjx5srONa+abDh8+rF/84heKjIxUcHCwrrjiCuXm5jr3G4ahGTNmqFu3bgoODlZGRoby8/NNrPjiVldXp+nTpyspKUnBwcHq1auXnnrqKZfn6XDNzJedna3bbrtNcXFxstlsev/99132u3ONTpw4oVGjRik8PFydO3fW2LFjVVFR4fVaCTKtICsrS+PHj9fmzZu1du1a1dbW6sc//rEqKyudx0yZMkWrV6/WypUrlZWVpSNHjmjEiBEmVo2ztm3bpgULFujKK690aeea+Z6TJ08qPT1dAQEBWrNmjXbt2qUXXnhBXbp0cR4ze/ZszZkzR/Pnz9eWLVsUGhqqYcOG6cyZMyZWfvH64x//qNdee02vvPKKvv76a/3xj3/U7NmzNXfuXOcxXDPzVVZW6gc/+IFeffXVBve7c41GjRqlf/zjH1q7dq0++ugjZWdna9y4cd4v1kCrO3bsmCHJyMrKMgzDMMrKyoyAgABj5cqVzmO+/vprQ5KRk5NjVpkwDOPUqVNGSkqKsXbtWmPIkCHGpEmTDMPgmvmq3/3ud8b111/f6P76+nojNjbWeO6555xtZWVlRlBQkLF8+fK2KBHnuPXWW41f/vKXLm0jRowwRo0aZRgG18wXSTJWrVrl/Nmda7Rr1y5DkrFt2zbnMWvWrDFsNptx+PBhr9ZHj0wbKC8vlyRFRERIkrZv367a2lplZGQ4j+nbt68SEhKUk5NjSo34zvjx43Xrrbe6XBuJa+arPvzwQw0YMEB33XWXunbtqtTUVC1cuNC5v6CgQEVFRS7XzW63a9CgQVw3k1x33XVat26d9uzZI0n6+9//ro0bN+qWW26RxDWzAneuUU5Ojjp37qwBAwY4j8nIyFCHDh20ZcsWr9bT7h8aabb6+npNnjxZ6enpuvzyyyVJRUVFCgwMVOfOnV2OjYmJUVFRkQlVQpJWrFihHTt2aNu2beft45r5pv379+u1117T1KlT9eijj2rbtm2aOHGiAgMDNWbMGOe1iYmJcXkd1808jzzyiBwOh/r27Ss/Pz/V1dXpmWee0ahRoySJa2YB7lyjoqIide3a1WW/v7+/IiIivH4dCTKtbPz48frqq6+0ceNGs0tBEw4dOqRJkyZp7dq16tixo9nlwE319fUaMGCAnn32WUlSamqqvvrqK82fP19jxowxuTo05J133tFbb72lZcuW6bLLLlNeXp4mT56suLg4rhk8wtBSK3rggQf00Ucfaf369erevbuzPTY2VjU1NSorK3M5vri4WLGxsW1cJaTvho6OHTumq6++Wv7+/vL391dWVpbmzJkjf39/xcTEcM18ULdu3XTppZe6tPXr10+FhYWS5Lw2595dxnUzz8MPP6xHHnlE99xzj6644gr953/+p6ZMmaKZM2dK4ppZgTvXKDY2VseOHXPZ/+233+rEiRNev44EmVZgGIYeeOABrVq1Sp9++qmSkpJc9vfv318BAQFat26ds2337t0qLCxUWlpaW5cLSTfddJO+/PJL5eXlObcBAwZo1KhRzj9zzXxPenr6eUsb7NmzRz169JAkJSUlKTY21uW6ORwObdmyhetmkqqqKnXo4PrV4+fnp/r6eklcMytw5xqlpaWprKxM27dvdx7z6aefqr6+XoMGDfJuQV6dOgzDMAzj/vvvN+x2u7Fhwwbj6NGjzq2qqsp5zG9+8xsjISHB+PTTT43c3FwjLS3NSEtLM7FqnOv7dy0ZBtfMF23dutXw9/c3nnnmGSM/P9946623jJCQEON//ud/nMfMmjXL6Ny5s/HBBx8YX3zxhXH77bcbSUlJxunTp02s/OI1ZswY45JLLjE++ugjo6CgwHjvvfeMqKgo47e//a3zGK6Z+U6dOmXs3LnT2LlzpyHJePHFF42dO3caBw8eNAzDvWt08803G6mpqcaWLVuMjRs3GikpKUZmZqbXayXItAJJDW6LFy92HnP69Gnj//2//2d06dLFCAkJMe644w7j6NGj5hWN85wbZLhmvmn16tXG5ZdfbgQFBRl9+/Y1Xn/9dZf99fX1xvTp042YmBgjKCjIuOmmm4zdu3ebVC0cDocxadIkIyEhwejYsaPRs2dP47HHHjOqq6udx3DNzLd+/foGv8fGjBljGIZ71+j48eNGZmamERYWZoSHhxv/9V//ZZw6dcrrtdoM43vLKQIAAFgIc2QAAIBlEWQAAIBlEWQAAIBlEWQAAIBlEWQAAIBlEWQAAIBlEWQAAIBlEWQAAIBlEWQAmGL69OkaN25ck8fccMMNmjx5ctsU1ExLlixR586dW+XcH3/8sa666irn84cANI4gA7Rj8+fPV6dOnfTtt9862yoqKhQQEKAbbrjB5dgNGzbIZrNp3759kqTExETZbLbztlmzZkmSDhw44NIeERGhIUOG6LPPPrtgXUVFRfrTn/6kxx57zHsfthUlJibq5ZdfbrP3u/nmmxUQEKC33nqrzd4TsCqCDNCODR06VBUVFcrNzXW2ffbZZ4qNjdWWLVt05swZZ/v69euVkJCgXr16OduefPJJHT161GWbMGGCy3v87W9/09GjR5Wdna24uDj99Kc/VXFxcZN1vfHGG7ruuuucT6nGv9XW1kqS7r33Xs2ZM8fkagDfR5AB2rE+ffqoW7du2rBhg7Ntw4YNuv3225WUlKTNmze7tA8dOtTl9Z06dVJsbKzLFhoa6nJMZGSkYmNjdfnll+vRRx+Vw+HQli1bmqxrxYoVuu2221zaKisrNXr0aIWFhalbt2564YUXzntddXW1HnroIV1yySUKDQ3VoEGDnJ/N4XAoODhYa9ascXnNqlWr1KlTJ1VVVTVaz1/+8hdddtllCgoKUmJiost733DDDTp48KCmTJni7H36vk8++UT9+vVTWFiYbr75Zh09etRl/xtvvKF+/fqpY8eO6tu3r+bNm+fcd7ZX6+2339aQIUPUsWNHZy/MbbfdptzcXGcPGYCGEWSAdm7o0KFav3698+f169frhhtu0JAhQ5ztp0+f1pYtW84LMs1x+vRpLV26VJIUGBjY6HEnTpzQrl27NGDAAJf2hx9+WFlZWfrggw/017/+VRs2bNCOHTtcjnnggQeUk5OjFStW6IsvvtBdd92lm2++Wfn5+QoPD9dPf/pTLVu2zOU1b731loYPH66QkJAG69m+fbt+/vOf65577tGXX36pP/zhD5o+fbqWLFkiSXrvvffUvXt3l96ps6qqqvT888/rz3/+s7Kzs1VYWKiHHnrI5b1nzJihZ555Rl9//bWeffZZTZ8+XW+++aZLDY888ogmTZqkr7/+WsOGDZMkJSQkKCYmxq2hOuCi5vXnaQPwKQsXLjRCQ0ON2tpaw+FwGP7+/saxY8eMZcuWGYMHDzYMwzDWrVtnSDIOHjzofF2PHj2MwMBAIzQ01GXLzs42DMMwCgoKDElGcHCwERoaathsNkOS0b9/f6OmpqbRenbu3GlIMgoLC51tp06dMgIDA4133nnH2Xb8+HEjODjYmDRpkmEYhnHw4EHDz8/POHz4sMv5brrpJmPatGmGYRjGqlWrjLCwMKOystIwDMMoLy83OnbsaKxZs6bRev7jP/7D+NGPfuTS9vDDDxuXXnqpy+/ipZdecjlm8eLFhiRj7969zrZXX33ViImJcf7cq1cvY9myZS6ve+qpp4y0tDTDMP79O3z55ZcbrC01NdX4wx/+0GjtAAyDHhmgnbvhhhtUWVmpbdu26bPPPlPv3r0VHR2tIUOGOOfJbNiwQT179lRCQoLLax9++GHl5eW5bOf2pLz99tvauXOn/vKXvyg5OVlLlixRQEBAo/WcPn1aktSxY0dn2759+1RTU6NBgwY52yIiItSnTx/nz19++aXq6urUu3dvhYWFObesrCzn8MtPfvITBQQE6MMPP5T03ZBReHi4MjIyVFhY6PK6Z599VpL09ddfKz093aXG9PR05efnq66ursnfbUhIiMucom7duunYsWOSvhsq27dvn8aOHevyvk8//fR5w0Xn/k7PCg4ObnJIDIDkb3YBAFpXcnKyunfvrvXr1+vkyZMaMmSIJCkuLk7x8fH6/PPPtX79et14443nvTYqKkrJyclNnj8+Pl4pKSlKSUnRt99+qzvuuENfffWVgoKCGjw+KipKknTy5ElFR0e7/TkqKirk5+en7du3y8/Pz2VfWFiYpO+GtO68804tW7ZM99xzj5YtW6a7775b/v7+iouLU15envM1ERERbr93Y84NbDabTYZhOOuVpIULF7oENEnn1X/uvKOzTpw40azfEXAxokcGuAgMHTpUGzZs0IYNG1xuux48eLDWrFmjrVu3tmh+zFl33nmn/P39XSa0nqtXr14KDw/Xrl27XNoCAgJcJgmfPHlSe/bscf6cmpqquro6HTt2TMnJyS5bbGys87hRo0bp448/1j/+8Q99+umnGjVqlCTJ39/f5TVng0y/fv20adMmlxo3bdqk3r17OwNHYGDgBXtnzhUTE6O4uDjt37//vHqTkpIu+PozZ85o3759Sk1Nbdb7AhcbggxwERg6dKg2btyovLw8Z4+MJA0ZMkQLFixQTU1Ng0Hm1KlTKioqctkcDkej72Oz2TRx4kTNmjWr0SGRDh06KCMjQxs3bnS2hYWFaezYsXr44Yf16aef6quvvtK9996rDh3+/V9U7969NWrUKI0ePVrvvfeeCgoKtHXrVs2cOVP/+7//6zxu8ODBio2N1ahRo5SUlHReb8i5HnzwQa1bt05PPfWU9uzZozfffFOvvPKKy6TdxMREZWdn6/DhwyotLW3yfN/3xBNPaObMmZozZ4727NmjL7/8UosXL9aLL754wddu3rxZQUFBSktLc/v9gIuS2ZN0ALS+s5NK+/bt69J+4MABQ5LRp0+f817To0cPQ9J5269//WuXc+7cudPldZWVlUaXLl2MP/7xj43W83//93/GJZdcYtTV1TnbTp06ZfziF78wQkJCjJiYGGP27NnGkCFDnJN9DcMwampqjBkzZhiJiYlGQECA0a1bN+OOO+4wvvjiC5fz//a3vzUkGTNmzHDr9/Puu+8al156qREQEGAkJCQYzz33nMv+nJwc48orrzSCgoKMs/9tLl682LDb7S7HrVq1yjj3v9W33nrLuOqqq4zAwECjS5cuxuDBg4333nvPMIzGf4eGYRjjxo1z/q4BNM5mGP8a0AWANmIYhgYNGqQpU6YoMzPT7HJ8Tmlpqfr06aPc3Fy3hqGAixlDSwDanM1m0+uvv+7y6AT824EDBzRv3jxCDOAGemQAAIBl0SMDAAAsiyADAAAsiyADAAAsiyADAAAsiyADAAAsiyADAAAsiyADAAAsiyADAAAsiyADAAAs6/8DZYPxwyzGGz8AAAAASUVORK5CYII=", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df_tuned[df_tuned[\"overfitting\"] < 100].plot(kind=\"scatter\", x=\"WER (dev-other)\", y=\"overfitting\")" - ] } ], "metadata": { @@ -933,7 +831,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.13" } }, "nbformat": 4, diff --git a/users/rilling/evaluation/swer_eval.ipynb b/users/rilling/evaluation/swer_eval.ipynb index 24acd6f0c..9122f2655 100644 --- a/users/rilling/evaluation/swer_eval.ipynb +++ b/users/rilling/evaluation/swer_eval.ipynb @@ -2,18 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": 35, + "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [ "import glob\n", "import sys\n", @@ -29,104 +20,106 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + "['/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_100ep_pe1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1_radam1e-9/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_x_vector/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_x_vector/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1_radam1e-9/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_multiscale/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector/enc768/100ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss_grad_clip_10/ed_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss_grad_clip_10/ed_scale_1.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_simple_encoder/12cb/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_simple_encoder/20cb/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_batch_norm/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_conformer_coupling/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/100ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/100ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05_epsilon_1e-8/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.0/grad_clip_10/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/100ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/grad_clip_10/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/epsilon_1e-8/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/grad_clip_10/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05_epsilon_1e-8/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0_epsilon_1e-8/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/epsilon_1e-8/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/grad_clip_10/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.00/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss/ed_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/100ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/mean_only/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_1.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/basic_init/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss_grad_clip_10/ed_scale_1.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss_grad_clip_10/ed_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_simple_encoder/12cb/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_simple_encoder/20cb/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_conformer_coupling/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_multiscale/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_batch_norm/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector/enc768/100ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/basic_init/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/tts_pretrained/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/basic_init/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.01/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/basic_init/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/mean_only/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_1.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.01/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.01/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_1.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_1.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc192/100ep/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc768/100ep/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder_no_blstm/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_ddi_actnorm/enc192/100ep/not_silence_preprocessed/LR_scheduled/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc192/200ep/long_cooldown/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc192/200ep/long_cooldown/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/basic_init/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/tts_pretrained/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/basic_init/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc256/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc768/with_sigma/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc768/with_sigma/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc768/mean_only/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc768/mean_only/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc256/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_simple_encoder/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer']" + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc768/mean_only/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc192/200ep/long_cooldown/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc192/200ep/long_cooldown/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_simple_encoder/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_ddi_actnorm/enc192/100ep/not_silence_preprocessed/LR_scheduled/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc768/100ep/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc192/100ep/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder_no_blstm/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer']" ] }, - "execution_count": 36, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -147,16 +140,23 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 3, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:root:Settings file 'settings.py' does not exist, ignoring it ([Errno 2] No such file or directory: 'settings.py').\n" + ] + }, { "data": { "text/plain": [ - "'250'" + "'100'" ] }, - "execution_count": 37, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -178,93 +178,16 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'frontend_config': {'in_features': 80,\n", - " 'conv1_channels': 16,\n", - " 'conv2_channels': 16,\n", - " 'conv3_channels': 16,\n", - " 'conv4_channels': 16,\n", - " 'conv_kernel_size': (3, 3),\n", - " 'conv_padding': None,\n", - " 'pool1_kernel_size': (2, 1),\n", - " 'pool1_stride': (2, 1),\n", - " 'pool1_padding': None,\n", - " 'pool2_kernel_size': (2, 1),\n", - " 'pool2_stride': (2, 1),\n", - " 'pool2_padding': None,\n", - " 'activation': None,\n", - " 'out_features': 96,\n", - " 'activation_str': 'ReLU'},\n", - " 'specaug_config': {'repeat_per_n_frames': 100,\n", - " 'max_dim_time': 20,\n", - " 'num_repeat_feat': 5,\n", - " 'max_dim_feat': 8},\n", - " 'decoder_config': {'hidden_channels': 192,\n", - " 'kernel_size': 5,\n", - " 'dilation_rate': 1,\n", - " 'n_blocks': 12,\n", - " 'n_layers': 4,\n", - " 'p_dropout': 0.05,\n", - " 'n_split': 4,\n", - " 'n_sqz': 2,\n", - " 'sigmoid_scale': False,\n", - " 'ddi': True,\n", - " 'n_heads': 2},\n", - " 'text_encoder_config': {'n_vocab': 44,\n", - " 'hidden_channels': 192,\n", - " 'filter_channels': 768,\n", - " 'filter_channels_dp': 256,\n", - " 'n_heads': 2,\n", - " 'n_layers': 6,\n", - " 'kernel_size': 3,\n", - " 'p_dropout': 0.1,\n", - " 'window_size': 4,\n", - " 'block_length': None,\n", - " 'mean_only': False,\n", - " 'prenet': True},\n", - " 'conformer_asr_encoder_config': {'conformer_size': 96,\n", - " 'ff_dim': 384,\n", - " 'ff_dropout': 0.2,\n", - " 'num_heads': 2,\n", - " 'att_weights_dropout': 0.2,\n", - " 'mhsa_dropout': 0.2,\n", - " 'kernel_size': 9,\n", - " 'conv_dropout': 0.2,\n", - " 'num_layers': 8},\n", - " 'out_channels': 80,\n", - " 'gin_channels': 512,\n", - " 'final_dropout': 0.2,\n", - " 'label_target_size': 79,\n", - " 'specaug_start_epoch': 1,\n", - " 'n_speakers': 251}" + "(91, 91, 91, 91, 91, 91, 91, 91)" ] }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "eval(returnn_configs[0].value(\"model_kwargs\", \"default\"))[\"model_config\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(89, 89, 89, 89, 89, 89, 89, 89)" - ] - }, - "execution_count": 39, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -277,6 +200,7 @@ "experiment_group = []\n", "wers = []\n", "nisqas = []\n", + "nisqa_confidence =[]\n", "encoder_channels = []\n", "num_epochs = []\n", "dec_dropout = []\n", @@ -289,12 +213,19 @@ " wers.append(float(ff.readline().replace(\"\\n\", \"\")))\n", "\n", " mos_path = f.replace(wer_subdir, \"tts_eval_gl/test-clean/nisqa_mos/average\")\n", - " breakpoint()\n", " if os.path.exists(mos_path):\n", " with open(mos_path, \"r\") as fmos:\n", " nisqas.append(float(fmos.readline().replace(\"\\n\", \"\")))\n", " else:\n", " nisqas.append(None)\n", + "\n", + " mos_conf_path = f.replace(wer_subdir, \"tts_eval_gl/test-clean/nisqa_mos/confidence_max_interval\")\n", + " if os.path.exists(mos_conf_path):\n", + " with open(mos_conf_path, \"r\") as fmos:\n", + " nisqa_confidence.append(float(fmos.readline().replace(\"\\n\", \"\")))\n", + " else:\n", + " nisqa_confidence.append(None)\n", + "\n", " folders = [\"librispeech_glow_asr\", \"joint_training/default\", \"joint_training/conformer_coupling\", \"joint_training/given_alignments\", \"tts_architecture\"]\n", " found = False\n", " for folder in folders:\n", @@ -373,7 +304,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -400,6 +331,7 @@ " \n", " sWER\n", " autoMOS\n", + " autoMOS confidence\n", " num_epochs\n", " decoder dropout\n", " mean only\n", @@ -418,25 +350,28 @@ " \n", " \n", " \n", + " \n", " \n", " \n", " \n", " \n", " joint_training/conformer_coupling\n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 5.2\n", - " 2.136262\n", - " 250\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_100ep_pe1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 12.4\n", + " 2.936271\n", + " NaN\n", + " 100\n", " 0.05\n", " False\n", " 768\n", - " [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]\n", + " [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05]\n", " {'class': 'adam', 'epsilon': 1e-08}\n", " \n", " \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", " 20.9\n", " 2.449569\n", + " NaN\n", " 250\n", " 0.05\n", " False\n", @@ -445,9 +380,10 @@ " {'class': 'adam', 'epsilon': 1e-08}\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 97.7\n", - " 1.611414\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 5.2\n", + " 2.136262\n", + " NaN\n", " 250\n", " 0.05\n", " False\n", @@ -456,25 +392,27 @@ " {'class': 'adam', 'epsilon': 1e-08}\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_x_vector/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 99.5\n", - " 1.933423\n", - " 250\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1_radam1e-9/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 14.7\n", + " 2.588368\n", + " NaN\n", + " 100\n", " 0.05\n", " False\n", " 768\n", - " [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]\n", - " {'class': 'adam', 'epsilon': 1e-08}\n", + " [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05]\n", + " {'class': 'radam', 'epsilon': 1e-09}\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 95.7\n", - " 1.824994\n", - " 250\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 14.4\n", + " 2.718031\n", + " NaN\n", + " 100\n", " 0.05\n", " False\n", " 768\n", - " [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]\n", + " [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05]\n", " {'class': 'adam', 'epsilon': 1e-08}\n", " \n", " \n", @@ -488,107 +426,127 @@ " ...\n", " ...\n", " ...\n", + " ...\n", " \n", " \n", " tts_architecture\n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc768/with_sigma/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 15.3\n", - " 3.404880\n", - " 200\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc768/100ep/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 20.9\n", + " 3.312732\n", + " NaN\n", + " 100\n", " 0.00\n", " False\n", " 768\n", - " [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06]\n", + " [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05]\n", " {'class': 'adam', 'epsilon': 1e-09}\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc768/mean_only/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 15.5\n", - " 3.469905\n", - " 200\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc192/100ep/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 95.4\n", + " 2.633865\n", + " NaN\n", + " 100\n", " 0.00\n", - " True\n", - " 768\n", - " [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06]\n", + " False\n", + " 192\n", + " [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05]\n", " {'class': 'adam', 'epsilon': 1e-09}\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc768/mean_only/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 16.9\n", - " 3.515657\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 25.2\n", + " 3.463099\n", + " NaN\n", " 200\n", " 0.00\n", - " True\n", - " 768\n", - " [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06]\n", + " -\n", + " -\n", + " [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-05]\n", " {'class': 'adam', 'epsilon': 1e-09}\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc256/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 97.9\n", - " 1.694847\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 25.9\n", + " 3.391651\n", + " NaN\n", " 200\n", " 0.00\n", - " False\n", - " 256\n", - " [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06]\n", + " -\n", + " -\n", + " [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-05]\n", " {'class': 'adam', 'epsilon': 1e-09}\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_simple_encoder/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 56.4\n", - " 3.333580\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder_no_blstm/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 105.6\n", + " 3.165388\n", + " NaN\n", " 100\n", " 0.00\n", " -\n", " -\n", - " [0: 0.0001, 49: 0.0005, 50: 0.0005, 100: 1e-06]\n", + " [0: 0.0001, 49: 0.0005, 50: 0.0001, 100: 1e-06]\n", " {'class': 'radam', 'epsilon': 1e-09}\n", " \n", " \n", "\n", - "

89 rows × 8 columns

\n", + "

91 rows × 9 columns

\n", "" ], "text/plain": [ - " sWER \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 5.2 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 20.9 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 97.7 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 99.5 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 95.7 \n", - "... ... \n", - "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 15.3 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 15.5 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 16.9 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 97.9 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 56.4 \n", + " sWER \\\n", + "Group Experiment \n", + "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 12.4 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 20.9 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 5.2 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 14.7 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 14.4 \n", + "... ... \n", + "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 20.9 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 95.4 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 25.2 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 25.9 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 105.6 \n", "\n", " autoMOS \\\n", "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.136262 \n", + "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.936271 \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.449569 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.611414 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.933423 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.824994 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.136262 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.588368 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.718031 \n", "... ... \n", - "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.404880 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.469905 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.515657 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.694847 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.333580 \n", + "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.312732 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.633865 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.463099 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.391651 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.165388 \n", + "\n", + " autoMOS confidence \\\n", + "Group Experiment \n", + "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", + "... ... \n", + "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", "\n", " num_epochs \\\n", "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", + "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 100 \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 100 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 100 \n", "... ... \n", - "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 200 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 200 \n", + "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 100 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 100 \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 200 \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 200 \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 100 \n", @@ -616,10 +574,10 @@ " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", "... ... \n", "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... - \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... - \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... - \n", "\n", " encoder channels \\\n", "Group Experiment \n", @@ -630,31 +588,31 @@ " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", "... ... \n", "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 256 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 192 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... - \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... - \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... - \n", "\n", " LR \\\n", "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", + "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] \n", "... ... \n", - "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 0.0001, 49: 0.0005, 50: 0.0005, 100: 1e-06] \n", + "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-05] \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-05] \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 0.0001, 49: 0.0005, 50: 0.0001, 100: 1e-06] \n", "\n", " Optimizer \n", "Group Experiment \n", "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'radam', 'epsilon': 1e-09} \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", "... ... \n", "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-09} \n", @@ -663,10 +621,10 @@ " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-09} \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'radam', 'epsilon': 1e-09} \n", "\n", - "[89 rows x 8 columns]" + "[91 rows x 9 columns]" ] }, - "execution_count": 40, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -676,14 +634,14 @@ "\n", "index = pd.MultiIndex.from_arrays([experiment_group, wer_files], names=(\"Group\", \"Experiment\"))\n", "\n", - "df = pd.DataFrame({\"sWER\": wers, \"autoMOS\": nisqas, \"num_epochs\": num_epochs, \"decoder dropout\": dec_dropout, \"mean only\": mean_only, \"encoder channels\": encoder_channels, \"LR\": lr, \"Optimizer\": optimizer}, index=index)\n", + "df = pd.DataFrame({\"sWER\": wers, \"autoMOS\": nisqas, \"autoMOS confidence\": nisqa_confidence, \"num_epochs\": num_epochs, \"decoder dropout\": dec_dropout, \"mean only\": mean_only, \"encoder channels\": encoder_channels, \"LR\": lr, \"Optimizer\": optimizer}, index=index)\n", "\n", "df" ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -710,6 +668,7 @@ " \n", " sWER\n", " autoMOS\n", + " autoMOS confidence\n", " num_epochs\n", " decoder dropout\n", " mean only\n", @@ -742,32 +701,35 @@ " \n", " \n", " \n", + " \n", " \n", " \n", " \n", " \n", " joint_training/conformer_coupling\n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 5.2\n", - " 2.136262\n", - " 250\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_100ep_pe1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 12.4\n", + " 2.936271\n", + " NaN\n", + " 100\n", " 0.05\n", " False\n", " 768\n", - " [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]\n", + " [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05]\n", " {'class': 'adam', 'epsilon': 1e-08}\n", - " -0.453089\n", - " -0.401680\n", - " -0.465325\n", - " 0.246206\n", - " 0.344511\n", - " 0.229336\n", - " True\n", + " -0.826781\n", + " -0.846025\n", + " NaN\n", + " 0.382039\n", + " 0.353906\n", + " NaN\n", + " False\n", " \n", " \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", " 20.9\n", " 2.449569\n", + " NaN\n", " 250\n", " 0.05\n", " False\n", @@ -783,58 +745,61 @@ " True\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 97.7\n", - " 1.611414\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 5.2\n", + " 2.136262\n", + " NaN\n", " 250\n", " 0.05\n", " False\n", " 768\n", " [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]\n", " {'class': 'adam', 'epsilon': 1e-08}\n", - " -0.824271\n", - " -0.830326\n", - " NaN\n", - " 0.964433\n", - " 0.725438\n", - " NaN\n", - " False\n", + " -0.453089\n", + " -0.401680\n", + " -0.465325\n", + " 0.246206\n", + " 0.344511\n", + " 0.229336\n", + " True\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_x_vector/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 99.5\n", - " 1.933423\n", - " 250\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1_radam1e-9/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 14.7\n", + " 2.588368\n", + " NaN\n", + " 100\n", " 0.05\n", " False\n", " 768\n", - " [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]\n", - " {'class': 'adam', 'epsilon': 1e-08}\n", - " -0.809305\n", - " -0.755239\n", - " -0.813211\n", - " 1.001979\n", - " 1.077792\n", - " 1.042415\n", + " [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05]\n", + " {'class': 'radam', 'epsilon': 1e-09}\n", + " -0.825730\n", + " -0.844433\n", + " NaN\n", + " 0.384814\n", + " 0.355806\n", + " NaN\n", " False\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 95.7\n", - " 1.824994\n", - " 250\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 14.4\n", + " 2.718031\n", + " NaN\n", + " 100\n", " 0.05\n", " False\n", " 768\n", - " [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]\n", + " [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05]\n", " {'class': 'adam', 'epsilon': 1e-08}\n", - " -0.657168\n", + " -0.825686\n", + " -0.844577\n", " NaN\n", - " -0.670671\n", - " 1.004604\n", + " 0.389056\n", + " 0.360854\n", " NaN\n", - " 2.091343\n", - " True\n", + " False\n", " \n", " \n", "\n", @@ -843,27 +808,35 @@ "text/plain": [ " sWER \\\n", "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 5.2 \n", + "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 12.4 \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 20.9 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 97.7 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 99.5 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 95.7 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 5.2 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 14.7 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 14.4 \n", "\n", " autoMOS \\\n", "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.136262 \n", + "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.936271 \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.449569 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.611414 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.933423 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.824994 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.136262 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.588368 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.718031 \n", + "\n", + " autoMOS confidence \\\n", + "Group Experiment \n", + "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", "\n", " num_epochs \\\n", "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", + "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 100 \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 100 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 100 \n", "\n", " decoder dropout \\\n", "Group Experiment \n", @@ -891,78 +864,78 @@ "\n", " LR \\\n", "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", + "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] \n", "\n", - " Optimizer \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", + " Optimizer \\\n", + "Group Experiment \n", + "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'radam', 'epsilon': 1e-09} \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", "\n", " MLE \\\n", "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.453089 \n", + "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.826781 \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.672552 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.824271 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.809305 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.657168 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.453089 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.825730 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.825686 \n", "\n", " dev MLE \\\n", "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.401680 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.830326 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.755239 \n", + "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.846025 \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.401680 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.844433 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.844577 \n", "\n", " devtrain MLE \\\n", "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.465325 \n", + "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.692025 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.465325 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.813211 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.670671 \n", "\n", " DP loss \\\n", "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.246206 \n", + "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.382039 \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.444391 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.964433 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.001979 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.004604 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.246206 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.384814 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.389056 \n", "\n", " DP dev loss \\\n", "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.344511 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.725438 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.077792 \n", + "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.353906 \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.344511 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.355806 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.360854 \n", "\n", " DP devtrain loss \\\n", "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.229336 \n", + "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.412498 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.229336 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.042415 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.091343 \n", "\n", " Joint \n", "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True \n", + "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True " + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False " ] }, - "execution_count": 41, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1023,7 +996,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -1035,62 +1008,64 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "| | Group | Experiment | sWER | autoMOS | num_epochs | decoder dropout | mean only | encoder channels | LR | Optimizer | MLE | dev MLE | devtrain MLE | DP loss | DP dev loss | DP devtrain loss | Joint |\n", - "|---:|:----------------------------------|:------------------------------------------------------------------------------------------------------------------------|-------:|----------:|-------------:|------------------:|:------------|:-------------------|:-------------------------------------------------|:-------------------------------------|------:|----------:|---------------:|----------:|--------------:|-------------------:|:--------|\n", - "| 2 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS/ | 97.7 | 1.61 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.82 | -0.83 | nan | 0.96 | 0.73 | nan | False |\n", - "| 3 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_x_vector/ | 99.5 | 1.93 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.81 | -0.76 | -0.81 | 1 | 1.08 | 1.04 | False |\n", - "| 5 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1/ | 14.4 | 2.72 | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-08} | -0.83 | -0.84 | nan | 0.39 | 0.36 | nan | False |\n", - "| 7 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1_radam1e-9/ | 14.7 | 2.59 | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.83 | -0.84 | nan | 0.38 | 0.36 | nan | False |\n", - "| 22 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_multiscale/enc768/200ep/dec_drop_0.05/ | 100 | 1.25 | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.79 | -0.8 | -0.81 | 0.53 | 0.57 | 0.52 | False |\n", - "| 23 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector/enc768/100ep/dec_drop_0.05/ | 97.9 | 1.51 | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.78 | -0.79 | -0.79 | 0.99 | 1.38 | 1.38 | False |\n", - "| 24 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss_grad_clip_10/ed_scale_0.1/ | 95.7 | 3.21 | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.79 | -0.79 | 1.08 | 1.81 | 1.83 | False |\n", - "| 25 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss_grad_clip_10/ed_scale_1.0/ | 98.8 | 2.27 | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.74 | -0.76 | -0.76 | 1.13 | 1.54 | 1.55 | False |\n", - "| 26 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_simple_encoder/12cb/200ep/dec_drop_0.05/ | 70.3 | 3.47 | 200 | 0.05 | False | - | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.82 | -0.83 | 0.43 | 0.5 | 0.4 | False |\n", - "| 27 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_simple_encoder/20cb/200ep/dec_drop_0.05/ | 57.3 | 3.46 | 200 | 0.05 | False | - | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.83 | -0.85 | -0.86 | 0.4 | 0.47 | 0.38 | False |\n", - "| 28 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_batch_norm/enc768/200ep/dec_drop_0.05/ | 15.1 | 3.4 | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.82 | -0.83 | 0.39 | 0.45 | 0.36 | False |\n", - "| 29 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_conformer_coupling/enc768/200ep/dec_drop_0.05/ | 9.6 | 1.79 | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.84 | -0.85 | -0.86 | 0.37 | 0.42 | 0.34 | False |\n", - "| 30 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/100ep/dec_drop_0.05/ | 98.1 | 1.52 | 100 | 0.05 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.8 | -0.81 | 0.99 | 0.78 | 0.78 | False |\n", - "| 31 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05_epsilon_1e-8/ | 96.7 | 2.27 | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-08} | -0.81 | -0.82 | -0.82 | 0.98 | 0.81 | 0.83 | False |\n", - "| 32 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05/ | 98.8 | 1.88 | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.82 | -0.82 | 0.99 | 0.62 | 0.62 | False |\n", - "| 33 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05_epsilon_1e-8/ | 15.7 | 3.35 | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-08} | -0.81 | -0.83 | -0.83 | 0.39 | 0.44 | 0.35 | False |\n", - "| 34 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0_epsilon_1e-8/ | 95.9 | 3.06 | 200 | 0 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-08} | -0.83 | -0.75 | -0.81 | 0.98 | 0.45 | 0.46 | False |\n", - "| 35 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05/ | 15.1 | 3.36 | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.83 | -0.83 | 0.39 | 0.44 | 0.35 | False |\n", - "| 36 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/epsilon_1e-8/ | 95.9 | 3.06 | 200 | 0 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-08} | -0.83 | -0.75 | -0.81 | 0.98 | 0.45 | 0.46 | False |\n", - "| 37 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/grad_clip_10/dec_drop_0.05/ | 14.4 | 3.29 | 400 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 400: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.82 | -0.84 | -0.85 | 0.37 | 0.44 | 0.33 | False |\n", - "| 38 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.05/ | 17.2 | 3.3 | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.81 | -0.82 | 0.4 | 0.43 | 0.37 | False |\n", - "| 39 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.00/ | 16.3 | 3.27 | 100 | 0 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.83 | -0.82 | -0.83 | 0.41 | 0.44 | 0.38 | False |\n", - "| 40 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss/ed_scale_0.1/ | 95.8 | 3.09 | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.78 | -0.78 | -0.78 | 1.08 | 2.07 | 2.08 | False |\n", - "| 41 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.0/ | 95.4 | 2.41 | 200 | 0 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.84 | -0.82 | -0.83 | 0.97 | 1.15 | 1.18 | False |\n", - "| 42 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.05/ | 13.3 | 3.3 | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.82 | -0.83 | 0.38 | 0.41 | 0.34 | False |\n", - "| 43 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/100ep/dec_drop_0.05/ | 16.3 | 3.18 | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.81 | -0.82 | 0.41 | 0.44 | 0.38 | False |\n", - "| 44 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.0/ | 96.2 | 2.29 | 200 | 0 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.8 | -0.8 | 0.98 | 0.87 | 0.88 | False |\n", - "| 45 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/ | 14.6 | 3.26 | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.82 | -0.83 | 0.39 | 0.44 | 0.36 | False |\n", - "| 46 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.0/ | 16.7 | 3.32 | 200 | 0 | False | 768 | [0: 1e-06, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.85 | -0.76 | -0.85 | 0.41 | 0.47 | 0.38 | False |\n", - "| 47 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.05/ | 15.6 | 3.21 | 200 | 0.05 | False | 768 | [0: 1e-06, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.82 | -0.83 | 0.39 | 0.45 | 0.36 | False |\n", - "| 48 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/dec_drop_0.05/ | 12.9 | 3.33 | 400 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 400: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.82 | -0.83 | -0.84 | 0.38 | 0.46 | 0.34 | False |\n", - "| 73 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc192/100ep/not_silence_preprocessed/ | 95.4 | 2.63 | 100 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.81 | -0.79 | nan | 0.97 | 1.27 | nan | False |\n", - "| 74 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc768/100ep/not_silence_preprocessed/ | 20.9 | 3.31 | 100 | 0 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.81 | -0.8 | nan | 0.46 | 0.47 | nan | False |\n", - "| 75 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/not_silence_preprocessed/ | 25.2 | 3.46 | 200 | 0 | - | - | [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.84 | -0.71 | nan | 0.41 | 0.46 | nan | False |\n", - "| 76 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/silence_preprocessed/ | 25.9 | 3.39 | 200 | 0 | - | - | [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.83 | -0.81 | nan | 0.39 | 0.42 | nan | False |\n", - "| 77 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder_no_blstm/silence_preprocessed/ | 105.6 | 3.17 | 100 | 0 | - | - | [0: 0.0001, 49: 0.0005, 50: 0.0001, 100: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.79 | nan | 0.88 | 0.87 | nan | False |\n", - "| 78 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_ddi_actnorm/enc192/100ep/not_silence_preprocessed/LR_scheduled/ | 100 | 3.12 | 100 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.84 | -0.8 | nan | 0.42 | 0.44 | nan | False |\n", - "| 79 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc192/200ep/long_cooldown/not_silence_preprocessed/ | 14.7 | 3.29 | 200 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-07] | {'class': 'adam', 'epsilon': 1e-08} | -0.84 | -0.69 | nan | 0.41 | 0.43 | nan | False |\n", - "| 80 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc192/200ep/long_cooldown/silence_preprocessed/ | 13.5 | 3.28 | 200 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-07] | {'class': 'adam', 'epsilon': 1e-08} | -0.82 | -0.81 | nan | 0.38 | 0.39 | nan | False |\n", - "| 81 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/not_silence_preprocessed/ | 14.3 | 3.31 | 100 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.84 | -0.77 | nan | 0.41 | 0.42 | nan | False |\n", - "| 82 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/silence_preprocessed/ | 13.5 | 3.3 | 100 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.82 | -0.81 | nan | 0.37 | 0.39 | nan | False |\n", - "| 83 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc768/with_sigma/not_silence_preprocessed/ | 15.7 | 3.35 | 200 | 0 | False | 768 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.85 | -0.83 | nan | 0.4 | 0.45 | nan | False |\n", - "| 84 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc768/with_sigma/silence_preprocessed/ | 15.3 | 3.4 | 200 | 0 | False | 768 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.74 | -0.46 | nan | 0.37 | 0.41 | nan | False |\n", - "| 85 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc768/mean_only/not_silence_preprocessed/ | 15.5 | 3.47 | 200 | 0 | True | 768 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.85 | -0.76 | nan | 0.4 | 0.45 | nan | False |\n", - "| 86 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc768/mean_only/silence_preprocessed/ | 16.9 | 3.52 | 200 | 0 | True | 768 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.74 | -0.71 | nan | 0.37 | 0.41 | nan | False |\n", - "| 87 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc256/not_silence_preprocessed/ | 97.9 | 1.69 | 200 | 0 | False | 256 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.85 | -0.54 | nan | 0.96 | 0.5 | nan | False |\n", - "| 88 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_simple_encoder/silence_preprocessed/ | 56.4 | 3.33 | 100 | 0 | - | - | [0: 0.0001, 49: 0.0005, 50: 0.0005, 100: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.84 | -0.82 | nan | 0.38 | 0.43 | nan | False |\n" + "| | Group | Experiment | sWER | autoMOS | autoMOS confidence | num_epochs | decoder dropout | mean only | encoder channels | LR | Optimizer | MLE | dev MLE | devtrain MLE | DP loss | DP dev loss | DP devtrain loss | Joint |\n", + "|---:|:----------------------------------|:------------------------------------------------------------------------------------------------------------------------|-------:|----------:|---------------------:|-------------:|------------------:|:------------|:-------------------|:-------------------------------------------------|:-------------------------------------|------:|----------:|---------------:|----------:|--------------:|-------------------:|:--------|\n", + "| 0 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_100ep_pe1/ | 12.4 | 2.94 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-08} | -0.83 | -0.85 | nan | 0.38 | 0.35 | nan | False |\n", + "| 3 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1_radam1e-9/ | 14.7 | 2.59 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.83 | -0.84 | nan | 0.38 | 0.36 | nan | False |\n", + "| 4 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1/ | 14.4 | 2.72 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-08} | -0.83 | -0.84 | nan | 0.39 | 0.36 | nan | False |\n", + "| 5 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS/ | 97.7 | 1.61 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.82 | -0.83 | nan | 0.96 | 0.73 | nan | False |\n", + "| 7 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_x_vector/ | 99.5 | 1.93 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.81 | -0.76 | -0.81 | 1 | 1.08 | 1.04 | False |\n", + "| 23 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/dec_drop_0.05/ | 12.9 | 3.33 | 0.02 | 400 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 400: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.82 | -0.83 | -0.84 | 0.38 | 0.46 | 0.34 | False |\n", + "| 24 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.0/ | 96.2 | 2.29 | nan | 200 | 0 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.8 | -0.8 | 0.98 | 0.87 | 0.88 | False |\n", + "| 25 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/ | 14.6 | 3.26 | 0.02 | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.82 | -0.83 | 0.39 | 0.44 | 0.36 | False |\n", + "| 26 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/100ep/dec_drop_0.05/ | 16.3 | 3.18 | 0.02 | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.81 | -0.82 | 0.41 | 0.44 | 0.38 | False |\n", + "| 27 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.0/ | 16.7 | 3.32 | nan | 200 | 0 | False | 768 | [0: 1e-06, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.85 | -0.76 | -0.85 | 0.41 | 0.47 | 0.38 | False |\n", + "| 28 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.05/ | 15.6 | 3.21 | nan | 200 | 0.05 | False | 768 | [0: 1e-06, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.82 | -0.83 | 0.39 | 0.45 | 0.36 | False |\n", + "| 29 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.05/ | 13.3 | 3.3 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.82 | -0.83 | 0.38 | 0.41 | 0.34 | False |\n", + "| 30 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.0/ | 95.4 | 2.41 | nan | 200 | 0 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.84 | -0.82 | -0.83 | 0.97 | 1.15 | 1.18 | False |\n", + "| 31 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05_epsilon_1e-8/ | 96.7 | 2.27 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-08} | -0.81 | -0.82 | -0.82 | 0.98 | 0.81 | 0.83 | False |\n", + "| 32 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05/ | 98.8 | 1.88 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.82 | -0.82 | 0.99 | 0.62 | 0.62 | False |\n", + "| 33 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.0/grad_clip_10/ | 12.8 | 3.34 | nan | 200 | 0 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.85 | -0.77 | -0.85 | 0.39 | 0.43 | 0.36 | False |\n", + "| 34 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/100ep/dec_drop_0.05/ | 98.1 | 1.52 | nan | 100 | 0.05 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.8 | -0.81 | 0.99 | 0.78 | 0.78 | False |\n", + "| 35 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/grad_clip_10/dec_drop_0.05/ | 14.4 | 3.29 | 0.02 | 400 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 400: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.82 | -0.84 | -0.85 | 0.37 | 0.44 | 0.33 | False |\n", + "| 36 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/epsilon_1e-8/ | 95.9 | 3.06 | nan | 200 | 0 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-08} | -0.83 | -0.75 | -0.81 | 0.98 | 0.45 | 0.46 | False |\n", + "| 37 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/grad_clip_10/ | 14.5 | 3.31 | nan | 200 | 0 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.85 | -0.84 | -0.85 | 0.4 | 0.45 | 0.36 | False |\n", + "| 38 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05_epsilon_1e-8/ | 15.7 | 3.35 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-08} | -0.81 | -0.83 | -0.83 | 0.39 | 0.44 | 0.35 | False |\n", + "| 39 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05/ | 15.1 | 3.36 | 0.02 | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.83 | -0.83 | 0.39 | 0.44 | 0.35 | False |\n", + "| 40 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.00/ | 16.3 | 3.27 | nan | 100 | 0 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.83 | -0.82 | -0.83 | 0.41 | 0.44 | 0.38 | False |\n", + "| 41 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.05/ | 17.2 | 3.3 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.81 | -0.82 | 0.4 | 0.43 | 0.37 | False |\n", + "| 42 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss/ed_scale_0.1/ | 95.8 | 3.09 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.78 | -0.78 | -0.78 | 1.08 | 2.07 | 2.08 | False |\n", + "| 43 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss_grad_clip_10/ed_scale_1.0/ | 98.8 | 2.27 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.74 | -0.76 | -0.76 | 1.13 | 1.54 | 1.55 | False |\n", + "| 44 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss_grad_clip_10/ed_scale_0.1/ | 95.7 | 3.21 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.79 | -0.79 | 1.08 | 1.81 | 1.83 | False |\n", + "| 45 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_simple_encoder/12cb/200ep/dec_drop_0.05/ | 70.3 | 3.47 | nan | 200 | 0.05 | False | - | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.82 | -0.83 | 0.43 | 0.5 | 0.4 | False |\n", + "| 46 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_simple_encoder/20cb/200ep/dec_drop_0.05/ | 57.3 | 3.46 | nan | 200 | 0.05 | False | - | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.83 | -0.85 | -0.86 | 0.4 | 0.47 | 0.38 | False |\n", + "| 47 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_conformer_coupling/enc768/200ep/dec_drop_0.05/ | 9.6 | 1.79 | 0.03 | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.84 | -0.85 | -0.86 | 0.37 | 0.42 | 0.34 | False |\n", + "| 48 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_multiscale/enc768/200ep/dec_drop_0.05/ | 100 | 1.25 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.79 | -0.8 | -0.81 | 0.53 | 0.57 | 0.52 | False |\n", + "| 49 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_batch_norm/enc768/200ep/dec_drop_0.05/ | 15.1 | 3.4 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.82 | -0.83 | 0.39 | 0.45 | 0.36 | False |\n", + "| 50 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector/enc768/100ep/dec_drop_0.05/ | 97.9 | 1.51 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.78 | -0.79 | -0.79 | 0.99 | 1.38 | 1.38 | False |\n", + "| 75 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc256/not_silence_preprocessed/ | 97.9 | 1.69 | nan | 200 | 0 | False | 256 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.85 | -0.54 | nan | 0.96 | 0.5 | nan | False |\n", + "| 76 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc768/with_sigma/not_silence_preprocessed/ | 15.7 | 3.35 | nan | 200 | 0 | False | 768 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.85 | -0.83 | nan | 0.4 | 0.45 | nan | False |\n", + "| 77 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc768/with_sigma/silence_preprocessed/ | 15.3 | 3.4 | nan | 200 | 0 | False | 768 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.74 | -0.46 | nan | 0.37 | 0.41 | nan | False |\n", + "| 78 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc768/mean_only/silence_preprocessed/ | 16.9 | 3.52 | nan | 200 | 0 | True | 768 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.74 | -0.71 | nan | 0.37 | 0.41 | nan | False |\n", + "| 79 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc768/mean_only/not_silence_preprocessed/ | 15.5 | 3.47 | nan | 200 | 0 | True | 768 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.85 | -0.76 | nan | 0.4 | 0.45 | nan | False |\n", + "| 80 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/not_silence_preprocessed/ | 14.3 | 3.31 | nan | 100 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.84 | -0.77 | nan | 0.41 | 0.42 | nan | False |\n", + "| 81 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/silence_preprocessed/ | 13.5 | 3.3 | nan | 100 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.82 | -0.81 | nan | 0.37 | 0.39 | nan | False |\n", + "| 82 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc192/200ep/long_cooldown/not_silence_preprocessed/ | 14.7 | 3.29 | nan | 200 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-07] | {'class': 'adam', 'epsilon': 1e-08} | -0.84 | -0.69 | nan | 0.41 | 0.43 | nan | False |\n", + "| 83 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc192/200ep/long_cooldown/silence_preprocessed/ | 13.5 | 3.28 | nan | 200 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-07] | {'class': 'adam', 'epsilon': 1e-08} | -0.82 | -0.81 | nan | 0.38 | 0.39 | nan | False |\n", + "| 84 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_simple_encoder/silence_preprocessed/ | 56.4 | 3.33 | nan | 100 | 0 | - | - | [0: 0.0001, 49: 0.0005, 50: 0.0005, 100: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.84 | -0.82 | nan | 0.38 | 0.43 | nan | False |\n", + "| 85 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_ddi_actnorm/enc192/100ep/not_silence_preprocessed/LR_scheduled/ | 100 | 3.12 | nan | 100 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.84 | -0.8 | nan | 0.42 | 0.44 | nan | False |\n", + "| 86 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc768/100ep/not_silence_preprocessed/ | 20.9 | 3.31 | nan | 100 | 0 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.81 | -0.8 | nan | 0.46 | 0.47 | nan | False |\n", + "| 87 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc192/100ep/not_silence_preprocessed/ | 95.4 | 2.63 | nan | 100 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.81 | -0.79 | nan | 0.97 | 1.27 | nan | False |\n", + "| 88 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/not_silence_preprocessed/ | 25.2 | 3.46 | nan | 200 | 0 | - | - | [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.84 | -0.71 | nan | 0.41 | 0.46 | nan | False |\n", + "| 89 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/silence_preprocessed/ | 25.9 | 3.39 | nan | 200 | 0 | - | - | [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.83 | -0.81 | nan | 0.39 | 0.42 | nan | False |\n", + "| 90 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder_no_blstm/silence_preprocessed/ | 105.6 | 3.17 | nan | 100 | 0 | - | - | [0: 0.0001, 49: 0.0005, 50: 0.0001, 100: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.79 | nan | 0.88 | 0.87 | nan | False |\n" ] } ], @@ -1100,7 +1075,67 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "| | Group | Experiment | sWER | autoMOS | autoMOS confidence | num_epochs | decoder dropout | mean only | encoder channels | LR | Optimizer | MLE | dev MLE | devtrain MLE | DP loss | DP dev loss | DP devtrain loss | Joint |\n", + "|---:|:----------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------|-------:|----------:|---------------------:|-------------:|------------------:|:------------|-------------------:|:-------------------------------------------------|:-------------------------------------|------:|----------:|---------------:|----------:|--------------:|-------------------:|:--------|\n", + "| 1 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/ | 20.9 | 2.45 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.67 | nan | -0.69 | 0.44 | nan | 0.41 | True |\n", + "| 2 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/ | 5.2 | 2.14 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.45 | -0.4 | -0.47 | 0.25 | 0.34 | 0.23 | True |\n", + "| 6 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/ | 95.7 | 1.82 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.66 | nan | -0.67 | 1 | nan | 2.09 | True |\n", + "| 8 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/ | 5.2 | 2.14 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.45 | -0.4 | -0.47 | 0.25 | 0.34 | 0.23 | True |\n", + "| 9 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/ | 4.6 | 3.11 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74 | -0.69 | -0.76 | 0.35 | 0.48 | 0.33 | True |\n", + "| 10 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/ | 9.5 | 2.75 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.61 | -0.56 | -0.63 | 0.42 | 0.58 | 0.4 | True |\n", + "| 11 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/ | 11.2 | 3.18 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.76 | -0.69 | -0.78 | 0.4 | 0.56 | 0.38 | True |\n", + "| 12 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/ | 13.3 | 3.12 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.75 | -0.69 | -0.77 | 0.42 | 0.57 | 0.4 | True |\n", + "| 13 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/ | 5.2 | 2.3 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.51 | -0.46 | -0.52 | 0.25 | 0.35 | 0.23 | True |\n", + "| 14 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/ | 9.9 | 2.64 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.62 | -0.57 | -0.64 | 0.4 | 0.55 | 0.38 | True |\n", + "| 15 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/ | 7.9 | 3.19 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.75 | -0.7 | -0.77 | 0.37 | 0.51 | 0.35 | True |\n", + "| 16 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/ | 6.7 | 2.48 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.58 | -0.53 | -0.59 | 0.35 | 0.46 | 0.33 | True |\n", + "| 17 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/ | 7.6 | 3.15 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.75 | -0.7 | -0.77 | 0.38 | 0.5 | 0.36 | True |\n", + "| 18 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/ | 4.4 | 3.2 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.75 | -0.69 | -0.76 | 0.33 | 0.46 | 0.31 | True |\n", + "| 19 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/ | 14.8 | 2.61 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.68 | nan | -0.7 | 0.4 | nan | 0.37 | True |\n", + "| 20 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/ | 6 | 2.32 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.52 | -0.47 | -0.54 | 0.23 | 0.35 | 0.2 | True |\n", + "| 21 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/ | 15.2 | 3.16 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.78 | nan | -0.81 | 0.38 | nan | 0.36 | True |\n", + "| 22 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/ | 98 | 1.58 | 0.01 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.55 | -0.5 | -0.55 | 1.02 | 0.82 | 0.7 | True |\n", + "| 51 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_0.1/ | 25.2 | 3.11 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.8 | -0.82 | -0.82 | 0.43 | 0.46 | 0.41 | True |\n", + "| 52 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_1/ | 88.5 | 1.87 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.72 | -0.73 | -0.74 | 0.75 | 0.79 | 0.69 | True |\n", + "| 53 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 98.1 | 1.82 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.79 | -0.79 | -0.8 | 0.89 | 0.57 | 0.58 | True |\n", + "| 54 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 32.9 | 2.77 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.82 | -0.83 | 0.44 | 0.49 | 0.41 | True |\n", + "| 55 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_1/ | 83.9 | 1.92 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.69 | -0.7 | -0.71 | 0.76 | 0.94 | 0.72 | True |\n", + "| 56 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_0.1/ | 41.9 | 2.75 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.79 | -0.81 | -0.82 | 0.51 | 0.54 | 0.49 | True |\n", + "| 57 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 14.1 | 3.2 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.79 | -0.81 | -0.81 | 0.4 | 0.45 | 0.36 | True |\n", + "| 58 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 23.3 | 3.13 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.78 | -0.79 | -0.79 | 0.43 | 0.48 | 0.4 | True |\n", + "| 59 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/mean_only/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 96.4 | 2.37 | nan | 200 | 0.05 | True | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.8 | -0.81 | 0.93 | 0.72 | 0.73 | True |\n", + "| 60 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 91.7 | 2.66 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.82 | -0.83 | 0.63 | 0.63 | 0.59 | True |\n", + "| 61 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_1.0/ | 97.2 | 2.12 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.8 | -0.81 | 0.83 | 0.61 | 0.62 | True |\n", + "| 62 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 99.9 | 1.56 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.77 | -0.8 | 0.82 | 0.53 | 0.54 | True |\n", + "| 63 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.1/ | 99.7 | 1.83 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.79 | -0.81 | -0.81 | 0.05 | 0.05 | 0.04 | True |\n", + "| 64 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.01/ | 97.2 | 1.97 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.81 | -0.82 | 0.06 | 0.06 | 0.05 | True |\n", + "| 65 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.1/ | 99.1 | 1.64 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.78 | -0.8 | -0.8 | 0.05 | 0.05 | 0.04 | True |\n", + "| 66 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.01/ | 98.2 | 2.18 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.79 | -0.8 | -0.81 | 0.06 | 0.06 | 0.05 | True |\n", + "| 67 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_1.0/ | 100 | 1.97 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.73 | -0.75 | -0.75 | 0.08 | 0.08 | 0.07 | True |\n", + "| 68 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_0.1/ | 100 | 1.98 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.76 | -0.78 | -0.78 | 0.08 | 0.08 | 0.07 | True |\n", + "| 69 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_1.0/ | 100 | 1.98 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.73 | -0.74 | -0.75 | 0.07 | 0.08 | 0.06 | True |\n", + "| 70 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_0.1/ | 100 | 1.98 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.76 | -0.77 | -0.78 | 0.07 | 0.08 | 0.06 | True |\n", + "| 71 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/basic_init/ce_ls_0.1/ | 97.9 | 1.78 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.78 | -0.79 | -0.8 | 0.26 | 0.24 | 0.25 | True |\n", + "| 72 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/ce_ls_0.1/ | 95.4 | 2.53 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.79 | -0.81 | -0.81 | 0.25 | 0.22 | 0.23 | True |\n", + "| 73 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/tts_pretrained/ce_ls_0.1/ | 95.2 | 2.18 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.78 | -0.79 | -0.79 | 0.26 | 0.24 | 0.25 | True |\n", + "| 74 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/basic_init/ce_ls_0.1/ | 96.1 | 2.35 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.77 | -0.78 | -0.79 | 0.27 | 0.26 | 0.26 | True |\n" + ] + } + ], + "source": [ + "print(df_no_index[df_no_index[\"Joint\"]].to_markdown())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -1109,13 +1144,13 @@ "" ] }, - "execution_count": 44, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -1130,7 +1165,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -1139,13 +1174,13 @@ "" ] }, - "execution_count": 45, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -1160,7 +1195,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1169,13 +1204,13 @@ "" ] }, - "execution_count": 46, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -1190,7 +1225,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -1199,13 +1234,13 @@ "" ] }, - "execution_count": 47, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAkAAAAGwCAYAAABB4NqyAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAABG1ElEQVR4nO3df1zV9f3///sBFUQDf/C7SDCYLfMH+QMR88ckyfmprLaMr5vmrC41f2TkmrZlv6NaVstcrrLUbaFrpW2tsRzx463ib6lc5jBAzQQBhSNgkPD6/uE46wgcOQicc3jdrpfLudR5vp7nxeP1Unk9zvOnxTAMQwAAACbi5eoAAAAAOhsJEAAAMB0SIAAAYDokQAAAwHRIgAAAgOmQAAEAANMhAQIAAKbTzdUBuKOGhgZ9/fXXuuSSS2SxWFwdDgAAaAXDMHT69GmFh4fLy8txGw8JUDO+/vprRUREuDoMAADQBkePHtVll13msA4JUDMuueQSSeduoL+/v4ujAQAArWG1WhUREWF7jjtCAtSMxm4vf39/EiAAADxMa4avMAgaAACYDgkQAAAwHRIgAABgOiRAAADAdEiAAACA6ZAAAQAA0yEBAgAApkMCBAAATIcECAAAmA4JEAAAMB22wgAAwMMVlFbp8MkaRfbvpajAXq4OxyOQAAEA4KEqauq0MC1POfmltrLxMUFakRyrAL/uLozM/dEFBgCAh1qYlqeth8rsyrYeKtOCtH0uishzkAABAOCBCkqrlJNfqnrDsCuvNwzl5JeqsKzaRZF5BhIgAAA80OGTNQ6PF5WTADlCAgQAgAca0M/P4fHI/gyGdoQECAAADzQwqLfGxwTJ22KxK/e2WDQ+JojZYBdAAgQAgIdakRyrhOhAu7KE6ECtSI51UUSeg2nwAAB4qAC/7lo3d7QKy6pVVF7NOkBOIAECAMDDRQWS+DiLLjAAAGA6JEAAAMB0XJoApaamatSoUbrkkksUHBys6dOn6+DBgxf83DvvvKMrr7xSvr6+GjJkiD788EO744ZhaNmyZQoLC1PPnj2VmJio/Pz8jroMAADgYVyaAGVnZ2vevHnavn27Nm/erG+//VZTpkxRdXXLizdt27ZNycnJmjt3rvbt26fp06dr+vTp2r9/v63Oc889p5dfflmrVq3Sjh071KtXLyUlJembb77pjMsCAABuzmIY562h7UKlpaUKDg5Wdna2xo8f32ydGTNmqLq6Wh988IGtbMyYMRo+fLhWrVolwzAUHh6uBx54QIsXL5YkVVZWKiQkRGvWrNHtt9/e5Jy1tbWqra21vbdarYqIiFBlZaX8/f3b+SoBAEBHsFqtCggIaNXz263GAFVWVkqS+vXr12Kd3NxcJSYm2pUlJSUpNzdXklRYWKji4mK7OgEBAYqLi7PVOV9qaqoCAgJsr4iIiIu9FAAA4MbcJgFqaGjQokWLlJCQoKuvvrrFesXFxQoJCbErCwkJUXFxse14Y1lLdc63dOlSVVZW2l5Hjx69mEsBAABuzm3WAZo3b57279+vLVu2dPrP9vHxkY+PT6f/XAAA4Bpu0QI0f/58ffDBB8rMzNRll13msG5oaKhKSkrsykpKShQaGmo73ljWUh0AAGBuLk2ADMPQ/PnztXHjRn388ceKioq64Gfi4+OVkZFhV7Z582bFx8dLkqKiohQaGmpXx2q1aseOHbY6AADA3FzaBTZv3jy9/fbbev/993XJJZfYxugEBASoZ8+ekqRZs2bp0ksvVWpqqiTpvvvu04QJE7R8+XJNmzZN69ev1+7du/Xaa69JkiwWixYtWqQnn3xSMTExioqK0sMPP6zw8HBNnz7dJdcJAADci0sToFdffVWSNHHiRLvyt956S3fccYck6ciRI/Ly+l9D1dixY/X222/r17/+tR566CHFxMRo06ZNdgOnH3zwQVVXV+vuu+9WRUWFxo0bp/T0dPn6+nb4NQEAAPfnVusAuQtn1hEAAADuwWPXAQIAAOgMJEAAAMB0SIAAAIDpkAABAADTIQECAACmQwIEAABMhwQIAACYDgkQAAAwHRIgAABgOiRAAADAdEiAAACA6ZAAAQAA0yEBAgAApkMCBAAATIcECAAAmA4JEAAAMB0SIAAAYDokQAAAwHRIgAAAgOl0c3UAAACg7QpKq3T4ZI0i+/dSVGAvV4fjMUiAAADwQBU1dVqYlqec/FJb2fiYIK1IjlWAX3cXRuYZ6AIDAMADLUzL09ZDZXZlWw+VaUHaPhdF5FlIgAAA8DAFpVXKyS9VvWHYldcbhnLyS1VYVu2iyDwHCRAAAB7m8Mkah8eLykmALoQECAAADzOgn5/D45H9GQx9ISRAAAB4mIFBvTU+JkjeFotdubfFovExQcwGawUSIAAAPNCK5FglRAfalSVEB2pFcqyLIvIsTIMHAMADBfh117q5o1VYVq2i8mrWAXISCRAAAB4sKpDEpy3oAgMAAKbj0gQoJydHN9xwg8LDw2WxWLRp0yaH9e+44w5ZLJYmr8GDB9vqPProo02OX3nllR18JQAAwJO4NAGqrq7WsGHDtHLlylbV/+1vf6vjx4/bXkePHlW/fv304x//2K7e4MGD7ept2bKlI8IHAAAeyqVjgKZOnaqpU6e2un5AQIACAgJs7zdt2qRTp05pzpw5dvW6deum0NDQdosTAAB0LR49Bmj16tVKTEzUgAED7Mrz8/MVHh6ugQMHaubMmTpy5IjD89TW1spqtdq9AABA1+WxCdDXX3+tf/zjH7rzzjvtyuPi4rRmzRqlp6fr1VdfVWFhoa699lqdPn26xXOlpqbaWpcCAgIUERHR0eEDAAAXshjGeTupuYjFYtHGjRs1ffr0VtVPTU3V8uXL9fXXX6tHjx4t1quoqNCAAQP0wgsvaO7cuc3Wqa2tVW1tre291WpVRESEKisr5e/v79R1AAAA17BarQoICGjV89sj1wEyDENvvvmmfvrTnzpMfiSpT58++t73vqdDhw61WMfHx0c+Pj7tHSYAAHBTHtkFlp2drUOHDrXYovNdVVVV+vLLLxUWFtYJkQEAAE/g0gSoqqpKeXl5ysvLkyQVFhYqLy/PNmh56dKlmjVrVpPPrV69WnFxcbr66qubHFu8eLGys7NVVFSkbdu26eabb5a3t7eSk5M79FoAAIDncGkX2O7duzVp0iTb+5SUFEnS7NmztWbNGh0/frzJDK7Kykq9++67+u1vf9vsOb/66islJyervLxcQUFBGjdunLZv366goKCOuxAAAOBR3GYQtDtxZhAVAABwD848vz1yDBAAAMDFIAECAACmQwIEAABMhwQIAACYDgkQAAAwHRIgAABgOiRAAADAdEiAAACA6ZAAAQAA0yEBAgAApkMCBAAATIcECAAAmA4JEAAAMB0SIAAAYDokQAAAwHRIgAAAgOmQAAEAANMhAQIAAKZDAgQAAEyHBAgAAJgOCRAAADAdEiAAAGA6JEAAAMB0urk6AAAA0PkKSqt0+GSNIvv3UlRgL1eH0+lIgAAAMJGKmjotTMtTTn6prWx8TJBWJMcqwK+7CyPrXHSBAQBgIgvT8rT1UJld2dZDZVqQts9FEbkGCRAAACZRUFqlnPxS1RuGXXm9YSgnv1SFZdUuiqzzkQABAGASh0/WODxeVE4CBAAAupgB/fwcHo/sb57B0CRAAACYxMCg3hofEyRvi8Wu3Nti0fiYIFPNBiMBAgDARFYkxyohOtCuLCE6UCuSY10UkWu4NAHKycnRDTfcoPDwcFksFm3atMlh/aysLFksliav4uJiu3orV65UZGSkfH19FRcXp507d3bgVQAA4DkC/Lpr3dzRylw8UW/NGaXMxRO1bu5oU02Bl1ycAFVXV2vYsGFauXKlU587ePCgjh8/bnsFBwfbjm3YsEEpKSl65JFHtHfvXg0bNkxJSUk6ceJEe4cPAIDHigrspUmDgk3V7fVdLl0IcerUqZo6darTnwsODlafPn2aPfbCCy/orrvu0pw5cyRJq1at0t///ne9+eabWrJkSbOfqa2tVW1tre291Wp1OiYAAOA5PHIM0PDhwxUWFqbrrrtOW7dutZXX1dVpz549SkxMtJV5eXkpMTFRubm5LZ4vNTVVAQEBtldERESHxg8AAFzLoxKgsLAwrVq1Su+++67effddRUREaOLEidq7d68kqaysTPX19QoJCbH7XEhISJNxQt+1dOlSVVZW2l5Hjx7t0OsAAACu5VF7gQ0aNEiDBg2yvR87dqy+/PJLvfjii/rDH/7Q5vP6+PjIx8enPUIEAAAewKNagJozevRoHTp0SJIUGBgob29vlZSU2NUpKSlRaGioK8IDAABuyOMToLy8PIWFhUmSevTooREjRigjI8N2vKGhQRkZGYqPj3dViAAAwM24tAusqqrK1nojSYWFhcrLy1O/fv10+eWXa+nSpTp27JjWrVsnSXrppZcUFRWlwYMH65tvvtEbb7yhjz/+WB999JHtHCkpKZo9e7ZGjhyp0aNH66WXXlJ1dbVtVhgAAIBLE6Ddu3dr0qRJtvcpKSmSpNmzZ2vNmjU6fvy4jhw5YjteV1enBx54QMeOHZOfn5+GDh2qf/3rX3bnmDFjhkpLS7Vs2TIVFxdr+PDhSk9PbzIwGgAAmJfFMAzD1UG4G6vVqoCAAFVWVsrf39/V4QAAgFZw5vnt8WOAAAAAnEUCBAAATIcECAAAmA4JEAAAMB0SIAAAYDokQAAAwHRIgAAAgOmQAAEAANPxqN3gAQBA2xSUVunwyRpF9u+lqMBerg7H5UiAAADowipq6rQwLU85+aW2svExQVqRHKsAv+4ujMy16AIDAKALW5iWp62HyuzKth4q04K0fS6KyD2QAAEA0EUVlFYpJ79U9edt+1lvGMrJL1VhWbWLInM9EiAAALqowydrHB4vKicBAgAAXcyAfn4Oj0f2N+9gaBIgAAC6qIFBvTU+JkjeFotdubfFovExQaaeDUYCBABAF7YiOVYJ0YF2ZQnRgVqRHOuiiNwD0+ABAOjCAvy6a93c0Sosq1ZReTXrAP0XCRAAACYQFUji8110gQEAANMhAQIAAKZDAgQAAEyHBAgAAJgOCRAAADAdEiAAAGA6JEAAAMB0SIAAAIDpkAABAADTYSVoAAA8VEFplQ6frGF7izZwOgE6c+aMDMOQn5+fJOnw4cPauHGjrrrqKk2ZMqXdAwQAAPYqauq0MC1POfmltrLxMUFakRyrAL/uLozMczjdBXbTTTdp3bp1kqSKigrFxcVp+fLluummm/Tqq6+2e4AAAMDewrQ8bT1UZle29VCZFqTtc1FEnsfpBGjv3r269tprJUl/+ctfFBISosOHD2vdunV6+eWXnTpXTk6ObrjhBoWHh8tisWjTpk0O67/33nu67rrrFBQUJH9/f8XHx+uf//ynXZ1HH31UFovF7nXllVc6FRcAAO6qoLRKOfmlqjcMu/J6w1BOfqkKy6pdFJlncToBqqmp0SWXXCJJ+uijj3TLLbfIy8tLY8aM0eHDh506V3V1tYYNG6aVK1e2qn5OTo6uu+46ffjhh9qzZ48mTZqkG264Qfv22We8gwcP1vHjx22vLVu2OBUXAADu6vDJGofHi8pJgFrD6TFA0dHR2rRpk26++Wb985//1P333y9JOnHihPz9/Z0619SpUzV16tRW13/ppZfs3j/99NN6//339be//U2xsbG28m7duik0NNSpWAAA8AQD+vk5PB7Zn8HQreF0C9CyZcu0ePFiRUZGKi4uTvHx8ZLOtQZ9NwnpDA0NDTp9+rT69etnV56fn6/w8HANHDhQM2fO1JEjRxyep7a2Vlar1e4FAIA7GhjUW+NjguRtsdiVe1ssGh8TxGywVnI6AfrRj36kI0eOaPfu3UpPT7eVT548WS+++GK7Bnchzz//vKqqqnTbbbfZyuLi4rRmzRqlp6fr1VdfVWFhoa699lqdPn26xfOkpqYqICDA9oqIiOiM8AEAaJMVybFKiA60K0uIDtSK5M5tiPBkFsM4bxSVk6xWqz7++GMNGjRI3//+99seiMWijRs3avr06a2q//bbb+uuu+7S+++/r8TExBbrVVRUaMCAAXrhhRc0d+7cZuvU1taqtrbW9t5qtSoiIkKVlZVOd+sBANBZCsuqVVRezTpA/2W1WhUQENCq57fTY4Buu+02jR8/XvPnz9eZM2c0cuRIFRUVyTAMrV+/XrfeemubA2+t9evX684779Q777zjMPmRpD59+uh73/ueDh061GIdHx8f+fj4tHeYAAB0qKhAEp+2croLLCcnxzYNfuPGjTIMQxUVFXr55Zf15JNPtnuA50tLS9OcOXOUlpamadOmXbB+VVWVvvzyS4WFhXV4bAAAwDM4nQBVVlbaBh2np6fr1ltvlZ+fn6ZNm6b8/HynzlVVVaW8vDzl5eVJkgoLC5WXl2cbtLx06VLNmjXLVv/tt9/WrFmztHz5csXFxam4uFjFxcWqrKy01Vm8eLGys7NVVFSkbdu26eabb5a3t7eSk5OdvVQAANBFOZ0ARUREKDc3V9XV1UpPT7dtf3Hq1Cn5+vo6da7du3crNjbWNnssJSVFsbGxWrZsmSTp+PHjdjO4XnvtNZ09e1bz5s1TWFiY7XXffffZ6nz11VdKTk7WoEGDdNttt6l///7avn27goKCnL1UAADQRTk9CPp3v/ud7rvvPvXu3VsDBgzQ3r175eXlpRUrVui9995TZmZmR8XaaZwZRAUAANxDhw6C/vnPf67Ro0fr6NGjuu666+Tlda4RaeDAgZ0yBggAAOBiXdQ0+MaPWs5bjMnT0QIEAIDnceb57fQYIElat26dhgwZop49e6pnz54aOnSo/vCHP7QpWAAAgM7mdBfYCy+8oIcffljz589XQkKCJGnLli265557VFZWZtsbDAAAwF053QUWFRWlxx57zG56uiStXbtWjz76qAoLC9s1QFegCwwAAM/ToYOgjx8/rrFjxzYpHzt2rI4fP+7s6QAAQBsVlFbp8MkatsJoA6cToOjoaP35z3/WQw89ZFe+YcMGxcTEtFtgAACgeRU1dVqYlqec/FJb2fiYIK1IjlWAX3cXRuY5nE6AHnvsMc2YMUM5OTm2MUBbt25VRkaG/vznP7d7gAAAwN7CtDxtPVRmV7b1UJkWpO3TurmjXRSVZ3F6Ftitt96qHTt2KDAwUJs2bdKmTZsUGBionTt36uabb+6IGAEAwH8VlFYpJ79U9ecN4a03DOXkl6qwrNpFkXkWp1uAJGnEiBH64x//2N6xAACACzh8ssbh8aLyasYDtUKrEiCr1drqEzJrCgCAjjOgn5/D45H9SX5ao1UJUJ8+fS642rNhGLJYLKqvr2+XwAAAQFMDg3prfEyQth4qs+sG87ZYlBAdSOtPK7UqAeoKG5wCANBVrEiO1YK0fXazwBKiA7UiOdaFUXmWi9oLrKtiIUQAgCcoLKtWUXk16wD9V4cuhAgAANxDVCCJT1u1aTNUAAAAT0YCBAAATIcECAAAmI7TCdCTTz7ZJXZ8BwAA5xSUVinz4AlTrSLt9CywYcOGaf/+/YqLi9NPfvIT3XbbbQoMDOyo+FyCWWAAADPoapuqOvP8droF6JNPPtGnn36qiRMn6vnnn1d4eLimTZumt99+WzU1jpfnBgAA7sPRpqpdXZvGAA0ePFhPP/20CgoKlJmZqcjISC1atEihoaHtHR8AAOgAZt9U9aIHQffq1Us9e/ZUjx499O2337ZHTAAAoIO1ZlPVrqxNCVBhYaGeeuopDR48WCNHjtS+ffv02GOPqbi4uL3jAwAAHcDsm6o6vRL0mDFjtGvXLg0dOlRz5sxRcnKyLr300o6IDQAAdBCzb6rqdAI0efJkvfnmm7rqqqs6Ih4AANBJzLypaps3Q62rq1NhYaGuuOIKdevWtbYUYxo8AMBMusqmqh06Df7MmTOaO3eu/Pz8NHjwYB05ckSStGDBAj3zzDNtixgAALhMVGAvTRoU7NHJj7OcToCWLFmiTz75RFlZWfL19bWVJyYmasOGDe0aHAAAQEdwuu9q06ZN2rBhg8aMGSOLxWIrHzx4sL788st2DQ4AAKAjON0CVFpaquDg4Cbl1dXVdgkRAACAu3I6ARo5cqT+/ve/2943Jj1vvPGG4uPjnTpXTk6ObrjhBoWHh8tisWjTpk0X/ExWVpauueYa+fj4KDo6WmvWrGlSZ+XKlYqMjJSvr6/i4uK0c+dOp+ICAABdm9NdYE8//bSmTp2qzz//XGfPntVvf/tbff7559q2bZuys7OdOld1dbWGDRumn/3sZ7rlllsuWL+wsFDTpk3TPffcoz/96U/KyMjQnXfeqbCwMCUlJUmSNmzYoJSUFK1atUpxcXF66aWXlJSUpIMHDzbbcgUAAMynTdPgv/zySz3zzDP65JNPVFVVpWuuuUa//OUvNWTIkLYHYrFo48aNmj59eot1fvnLX+rvf/+79u/fbyu7/fbbVVFRofT0dElSXFycRo0apVdeeUWS1NDQoIiICC1YsEBLlixpVSxMgwcAwPM48/xu0wI+V1xxhV5//fU2BXcxcnNzlZiYaFeWlJSkRYsWSTq3NtGePXu0dOlS23EvLy8lJiYqNze3xfPW1taqtrbW9t5qtbZv4AAAwK20KgFyJiHoyBaT4uJihYSE2JWFhITIarXqzJkzOnXqlOrr65ut88UXX7R43tTUVD322GMdEjMAAHA/rUqA+vTp0+oZXvX19RcVkCssXbpUKSkptvdWq1UREREujAgAAHSkViVAmZmZtv8vKirSkiVLdMcdd9hmfeXm5mrt2rVKTU3tmCj/KzQ0VCUlJXZlJSUl8vf3V8+ePeXt7S1vb+9m64SGhrZ4Xh8fH/n4+HRIzAAAwP20KgGaMGGC7f8ff/xxvfDCC0pOTraV3XjjjRoyZIhee+01zZ49u/2j/K/4+Hh9+OGHdmWbN2+2JWI9evTQiBEjlJGRYRtM3dDQoIyMDM2fP7/D4gIAAJ7F6XWAcnNzNXLkyCblI0eOdHq9naqqKuXl5SkvL0/SuWnueXl5tv3Fli5dqlmzZtnq33PPPSooKNCDDz6oL774Qr/73e/05z//Wffff7+tTkpKil5//XWtXbtWBw4c0L333qvq6mrNmTPH2UsFAABdlNMJUERERLMzwN544w2nx83s3r1bsbGxio2NlXQueYmNjdWyZcskScePH7clQ5IUFRWlv//979q8ebOGDRum5cuX64033rCtASRJM2bM0PPPP69ly5Zp+PDhysvLU3p6epOB0QAAcyoorVLmwRMqLKt2dShwIafXAfrwww916623Kjo6WnFxcZKknTt3Kj8/X++++65++MMfdkignYl1gACg66moqdPCtDzl5JfaysbHBGlFcqwC/Lq7MDK0F2ee3063AP3whz9Ufn6+brzxRp08eVInT57UDTfcoP/85z9dIvkBAHRNC9PytPVQmV3Z1kNlWpC2z0URwZXatBDiZZddpqeeeqq9YwEAoEMUlFbZtfw0qjcM5eSXqrCsWlGBvVwQGVzF6RYgAAA8zeGTNQ6PF5UzHshsSIAAAF3egH5+Do9H9qf1x2xIgAAAXd7AoN4aHxMk7/N2NfC2WDQ+JojuLxMiAQIAmMKK5FglRAfalSVEB2pFcqyLIoIrOTUIevv27frb3/6muro6TZ48Wddff31HxQUAQLsK8OuudXNHq7CsWkXl1Yrs34uWHxNr9TpAf/nLXzRjxgz17NlT3bt3l9Vq1bPPPqvFixd3dIydjnWAAADwPB2yDlBqaqruuusuVVZW6tSpU3ryySf19NNPX3SwAAAAna3VLUC9e/dWXl6eoqOjJUl1dXXq1auXjh07puDg4A4NsrPRAgQAgOfpkBagmpoau5P16NFDvr6+qqqqanukAADAVNxlLzanBkG/8cYb6t27t+392bNntWbNGgUG/m9U/cKFC9svOgAA0GoFpVU6fLLGLQd4u9tebK3uAouMjJTlvPUTmpzMYlFBQUG7BOZKdIEBADyJuyUXzZm1eqe2HipT/XfSDm+LRQnRgVo3d3S7/Axnnt+tbgEqKiq62LgAAEAHcLTRa3slFxfDHfdiYyFEAAA8WGNyUX9eh853kwtXc8e92JxKgBoaGvTmm2/q//2//6err75aQ4YM0Y033qh169aplT1pAACgHbljcnE+d9yLrdUJkGEYuvHGG3XnnXfq2LFjGjJkiAYPHqzDhw/rjjvu0M0339yRcQIAgGa4Y3JxPnfci63VCdCaNWuUk5OjjIwM7du3T2lpaVq/fr0++eQT/etf/9LHH3+sdevWdWSsAADgPO6YXDTH3fZia/UssClTpugHP/iBlixZ0uzxp59+WtnZ2frnP//ZrgG6ArPAAACepLLmWy1I23fBWWDuME2+I/dic+b53eoEKDQ0VOnp6Ro+fHizx/ft26epU6equLjY6YDdDQkQAMATtZRceMI0+fbQIStBnzx5UiEhIS0eDwkJ0alTp1ofJQAAaFdRgb00aVCwDMOwW23Z0TR5s2r1OkD19fXq1q3l6t7e3jp79my7BAUAAJzXXEvPqMi+2lXUtIHClWvwuINWJ0CGYeiOO+6Qj49Ps8dra2vbLSgAAOC85lp69hx23DtTVE4C5NDs2bMvWGfWrFkXFQwAAGibllZbbrjASF93mCbvCq1OgN56662OjAMAAI/kDjOrpAsviOglqeE77xv34TJj64/k5G7wAADgHHebWXWhBRFHDOirXd/pDnPlGjzugAQIAIA2cLcNSBsXRHS043rOf0q17+gpXXN5X10bE9TpMboTEiAAAJzkjrubS+dWWz5/QcSE6EA9OX2wZq3e6TatVe6ABAgAACe1ZgNSVyRAAX7dtW7u6CYLIs5avdOtWqvcAQkQAABOcvcNSKMC/zcg211bq1yt1StBAwCAczxlA1Kpda1VZuQWCdDKlSsVGRkpX19fxcXFaefOnS3WnThxoiwWS5PXtGnTbHXuuOOOJsevv/76zrgUAIBJuNvu5i1x99YqV3F5F9iGDRuUkpKiVatWKS4uTi+99JKSkpJ08OBBBQcHN6n/3nvvqa6uzva+vLxcw4YN049//GO7etdff73d2kUtrWANAEBbtDText1caHaYO8bcGVzeAvTCCy/orrvu0pw5c3TVVVdp1apV8vPz05tvvtls/X79+ik0NNT22rx5s/z8/JokQD4+Pnb1+vbt22IMtbW1slqtdi8AAFqjcQNSd04kPKW1qjO5tAWorq5Oe/bs0dKlS21lXl5eSkxMVG5ubqvOsXr1at1+++3q1cv+L15WVpaCg4PVt29f/eAHP9CTTz6p/v37N3uO1NRUPfbYY22/EAAA3Fh5da3mjIvUXeOjdLbBcNvWqs7k0gSorKxM9fX1CgkJsSsPCQnRF198ccHP79y5U/v379fq1avtyq+//nrdcsstioqK0pdffqmHHnpIU6dOVW5urry9vZucZ+nSpUpJSbG9t1qtioiIaONVAQDgGudvy+FotWqzc/kYoIuxevVqDRkyRKNH269hcPvtt9v+f8iQIRo6dKiuuOIKZWVlafLkyU3O4+PjwxghAIDHainR+ba+QTsLT9rVNfv6P41cOgYoMDBQ3t7eKikpsSsvKSlRaGiow89WV1dr/fr1mjt37gV/zsCBAxUYGKhDhw5dVLwAALij5rbl2HKoVLkF5XYDnyX79X/MzKUJUI8ePTRixAhlZGTYyhoaGpSRkaH4+HiHn33nnXdUW1urn/zkJxf8OV999ZXKy8sVFhZ20TEDAOBOGhc6PD/RaTBa+MB/mXX9n0YunwWWkpKi119/XWvXrtWBAwd07733qrq6WnPmzJEkzZo1y26QdKPVq1dr+vTpTQY2V1VV6Re/+IW2b9+uoqIiZWRk6KabblJ0dLSSkpI65ZoAAOgsF1rosCVmXf+nkcvHAM2YMUOlpaVatmyZiouLNXz4cKWnp9sGRh85ckReXvZ52sGDB7VlyxZ99NFHTc7n7e2tTz/9VGvXrlVFRYXCw8M1ZcoUPfHEE4zzAQB0ORda6NBLUsN33pt9/Z9GFsMwLtBIZj5Wq1UBAQGqrKyUv7+/q8MBAMChxs1Oz1/oMG5gP3Xz8jLNLvDOPL9d3gIEAAAuzorkWC1I22eX6DQudBjg193tV6t2BVqAmkELEADAE5k90aEFCAAAE4oKNGfi0xYunwUGAADQ2UiAAACA6ZAAAQAA0yEBAgAApkMCBAAATIdZYAAAuEhBaZUOn6wx7bR1VyIBAgCgk1XU1GlhWp5pVmh2R3SBAQDQyRam5WnroTK7sq2HyrQgbZ+LIjIfEiAAADpRQWmVcvJL7fbtkqR6w1BOfqkKy6rb9WdlHjzR7DkdHTMDusAAAOhEh0/WODxeVF590eOBHHWxGTLofhMtQAAAdKoB/fwcHo/sf/GDoR11sdH9dg4JEAAAnWhgUG+NjwmSt8ViV+5tsWh8TNBFt/5cqIuts7rf3B0JEAAAnWxFcqwSogPtyhKiA7UiOfaiz32hLjZHisrNkwAxBqiLYC0JAPAcAX7dtW7uaBWWVauovLpdf3dfqIvNkfbofvMUJEAejrUkAMBzRQW2/5fWxi62rYfK7Lq6vC0WxV7eR998W6/Pv7aq4Tuf8bZYlBAdaKov0HSBeTgGswEAztdcF5t/z27affiU9p+X/Ej23W9mmR5PC5AHaxzodr7vDmZzJpunGw0Auobzu9h+l3lIew9X2NXxskhXhftrRfI1igrspYqaOs1avdM0PQokQB6svdaSoBsNALqmqMBeMgxDu4pONTnWYEj7j1lt7x31KKybO7rDY+1sdIF5MGfWknDUpEk3GgB0Xa35styZq1O7C1qAPFhLA90aPfL+v/Xk9Kv16037W2zdae9uNACAe2nNl+ULTX9vj9Wp3Q0tQB6uuYFujbYeKtNNK7c4bN1pzTcDmINZBj4CZtOahRc7Y3Vqd0MC5OEC/Lrr0RuvavZYvWHoVM23Dps0zfiXHvYaBz7+YHm25ry1S5Oez9Ks1TtVWfOtq0MD0E4utPBiR69O7Y7oAusC2rrqZ1F5tSYNCm5xvQizrQlhVmYb+AiYUWsWXlyRHKu5a3dp9+H/DZhur9Wp3REtQF1AW1f9bGzd6cgl2eHezDjwETCzqMBemjQouEnyU1FTpwVp++ySn1GRfbv0bGBagLoAR6t++vfsJuuZsw5bdzpySXa4t/ZaSgGAZ2uuJXjv4You3RJMC1AX0VIrzl/njWt1605L3wzQdTEGDIBZW4JpAfIQBaVV2lF4UhZJcQP7N0lSHLXi0LqDljhqPWQMGGAOZm0JdosWoJUrVyoyMlK+vr6Ki4vTzp07W6y7Zs0aWSwWu5evr69dHcMwtGzZMoWFhalnz55KTExUfn5+R19Gh6ioqVPya9v1g+XZWvreZ1ry3mea9HyW/r/Xtzc7S6elVhxad9ASxoCZG8sfmEdLf9ZmbQl2eQvQhg0blJKSolWrVikuLk4vvfSSkpKSdPDgQQUHBzf7GX9/fx08eND23nLetL3nnntOL7/8stauXauoqCg9/PDDSkpK0ueff94kWXJ3C9PylFtQ3qR825flXbpvtity173WGANmTmyBYx4X+rM2a0uwxTCaWUK4E8XFxWnUqFF65ZVXJEkNDQ2KiIjQggULtGTJkib116xZo0WLFqmioqLZ8xmGofDwcD3wwANavHixJKmyslIhISFas2aNbr/99gvGZLVaFRAQoMrKSvn7+7f94i5SQWmVfrA822GdzMUTu+xfzq6CBw3c0azVO1t84PHFqmtpzZ91Zc23WpC2z+N/Tznz/HZpF1hdXZ327NmjxMREW5mXl5cSExOVm5vb4ueqqqo0YMAARURE6KabbtK///1v27HCwkIVFxfbnTMgIEBxcXEtnrO2tlZWq9Xu1VGcaW5uzfo+Zl+p2ROa79lrDe7GrINezai1f9aNLcGZiyfqrTmjlLl4otbNHe1RyY+zXNoFVlZWpvr6eoWEhNiVh4SE6Isvvmj2M4MGDdKbb76poUOHqrKyUs8//7zGjh2rf//737rssstUXFxsO8f552w8dr7U1FQ99thj7XBFLWtLK0Br1vfpqn2zF+IprSrstea53LXLsj2YddCrGTn7Zx0V2PX+vrfELQZBOyM+Pl6zZs3S8OHDNWHCBL333nsKCgrS73//+zafc+nSpaqsrLS9jh492o4Rn9OWVoDGftmWuNPy5O3ZEtOac3lKqwp7rXkeM2wNYtZBr2bEn3XLXNoCFBgYKG9vb5WUlNiVl5SUKDQ0tFXn6N69u2JjY3Xo0CFJsn2upKREYWFhduccPnx4s+fw8fGRj49PG66gddrSCtD47XPxlO/p2/qGJgOhx17R3y1m6bRnS0xrz+XM/XT1t3h++XgeM2wNYtZBr2bEn3XLXNoC1KNHD40YMUIZGRm2soaGBmVkZCg+Pr5V56ivr9dnn31mS3aioqIUGhpqd06r1aodO3a0+pztzZlWgPO/fd64cqu6e3vpr/MTlHrLED1zyxBlLp6ot+8a025dPRfTetOeLTGtPVdr7qe7fIs34waDnsxMY2NY/qBryT54Qr/N+I/+r5kvh/xZN8/l0+BTUlI0e/ZsjRw5UqNHj9ZLL72k6upqzZkzR5I0a9YsXXrppUpNTZUkPf744xozZoyio6NVUVGh3/zmNzp8+LDuvPNOSeemxC9atEhPPvmkYmJibNPgw8PDNX36dJdcozOtAC0lAZLa/dvnhVpcWmo9ObcoY7nKqmrbbXyLM606rbmf9/5xb5NWs5z8Ut3zxz1Ku3tMq2JqLyuSY5vMruCXj3sy09gYlj/oGg6XV2v6yq069Z0vd339uuuv88Ypov+535X8WTfP5QnQjBkzVFpaqmXLlqm4uFjDhw9Xenq6bRDzkSNH5OX1v4aqU6dO6a677lJxcbH69u2rESNGaNu2bbrqqqtsdR588EFVV1fr7rvvVkVFhcaNG6f09HSXrQE0MKi3+vp1t/sL2qivX3e77prOHDDbUrJ1zx/3qLu3l10sV1/qrzviB2j11iIdOH66Ved35mHhzIPnQk26hmE0u3aSJOUWlHf6wGN++XgOM3ZZmmnQa1d0fvIjSadqvtWNK7do37IpduX8WdtzeQIkSfPnz9f8+fObPZaVlWX3/sUXX9SLL77o8HwWi0WPP/64Hn/88fYK8aIUlFY1m/xI5/6iNj6QO/Pbp6NkK7egXF72PTbaf8yqxX/5zKmf4czDwtkHj6NWlQ/3H3d4ru0F5S75JcAvH/fHeAl4kuyDJxw+W/4vv1TXOphI0xJXj53sLG6RAHV1rU1sOvPb54ViariI5TG9LNJV4c4tIOnsg8dxq4rj4C0Oj8Ls6LKEp8j7qsLh8b1HTjmVAHnK8iLthQSoE7Q2senMb5+tWWOorRqMcy1Gk57PcuofT1sePM21qsRF9Xf4c+IGOj4Oc/rut166LOEJhl/Wx+Hxay7v69T5zDAD8rtIgDqBM4lNZ337bCkmL0kN7fhznPnH015jZQYG9dbYK/pr25dNxwF9P/QSp8+Hrs3Rt14SH7izCYOCHY4vdab1x4yLtrp8LzB31BF7gTm7z0pbkgBn+22bi+nqS/11pq5eX5a273Tfzt6zrLlr+66u3KwL57AnFjzZ0fIa3bhyi8NZYK2RefCE5ry1q8Xjb80ZpUmDmt+g3J048/wmAWpGR26G2hHN6hfbb/vJ0VP61cb92v91x+2B5qp/PIVl1VqQtleff221G9fEAw7ShTccZrNheIr/yy/V3iOndM3lfds88Lkr/FvwmM1QzSgqsJcmDQrulOnsrV2McPlH+a2e2t5Wrpo+bBiG9h+zNhnU3RUXtoPz2KoEXcW1MUG6b/L32pT8SOZctJUEyMNd7Mq1LX2+vbj6Hw8PODhixnV/gJaYbcVoBkF7uItdO+hCn79Y11zex6X/eHjAwRHW/QH+x2yLttIC5OEu9gHfkdPhJennP4h26UBjMzbrwjlm+9YLXEhHDNVwR7QAebiL/Qbb+PmWZktdLHdoYWFhOzhitm+9AM5hFlgzOnIWWEdwdor9+T45WqGbVm5t8fgztwzRu3u/0t7DFc0mWZI8YhoxDzgA6NqceX7TAtQFXOw32M+PO57+bkh6Y9Yoh60ontDCwl5cAIBGJEBdSNsf8I4bAQ8Wn9aYgf0dJll0IQAAPAkJEC64d9aabUVas61Iowb01RuzR7WY3NDCAgDwFMwCc6GC0iplHjzh8sX4GvfOupBdh09p4vOZqmxm3xkAADwJg6Cb0dGDoFvauuKBKd/TyZo6l3QhXWjvrO8aFdlX79wzthOiAgCg9dgL7CJ1dAJ026pt2ll0ymEdV23WWVhWrb9+ckwvbs53WM9T9oUBAJgHe4G5sYLSqgsmP5Jze3m1p6jAXrphaPgF67GFBADAk5EAdbIPPv26VfVcuVnnwKDeGjmgr8M67rDAIQAAbUUC1MlOVtc5Vd9VLS2rZ49S32a637wtYgsJAIDHIwHqZJMGBTtV31UtLQF+3ZW1eJJGRdq3BCVEB7ndAocAADiLdYA62YRBwQro2U2VZ846rOcOu1EH+HXXO/eMZYFDAECXQwuQC3ww/9om3UvdvOx3K3enrSTMsjMwAMA8aAFygYj+ftq3bIr+L79Ue4+c0jWX99W1MUG0tAAA0ElYB6gZnrYbPAAAYB0gAAAAh0iAAACA6ZAAAQAA02EQNAAAXUBBaZUOn6xhIk0rkQABAODBKmrqtDAtTzn5pbYyV22o7UnoAgMAwIMtTMvT1kNldmWu2lDbk7hFArRy5UpFRkbK19dXcXFx2rlzZ4t1X3/9dV177bXq27ev+vbtq8TExCb177jjDlksFrvX9ddf39GXAQBApyoorVJOfqnqz1vRxpUbansKlydAGzZsUEpKih555BHt3btXw4YNU1JSkk6cONFs/aysLCUnJyszM1O5ubmKiIjQlClTdOzYMbt6119/vY4fP257paWldcblAADQaQ6frHF4/LsbaheUVinz4AmSov9y+UKIcXFxGjVqlF555RVJUkNDgyIiIrRgwQItWbLkgp+vr69X37599corr2jWrFmSzrUAVVRUaNOmTa2Koba2VrW1tbb3VqtVERERLIQIAHBbBaVV2lFYrqXv7W+xTubiierr1900Y4Q8ZiHEuro67dmzR4mJibYyLy8vJSYmKjc3t1XnqKmp0bfffqt+/frZlWdlZSk4OFiDBg3Svffeq/Ly8hbPkZqaqoCAANsrIiKibRcEAEAHq6ip06zVO/WD5dktJj/eFovGxwQpKrAXY4Ra4NIEqKysTPX19QoJCbErDwkJUXFxcavO8ctf/lLh4eF2SdT111+vdevWKSMjQ88++6yys7M1depU1dfXN3uOpUuXqrKy0vY6evRo2y8KAIAO1FxCc77GDbUZI9Qyj54G/8wzz2j9+vXKysqSr6+vrfz222+3/f+QIUM0dOhQXXHFFcrKytLkyZObnMfHx0c+Pj6dEjMAAG3VmNC0JPWWIRozsL9tHaC9R085PF9RebVp1wxyaQtQYGCgvL29VVJSYldeUlKi0NBQh599/vnn9cwzz+ijjz7S0KFDHdYdOHCgAgMDdejQoYuOGQAAV7nQoOdLfLvZJTQD+vk5rB/Z35zJj+TiBKhHjx4aMWKEMjIybGUNDQ3KyMhQfHx8i5977rnn9MQTTyg9PV0jR4684M/56quvVF5errCwsHaJGwAAV7hQQrN2W5Hd+4FBvTU+JkjeFotd+XfHCJmVy6fBp6Sk6PXXX9fatWt14MAB3XvvvaqurtacOXMkSbNmzdLSpUtt9Z999lk9/PDDevPNNxUZGani4mIVFxerqqpKklRVVaVf/OIX2r59u4qKipSRkaGbbrpJ0dHRSkpKcsk1AgDQHgYG9dbIAX1bPL6r6FSTcT0rkmOVEB1oV9Y4RsjMXD4GaMaMGSotLdWyZctUXFys4cOHKz093TYw+siRI/Ly+l+e9uqrr6qurk4/+tGP7M7zyCOP6NFHH5W3t7c+/fRTrV27VhUVFQoPD9eUKVP0xBNPMM4HAODx5oyN1O7DLY/tOX9cT4Bfd62bO1qFZdUqKq9mr7D/cvk6QO7ImXUEAADoTAWlVfrB8uwWj2cunmjaBMdj1gECAADOYVxP+yABAgDAwzCu5+K5fAwQAABwDuN6Lh4JEAAAHioqkMSnregCAwAApkMCBAAATIcECAAAmA4JEAAAMB0SIAAAYDokQAAAwHRIgAAAgOmQAAEAANMhAQIAAKZDAgQAAEyHBAgAAJgOCRAAADAdNkMFAAA2G3YeUW5huRKuCNSPR0a4OpwOYzEMw3B1EO7GarUqICBAlZWV8vf3d3U4AAB0uM++qtDNv9umsw3/Swu6eVn013kJuurSABdG1nrOPL/pAgMAAE2SH0k622DoxpVbXRRRxyIBAgDA5DbsPNIk+Wl0tsHQO7uPdnJEHY8ECACALqqgtEqZB0+osKzaYdnmAyUOz/PRv4s7LEZXYRA0AABdTEVNnRam5Sknv9RWNvaK/jIMKbeg3FY2PiZIK5JjFdHXz+H5Ivo5Pu6JSIAAAOhiFqblaeuhMruybV+WN6m39VCZ7ly3S7dcc5nD8/00PrI9w3MLJEAAAHQhBaVVdi0/jtQbhnYVndKuolMt1om9LEBRgb3aKzy3QQIEAEAXcvhkTbudq7GLrCtiEDQAAF3IgHYcr/PYTYMV4Ne93c7nTkiAAADoQgYG9db4mCB5WywXfa6i8uoLV/JQJEAAAHQxK5JjlRAdaFc29or+ih/Y36nzRPbvemN/GjEGCACALibAr7vWzR2twrJqFZVXK7J/L9tA5say3318SHuPVKi+mR2xvC0WJUQHdsnBz41IgAAA6KKiAns1SWIay66J6KsFafuanTGWEB3YZQc/N3KLLrCVK1cqMjJSvr6+iouL086dOx3Wf+edd3TllVfK19dXQ4YM0Ycffmh33DAMLVu2TGFhYerZs6cSExOVn5/fkZcAAIBHaWwlylw8UW/NGaU/zB2tt+aMUubiiVo3d3SXHfzcyOUJ0IYNG5SSkqJHHnlEe/fu1bBhw5SUlKQTJ040W3/btm1KTk7W3LlztW/fPk2fPl3Tp0/X/v37bXWee+45vfzyy1q1apV27NihXr16KSkpSd98801nXRYAAB4hKrCXJg0K1rUxQZo0KLhLd3t9l8Uwmun860RxcXEaNWqUXnnlFUlSQ0ODIiIitGDBAi1ZsqRJ/RkzZqi6uloffPCBrWzMmDEaPny4Vq1aJcMwFB4ergceeECLFy+WJFVWViokJERr1qzR7bfffsGYrFarAgICVFlZKX9//3a6UgAA0JGceX67tAWorq5Oe/bsUWJioq3My8tLiYmJys3NbfYzubm5dvUlKSkpyVa/sLBQxcXFdnUCAgIUFxfX4jlra2tltVrtXgAAoOtyaQJUVlam+vp6hYSE2JWHhISouLj5nWeLi4sd1m/8rzPnTE1NVUBAgO0VERHRpusBAACeweVjgNzB0qVLVVlZaXsdPXrU1SEBAIAO5NIEKDAwUN7e3iopKbErLykpUWhoaLOfCQ0NdVi/8b/OnNPHx0f+/v52LwAA0HW5NAHq0aOHRowYoYyMDFtZQ0ODMjIyFB8f3+xn4uPj7epL0ubNm231o6KiFBoaalfHarVqx44dLZ4TAACYi8sXQkxJSdHs2bM1cuRIjR49Wi+99JKqq6s1Z84cSdKsWbN06aWXKjU1VZJ03333acKECVq+fLmmTZum9evXa/fu3XrttdckSRaLRYsWLdKTTz6pmJgYRUVF6eGHH1Z4eLimT5/uqssEAABuxOUJ0IwZM1RaWqply5apuLhYw4cPV3p6um0Q85EjR+Tl9b+GqrFjx+rtt9/Wr3/9az300EOKiYnRpk2bdPXVV9vqPPjgg6qurtbdd9+tiooKjRs3Tunp6fL19e306wMAAO7H5esAuSPWAQIAwPN4zDpAAAAAruDyLjB31NgoxoKIAAB4jsbndms6t0iAmnH69GlJYkFEAAA80OnTpxUQEOCwDmOAmtHQ0KCvv/5al1xyiU6fPq2IiAgdPXqU8UAdxGq1co87GPe443GPOx73uHN48n02DEOnT59WeHi43QSq5tAC1AwvLy9ddtllks5Nq5fEAomdgHvc8bjHHY973PG4x53DU+/zhVp+GjEIGgAAmA4JEAAAMB0SoAvw8fHRI488Ih8fH1eH0mVxjzse97jjcY87Hve4c5jlPjMIGgAAmA4tQAAAwHRIgAAAgOmQAAEAANMhAQIAAKZDAuTAypUrFRkZKV9fX8XFxWnnzp2uDsljpaamatSoUbrkkksUHBys6dOn6+DBg3Z1vvnmG82bN0/9+/dX7969deutt6qkpMRFEXu+Z555RhaLRYsWLbKVcY8v3rFjx/STn/xE/fv3V8+ePTVkyBDt3r3bdtwwDC1btkxhYWHq2bOnEhMTlZ+f78KIPU99fb0efvhhRUVFqWfPnrriiiv0xBNP2O3vxH12Tk5Ojm644QaFh4fLYrFo06ZNdsdbcz9PnjypmTNnyt/fX3369NHcuXNVVVXViVfRvkiAWrBhwwalpKTokUce0d69ezVs2DAlJSXpxIkTrg7NI2VnZ2vevHnavn27Nm/erG+//VZTpkxRdXW1rc7999+vv/3tb3rnnXeUnZ2tr7/+WrfccosLo/Zcu3bt0u9//3sNHTrUrpx7fHFOnTqlhIQEde/eXf/4xz/0+eefa/ny5erbt6+tznPPPaeXX35Zq1at0o4dO9SrVy8lJSXpm2++cWHknuXZZ5/Vq6++qldeeUUHDhzQs88+q+eee04rVqyw1eE+O6e6ulrDhg3TypUrmz3emvs5c+ZM/fvf/9bmzZv1wQcfKCcnR3fffXdnXUL7M9Cs0aNHG/PmzbO9r6+vN8LDw43U1FQXRtV1nDhxwpBkZGdnG4ZhGBUVFUb37t2Nd955x1bnwIEDhiQjNzfXVWF6pNOnTxsxMTHG5s2bjQkTJhj33XefYRjc4/bwy1/+0hg3blyLxxsaGozQ0FDjN7/5ja2soqLC8PHxMdLS0jojxC5h2rRpxs9+9jO7sltuucWYOXOmYRjc54slydi4caPtfWvu5+eff25IMnbt2mWr849//MOwWCzGsWPHOi329kQLUDPq6uq0Z88eJSYm2sq8vLyUmJio3NxcF0bWdVRWVkqS+vXrJ0nas2ePvv32W7t7fuWVV+ryyy/nnjtp3rx5mjZtmt29lLjH7eGvf/2rRo4cqR//+McKDg5WbGysXn/9ddvxwsJCFRcX293jgIAAxcXFcY+dMHbsWGVkZOg///mPJOmTTz7Rli1bNHXqVEnc5/bWmvuZm5urPn36aOTIkbY6iYmJ8vLy0o4dOzo95vbAZqjNKCsrU319vUJCQuzKQ0JC9MUXX7goqq6joaFBixYtUkJCgq6++mpJUnFxsXr06KE+ffrY1Q0JCVFxcbELovRM69ev1969e7Vr164mx7jHF6+goECvvvqqUlJS9NBDD2nXrl1auHChevToodmzZ9vuY3O/O7jHrbdkyRJZrVZdeeWV8vb2Vn19vZ566inNnDlTkrjP7aw197O4uFjBwcF2x7t166Z+/fp57D0nAUKnmzdvnvbv368tW7a4OpQu5ejRo7rvvvu0efNm+fr6ujqcLqmhoUEjR47U008/LUmKjY3V/v37tWrVKs2ePdvF0XUdf/7zn/WnP/1Jb7/9tgYPHqy8vDwtWrRI4eHh3Ge0G7rAmhEYGChvb+8ms2NKSkoUGhrqoqi6hvnz5+uDDz5QZmamLrvsMlt5aGio6urqVFFRYVefe956e/bs0YkTJ3TNNdeoW7du6tatm7Kzs/Xyyy+rW7duCgkJ4R5fpLCwMF111VV2Zd///vd15MgRSbLdR353XJxf/OIXWrJkiW6//XYNGTJEP/3pT3X//fcrNTVVEve5vbXmfoaGhjaZBHT27FmdPHnSY+85CVAzevTooREjRigjI8NW1tDQoIyMDMXHx7swMs9lGIbmz5+vjRs36uOPP1ZUVJTd8REjRqh79+529/zgwYM6cuQI97yVJk+erM8++0x5eXm218iRIzVz5kzb/3OPL05CQkKT5Rv+85//aMCAAZKkqKgohYaG2t1jq9WqHTt2cI+dUFNTIy8v+8eTt7e3GhoaJHGf21tr7md8fLwqKiq0Z88eW52PP/5YDQ0NiouL6/SY24WrR2G7q/Xr1xs+Pj7GmjVrjM8//9y4++67jT59+hjFxcWuDs0j3XvvvUZAQICRlZVlHD9+3Paqqamx1bnnnnuMyy+/3Pj444+N3bt3G/Hx8UZ8fLwLo/Z8350FZhjc44u1c+dOo1u3bsZTTz1l5OfnG3/6058MPz8/449//KOtzjPPPGP06dPHeP/9941PP/3UuOmmm4yoqCjjzJkzLozcs8yePdu49NJLjQ8++MAoLCw03nvvPSMwMNB48MEHbXW4z845ffq0sW/fPmPfvn2GJOOFF14w9u3bZxw+fNgwjNbdz+uvv96IjY01duzYYWzZssWIiYkxkpOTXXVJF40EyIEVK1YYl19+udGjRw9j9OjRxvbt210dkseS1OzrrbfestU5c+aM8fOf/9zo27ev4efnZ9x8883G8ePHXRd0F3B+AsQ9vnh/+9vfjKuvvtrw8fExrrzySuO1116zO97Q0GA8/PDDRkhIiOHj42NMnjzZOHjwoIui9UxWq9W47777jMsvv9zw9fU1Bg4caPzqV78yamtrbXW4z87JzMxs9nfw7NmzDcNo3f0sLy83kpOTjd69exv+/v7GnDlzjNOnT7vgatqHxTC+s7QmAACACTAGCAAAmA4JEAAAMB0SIAAAYDokQAAAwHRIgAAAgOmQAAEAANMhAQIAAKZDAgQAAEyHBAgAAJgOCRAAj5aeni6LxaLi4mK78rCwMEVGRtqVFRUVyWKx2DZ9nDhxoiwWS5PXPffcY/vMd8v9/f01atQovf/++x1+XQA6FgkQAI82btw4devWTVlZWbayAwcO6MyZMzp16pSKiops5ZmZmfLx8VFCQoKt7K677tLx48ftXs8995zdz3jrrbd0/Phx7d69WwkJCfrRj36kzz77rKMvDUAHIgEC4DH+8pe/aMiQIerZs6f69++vxMREWSwWjRo1yi4BysrK0rhx45SQkNCkfMyYMfL19bWV+fn5KTQ01O7l7+9v93P79Omj0NBQfe9739MTTzyhs2fPKjMzs6MvF0AHIgEC4BGOHz+u5ORk/exnP9OBAweUlZWlW265RYZhaNKkSXYJSWZmpiZOnKgJEybYlWdlZWnSpEltjuHs2bNavXq1JKlHjx5tvxgALsdu8AA8wt69ezVixAgVFRVpwIABdsf+9a9/6brrrtPXX3+tsLAwhYSE6IMPPtDZs2eVnJysoqIiFRQU6IorrlB2drbGjx8v6dwYoG3btjVJZn7/+99r5syZks6NAfL19ZW3t7fOnDmjhoYGRUZGas+ePerXr1/nXDyAdtfN1QEAQGsMGzZMkydP1pAhQ5SUlKQpU6boRz/6kfr27auxY8eqR48eysrK0rBhw3TmzBldc801amhoUGlpqQoLC5WVlaWePXtqzJgxduedOXOmfvWrX9mVhYSE2L1/8cUXlZiYqIKCAt1///16+eWXSX4AD0cCBMAjeHt7a/Pmzdq2bZs++ugjrVixQr/61a+0Y8cORUVFafTo0crMzNTJkyc1btw4eXt7y9vbW2PHjlVmZqYyMzOVkJDQpLUnICBA0dHRDn92aGiooqOjFR0drbfeeks//OEP9fnnnys4OLgjLxlAB2IMEACPYbFYlJCQoMcee0z79u1Tjx49tHHjRknSpEmTlJWVpaysLE2cONH2mfHjxysrK0vZ2dkXNf6n0ejRozVixAg99dRTF30uAK5DAgTAI+zYsUNPP/20du/erSNHjui9995TaWmpvv/970s6lwDl5+frn//8pyZMmGD73IQJE7Rp0yYdPXq02QSopqZGxcXFdq9Tp045jGXRokX6/e9/r2PHjrXvRQLoNAyCBuARDhw4oPvvv1979+6V1WrVgAEDtGDBAs2fP1+S9M0336hPnz7q3r27Tp06pW7dzvXw19bWqk+fPurWrZtduXRuEHR2dnaTn5WUlKT09HRJ51qdNm7cqOnTp9uOG4ahq666SpMmTdLvfve7DrxqAB2FBAgAAJgOXWAAAMB0SIAAAIDpkAABAADTIQECAACmQwIEAABMhwQIAACYDgkQAAAwHRIgAABgOiRAAADAdEiAAACA6ZAAAQAA0/n/ASLavNsI33U4AAAAAElFTkSuQmCC", + "image/png": "", "text/plain": [ "
" ] @@ -1220,7 +1255,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -1229,13 +1264,13 @@ "" ] }, - "execution_count": 48, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -1250,7 +1285,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -1268,7 +1303,7 @@ "\u001b[0;31mKeyError\u001b[0m: 'Experiment'", "\nThe above exception was the direct cause of the following exception:\n", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[34], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m df_standard_flow_variations \u001b[38;5;241m=\u001b[39m df[\u001b[38;5;241m~\u001b[39mdf[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mJoint\u001b[39m\u001b[38;5;124m\"\u001b[39m]]\u001b[38;5;241m.\u001b[39mreplace(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-\u001b[39m\u001b[38;5;124m\"\u001b[39m, np\u001b[38;5;241m.\u001b[39mnan)\n\u001b[0;32m----> 2\u001b[0m df_standard_flow_variations[\u001b[38;5;241m~\u001b[39m\u001b[43mdf_standard_flow_variations\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mExperiment\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mcontains(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msimple_encoder\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m&\u001b[39m \u001b[38;5;241m~\u001b[39mdf_standard_flow_variations[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExperiment\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mcontains(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mddi_actnorm\u001b[39m\u001b[38;5;124m\"\u001b[39m)]\n", + "Cell \u001b[0;32mIn[25], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m df_standard_flow_variations \u001b[38;5;241m=\u001b[39m df[\u001b[38;5;241m~\u001b[39mdf[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mJoint\u001b[39m\u001b[38;5;124m\"\u001b[39m]]\u001b[38;5;241m.\u001b[39mreplace(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-\u001b[39m\u001b[38;5;124m\"\u001b[39m, np\u001b[38;5;241m.\u001b[39mnan)\n\u001b[0;32m----> 2\u001b[0m df_standard_flow_variations[\u001b[38;5;241m~\u001b[39m\u001b[43mdf_standard_flow_variations\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mExperiment\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mcontains(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msimple_encoder\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m&\u001b[39m \u001b[38;5;241m~\u001b[39mdf_standard_flow_variations[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExperiment\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mcontains(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mddi_actnorm\u001b[39m\u001b[38;5;124m\"\u001b[39m)]\n", "File \u001b[0;32m/work/tools22/users/lukas.rilling/sis_env/lib/python3.10/site-packages/pandas/core/frame.py:3896\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3894\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 3895\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 3896\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3897\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 3898\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", "File \u001b[0;32m/work/tools22/users/lukas.rilling/sis_env/lib/python3.10/site-packages/pandas/core/indexes/base.py:3797\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3792\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3793\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3794\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3795\u001b[0m ):\n\u001b[1;32m 3796\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3797\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3798\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3799\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3800\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3801\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3802\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", "\u001b[0;31mKeyError\u001b[0m: 'Experiment'" @@ -1282,7 +1317,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1573,7 +1608,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1603,7 +1638,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1633,7 +1668,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1663,7 +1698,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [ { diff --git a/users/rilling/evaluation/wer_eval.ipynb b/users/rilling/evaluation/wer_eval.ipynb index 81fcc25ad..2bf05746d 100644 --- a/users/rilling/evaluation/wer_eval.ipynb +++ b/users/rilling/evaluation/wer_eval.ipynb @@ -28,85 +28,85 @@ { "data": { "text/plain": [ - "['/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/tuning/lm_1.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_1.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/search/dev-other/sclite/wer',\n", + "['/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/tuning/lm_1.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/tuning/lm_4.0/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/tuning/lm_2.5/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/tuning/lm_1.5/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/tuning/lm_2.0/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/tuning/lm_1.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/tuning/lm_3.0/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_4.0/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_1.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/tuning/lm_1.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_1.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_1.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/tuning/lm_1.5/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/tuning/lm_3.5/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/tuning/lm_2.5/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/tuning/lm_3.0/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_1.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_4.0/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/tuning/lm_3.0/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/tuning/lm_2.0/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/tuning/lm_3.5/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/tuning/lm_1.5/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/tuning/lm_1.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_radam/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_4.0/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_2.5/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ddi_actnorm/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_spec_augment/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/tuning/lm_3.5/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/tuning/lm_2.5/search/dev-other/sclite/wer',\n", @@ -114,597 +114,749 @@ " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/tuning/lm_4.5/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/tuning/lm_3.0/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/tuning/lm_1.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_control/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_coupling_epsilon/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_trainXvector/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuned/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ddi_actnorm/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_4.0/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_2.5/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_3.5/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_2.0/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_3.0/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ddi_actnorm/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_4.0/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_3.5/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_2.5/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_radam/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_encoder_sample_ctc_scale_0.1/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_encoder_sample_ctc_scale_0.1/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/tuning/lm_4.5/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/tuning/lm_2.0/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/tuning/lm_3.0/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/tuning/lm_2.5/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/tuning/lm_3.5/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuned/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_coupling_epsilon/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_trainXvector/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_control/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ddi_actnorm/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_spec_augment/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_radam/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep_v2/no_specaug/ce_ls_1.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/no_specaug/ce_ls_1.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/tuning/lm_1.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/search/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/specaug/ce_ls_1.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/no_specaug/ce_ls_1.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep_v2/no_specaug/ce_ls_1.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_4x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/default/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval_spec_aug/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc768_100ep_xvector/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_simple_encoder/silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment_before/glow_enc192_200ep_not_freezed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment_before/glow_enc192_200ep/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/silence_preprocessing/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/drop_around_blstm/lm5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/drop_around_blstm/spec_augment/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder_16blocks/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/silence_preprocessing/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder/silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_with_small_enc/silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_simple_encoder/silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_simple_encoder_epoch84/silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/100epTTS/silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS_early_eval_ep100/silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS_early_eval_ep100/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS/silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/linear_1x512_d0.2_b300_fs4/glow_nar_taco_encoder_16blocks/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/linear_1x512_d0.2_b300_fs4/glow_enc768/tts_dataset/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x1024_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/tts_dataset/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/tts_dataset/spec_augment/no_glow/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/tuned/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/layer_norm/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/batch_norm/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.4/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.1/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.2/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment_before/glow_enc192_200ep_not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/tuned/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/batch_norm/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/layer_norm/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm3.0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm2.0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm3.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm2.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm1.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm4.0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v2/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/tuned/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/tuned_no_prior/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.4/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.1/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.2/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_200ep_not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm3.0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm2.0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm1.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm4.0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm3.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm2.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/test/blstm512/warmup/d0.2_b100/default_250/extra/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_4x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x1024_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/tts_dataset/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/lm5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/silence_preprocessing/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment_before/glow_enc192_200ep_not_freezed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment_before/glow_enc192_200ep/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_simple_encoder/silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc768_100ep_xvector/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/drop_around_blstm/lm5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/drop_around_blstm/spec_augment/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/default/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval_spec_aug/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder_16blocks/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS_early_eval_ep100/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS_early_eval_ep100/silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS/silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/100epTTS/silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_with_small_enc/silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_simple_encoder/silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/not_silence_preprocessed/lm5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/silence_preprocessing/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_simple_encoder_epoch84/silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder/silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/linear_1x512_d0.2_b300_fs4/glow_enc768/tts_dataset/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/linear_1x512_d0.2_b300_fs4/glow_nar_taco_encoder_16blocks/default_250/dev-other/sclite/wer']" + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/tuned_no_prior/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/tuned/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v2/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer']" ] }, "execution_count": 2, @@ -929,7 +1081,8 @@ "outputs": [], "source": [ "df_final = df_indexed.round(decimals=3)\n", - "df_final = df_final.fillna(\"-\")" + "df_final = df_final.fillna(\"-\")\n", + "df_final[\"Missing glow.eval\"] = df_final[\"Experiment\"].str.contains(\"glow_not_eval\")" ] }, { @@ -948,66 +1101,62 @@ "name": "stdout", "output_type": "stream", "text": [ - "| | Group | Experiment | WER (dev-other) | Count | Tuned | CTC | dev CTC | overfitting | dev MLE | dev DP | Joint | Still running | Training data available | Num Epochs | LR | ASR Model Type |\n", - "|----:|:--------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------:|--------:|:--------|:------|:----------|:--------------|:--------------------|:--------------------|:--------|:----------------|:--------------------------|:-------------|:-------------------------------------------------|:-----------------|\n", - "| 9 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ddi_actnorm/ | 66.7 | 1 | False | 0.282 | 0.583 | 2.068 | -0.696242607904203 | 0.4012426779125676 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 11 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control/ | 38.5 | 1 | False | 0.216 | 0.735 | 3.408 | -0.7795215357433666 | 1.5493716010541627 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 12 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_radam/ | 38.3 | 1 | False | 0.214 | 0.731 | 3.414 | -0.7803357613809181 | 1.4600530691219098 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 13 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_spec_augment/ | 37.9 | 1 | False | 0.501 | 0.588 | 1.173 | -0.7782901200381193 | 1.4496996285337391 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 15 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ddi_actnorm/ | 94.4 | 1 | False | 0.165 | 1.062 | 6.429 | -0.5203935901323954 | 0.21473516580281835 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 16 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_encoder_sample_ctc_scale_0.1/ | 100 | 1 | False | 3.481 | 6.325 | 1.817 | -0.7816050729968331 | 1.1355000418243986 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 17 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_radam/ | 43.8 | 1 | False | 0.098 | 1.135 | 11.599 | -0.5829270274350138 | 0.34816459318002063 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 21 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_trainXvector/ | 47 | 1 | False | 0.046 | 1.485 | 32.45 | -0.5923372669653459 | 0.586278223855929 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 32 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_coupling_epsilon/ | 30.4 | 1 | False | 0.243 | 0.576 | 2.37 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 36 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_control/ | 46.3 | 1 | False | 0.233 | 0.916 | 3.927 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 39 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector/ | 43.5 | 1 | False | 0.251 | 0.762 | 3.03 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 44 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/no_specaug/ce_ls_1.0/ | 81 | 1 | False | 0.036 | 2.023 | 55.937 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 45 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/specaug/ce_ls_1.0/ | 79.5 | 1 | False | 1.182 | 0.749 | 0.634 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 46 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep_v2/no_specaug/ce_ls_1.0/ | 86.9 | 1 | False | 0.006 | 3.047 | 476.04 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 47 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x1024_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/ | 66.4 | 1 | False | 0.582 | 1.039 | 1.786 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 48 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/default/ | 55.1 | 1 | False | 0.177 | 1.151 | 6.489 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 49 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/drop_around_blstm/lm5/ | 66.3 | 1 | False | 0.599 | 0.948 | 1.581 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 50 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/drop_around_blstm/spec_augment/ | 94.2 | 1 | False | 2.438 | 1.396 | 0.573 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 52 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval_spec_aug/not_silence_preprocessed/ | 67.7 | 1 | False | 0.725 | 1.04 | 1.434 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 56 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/silence_preprocessed/ | 78.1 | 1 | False | 0.779 | 1.163 | 1.493 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 57 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/ | 73.4 | 1 | False | 0.667 | 1.099 | 1.648 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 59 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/lm5/ | 73.4 | 1 | False | 0.667 | 1.099 | 1.648 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 60 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/silence_preprocessing/ | 76.2 | 1 | False | 0.71 | 1.15 | 1.62 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 61 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_simple_encoder/silence_preprocessed/ | 50.3 | 1 | False | 0.046 | 1.527 | 33.333 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 66 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc768_100ep_xvector/ | 62.4 | 1 | False | 0.326 | 1.049 | 3.219 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 67 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment_before/glow_enc192_200ep/ | 92.4 | 1 | False | 2.272 | 1.371 | 0.604 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 68 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment_before/glow_enc192_200ep_not_freezed/ | 30.9 | 1 | False | 0.172 | 0.56 | 3.248 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 69 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/100epTTS/silence_preprocessed/ | 59.7 | 1 | False | 0.356 | 1.057 | 2.966 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 70 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS/not_silence_preprocessed/ | 61.8 | 1 | False | 0.449 | 1.042 | 2.32 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 71 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS/silence_preprocessed/ | 61.8 | 1 | False | 0.408 | 1.04 | 2.552 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 72 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS_early_eval_ep100/not_silence_preprocessed/ | 60.6 | 1 | False | 0.385 | 1.046 | 2.718 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 73 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS_early_eval_ep100/silence_preprocessed/ | 60.8 | 1 | False | 0.35 | 1.066 | 3.045 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 74 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/not_silence_preprocessed/ | 61.1 | 1 | False | 0.402 | 1.038 | 2.579 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 75 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/not_silence_preprocessed/ | 70.4 | 1 | False | 0.737 | 1.107 | 1.503 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 76 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/not_silence_preprocessed/lm5/ | 70.4 | 1 | False | 0.737 | 1.107 | 1.503 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 77 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/silence_preprocessing/ | 71.9 | 1 | False | 0.767 | 1.135 | 1.479 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 78 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder/not_silence_preprocessed/ | 66.2 | 1 | False | 0.668 | 1.026 | 1.535 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 79 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder/silence_preprocessed/ | 67.7 | 1 | False | 0.622 | 1.081 | 1.738 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 80 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder_16blocks/not_silence_preprocessed/ | 55.1 | 1 | False | 0.105 | 1.32 | 12.536 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 81 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_simple_encoder/silence_preprocessed/ | 56.9 | 1 | False | 0.096 | 1.431 | 14.855 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 82 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_simple_encoder_epoch84/silence_preprocessed/ | 57.4 | 1 | False | 0.079 | 1.519 | 19.301 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 83 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_with_small_enc/silence_preprocessed/ | 79.4 | 1 | False | 1.103 | 1.283 | 1.163 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 84 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_4x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/ | 52.4 | 1 | False | 0.378 | 0.812 | 2.15 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 90 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.1/ | 25.8 | 1 | False | 0.406 | 0.422 | 1.041 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 91 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.2/ | 25.5 | 1 | False | 0.413 | 0.417 | 1.01 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 92 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/ | 26.2 | 1 | False | 0.44 | 0.415 | 0.943 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 93 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.4/ | 25.2 | 1 | False | 0.416 | 0.413 | 0.993 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 97 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/batch_norm/ | 29.4 | 1 | False | 0.392 | 0.504 | 1.285 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 98 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/layer_norm/ | 28.5 | 1 | False | 0.407 | 0.488 | 1.197 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 100 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v2/ | 26.6 | 1 | False | 0.216 | 0.589 | 2.724 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 101 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v3/ | 26.3 | 1 | False | 0.211 | 0.591 | 2.799 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 106 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/ | 41.3 | 1 | False | 0.423 | 0.751 | 1.777 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 109 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_200ep_not_silence_preprocessed/ | 100 | 1 | False | 3.454 | 5.802 | 1.68 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 111 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/tts_dataset/spec_augment/no_glow/ | 20.9 | 1 | False | 0.091 | 0.419 | 4.591 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 112 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/linear_1x512_d0.2_b300_fs4/glow_enc768/tts_dataset/ | 99.7 | 1 | False | 2.628 | 2.609 | 0.993 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown |\n", - "| 113 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/linear_1x512_d0.2_b300_fs4/glow_nar_taco_encoder_16blocks/ | 98.8 | 1 | False | 2.23 | 2.39 | 1.072 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown |\n", - "| 115 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/tts_dataset/ | 59.8 | 1 | False | 0.071 | 1.537 | 21.521 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 116 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/test/blstm512/warmup/d0.2_b100/default_250/extra/dev-other/sclite/wer | 127.9 | 1 | False | - | - | - | - | - | False | False | True | - | - | blstm |\n" + "| | Group | Experiment | WER (dev-other) | Count | Tuned | CTC | dev CTC | overfitting | dev MLE | dev DP | Joint | Still running | Training data available | Num Epochs | LR | ASR Model Type | Missing glow.eval |\n", + "|----:|:--------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------:|--------:|:--------|------:|----------:|--------------:|:--------------------|:--------------------|:--------|:----------------|:--------------------------|-------------:|:-------------------------------------------------|:-----------------|:--------------------|\n", + "| 9 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ddi_actnorm/ | 66.7 | 1 | False | 0.282 | 0.583 | 2.068 | -0.696242607904203 | 0.4012426779125676 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 11 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control/ | 38.5 | 1 | False | 0.216 | 0.735 | 3.408 | -0.7795215357433666 | 1.5493716010541627 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 12 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_radam/ | 38.3 | 1 | False | 0.214 | 0.731 | 3.414 | -0.7803357613809181 | 1.4600530691219098 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 13 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_spec_augment/ | 37.9 | 1 | False | 0.501 | 0.588 | 1.173 | -0.7782901200381193 | 1.4496996285337391 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 15 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ddi_actnorm/ | 94.4 | 1 | False | 0.165 | 1.062 | 6.429 | -0.5203935901323954 | 0.21473516580281835 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 16 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_encoder_sample_ctc_scale_0.1/ | 100 | 1 | False | 3.481 | 6.325 | 1.817 | -0.7816050729968331 | 1.1355000418243986 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 17 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_radam/ | 43.8 | 1 | False | 0.098 | 1.135 | 11.599 | -0.5829270274350138 | 0.34816459318002063 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 21 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_trainXvector/ | 47 | 1 | False | 0.046 | 1.485 | 32.45 | -0.5923372669653459 | 0.586278223855929 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 32 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_coupling_epsilon/ | 30.4 | 1 | False | 0.243 | 0.576 | 2.37 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 36 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_control/ | 46.3 | 1 | False | 0.233 | 0.916 | 3.927 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 39 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector/ | 43.5 | 1 | False | 0.251 | 0.762 | 3.03 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 45 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/no_specaug/ce_ls_1.0/ | 81 | 1 | False | 0.036 | 2.023 | 55.937 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 46 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/specaug/ce_ls_1.0/ | 79.5 | 1 | False | 1.182 | 0.749 | 0.634 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 47 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep_v2/no_specaug/ce_ls_1.0/ | 86.9 | 1 | False | 0.006 | 3.047 | 476.04 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 48 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x1024_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/ | 66.4 | 1 | False | 0.582 | 1.039 | 1.786 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 49 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/default/ | 55.1 | 1 | False | 0.177 | 1.151 | 6.489 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 50 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/drop_around_blstm/lm5/ | 66.3 | 1 | False | 0.599 | 0.948 | 1.581 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 51 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/drop_around_blstm/spec_augment/ | 94.2 | 1 | False | 2.438 | 1.396 | 0.573 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 53 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval_spec_aug/not_silence_preprocessed/ | 67.7 | 1 | False | 0.725 | 1.04 | 1.434 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 57 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/silence_preprocessed/ | 78.1 | 1 | False | 0.779 | 1.163 | 1.493 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 58 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/ | 73.4 | 1 | False | 0.667 | 1.099 | 1.648 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 60 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/silence_preprocessing/ | 76.2 | 1 | False | 0.71 | 1.15 | 1.62 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 61 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_simple_encoder/silence_preprocessed/ | 50.3 | 1 | False | 0.046 | 1.527 | 33.333 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 66 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc768_100ep_xvector/ | 62.4 | 1 | False | 0.326 | 1.049 | 3.219 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 67 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment_before/glow_enc192_200ep/ | 92.4 | 1 | False | 2.272 | 1.371 | 0.604 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 68 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment_before/glow_enc192_200ep_not_freezed/ | 30.9 | 1 | False | 0.172 | 0.56 | 3.248 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 69 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/100epTTS/silence_preprocessed/ | 59.7 | 1 | False | 0.356 | 1.057 | 2.966 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 70 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS/not_silence_preprocessed/ | 61.8 | 1 | False | 0.449 | 1.042 | 2.32 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 71 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS/silence_preprocessed/ | 61.8 | 1 | False | 0.408 | 1.04 | 2.552 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 72 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS_early_eval_ep100/not_silence_preprocessed/ | 60.6 | 1 | False | 0.385 | 1.046 | 2.718 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 73 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS_early_eval_ep100/silence_preprocessed/ | 60.8 | 1 | False | 0.35 | 1.066 | 3.045 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 74 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/not_silence_preprocessed/ | 61.1 | 1 | False | 0.402 | 1.038 | 2.579 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 75 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/not_silence_preprocessed/ | 70.4 | 1 | False | 0.737 | 1.107 | 1.503 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 76 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/silence_preprocessing/ | 71.9 | 1 | False | 0.767 | 1.135 | 1.479 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 77 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder/not_silence_preprocessed/ | 66.2 | 1 | False | 0.668 | 1.026 | 1.535 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 78 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder/silence_preprocessed/ | 67.7 | 1 | False | 0.622 | 1.081 | 1.738 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 79 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder_16blocks/not_silence_preprocessed/ | 55.1 | 1 | False | 0.105 | 1.32 | 12.536 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 80 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_simple_encoder/silence_preprocessed/ | 56.9 | 1 | False | 0.096 | 1.431 | 14.855 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 81 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_simple_encoder_epoch84/silence_preprocessed/ | 57.4 | 1 | False | 0.079 | 1.519 | 19.301 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 82 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_with_small_enc/silence_preprocessed/ | 79.4 | 1 | False | 1.103 | 1.283 | 1.163 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 83 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_4x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/ | 52.4 | 1 | False | 0.378 | 0.812 | 2.15 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 94 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v2/ | 26.6 | 1 | False | 0.216 | 0.589 | 2.724 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 95 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v3/ | 26.3 | 1 | False | 0.211 | 0.591 | 2.799 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 108 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.1/ | 25.8 | 1 | False | 0.406 | 0.422 | 1.041 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 109 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.2/ | 25.5 | 1 | False | 0.413 | 0.417 | 1.01 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 110 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/ | 26.2 | 1 | False | 0.44 | 0.415 | 0.943 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 111 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.4/ | 25.2 | 1 | False | 0.416 | 0.413 | 0.993 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 115 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/batch_norm/ | 29.4 | 1 | False | 0.392 | 0.504 | 1.285 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 116 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/layer_norm/ | 28.5 | 1 | False | 0.407 | 0.488 | 1.197 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 118 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment_before/glow_enc192_200ep_not_silence_preprocessed/ | 100 | 1 | False | 3.454 | 5.802 | 1.68 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 119 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/tts_dataset/spec_augment/no_glow/ | 20.9 | 1 | False | 0.091 | 0.419 | 4.591 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 120 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/linear_1x512_d0.2_b300_fs4/glow_enc768/tts_dataset/ | 99.7 | 1 | False | 2.628 | 2.609 | 0.993 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | False |\n", + "| 121 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/linear_1x512_d0.2_b300_fs4/glow_nar_taco_encoder_16blocks/ | 98.8 | 1 | False | 2.23 | 2.39 | 1.072 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | False |\n", + "| 124 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/tts_dataset/ | 59.8 | 1 | False | 0.071 | 1.537 | 21.521 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n" ] } ], @@ -1033,18 +1182,19 @@ "name": "stdout", "output_type": "stream", "text": [ - "| | Group | Experiment | WER (dev-other) | Count | Tuned | CTC | dev CTC | overfitting | dev MLE | dev DP | Joint | Still running | Training data available | Num Epochs | LR | ASR Model Type |\n", - "|----:|:---------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------:|--------:|:--------|------:|----------:|--------------:|:----------|:---------|:--------|:----------------|:--------------------------|-------------:|:-----------------------------------------------|:-----------------|\n", - "| 51 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/ | 58 | 13 | True | 0.022 | 2.315 | 104.406 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 53 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/ | 44.6 | 13 | True | 0.326 | 1.042 | 3.202 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 54 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/ | 63.8 | 13 | True | 0.065 | 2.401 | 36.955 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 55 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/ | 55.4 | 13 | True | 0.882 | 1.139 | 1.292 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 58 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/ | 63.2 | 13 | True | 0.062 | 2.427 | 39.391 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 62 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/ | 34 | 19 | True | 0.058 | 0.913 | 15.782 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 63 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/ | 33.8 | 19 | True | 0.003 | 1.209 | 454.454 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 64 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/ | 31.4 | 19 | True | 0.024 | 0.926 | 39.211 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 65 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/ | 23.9 | 19 | True | 0.187 | 0.566 | 3.019 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 114 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/ | 49.4 | 13 | True | 0.028 | 1.672 | 58.743 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n" + "| | Group | Experiment | WER (dev-other) | Count | Tuned | CTC | dev CTC | overfitting | dev MLE | dev DP | Joint | Still running | Training data available | Num Epochs | LR | ASR Model Type | Missing glow.eval |\n", + "|----:|:---------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------:|--------:|:--------|------:|----------:|--------------:|:----------|:---------|:--------|:----------------|:--------------------------|-------------:|:-----------------------------------------------|:-----------------|:--------------------|\n", + "| 52 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/ | 58 | 13 | True | 0.022 | 2.315 | 104.406 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 54 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/ | 44.6 | 13 | True | 0.326 | 1.042 | 3.202 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | True |\n", + "| 55 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/ | 63.8 | 13 | True | 0.065 | 2.401 | 36.955 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 56 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/ | 55.4 | 13 | True | 0.882 | 1.139 | 1.292 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 59 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/ | 63.2 | 13 | True | 0.062 | 2.427 | 39.391 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 62 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/ | 34 | 19 | True | 0.058 | 0.913 | 15.782 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 63 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/ | 33.8 | 19 | True | 0.003 | 1.209 | 454.454 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 64 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/ | 31.4 | 19 | True | 0.024 | 0.926 | 39.211 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 65 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/ | 23.9 | 19 | True | 0.187 | 0.566 | 3.019 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 122 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/ | 49.4 | 13 | True | 0.028 | 1.672 | 58.743 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 123 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/ | 57 | 13 | True | 0.508 | 0.863 | 1.698 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n" ] } ], @@ -1068,52 +1218,50 @@ "name": "stdout", "output_type": "stream", "text": [ - "| | Group | Experiment | WER (dev-other) | Count | Tuned | CTC | dev CTC | overfitting | dev MLE | dev DP | Joint | Still running | Training data available | Num Epochs | LR | ASR Model Type |\n", - "|----:|:--------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------:|--------:|:--------|:------|:----------|:--------------|:----------|:---------|:--------|:----------------|:--------------------------|:-------------|:-----------------------------------------------|:-----------------|\n", - "| 44 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/no_specaug/ce_ls_1.0/ | 81 | 1 | False | 0.036 | 2.023 | 55.937 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 45 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/specaug/ce_ls_1.0/ | 79.5 | 1 | False | 1.182 | 0.749 | 0.634 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 46 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep_v2/no_specaug/ce_ls_1.0/ | 86.9 | 1 | False | 0.006 | 3.047 | 476.04 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 47 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x1024_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/ | 66.4 | 1 | False | 0.582 | 1.039 | 1.786 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 48 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/default/ | 55.1 | 1 | False | 0.177 | 1.151 | 6.489 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 49 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/drop_around_blstm/lm5/ | 66.3 | 1 | False | 0.599 | 0.948 | 1.581 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 50 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/drop_around_blstm/spec_augment/ | 94.2 | 1 | False | 2.438 | 1.396 | 0.573 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 51 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/ | 58 | 13 | True | 0.022 | 2.315 | 104.406 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 52 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval_spec_aug/not_silence_preprocessed/ | 67.7 | 1 | False | 0.725 | 1.04 | 1.434 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 53 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/ | 44.6 | 13 | True | 0.326 | 1.042 | 3.202 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 54 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/ | 63.8 | 13 | True | 0.065 | 2.401 | 36.955 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 55 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/ | 55.4 | 13 | True | 0.882 | 1.139 | 1.292 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 56 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/silence_preprocessed/ | 78.1 | 1 | False | 0.779 | 1.163 | 1.493 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 57 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/ | 73.4 | 1 | False | 0.667 | 1.099 | 1.648 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 58 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/ | 63.2 | 13 | True | 0.062 | 2.427 | 39.391 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 59 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/lm5/ | 73.4 | 1 | False | 0.667 | 1.099 | 1.648 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 60 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/silence_preprocessing/ | 76.2 | 1 | False | 0.71 | 1.15 | 1.62 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 61 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_simple_encoder/silence_preprocessed/ | 50.3 | 1 | False | 0.046 | 1.527 | 33.333 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 62 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/ | 34 | 19 | True | 0.058 | 0.913 | 15.782 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 63 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/ | 33.8 | 19 | True | 0.003 | 1.209 | 454.454 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 64 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/ | 31.4 | 19 | True | 0.024 | 0.926 | 39.211 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 65 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/ | 23.9 | 19 | True | 0.187 | 0.566 | 3.019 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 66 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc768_100ep_xvector/ | 62.4 | 1 | False | 0.326 | 1.049 | 3.219 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 67 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment_before/glow_enc192_200ep/ | 92.4 | 1 | False | 2.272 | 1.371 | 0.604 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 68 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment_before/glow_enc192_200ep_not_freezed/ | 30.9 | 1 | False | 0.172 | 0.56 | 3.248 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 69 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/100epTTS/silence_preprocessed/ | 59.7 | 1 | False | 0.356 | 1.057 | 2.966 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 70 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS/not_silence_preprocessed/ | 61.8 | 1 | False | 0.449 | 1.042 | 2.32 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 71 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS/silence_preprocessed/ | 61.8 | 1 | False | 0.408 | 1.04 | 2.552 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 72 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS_early_eval_ep100/not_silence_preprocessed/ | 60.6 | 1 | False | 0.385 | 1.046 | 2.718 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 73 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS_early_eval_ep100/silence_preprocessed/ | 60.8 | 1 | False | 0.35 | 1.066 | 3.045 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 74 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/not_silence_preprocessed/ | 61.1 | 1 | False | 0.402 | 1.038 | 2.579 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 75 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/not_silence_preprocessed/ | 70.4 | 1 | False | 0.737 | 1.107 | 1.503 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 76 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/not_silence_preprocessed/lm5/ | 70.4 | 1 | False | 0.737 | 1.107 | 1.503 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 77 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/silence_preprocessing/ | 71.9 | 1 | False | 0.767 | 1.135 | 1.479 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 78 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder/not_silence_preprocessed/ | 66.2 | 1 | False | 0.668 | 1.026 | 1.535 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 79 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder/silence_preprocessed/ | 67.7 | 1 | False | 0.622 | 1.081 | 1.738 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 80 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder_16blocks/not_silence_preprocessed/ | 55.1 | 1 | False | 0.105 | 1.32 | 12.536 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 81 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_simple_encoder/silence_preprocessed/ | 56.9 | 1 | False | 0.096 | 1.431 | 14.855 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 82 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_simple_encoder_epoch84/silence_preprocessed/ | 57.4 | 1 | False | 0.079 | 1.519 | 19.301 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 83 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_with_small_enc/silence_preprocessed/ | 79.4 | 1 | False | 1.103 | 1.283 | 1.163 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 84 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_4x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/ | 52.4 | 1 | False | 0.378 | 0.812 | 2.15 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 114 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/ | 49.4 | 13 | True | 0.028 | 1.672 | 58.743 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 115 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/tts_dataset/ | 59.8 | 1 | False | 0.071 | 1.537 | 21.521 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 116 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/test/blstm512/warmup/d0.2_b100/default_250/extra/dev-other/sclite/wer | 127.9 | 1 | False | - | - | - | - | - | False | False | True | - | - | blstm |\n" + "| | Group | Experiment | WER (dev-other) | Count | Tuned | CTC | dev CTC | overfitting | dev MLE | dev DP | Joint | Still running | Training data available | Num Epochs | LR | ASR Model Type | Missing glow.eval |\n", + "|----:|:--------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------:|--------:|:--------|------:|----------:|--------------:|:----------|:---------|:--------|:----------------|:--------------------------|-------------:|:-----------------------------------------------|:-----------------|:--------------------|\n", + "| 45 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/no_specaug/ce_ls_1.0/ | 81 | 1 | False | 0.036 | 2.023 | 55.937 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 46 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/specaug/ce_ls_1.0/ | 79.5 | 1 | False | 1.182 | 0.749 | 0.634 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 47 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep_v2/no_specaug/ce_ls_1.0/ | 86.9 | 1 | False | 0.006 | 3.047 | 476.04 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 48 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x1024_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/ | 66.4 | 1 | False | 0.582 | 1.039 | 1.786 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 49 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/default/ | 55.1 | 1 | False | 0.177 | 1.151 | 6.489 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 50 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/drop_around_blstm/lm5/ | 66.3 | 1 | False | 0.599 | 0.948 | 1.581 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 51 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/drop_around_blstm/spec_augment/ | 94.2 | 1 | False | 2.438 | 1.396 | 0.573 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 52 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/ | 58 | 13 | True | 0.022 | 2.315 | 104.406 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 53 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval_spec_aug/not_silence_preprocessed/ | 67.7 | 1 | False | 0.725 | 1.04 | 1.434 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 54 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/ | 44.6 | 13 | True | 0.326 | 1.042 | 3.202 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | True |\n", + "| 55 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/ | 63.8 | 13 | True | 0.065 | 2.401 | 36.955 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 56 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/ | 55.4 | 13 | True | 0.882 | 1.139 | 1.292 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 57 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/silence_preprocessed/ | 78.1 | 1 | False | 0.779 | 1.163 | 1.493 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 58 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/ | 73.4 | 1 | False | 0.667 | 1.099 | 1.648 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 59 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/ | 63.2 | 13 | True | 0.062 | 2.427 | 39.391 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 60 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/silence_preprocessing/ | 76.2 | 1 | False | 0.71 | 1.15 | 1.62 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 61 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_simple_encoder/silence_preprocessed/ | 50.3 | 1 | False | 0.046 | 1.527 | 33.333 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 62 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/ | 34 | 19 | True | 0.058 | 0.913 | 15.782 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 63 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/ | 33.8 | 19 | True | 0.003 | 1.209 | 454.454 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 64 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/ | 31.4 | 19 | True | 0.024 | 0.926 | 39.211 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 65 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/ | 23.9 | 19 | True | 0.187 | 0.566 | 3.019 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 66 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc768_100ep_xvector/ | 62.4 | 1 | False | 0.326 | 1.049 | 3.219 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 67 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment_before/glow_enc192_200ep/ | 92.4 | 1 | False | 2.272 | 1.371 | 0.604 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 68 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment_before/glow_enc192_200ep_not_freezed/ | 30.9 | 1 | False | 0.172 | 0.56 | 3.248 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 69 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/100epTTS/silence_preprocessed/ | 59.7 | 1 | False | 0.356 | 1.057 | 2.966 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 70 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS/not_silence_preprocessed/ | 61.8 | 1 | False | 0.449 | 1.042 | 2.32 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 71 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS/silence_preprocessed/ | 61.8 | 1 | False | 0.408 | 1.04 | 2.552 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 72 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS_early_eval_ep100/not_silence_preprocessed/ | 60.6 | 1 | False | 0.385 | 1.046 | 2.718 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 73 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS_early_eval_ep100/silence_preprocessed/ | 60.8 | 1 | False | 0.35 | 1.066 | 3.045 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 74 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/not_silence_preprocessed/ | 61.1 | 1 | False | 0.402 | 1.038 | 2.579 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 75 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/not_silence_preprocessed/ | 70.4 | 1 | False | 0.737 | 1.107 | 1.503 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 76 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/silence_preprocessing/ | 71.9 | 1 | False | 0.767 | 1.135 | 1.479 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 77 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder/not_silence_preprocessed/ | 66.2 | 1 | False | 0.668 | 1.026 | 1.535 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 78 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder/silence_preprocessed/ | 67.7 | 1 | False | 0.622 | 1.081 | 1.738 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 79 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder_16blocks/not_silence_preprocessed/ | 55.1 | 1 | False | 0.105 | 1.32 | 12.536 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 80 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_simple_encoder/silence_preprocessed/ | 56.9 | 1 | False | 0.096 | 1.431 | 14.855 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 81 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_simple_encoder_epoch84/silence_preprocessed/ | 57.4 | 1 | False | 0.079 | 1.519 | 19.301 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 82 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_with_small_enc/silence_preprocessed/ | 79.4 | 1 | False | 1.103 | 1.283 | 1.163 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 83 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_4x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/ | 52.4 | 1 | False | 0.378 | 0.812 | 2.15 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 122 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/ | 49.4 | 13 | True | 0.028 | 1.672 | 58.743 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 123 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/ | 57 | 13 | True | 0.508 | 0.863 | 1.698 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 124 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/tts_dataset/ | 59.8 | 1 | False | 0.071 | 1.537 | 21.521 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n" ] } ], @@ -1141,52 +1289,62 @@ "name": "stdout", "output_type": "stream", "text": [ - "| | Group | Experiment | WER (dev-other) | Count | Tuned | CTC | dev CTC | overfitting | dev MLE | dev DP | Joint | Still running | Training data available | Num Epochs | LR | ASR Model Type |\n", - "|----:|:----------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------:|--------:|:--------|:------|:----------|:--------------|:----------|:---------|:--------|:----------------|:--------------------------|:-------------|:-------------------------------------------------|:-----------------|\n", - "| 2 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/ | 29.6 | 7 | True | 0.024 | 0.863 | 36.326 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 5 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/ | 29.8 | 7 | True | 0.023 | 0.866 | 37.161 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 6 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/ | 18.7 | 7 | True | 0.133 | 0.444 | 3.33 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 30 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/ | 38.3 | 7 | True | 0.037 | 1.206 | 32.173 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 31 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/ | 24.1 | 7 | True | 0.244 | 0.575 | 2.358 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 32 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_coupling_epsilon/ | 30.4 | 1 | False | 0.243 | 0.576 | 2.37 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 33 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/ | 23.8 | 5 | True | - | - | - | - | - | False | False | True | - | - | conformer |\n", - "| 34 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/ | 24.6 | 8 | True | 0.242 | 0.587 | 2.424 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 35 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/ | 35.3 | 7 | True | 0.233 | 0.9 | 3.862 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 36 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_control/ | 46.3 | 1 | False | 0.233 | 0.916 | 3.927 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 37 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/ | 24.5 | 7 | True | 0.23 | 0.59 | 2.568 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 38 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/ | 24.6 | 7 | True | 0.239 | 0.594 | 2.488 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 39 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector/ | 43.5 | 1 | False | 0.251 | 0.762 | 3.03 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 40 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/ | 32.4 | 7 | True | 0.244 | 0.761 | 3.119 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 41 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/ | 29.5 | 7 | True | 0.216 | 0.73 | 3.38 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 42 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/ | 25.6 | 7 | True | 0.491 | 0.587 | 1.196 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 43 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/ | 25.4 | 7 | True | 0.502 | 0.592 | 1.181 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 85 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/ | 25.5 | 13 | True | 0.03 | 0.732 | 24.376 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 86 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/ | 28.1 | 19 | True | 0.001 | 1.248 | 2248.206 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 87 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/ | 28.7 | 19 | True | 0.001 | 1.295 | 1861.864 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 88 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/ | 27.1 | 13 | True | 0.001 | 1.193 | 975.256 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 89 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/ | 21.2 | 13 | True | 0.345 | 0.462 | 1.339 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 90 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.1/ | 25.8 | 1 | False | 0.406 | 0.422 | 1.041 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 91 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.2/ | 25.5 | 1 | False | 0.413 | 0.417 | 1.01 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 92 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/ | 26.2 | 1 | False | 0.44 | 0.415 | 0.943 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 93 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.4/ | 25.2 | 1 | False | 0.416 | 0.413 | 0.993 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 94 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/ | 20 | 13 | True | 0.416 | 0.418 | 1.005 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 95 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/ | 19.4 | 13 | True | 0.364 | 0.417 | 1.144 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 96 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/ | 21.8 | 14 | True | 0.368 | 0.474 | 1.287 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 97 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/batch_norm/ | 29.4 | 1 | False | 0.392 | 0.504 | 1.285 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 98 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/layer_norm/ | 28.5 | 1 | False | 0.407 | 0.488 | 1.197 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 99 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/ | 26.1 | 19 | True | 0.003 | 1.046 | 335.647 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 100 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v2/ | 26.6 | 1 | False | 0.216 | 0.589 | 2.724 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 101 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v3/ | 26.3 | 1 | False | 0.211 | 0.591 | 2.799 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 102 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/ | 17.6 | 19 | True | 0.086 | 0.512 | 5.958 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 103 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/ | 18.3 | 19 | True | 0.08 | 0.548 | 6.834 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 104 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/ | 23.6 | 13 | True | 0.42 | 0.476 | 1.133 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 105 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/ | 19.7 | 13 | True | 0.281 | 0.458 | 1.63 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 106 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/ | 41.3 | 1 | False | 0.423 | 0.751 | 1.777 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 107 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/ | 100 | 13 | True | 3.463 | 5.332 | 1.54 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 108 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/ | 18 | 15 | True | 0.083 | 0.544 | 6.584 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 109 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_200ep_not_silence_preprocessed/ | 100 | 1 | False | 3.454 | 5.802 | 1.68 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 110 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/ | 25.5 | 7 | True | 0.5 | 0.589 | 1.177 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 111 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/tts_dataset/spec_augment/no_glow/ | 20.9 | 1 | False | 0.091 | 0.419 | 4.591 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n" + "| | Group | Experiment | WER (dev-other) | Count | Tuned | CTC | dev CTC | overfitting | dev MLE | dev DP | Joint | Still running | Training data available | Num Epochs | LR | ASR Model Type | Missing glow.eval |\n", + "|----:|:----------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------:|--------:|:--------|:------|:----------|:--------------|:----------|:---------|:--------|:----------------|:--------------------------|:-------------|:-------------------------------------------------|:-----------------|:--------------------|\n", + "| 2 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/ | 29.6 | 7 | True | 0.024 | 0.863 | 36.326 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 5 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/ | 29.8 | 7 | True | 0.023 | 0.866 | 37.161 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 6 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/ | 18.7 | 7 | True | 0.133 | 0.444 | 3.33 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 30 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/ | 38.3 | 7 | True | 0.037 | 1.206 | 32.173 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 31 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/ | 24.1 | 7 | True | 0.244 | 0.575 | 2.358 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 32 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_coupling_epsilon/ | 30.4 | 1 | False | 0.243 | 0.576 | 2.37 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 33 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/ | 23.8 | 5 | True | - | - | - | - | - | False | False | True | - | - | conformer | False |\n", + "| 34 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/ | 24.6 | 8 | True | 0.242 | 0.587 | 2.424 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 35 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/ | 35.3 | 7 | True | 0.233 | 0.9 | 3.862 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 36 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_control/ | 46.3 | 1 | False | 0.233 | 0.916 | 3.927 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 37 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/ | 24.5 | 7 | True | 0.23 | 0.59 | 2.568 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 38 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/ | 24.6 | 7 | True | 0.239 | 0.594 | 2.488 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 39 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector/ | 43.5 | 1 | False | 0.251 | 0.762 | 3.03 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 40 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/ | 37 | 7 | True | 0.167 | 0.931 | 5.565 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 41 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/ | 32.4 | 7 | True | 0.244 | 0.761 | 3.119 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 42 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/ | 29.5 | 7 | True | 0.216 | 0.73 | 3.38 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 43 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/ | 25.6 | 7 | True | 0.491 | 0.587 | 1.196 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 44 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/ | 25.4 | 7 | True | 0.502 | 0.592 | 1.181 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 84 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/ | 28.1 | 19 | True | 0.001 | 1.248 | 2248.206 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 85 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/ | 28.7 | 19 | True | 0.001 | 1.295 | 1861.864 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 86 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/ | 49.1 | 13 | True | 0.006 | 2.364 | 381.61 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 87 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/ | 36.1 | 13 | True | 0.002 | 1.65 | 869.021 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 88 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/ | 27.1 | 13 | True | 0.001 | 1.193 | 975.256 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 89 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/ | 20 | 13 | True | 0.331 | 0.453 | 1.37 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 90 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/ | 19.7 | 13 | True | 0.335 | 0.448 | 1.338 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 91 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/ | 18.9 | 13 | True | 0.32 | 0.439 | 1.374 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 92 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/ | 38 | 13 | True | 0.321 | 0.894 | 2.788 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 93 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/ | 26.1 | 19 | True | 0.003 | 1.046 | 335.647 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 94 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v2/ | 26.6 | 1 | False | 0.216 | 0.589 | 2.724 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 95 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v3/ | 26.3 | 1 | False | 0.211 | 0.591 | 2.799 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 96 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/ | 26.6 | 19 | True | 0.003 | 1.081 | 368.105 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 97 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/ | 18.3 | 19 | True | 0.08 | 0.548 | 6.834 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 98 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/ | 23.6 | 13 | True | 0.42 | 0.476 | 1.133 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 99 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/ | 19.7 | 13 | True | 0.281 | 0.458 | 1.63 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 100 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/ | 38.2 | 13 | True | 0.285 | 0.975 | 3.417 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 101 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/ | 25.3 | 13 | True | 0.33 | 0.543 | 1.644 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 102 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/ | 22 | 13 | True | 0.279 | 0.488 | 1.75 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 103 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/ | 18 | 15 | True | 0.083 | 0.544 | 6.584 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 104 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/ | 17.6 | 19 | True | 0.086 | 0.512 | 5.958 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 105 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/ | 25.5 | 7 | True | 0.5 | 0.589 | 1.177 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 106 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/ | 25.5 | 13 | True | 0.03 | 0.732 | 24.376 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 107 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/ | 21.2 | 13 | True | 0.345 | 0.462 | 1.339 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 108 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.1/ | 25.8 | 1 | False | 0.406 | 0.422 | 1.041 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 109 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.2/ | 25.5 | 1 | False | 0.413 | 0.417 | 1.01 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 110 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/ | 26.2 | 1 | False | 0.44 | 0.415 | 0.943 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 111 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.4/ | 25.2 | 1 | False | 0.416 | 0.413 | 0.993 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 112 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/ | 20 | 13 | True | 0.416 | 0.418 | 1.005 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 113 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/ | 19.4 | 13 | True | 0.364 | 0.417 | 1.144 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 114 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/ | 21.8 | 14 | True | 0.368 | 0.474 | 1.287 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 115 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/batch_norm/ | 29.4 | 1 | False | 0.392 | 0.504 | 1.285 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 116 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/layer_norm/ | 28.5 | 1 | False | 0.407 | 0.488 | 1.197 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 117 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/ | 100 | 13 | True | 3.463 | 5.332 | 1.54 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 118 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment_before/glow_enc192_200ep_not_silence_preprocessed/ | 100 | 1 | False | 3.454 | 5.802 | 1.68 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 119 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/tts_dataset/spec_augment/no_glow/ | 20.9 | 1 | False | 0.091 | 0.419 | 4.591 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n" ] } ], @@ -1200,25 +1358,26 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "| | Group | Experiment | WER (dev-other) | Count | Tuned | CTC | dev CTC | overfitting | dev MLE | dev DP | Joint | Still running | Training data available | Num Epochs | LR | ASR Model Type |\n", - "|----:|:---------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------:|--------:|:--------|------:|----------:|--------------:|:----------|:---------|:--------|:----------------|:--------------------------|-------------:|:-------------------------------------------------|:-----------------|\n", - "| 62 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/ | 34 | 19 | True | 0.058 | 0.913 | 15.782 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 63 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/ | 33.8 | 19 | True | 0.003 | 1.209 | 454.454 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 64 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/ | 31.4 | 19 | True | 0.024 | 0.926 | 39.211 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 65 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/ | 23.9 | 19 | True | 0.187 | 0.566 | 3.019 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 68 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment_before/glow_enc192_200ep_not_freezed/ | 30.9 | 1 | False | 0.172 | 0.56 | 3.248 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 86 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/ | 28.1 | 19 | True | 0.001 | 1.248 | 2248.21 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 87 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/ | 28.7 | 19 | True | 0.001 | 1.295 | 1861.86 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 99 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/ | 26.1 | 19 | True | 0.003 | 1.046 | 335.647 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 102 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/ | 17.6 | 19 | True | 0.086 | 0.512 | 5.958 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 103 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/ | 18.3 | 19 | True | 0.08 | 0.548 | 6.834 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n" + "| | Group | Experiment | WER (dev-other) | Count | Tuned | CTC | dev CTC | overfitting | dev MLE | dev DP | Joint | Still running | Training data available | Num Epochs | LR | ASR Model Type | Missing glow.eval |\n", + "|----:|:---------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------:|--------:|:--------|------:|----------:|--------------:|:----------|:---------|:--------|:----------------|:--------------------------|-------------:|:-------------------------------------------------|:-----------------|:--------------------|\n", + "| 62 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/ | 34 | 19 | True | 0.058 | 0.913 | 15.782 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 63 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/ | 33.8 | 19 | True | 0.003 | 1.209 | 454.454 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 64 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/ | 31.4 | 19 | True | 0.024 | 0.926 | 39.211 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 65 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/ | 23.9 | 19 | True | 0.187 | 0.566 | 3.019 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 68 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment_before/glow_enc192_200ep_not_freezed/ | 30.9 | 1 | False | 0.172 | 0.56 | 3.248 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 84 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/ | 28.1 | 19 | True | 0.001 | 1.248 | 2248.21 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 85 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/ | 28.7 | 19 | True | 0.001 | 1.295 | 1861.86 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 93 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/ | 26.1 | 19 | True | 0.003 | 1.046 | 335.647 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 96 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/ | 26.6 | 19 | True | 0.003 | 1.081 | 368.105 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 97 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/ | 18.3 | 19 | True | 0.08 | 0.548 | 6.834 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 104 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/ | 17.6 | 19 | True | 0.086 | 0.512 | 5.958 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n" ] } ], @@ -1240,20 +1399,20 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_3659618/226959626.py:22: SettingWithCopyWarning: \n", + "/var/tmp/ipykernel_2362097/226959626.py:22: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_joint[\"auto MOS\"] = mos\n", - "/tmp/ipykernel_3659618/226959626.py:23: SettingWithCopyWarning: \n", + "/var/tmp/ipykernel_2362097/226959626.py:23: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", @@ -1291,42 +1450,42 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "| | Group | Experiment | WER (dev-other) | Count | Tuned | CTC | dev CTC | overfitting | dev MLE | dev DP | Joint | Still running | Training data available | Num Epochs | LR | ASR Model Type | auto MOS | sWER |\n", - "|---:|:----------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------:|--------:|:--------|------:|----------:|--------------:|----------:|---------:|:--------|:----------------|:--------------------------|-------------:|:-------------------------------------------------|:-----------------|-----------:|-------:|\n", - "| 0 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/ | 23.9 | 7 | True | 0.292 | 0.564 | 1.933 | -0.670671 | 2.09134 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | 1.82 | 95.7 |\n", - "| 1 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/ | 34.6 | 7 | True | 0.204 | 0.881 | 4.312 | -0.465325 | 0.229336 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | 2.14 | 5.2 |\n", - "| 3 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/ | 24 | 7 | True | 0.289 | 0.569 | 1.967 | -0.692025 | 0.412498 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | 2.45 | 20.9 |\n", - "| 4 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/ | 34.6 | 7 | True | 0.204 | 0.881 | 4.312 | -0.465325 | 0.229336 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | 2.14 | 5.2 |\n", - "| 7 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/ | 23.8 | 7 | True | 0.3 | 0.582 | 1.94 | -0.699266 | 0.370426 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | 2.61 | 14.8 |\n", - "| 8 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/ | 24 | 7 | True | 0.431 | 0.558 | 1.294 | -0.805268 | 0.355401 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | 3.16 | 15.2 |\n", - "| 9 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ddi_actnorm/ | 66.7 | 1 | False | 0.282 | 0.583 | 2.068 | -0.696243 | 0.401243 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | nan | nan |\n", - "| 10 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/ | 38.4 | 7 | True | 0.103 | 1.099 | 10.685 | -0.548408 | 0.701494 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | 1.58 | 98 |\n", - "| 11 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control/ | 38.5 | 1 | False | 0.216 | 0.735 | 3.408 | -0.779522 | 1.54937 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | nan | nan |\n", - "| 12 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_radam/ | 38.3 | 1 | False | 0.214 | 0.731 | 3.414 | -0.780336 | 1.46005 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | nan | nan |\n", - "| 13 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_spec_augment/ | 37.9 | 1 | False | 0.501 | 0.588 | 1.173 | -0.77829 | 1.4497 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | nan | nan |\n", - "| 14 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/ | 34.7 | 7 | True | 0.142 | 0.927 | 6.549 | -0.769183 | 0.36247 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | 3.15 | 7.6 |\n", - "| 15 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ddi_actnorm/ | 94.4 | 1 | False | 0.165 | 1.062 | 6.429 | -0.520394 | 0.214735 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | nan | nan |\n", - "| 16 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_encoder_sample_ctc_scale_0.1/ | 100 | 1 | False | 3.481 | 6.325 | 1.817 | -0.781605 | 1.1355 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | nan | nan |\n", - "| 17 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_radam/ | 43.8 | 1 | False | 0.098 | 1.135 | 11.599 | -0.582927 | 0.348165 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | nan | nan |\n", - "| 18 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/ | 37 | 7 | True | 0.197 | 0.946 | 4.803 | -0.521236 | 0.225934 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | 2.3 | 5.2 |\n", - "| 19 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/ | 29.4 | 7 | True | 0.409 | 0.689 | 1.685 | -0.759844 | 0.329289 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | 3.11 | 4.6 |\n", - "| 20 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/ | 29.6 | 7 | True | 0.409 | 0.696 | 1.703 | -0.75744 | 0.330754 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | nan | nan |\n", - "| 21 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_trainXvector/ | 47 | 1 | False | 0.046 | 1.485 | 32.45 | -0.592337 | 0.586278 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | nan | nan |\n", - "| 22 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/ | 39.7 | 7 | True | 0.432 | 0.863 | 1.998 | -0.629746 | 0.403638 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | 2.75 | 9.5 |\n", - "| 23 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/ | 52.4 | 7 | True | 0.455 | 1.104 | 2.429 | -0.771774 | 0.399888 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | 3.12 | 13.3 |\n", - "| 24 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/ | 47.7 | 7 | True | 0.386 | 1.015 | 2.63 | -0.641523 | 0.379653 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | 2.64 | 9.9 |\n", - "| 25 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/ | 50.7 | 7 | True | 0.441 | 1.084 | 2.461 | -0.778751 | 0.379635 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | 3.18 | 11.2 |\n", - "| 26 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/ | 44.1 | 7 | True | 0.058 | 1.44 | 25.027 | -0.591349 | 0.326762 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | 2.48 | 6.7 |\n", - "| 27 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/ | 36.1 | 7 | True | 0.128 | 1.01 | 7.915 | -0.772961 | 0.346386 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | 3.19 | 7.9 |\n", - "| 28 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/ | 42.6 | 7 | True | 0.136 | 1.228 | 9.014 | -0.535346 | 0.203592 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | 2.32 | 6 |\n", - "| 29 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/ | 31.9 | 7 | True | 0.378 | 0.76 | 2.009 | -0.764738 | 0.306534 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | 3.2 | 4.4 |\n" + "| | Group | Experiment | WER (dev-other) | Count | Tuned | CTC | dev CTC | overfitting | dev MLE | dev DP | Joint | Still running | Training data available | Num Epochs | LR | ASR Model Type | Missing glow.eval | auto MOS | sWER |\n", + "|---:|:----------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------:|--------:|:--------|------:|----------:|--------------:|----------:|---------:|:--------|:----------------|:--------------------------|-------------:|:-------------------------------------------------|:-----------------|:--------------------|-----------:|-------:|\n", + "| 0 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/ | 23.9 | 7 | True | 0.292 | 0.564 | 1.933 | -0.670671 | 2.09134 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 1.82 | 95.7 |\n", + "| 1 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/ | 34.6 | 7 | True | 0.204 | 0.881 | 4.312 | -0.465325 | 0.229336 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.14 | 5.2 |\n", + "| 3 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/ | 24 | 7 | True | 0.289 | 0.569 | 1.967 | -0.692025 | 0.412498 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.45 | 20.9 |\n", + "| 4 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/ | 34.6 | 7 | True | 0.204 | 0.881 | 4.312 | -0.465325 | 0.229336 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.14 | 5.2 |\n", + "| 7 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/ | 23.8 | 7 | True | 0.3 | 0.582 | 1.94 | -0.699266 | 0.370426 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.61 | 14.8 |\n", + "| 8 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/ | 24 | 7 | True | 0.431 | 0.558 | 1.294 | -0.805268 | 0.355401 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 3.16 | 15.2 |\n", + "| 9 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ddi_actnorm/ | 66.7 | 1 | False | 0.282 | 0.583 | 2.068 | -0.696243 | 0.401243 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", + "| 10 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/ | 38.4 | 7 | True | 0.103 | 1.099 | 10.685 | -0.548408 | 0.701494 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 1.58 | 98 |\n", + "| 11 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control/ | 38.5 | 1 | False | 0.216 | 0.735 | 3.408 | -0.779522 | 1.54937 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", + "| 12 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_radam/ | 38.3 | 1 | False | 0.214 | 0.731 | 3.414 | -0.780336 | 1.46005 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", + "| 13 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_spec_augment/ | 37.9 | 1 | False | 0.501 | 0.588 | 1.173 | -0.77829 | 1.4497 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", + "| 14 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/ | 34.7 | 7 | True | 0.142 | 0.927 | 6.549 | -0.769183 | 0.36247 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 3.15 | 7.6 |\n", + "| 15 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ddi_actnorm/ | 94.4 | 1 | False | 0.165 | 1.062 | 6.429 | -0.520394 | 0.214735 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", + "| 16 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_encoder_sample_ctc_scale_0.1/ | 100 | 1 | False | 3.481 | 6.325 | 1.817 | -0.781605 | 1.1355 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", + "| 17 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_radam/ | 43.8 | 1 | False | 0.098 | 1.135 | 11.599 | -0.582927 | 0.348165 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", + "| 18 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/ | 37 | 7 | True | 0.197 | 0.946 | 4.803 | -0.521236 | 0.225934 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.3 | 5.2 |\n", + "| 19 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/ | 29.4 | 7 | True | 0.409 | 0.689 | 1.685 | -0.759844 | 0.329289 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 3.11 | 4.6 |\n", + "| 20 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/ | 29.6 | 7 | True | 0.409 | 0.696 | 1.703 | -0.75744 | 0.330754 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", + "| 21 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_trainXvector/ | 47 | 1 | False | 0.046 | 1.485 | 32.45 | -0.592337 | 0.586278 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", + "| 22 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/ | 39.7 | 7 | True | 0.432 | 0.863 | 1.998 | -0.629746 | 0.403638 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.75 | 9.5 |\n", + "| 23 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/ | 52.4 | 7 | True | 0.455 | 1.104 | 2.429 | -0.771774 | 0.399888 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 3.12 | 13.3 |\n", + "| 24 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/ | 47.7 | 7 | True | 0.386 | 1.015 | 2.63 | -0.641523 | 0.379653 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.64 | 9.9 |\n", + "| 25 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/ | 50.7 | 7 | True | 0.441 | 1.084 | 2.461 | -0.778751 | 0.379635 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 3.18 | 11.2 |\n", + "| 26 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/ | 44.1 | 7 | True | 0.058 | 1.44 | 25.027 | -0.591349 | 0.326762 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.48 | 6.7 |\n", + "| 27 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/ | 36.1 | 7 | True | 0.128 | 1.01 | 7.915 | -0.772961 | 0.346386 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 3.19 | 7.9 |\n", + "| 28 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/ | 42.6 | 7 | True | 0.136 | 1.228 | 9.014 | -0.535346 | 0.203592 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.32 | 6 |\n", + "| 29 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/ | 31.9 | 7 | True | 0.378 | 0.76 | 2.009 | -0.764738 | 0.306534 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 3.2 | 4.4 |\n" ] } ], @@ -1343,23 +1502,24 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "| | Group | Experiment | WER (dev-other) | Count | Tuned | CTC | dev CTC | overfitting | dev MLE | dev DP | Joint | Still running | Training data available | Num Epochs | LR | ASR Model Type |\n", - "|----:|:-----------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------:|--------:|:--------|------:|----------:|--------------:|:----------|:---------|:--------|:----------------|:--------------------------|-------------:|:-------------------------------------------------|:-----------------|\n", - "| 41 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/ | 29.5 | 7 | True | 0.216 | 0.73 | 3.38 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 42 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/ | 25.6 | 7 | True | 0.491 | 0.587 | 1.196 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 43 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/ | 25.4 | 7 | True | 0.502 | 0.592 | 1.181 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 88 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/ | 27.1 | 13 | True | 0.001 | 1.193 | 975.256 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 108 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/ | 18 | 15 | True | 0.083 | 0.544 | 6.584 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 111 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/tts_dataset/spec_augment/no_glow/ | 20.9 | 1 | False | 0.091 | 0.419 | 4.591 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer |\n", - "| 114 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/ | 49.4 | 13 | True | 0.028 | 1.672 | 58.743 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n", - "| 115 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/tts_dataset/ | 59.8 | 1 | False | 0.071 | 1.537 | 21.521 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm |\n" + "| | Group | Experiment | WER (dev-other) | Count | Tuned | CTC | dev CTC | overfitting | dev MLE | dev DP | Joint | Still running | Training data available | Num Epochs | LR | ASR Model Type | Missing glow.eval |\n", + "|----:|:-----------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------:|--------:|:--------|------:|----------:|--------------:|:----------|:---------|:--------|:----------------|:--------------------------|-------------:|:-------------------------------------------------|:-----------------|:--------------------|\n", + "| 42 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/ | 29.5 | 7 | True | 0.216 | 0.73 | 3.38 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 43 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/ | 25.6 | 7 | True | 0.491 | 0.587 | 1.196 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 44 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/ | 25.4 | 7 | True | 0.502 | 0.592 | 1.181 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 88 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/ | 27.1 | 13 | True | 0.001 | 1.193 | 975.256 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 103 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/ | 18 | 15 | True | 0.083 | 0.544 | 6.584 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 119 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/tts_dataset/spec_augment/no_glow/ | 20.9 | 1 | False | 0.091 | 0.419 | 4.591 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 122 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/ | 49.4 | 13 | True | 0.028 | 1.672 | 58.743 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 123 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/ | 57 | 13 | True | 0.508 | 0.863 | 1.698 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 124 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/tts_dataset/ | 59.8 | 1 | False | 0.071 | 1.537 | 21.521 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n" ] } ], @@ -1370,7 +1530,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -1379,7 +1539,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -1389,7 +1549,7 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[18], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdf_training_avail\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mplot\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkind\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mscatter\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mWER (dev-other)\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mCTC\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[21], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdf_training_avail\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mplot\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkind\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mscatter\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mWER (dev-other)\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mCTC\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/work/tools22/users/lukas.rilling/sis_env/lib/python3.10/site-packages/pandas/plotting/_core.py:976\u001b[0m, in \u001b[0;36mPlotAccessor.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 974\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m kind \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_dataframe_kinds:\n\u001b[1;32m 975\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data, ABCDataFrame):\n\u001b[0;32m--> 976\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mplot_backend\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mplot\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkind\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkind\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 977\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 978\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mplot kind \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkind\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m can only be used for data frames\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", "File \u001b[0;32m/work/tools22/users/lukas.rilling/sis_env/lib/python3.10/site-packages/pandas/plotting/_matplotlib/__init__.py:71\u001b[0m, in \u001b[0;36mplot\u001b[0;34m(data, kind, **kwargs)\u001b[0m\n\u001b[1;32m 69\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124max\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(ax, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mleft_ax\u001b[39m\u001b[38;5;124m\"\u001b[39m, ax)\n\u001b[1;32m 70\u001b[0m plot_obj \u001b[38;5;241m=\u001b[39m PLOT_CLASSES[kind](data, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m---> 71\u001b[0m \u001b[43mplot_obj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 72\u001b[0m plot_obj\u001b[38;5;241m.\u001b[39mdraw()\n\u001b[1;32m 73\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m plot_obj\u001b[38;5;241m.\u001b[39mresult\n", "File \u001b[0;32m/work/tools22/users/lukas.rilling/sis_env/lib/python3.10/site-packages/pandas/plotting/_matplotlib/core.py:453\u001b[0m, in \u001b[0;36mMPLPlot.generate\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 451\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compute_plot_data()\n\u001b[1;32m 452\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_setup_subplots()\n\u001b[0;32m--> 453\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_plot\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 454\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_add_table()\n\u001b[1;32m 455\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_legend()\n", @@ -1491,7 +1651,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.13" } }, "nbformat": 4, diff --git a/users/rilling/experiments/librispeech/common/eval_references/swer.py b/users/rilling/experiments/librispeech/common/eval_references/swer.py index dc61a166c..8276ac479 100644 --- a/users/rilling/experiments/librispeech/common/eval_references/swer.py +++ b/users/rilling/experiments/librispeech/common/eval_references/swer.py @@ -12,7 +12,7 @@ def get_swer_evaluation_reference(): from i6_core.corpus.filter import FilterCorpusRemoveUnknownWordSegmentsJob bliss = FilterCorpusRemoveUnknownWordSegmentsJob( - bliss_corpus=get_bliss_corpus_dict()["test-clean"], bliss_lexicon=get_tts_lexicon(), all_unknown=False + bliss_corpus=get_bliss_corpus_dict(audio_format="ogg")["test-clean"], bliss_lexicon=get_tts_lexicon(), all_unknown=False ).out_corpus system = asr_recognizer_systems[asr_system] diff --git a/users/rilling/experiments/librispeech/common/tts_eval.py b/users/rilling/experiments/librispeech/common/tts_eval.py index a60a27157..b3ca0a1e1 100644 --- a/users/rilling/experiments/librispeech/common/tts_eval.py +++ b/users/rilling/experiments/librispeech/common/tts_eval.py @@ -79,6 +79,7 @@ def tts_eval( nisqa_eval=False, swer_eval=False, swer_eval_corpus_key="train-clean", + nisqa_confidence=False ): """ Run search for a specific test dataset @@ -106,13 +107,13 @@ def tts_eval( name = prefix_name + f"/tts_eval_{vocoder}/{swer_eval_corpus_key}" forward_job.add_alias(name + "/forward") if nisqa_eval: - evaluate_nisqa(name, forward_job.out_files["out_corpus.xml.gz"], vocoder=vocoder) + evaluate_nisqa(name, forward_job.out_files["out_corpus.xml.gz"], vocoder=vocoder, with_bootstrap=nisqa_confidence) if swer_eval: evaluate_swer(name, forward_job, returnn_exe=returnn_exe_asr, returnn_root=returnn_root, corpus_key=swer_eval_corpus_key) return forward_job -def evaluate_nisqa(prefix_name: str, bliss_corpus: tk.Path, vocoder: str = "univnet"): +def evaluate_nisqa(prefix_name: str, bliss_corpus: tk.Path, vocoder: str = "univnet", with_bootstrap=False): predict_mos_job = NISQAMosPredictionJob(bliss_corpus, nisqa_repo=NISQA_REPO) predict_mos_job.add_alias(prefix_name + f"/nisqa_mos") tk.register_output(os.path.join(prefix_name, "nisqa_mos/average"), predict_mos_job.out_mos_average) @@ -120,6 +121,12 @@ def evaluate_nisqa(prefix_name: str, bliss_corpus: tk.Path, vocoder: str = "univ tk.register_output(os.path.join(prefix_name, "nisqa_mos/max"), predict_mos_job.out_mos_max) tk.register_output(os.path.join(prefix_name, "nisqa_mos/std_dev"), predict_mos_job.out_mos_std_dev) + if with_bootstrap: + from i6_experiments.users.rossenbach.tts.evaluation.nisqa import NISQAConfidenceJob + nisqa_confidence_job = NISQAConfidenceJob(predict_mos_job.output_dir, bliss_corpus) + nisqa_confidence_job.add_alias(prefix_name + "/nisqa_mos_confidence") + tk.register_output(os.path.join(prefix_name, "nisqa_mos/confidence_max_interval"), nisqa_confidence_job.out_max_interval_bound) + def evaluate_swer( name: str, diff --git a/users/rilling/experiments/librispeech/librispeech_glow_asr/experiments.py b/users/rilling/experiments/librispeech/librispeech_glow_asr/experiments.py index 9e033c1fc..4bc50602a 100644 --- a/users/rilling/experiments/librispeech/librispeech_glow_asr/experiments.py +++ b/users/rilling/experiments/librispeech/librispeech_glow_asr/experiments.py @@ -908,7 +908,7 @@ def run_exp( ) train_args_conformer["net_args"]["n_vocab"] = vocab_size_without_blank run_exp( - prefix_name + "conformer/tts_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed", + prefix_name + "conformer/glow_not_eval/tts_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed", datasets=train_data, train_args=train_args_conformer, search_args=default_search_args_tts, @@ -919,7 +919,7 @@ def run_exp( train_args_conformer_asr_data = copy.deepcopy(train_args_conformer) train_args_conformer_asr_data["net_args"]["n_vocab"] = vocab_size_without_blank_normal_ctc run_exp( - prefix_name + "conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed", + prefix_name + "conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed", datasets=train_data_normal_ctc, train_args=train_args_conformer_asr_data, search_args=default_search_args_asr, @@ -927,6 +927,18 @@ def run_exp( num_epochs=250, ) + train_args_conformer_asr_data_spec_aug_before = copy.deepcopy(train_args_conformer_asr_data) + train_args_conformer_asr_data_spec_aug_before["network_module"] = "glowASR_conformer_specaug_before" + train_args_conformer_asr_data_spec_aug_before["net_args"]["spec_augment"] = True + run_exp( + prefix_name + "conformer/glow_not_eval/asr_dataset/spec_augment_before/glow_enc192_200ep_not_silence_preprocessed", + datasets=train_data_normal_ctc, + train_args=train_args_conformer_asr_data_spec_aug_before, + search_args=default_search_args_asr, + large_gpu_training=False, + num_epochs=250, + ) + train_args_glow100ep_conformer_asr_data = copy.deepcopy(train_args_conformer_asr_data) tts_exp_name = "glowTTS/enc192/100ep/not_silence_preprocessed" tts_train_job = TTS_experiments[tts_exp_name]["train_job"] @@ -935,7 +947,7 @@ def run_exp( ) run_exp( - prefix_name + "conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed", + prefix_name + "conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed", datasets=train_data_normal_ctc, train_args=train_args_glow100ep_conformer_asr_data, search_args=default_search_args_asr, @@ -950,7 +962,7 @@ def run_exp( tts_train_job.out_checkpoints[tts_train_job.returnn_config.get("num_epochs", 200)] ) run_exp( - prefix_name + "conformer/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed", + prefix_name + "conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed", datasets=train_data_normal_ctc, train_args=train_args_glow_enc768_200ep_conformer_asr_data, search_args=default_search_args_asr, @@ -978,7 +990,7 @@ def run_exp( train_args_conformer_asr_data_layer_norm = copy.deepcopy(train_args_conformer_asr_data) train_args_conformer_asr_data_layer_norm["net_args"]["layer_norm"] = True run_exp( - prefix_name + "conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/layer_norm", + prefix_name + "conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/layer_norm", datasets=train_data_normal_ctc, train_args=train_args_conformer_asr_data_layer_norm, search_args=default_search_args_asr, @@ -989,7 +1001,7 @@ def run_exp( train_args_conformer_asr_data_batch_norm = copy.deepcopy(train_args_conformer_asr_data) train_args_conformer_asr_data_batch_norm["net_args"]["batch_norm"] = True run_exp( - prefix_name + "conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/batch_norm", + prefix_name + "conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/batch_norm", datasets=train_data_normal_ctc, train_args=train_args_conformer_asr_data_batch_norm, search_args=default_search_args_asr, @@ -1000,7 +1012,7 @@ def run_exp( train_args_conformer_no_spec_augment = copy.deepcopy(train_args_conformer) train_args_conformer_no_spec_augment["net_args"]["spec_augment"] = False run_exp( - prefix_name + "conformer/tts_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed", + prefix_name + "conformer/glow_not_eval/tts_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed", datasets=train_data, train_args=train_args_conformer_no_spec_augment, search_args=default_search_args_tts, @@ -1012,7 +1024,7 @@ def run_exp( train_args_conformer_no_spec_augment_asr_data["net_args"]["n_vocab"] = vocab_size_without_blank_normal_ctc run_exp( - prefix_name + "conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed", + prefix_name + "conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed", datasets=train_data_normal_ctc, train_args=train_args_conformer_no_spec_augment_asr_data, search_args=default_search_args_asr, @@ -1082,7 +1094,7 @@ def run_exp( additional_search_args = {"lm_weight": lm_w, "prior_scale": ps} run_exp( prefix_name - + f"conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_{lm_w}_ps_{ps}", + + f"conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_{lm_w}_ps_{ps}", datasets=train_data_normal_ctc, train_args=train_args_conformer_asr_data, search_args={**default_search_args_asr, **additional_search_args}, @@ -1094,7 +1106,7 @@ def run_exp( run_exp( prefix_name - + f"conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_{lm_w}_ps_{ps}", + + f"conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_{lm_w}_ps_{ps}", datasets=train_data_normal_ctc, train_args=train_args_glow100ep_conformer_asr_data, search_args={**default_search_args_asr, **additional_search_args}, @@ -1106,7 +1118,7 @@ def run_exp( run_exp( prefix_name - + f"conformer/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_{lm_w}_ps_{ps}", + + f"conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_{lm_w}_ps_{ps}", datasets=train_data_normal_ctc, train_args=train_args_glow_enc768_200ep_conformer_asr_data, search_args={**default_search_args_asr, **additional_search_args}, @@ -1159,7 +1171,7 @@ def run_exp( ) run_exp( prefix_name - + f"conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_{lm_w}_ps_{ps}", + + f"conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_{lm_w}_ps_{ps}", datasets=train_data_normal_ctc, train_args=train_args_conformer_no_spec_augment_asr_data, search_args={**default_search_args_asr, **additional_search_args}, @@ -1200,7 +1212,7 @@ def run_exp( optimized_search_args = {"lm_weight": 3.0, "prior_scale": 0.5} run_exp( - prefix_name + f"conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/tuned", + prefix_name + f"conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/tuned", datasets=train_data_normal_ctc, train_args=train_args_conformer_asr_data, search_args={**default_search_args_asr, **optimized_search_args}, @@ -1210,7 +1222,7 @@ def run_exp( test_datasets=test_dataset_normal_ctc_tuples, ) run_exp( - prefix_name + f"conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/tuned", + prefix_name + f"conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/tuned", datasets=train_data_normal_ctc, train_args=train_args_conformer_asr_data, search_args={**default_search_args_asr, **optimized_search_args}, @@ -1233,7 +1245,7 @@ def run_exp( run_exp( prefix_name - + f"conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_{p}", + + f"conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_{p}", datasets=train_data_normal_ctc, train_args=train_args_conformer_speaker_drop_asr_data, search_args=default_search_args_asr, @@ -1254,7 +1266,7 @@ def run_exp( additional_search_args = {"lm_weight": lm_w, "prior_scale": ps} run_exp( prefix_name - + f"conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_{lm_w}_ps_{ps}", + + f"conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_{lm_w}_ps_{ps}", datasets=train_data_normal_ctc, train_args=train_args_conformer_speaker_drop_asr_data, search_args={**default_search_args_asr, **additional_search_args}, @@ -1275,7 +1287,7 @@ def run_exp( additional_search_args = {"lm_weight": lm_w, "prior_scale": ps} run_exp( prefix_name - + f"conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_{lm_w}_ps_{ps}", + + f"conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_{lm_w}_ps_{ps}", datasets=train_data_normal_ctc, train_args=train_args_conformer_speaker_drop_asr_data, search_args={**default_search_args_asr, **additional_search_args}, @@ -1283,6 +1295,49 @@ def run_exp( num_epochs=250, with_prior=True, ) + + # -------------------Speaker Drop Fix ----------------------- + train_args_conformer_speaker_drop_asr_data = copy.deepcopy(train_args_conformer_asr_data_v2) + + for p in [0.3, 0.5, 1]: + TTS_exp_name = f"glowTTS/enc192/100ep/speaker_drop/p_speaker_drop_{p}_not_silence_preprocessed" + assert TTS_exp_name in TTS_experiments, "Experiment reference not found!" + + TTS_exp_train_job = TTS_experiments[TTS_exp_name]["train_job"] + train_args_conformer_speaker_drop_asr_data["config"]["preload_from_files"]["existing-model"]["filename"] = ( + TTS_exp_train_job.out_checkpoints[TTS_exp_train_job.returnn_config.get("num_epochs", 100)] + ) + + run_exp( + prefix_name + + f"conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_{p}", + datasets=train_data_normal_ctc, + train_args=train_args_conformer_speaker_drop_asr_data, + search_args=default_search_args_asr, + large_gpu_training=True, + num_epochs=250, + ) + + for p in [0.3, 0.5, 1]: + TTS_exp_name = f"glowTTS/enc192/100ep/speaker_drop/p_speaker_drop_{p}_not_silence_preprocessed" + assert TTS_exp_name in TTS_experiments, "Experiment reference not found!" + TTS_exp_train_job = TTS_experiments[TTS_exp_name]["train_job"] + train_args_conformer_speaker_drop_asr_data["config"]["preload_from_files"]["existing-model"]["filename"] = ( + TTS_exp_train_job.out_checkpoints[TTS_exp_train_job.returnn_config.get("num_epochs", 100)] + ) + for lm_w in [2.5, 3.0, 3.5, 4.0]: + for ps in [0, 0.3, 0.5]: + additional_search_args = {"lm_weight": lm_w, "prior_scale": ps} + run_exp( + prefix_name + + f"conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_{p}/search_params/lm_{lm_w}_ps_{ps}", + datasets=train_data_normal_ctc, + train_args=train_args_conformer_speaker_drop_asr_data, + search_args={**default_search_args_asr, **additional_search_args}, + large_gpu_training=True, + num_epochs=250, + with_prior=True, + ) # ------------------------------ No Freezing Experiments ------------------------------ train_args_conformer_no_freeze_asr_data = copy.deepcopy(train_args_conformer_asr_data) @@ -1313,18 +1368,6 @@ def run_exp( num_epochs=250, ) - train_args_conformer_asr_data_spec_aug_before = copy.deepcopy(train_args_conformer_asr_data) - train_args_conformer_asr_data_spec_aug_before["network_module"] = "glowASR_conformer_specaug_before" - train_args_conformer_asr_data_spec_aug_before["net_args"]["spec_augment"] = True - run_exp( - prefix_name + "conformer/asr_dataset/spec_augment_before/glow_enc192_200ep_not_silence_preprocessed", - datasets=train_data_normal_ctc, - train_args=train_args_conformer_asr_data_spec_aug_before, - search_args=default_search_args_asr, - large_gpu_training=False, - num_epochs=250, - ) - train_args_conformer_no_freeze_asr_data_no_spec_augment = copy.deepcopy(train_args_conformer_no_freeze_asr_data) train_args_conformer_no_freeze_asr_data_no_spec_augment["net_args"]["spec_augment"] = False train_args_conformer_no_freeze_asr_data_no_spec_augment["network_module"] = "glowASR_conformer_no_freeze" @@ -1379,6 +1422,21 @@ def run_exp( num_epochs=250, ) + train_args_weak_conformer_no_pretrained_asr_data_spec_augment_after_fe = copy.deepcopy( + train_args_conformer_no_pretrained_asr_data_spec_augment_after_fe + ) + train_args_weak_conformer_no_pretrained_asr_data_spec_augment_after_fe["network_module"] = ( + "glowASR_conformer_no_freeze_spec_augment_weak_conf" + ) + run_exp( + prefix_name + "weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained", + datasets=train_data_normal_ctc, + train_args=train_args_weak_conformer_no_pretrained_asr_data_spec_augment_after_fe, + search_args=default_search_args_asr, + large_gpu_training=False, + num_epochs=250, + ) + train_args_conformer_no_pretrained_control = copy.deepcopy(train_args_conformer_no_pretrained_asr_data) train_args_conformer_no_pretrained_control["network_module"] = "glowASR_conformer" run_exp( @@ -1612,7 +1670,7 @@ def run_exp( with_prior=True, ) - train_args_conformer_asr_data_v3_768 = copy.deepcopy(train_args_conformer_asr_data) + train_args_conformer_asr_data_v3_768 = copy.deepcopy(train_args_conformer_asr_data_v2) train_args_conformer_asr_data_v3_768["config"]["preload_from_files"]["existing-model"]["filename"] = tts_models[ "glowTTS/enc768/200ep/dec_drop_0.05" ].checkpoint @@ -1626,6 +1684,53 @@ def run_exp( num_epochs=250, ) + train_args_weak_conformer_asr_data_v3_768 = copy.deepcopy(train_args_conformer_asr_data_v3_768) + train_args_weak_conformer_asr_data_v3_768["network_module"] = "glowASR_conformer_x_vector_v2_weak_conf" + run_exp( + prefix_name + "weak_conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2", + datasets=train_data_normal_ctc, + train_args=train_args_weak_conformer_asr_data_v3_768, + search_args=default_search_args_asr, + large_gpu_training=True, + num_epochs=250, + ) + + train_args_conformer_asr_data_v3_768_no_specaug = copy.deepcopy(train_args_conformer_asr_data_v3_768) + train_args_conformer_asr_data_v3_768_no_specaug["net_args"]["spec_augment"] = False + run_exp( + prefix_name + "conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2", + datasets=train_data_normal_ctc, + train_args=train_args_conformer_asr_data_v3_768_no_specaug, + search_args=default_search_args_asr, + large_gpu_training=True, + num_epochs=250, + ) + + train_args_conformer_asr_data_v3_768_400 = copy.deepcopy(train_args_conformer_asr_data_v3_768) + train_args_conformer_asr_data_v3_768_400["config"]["preload_from_files"]["existing-model"]["filename"] = tts_models[ + "glowTTS/enc768/400ep/grad_clip_10/dec_drop_0.05" + ].checkpoint + + run_exp( + prefix_name + "conformer/asr_dataset/spec_augment/glow_enc768_400ep_dec_0.05_v2", + datasets=train_data_normal_ctc, + train_args=train_args_conformer_asr_data_v3_768_400, + search_args=default_search_args_asr, + large_gpu_training=True, + num_epochs=250, + ) + + train_args_conformer_asr_data_v3_768_400_no_specaug = copy.deepcopy(train_args_conformer_asr_data_v3_768_400) + train_args_conformer_asr_data_v3_768_400_no_specaug["net_args"]["spec_augment"] = False + run_exp( + prefix_name + "conformer/asr_dataset/no_spec_augment/glow_enc768_400ep_dec_0.05_v2", + datasets=train_data_normal_ctc, + train_args=train_args_conformer_asr_data_v3_768_400_no_specaug, + search_args=default_search_args_asr, + large_gpu_training=True, + num_epochs=250, + ) + train_args_conformer_asr_data_v3_192 = copy.deepcopy(train_args_conformer_asr_data_v3_768) train_args_conformer_asr_data_v3_192["config"]["preload_from_files"]["existing-model"]["filename"] = tts_models[ "glowTTS/enc192/200ep/dec_drop_0.05" @@ -1654,6 +1759,52 @@ def run_exp( num_epochs=250, ) + train_args_weak_conformer_asr_data_xvector_v3 = copy.deepcopy(train_args_conformer_asr_data_xvector_v3) + train_args_weak_conformer_asr_data_xvector_v3["network_module"] = "glowASR_conformer_x_vector_v2_bottleneck_weak_conf" + run_exp( + prefix_name + "weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2", + datasets=train_data_normal_ctc, + train_args=train_args_weak_conformer_asr_data_xvector_v3, + search_args=default_search_args_asr, + large_gpu_training=False, + num_epochs=250, + ) + + train_args_conformer_asr_data_xvector_v3_no_specaug = copy.deepcopy(train_args_conformer_asr_data_xvector_v3) + train_args_conformer_asr_data_xvector_v3_no_specaug["net_args"]["spec_augment"] = False + run_exp( + prefix_name + "conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2", + datasets=train_data_normal_ctc, + train_args=train_args_conformer_asr_data_xvector_v3_no_specaug, + search_args=default_search_args_asr, + large_gpu_training=True, + num_epochs=250, + ) + + train_args_conformer_asr_data_xvector_v3_400 = copy.deepcopy(train_args_conformer_asr_data_xvector_v3) + train_args_conformer_asr_data_xvector_v3_400["config"]["preload_from_files"]["existing-model"]["filename"] = tts_models[ + "glowTTS_x_vector_v2/enc768/400ep/dec_drop_0.05" + ].checkpoint + run_exp( + prefix_name + "conformer/asr_dataset/spec_augment/glow_x_vector_enc768_400ep_dec_0.05_v2", + datasets=train_data_normal_ctc, + train_args=train_args_conformer_asr_data_xvector_v3_400, + search_args=default_search_args_asr, + large_gpu_training=False, + num_epochs=250, + ) + + train_args_conformer_asr_data_xvector_v3_400_no_specaug = copy.deepcopy(train_args_conformer_asr_data_xvector_v3_400) + train_args_conformer_asr_data_xvector_v3_400_no_specaug["net_args"]["spec_augment"] = False + run_exp( + prefix_name + "conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_400ep_dec_0.05_v2", + datasets=train_data_normal_ctc, + train_args=train_args_conformer_asr_data_xvector_v3_400_no_specaug, + search_args=default_search_args_asr, + large_gpu_training=True, + num_epochs=250, + ) + train_args_conformer_asr_data_xvector_v3_192 = copy.deepcopy(train_args_conformer_asr_data_xvector_v3) train_args_conformer_asr_data_xvector_v3_192["config"]["preload_from_files"]["existing-model"]["filename"] = ( tts_models["glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.05"].checkpoint @@ -1680,6 +1831,15 @@ def run_exp( num_epochs=250, ) + run_exp( + prefix_name + f"conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_{lm_w}_ps_{ps}", + datasets=train_data_normal_ctc, + train_args=train_args_conformer_asr_data_v3_768_no_specaug, + search_args={**default_search_args_asr, **additional_search_args}, + large_gpu_training=True, + num_epochs=250, + ) + run_exp( prefix_name + f"conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_{lm_w}_ps_{ps}", @@ -1700,6 +1860,15 @@ def run_exp( num_epochs=250, ) + run_exp( + prefix_name + f"conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_{lm_w}_ps_{ps}", + datasets=train_data_normal_ctc, + train_args=train_args_conformer_asr_data_xvector_v3_no_specaug, + search_args={**default_search_args_asr, **additional_search_args}, + large_gpu_training=True, + num_epochs=250, + ) + run_exp( prefix_name + f"conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_{lm_w}_ps_{ps}", diff --git a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_no_freeze_spec_augment_weak_conf.py b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_no_freeze_spec_augment_weak_conf.py new file mode 100644 index 000000000..233ed37ad --- /dev/null +++ b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_no_freeze_spec_augment_weak_conf.py @@ -0,0 +1,290 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +from dataclasses import dataclass +import torch +import numpy as np +from torch import nn +import multiprocessing +from librosa import filters +import sys +import time +from typing import Any, Dict, Optional, Tuple, Union +import math + +from torchaudio.functional import mask_along_axis + +from i6_models.parts.blstm import BlstmEncoderV1, BlstmEncoderV1Config + + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1, VGG4LayerActFrontendV1Config + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config +from ..i6modelsV1_VGG4LayerActFrontendV1_v4_cfg import ModelConfig + +from ..i6modelsV1_VGG4LayerActFrontendV1_v4_cfg import \ + SpecaugConfig, VGG4LayerActFrontendV1Config_mod, ModelConfig + + +from returnn.torch.context import get_run_ctx + +from .shared.configs import DbMelFeatureExtractionConfig +from .shared.feature_extraction import DbMelFeatureExtraction +from .shared.spec_augment import apply_spec_aug +from .shared.mask import mask_tensor + +from .shared import modules +from .shared import commons +from .shared import attentions +from .monotonic_align import maximum_path + +from .shared.forward import forward_init_hook, forward_step, forward_finish_hook, prior_init_hook, prior_finish_hook, prior_step +from .shared.train import train_step + +from IPython import embed + +class Model(nn.Module): + """ + Flow-based ASR model based on GlowTTS Structure using a pre-trained flow-based decoder + trained to generate spectrograms from given statistics coming from an encoder + + Model was pretrained using the architecture in + users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS.py + """ + + def __init__( + self, + n_vocab: int, + hidden_channels: int = 192, + out_channels: int = 80, + n_blocks_dec: int = 12, + kernel_size_dec: int = 5, + dilation_rate: int = 1, + n_block_layers: int = 4, + p_dropout: float = 0.1, + p_dropout_flow: float = 0.05, + gin_channels: int = 0, + n_split: int = 4, + n_sqz: int = 2, + sigmoid_scale: bool = False, + window_size: int = 4, + block_length: int = None, + hidden_channels_dec: int = None, + label_target_size=None, + spec_augment = False, + layer_norm = False, + batch_norm = False, + **kwargs, + ): + """_summary_ + + Args: + n_vocab (int): vocabulary size + hidden_channels (int): Number of hidden channels in encoder + out_channels (int): Number of channels in the output + n_blocks_dec (int, optional): Number of coupling blocks in the decoder. Defaults to 12. + kernel_size_dec (int, optional): Kernel size in the decoder. Defaults to 5. + dilation_rate (int, optional): Dilation rate for CNNs of coupling blocks in decoder. Defaults to 5. + n_block_layers (int, optional): Number of layers in the CNN of the coupling blocks in decoder. Defaults to 4. + p_dropout_dec (_type_, optional): Dropout probability in the decoder. Defaults to 0.. + n_speakers (int, optional): Number of speakers. Defaults to 0. + gin_channels (int, optional): Number of speaker embedding channels. Defaults to 0. + n_split (int, optional): Number of splits for the 1x1 convolution for flows in the decoder. Defaults to 4. + n_sqz (int, optional): Squeeze. Defaults to 1. + sigmoid_scale (bool, optional): Boolean to define if log probs in coupling layers should be rescaled using sigmoid. Defaults to False. + window_size (int, optional): Window size in Multi-Head Self-Attention for encoder. Defaults to None. + block_length (_type_, optional): Block length for optional block masking in Multi-Head Attention for encoder. Defaults to None. + hidden_channels_dec (_type_, optional): Number of hidden channels in decodder. Defaults to hidden_channels. + final_hidden_channels: Number of hidden channels in the final network + final_n_layers: Number of layers in the final network + label_target_size: Target size of target vocabulary, target size for final network + """ + super().__init__() + self.n_vocab = n_vocab + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.n_blocks_dec = n_blocks_dec + self.kernel_size_dec = kernel_size_dec + self.dilation_rate = dilation_rate + self.n_block_layers = n_block_layers + self.p_dropout = p_dropout + self.p_dropout_flow = p_dropout_flow + self.n_split = n_split + self.n_sqz = n_sqz + self.sigmoid_scale = sigmoid_scale + self.window_size = window_size + self.block_length = block_length + self.hidden_channels_dec = hidden_channels_dec + self.spec_augment = spec_augment + self.layer_norm = layer_norm + self.batch_norm = batch_norm + + self.net_kwargs = { + "repeat_per_num_frames": 100, + "max_dim_feat": 8, + "num_repeat_feat": 5, + "max_dim_time": 20, + } + + fe_config = DbMelFeatureExtractionConfig.from_dict(kwargs["fe_config"]) + self.feature_extraction = DbMelFeatureExtraction(config=fe_config) + + if label_target_size is None: + if n_vocab is None: + run_ctx = get_run_ctx() + dataset = run_ctx.engine.train_dataset or run_ctx.engine.forward_dataset + self.label_target_size = len(dataset.datasets["zip_dataset"].targets.labels) + else: + self.label_target_size = n_vocab + else: + self.label_target_size = label_target_size + + self.decoder = modules.Flow( + out_channels, + hidden_channels_dec or hidden_channels, + kernel_size_dec, + dilation_rate, + n_blocks_dec, + n_block_layers, + p_dropout=p_dropout_flow, + n_split=n_split, + n_sqz=n_sqz, + sigmoid_scale=sigmoid_scale, + gin_channels=gin_channels, + ) + + frontend_config = VGG4LayerActFrontendV1Config( + in_features=80, + conv1_channels=16, + conv2_channels=16, + conv3_channels=16, + conv4_channels=16, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + out_features=96, + activation=nn.ReLU(), + ) + + model_config = ModelConfig( + frontend_config=frontend_config, + specaug_config=None, + label_target_size=self.n_vocab, + conformer_size=96, + num_layers=8, + num_heads=2, + ff_dim=384, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=9, + final_dropout=0.2, + specauc_start_epoch=1, + ) + self.cfg = model_config + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, kernel_size=self.cfg.conv_kernel_size, dropout=self.cfg.conv_dropout, activation=nn.functional.silu, + norm=LayerNormNC(conformer_size) + ), + ), + ) + + if self.layer_norm: + print("Using Layer Norm after Flow...") + + if self.batch_norm: + print("Using Batch Norm after Flow...") + self.bn = nn.BatchNorm1d(self.out_channels) + else: + self.bn = None + + self.conformer = ConformerEncoderV1(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + def forward(self, raw_audio, raw_audio_len): + with torch.no_grad(): + squeezed_audio = torch.squeeze(raw_audio) + log_mel_features, log_mel_features_len = self.feature_extraction(squeezed_audio, raw_audio_len) # [B, T, F] + + audio_max_length = log_mel_features.size(1) + flow_in = log_mel_features.transpose(1,2) # [B, F, T] + flow_in, flow_in_length, flow_in_max_length = self.preprocess(flow_in, log_mel_features_len, audio_max_length) + mask = torch.unsqueeze(commons.sequence_mask(log_mel_features_len, flow_in.size(2)), 1).to(flow_in.dtype) + flow_out, _ = self.decoder(flow_in, mask, reverse=False) # [B, F, T] + + spec_augment_in = flow_out.transpose(1,2) # [B, T, F] + mask = mask_tensor(spec_augment_in, flow_in_length) + + if self.training and self.spec_augment: + audio_features_masked_2 = apply_spec_aug( + spec_augment_in, + num_repeat_time=torch.max(log_mel_features_len).detach().cpu().numpy() + // self.net_kwargs["repeat_per_num_frames"], + max_dim_time=self.net_kwargs["max_dim_time"], + num_repeat_feat=self.net_kwargs["num_repeat_feat"], + max_dim_feat=self.net_kwargs["max_dim_feat"], + ) + else: + audio_features_masked_2 = spec_augment_in + + conformer_in = audio_features_masked_2 + + if self.layer_norm: + conformer_in = torch.nn.functional.layer_norm(conformer_in, (conformer_in.size(-1),)) + elif self.bn is not None: + conformer_in = self.bn(conformer_in.transpose(1,2)).transpose(1,2) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + def preprocess(self, y, y_lengths, y_max_length): + if y_max_length is not None: + y_max_length = (y_max_length // self.n_sqz) * self.n_sqz + y = y[:, :, :y_max_length] + y_lengths = (y_lengths // self.n_sqz) * self.n_sqz + return y, y_lengths, y_max_length + + def store_inverse(self): + self.decoder.store_inverse() diff --git a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_v2.py b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_v2_weak_conf.py similarity index 96% rename from users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_v2.py rename to users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_v2_weak_conf.py index 9756a0827..de2448b48 100644 --- a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_v2.py +++ b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_v2_weak_conf.py @@ -172,10 +172,10 @@ def __init__( ) frontend_config = VGG4LayerActFrontendV1Config( in_features=80, - conv1_channels=32, - conv2_channels=64, - conv3_channels=64, - conv4_channels=32, + conv1_channels=16, + conv2_channels=16, + conv3_channels=16, + conv4_channels=16, conv_kernel_size=(3, 3), conv_padding=None, pool1_kernel_size=(2, 1), @@ -184,26 +184,27 @@ def __init__( pool2_kernel_size=(2, 1), pool2_stride=(2, 1), pool2_padding=None, - out_features=384, + out_features=96, activation=nn.ReLU(), ) + conformer_model_config = ModelConfig( frontend_config=frontend_config, - specaug_config=specaug_config, + specaug_config=None, label_target_size=self.n_vocab, - conformer_size=384, - num_layers=12, - num_heads=4, - ff_dim=1536, + conformer_size=96, + num_layers=8, + num_heads=2, + ff_dim=384, att_weights_dropout=0.2, conv_dropout=0.2, ff_dropout=0.2, mhsa_dropout=0.2, - conv_kernel_size=31, + conv_kernel_size=9, final_dropout=0.2, - specauc_start_epoch=1 + specauc_start_epoch=1, ) - + self.cfg = conformer_model_config frontend_config = self.cfg.frontend_config conformer_size = self.cfg.conformer_size @@ -244,7 +245,6 @@ def __init__( self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) self.specaug_start_epoch = self.cfg.specauc_start_epoch - def forward(self, raw_audio, raw_audio_len): with torch.no_grad(): self.decoder.eval() @@ -274,12 +274,12 @@ def forward(self, raw_audio, raw_audio_len): audio_features_masked_2 = spec_augment_in conformer_in = audio_features_masked_2 - + if self.layer_norm: conformer_in = torch.nn.functional.layer_norm(conformer_in, (conformer_in.size(-1),)) elif self.bn is not None: conformer_in = self.bn(conformer_in.transpose(1,2)).transpose(1,2) - + conformer_out, out_mask = self.conformer(conformer_in, mask) conformer_out = self.final_dropout(conformer_out) logits = self.final_linear(conformer_out) @@ -297,4 +297,3 @@ def preprocess(self, y, y_lengths, y_max_length): def store_inverse(self): self.decoder.store_inverse() - diff --git a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_x_vector_v2_bottleneck_weak_conf.py b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_x_vector_v2_bottleneck_weak_conf.py new file mode 100644 index 000000000..84a33497a --- /dev/null +++ b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_x_vector_v2_bottleneck_weak_conf.py @@ -0,0 +1,445 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +from dataclasses import dataclass +import torch +import numpy as np +from torch import nn +import multiprocessing +from librosa import filters +import sys +import time +from typing import Any, Dict, Optional, Tuple, Union +import math + +from torchaudio.functional import mask_along_axis + +from i6_models.parts.blstm import BlstmEncoderV1, BlstmEncoderV1Config + + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1, VGG4LayerActFrontendV1Config + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config +from ..i6modelsV1_VGG4LayerActFrontendV1_v4_cfg import ModelConfig + +from ..i6modelsV1_VGG4LayerActFrontendV1_v4_cfg import \ + SpecaugConfig, VGG4LayerActFrontendV1Config_mod, ModelConfig + + +from returnn.torch.context import get_run_ctx + +from .shared.configs import DbMelFeatureExtractionConfig +from .shared.feature_extraction import DbMelFeatureExtraction +from .shared.spec_augment import apply_spec_aug +from .shared.mask import mask_tensor + +from .shared import modules +from .shared import commons +from .shared import attentions +from .monotonic_align import maximum_path + +from .shared.forward import forward_init_hook, forward_step, forward_finish_hook, prior_init_hook, prior_finish_hook, prior_step +from .shared.train import train_step + +from IPython import embed + +class XVector(nn.Module): + def __init__(self, input_dim=40, num_classes=8, **kwargs): + super(XVector, self).__init__() + self.tdnn1 = modules.TDNN( + input_dim=input_dim, + output_dim=512, + context_size=5, + dilation=1, + dropout_p=0.5, + batch_norm=True + ) + self.tdnn2 = modules.TDNN( + input_dim=512, output_dim=512, context_size=3, dilation=2, dropout_p=0.5, batch_norm=True + ) + self.tdnn3 = modules.TDNN( + input_dim=512, output_dim=512, context_size=2, dilation=3, dropout_p=0.5, batch_norm=True + ) + self.tdnn4 = modules.TDNN( + input_dim=512, output_dim=512, context_size=1, dilation=1, dropout_p=0.5, batch_norm=True + ) + self.tdnn5 = modules.TDNN( + input_dim=512, output_dim=512, context_size=1, dilation=1, dropout_p=0.5, batch_norm=True + ) + #### Frame levelPooling + self.segment6 = nn.Linear(1024, 512) + self.segment7 = nn.Linear(512, 512) + self.output = nn.Linear(512, num_classes) + self.softmax = nn.Softmax(dim=1) + + # fe_config = DbMelFeatureExtractionConfig.from_dict(kwargs["fe_config"]) + # self.feature_extraction = DbMelFeatureExtraction(config=fe_config) + + def forward(self, x, x_lengths): + # with torch.no_grad(): + # squeezed_audio = torch.squeeze(raw_audio) + # x, x_lengths = self.feature_extraction(squeezed_audio, raw_audio_lengths) # [B, T, F] + + # x = x.transpose(1, 2) + tdnn1_out = self.tdnn1(x) + # return tdnn1_out + tdnn2_out = self.tdnn2(tdnn1_out) + tdnn3_out = self.tdnn3(tdnn2_out) + tdnn4_out = self.tdnn4(tdnn3_out) + tdnn5_out = self.tdnn5(tdnn4_out) + ### Stat Pool + mean = torch.mean(tdnn5_out, 2) + std = torch.std(tdnn5_out, 2) + stat_pooling = torch.cat((mean, std), 1) + segment6_out = self.segment6(stat_pooling) + x_vec = self.segment7(segment6_out) + output = self.output(x_vec) + predictions = self.softmax(output) + return output, predictions, x_vec + +class Flow(nn.Module): + def __init__( + self, + in_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_blocks, + n_layers, + p_dropout=0.0, + n_split=4, + n_sqz=2, + sigmoid_scale=False, + gin_channels=0, + ): + """Flow-based decoder model + + Args: + in_channels (int): Number of incoming channels + hidden_channels (int): Number of hidden channels + kernel_size (int): Kernel Size for convolutions in coupling blocks + dilation_rate (float): Dilation Rate to define dilation in convolutions of coupling block + n_blocks (int): Number of coupling blocks + n_layers (int): Number of layers in CNN of the coupling blocks + p_dropout (float, optional): Dropout probability for CNN in coupling blocks. Defaults to 0.. + n_split (int, optional): Number of splits for the 1x1 convolution for flows in the decoder. Defaults to 4. + n_sqz (int, optional): Squeeze. Defaults to 1. + sigmoid_scale (bool, optional): Boolean to define if log probs in coupling layers should be rescaled using sigmoid. Defaults to False. + gin_channels (int, optional): Number of speaker embedding channels. Defaults to 0. + """ + super().__init__() + + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_blocks = n_blocks + self.n_layers = n_layers + self.p_dropout = p_dropout + self.n_split = n_split + self.n_sqz = n_sqz + self.sigmoid_scale = sigmoid_scale + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + + for b in range(n_blocks): + self.flows.append(modules.ActNorm(channels=in_channels * n_sqz)) + self.flows.append(modules.InvConvNear(channels=in_channels * n_sqz, n_split=n_split)) + self.flows.append( + attentions.CouplingBlock( + in_channels * n_sqz, + hidden_channels, + kernel_size=kernel_size, + dilation_rate=dilation_rate, + n_layers=n_layers, + gin_channels=gin_channels, + p_dropout=p_dropout, + sigmoid_scale=sigmoid_scale, + ) + ) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + flows = self.flows + logdet_tot = 0 + else: + flows = reversed(self.flows) + logdet_tot = None + if g is not None: + g = g.unsqueeze(-1) + + if self.n_sqz > 1: + x, x_mask = commons.channel_squeeze(x, x_mask, self.n_sqz) + for f in flows: + if not reverse: + x, logdet = f(x, x_mask, g=g, reverse=reverse) + logdet_tot += logdet + else: + x, logdet = f(x, x_mask, g=g, reverse=reverse) + if self.n_sqz > 1: + x, x_mask = commons.channel_unsqueeze(x, x_mask, self.n_sqz) + return x, logdet_tot + + def store_inverse(self): + for f in self.flows: + f.store_inverse() + +class Model(nn.Module): + """ + Flow-based ASR model based on GlowTTS Structure using a pre-trained flow-based decoder + trained to generate spectrograms from given statistics coming from an encoder + + Model was pretrained using the architecture in + users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS.py + """ + + def __init__( + self, + n_vocab: int, + hidden_channels: int = 192, + out_channels: int = 80, + n_blocks_dec: int = 12, + kernel_size_dec: int = 5, + dilation_rate: int = 1, + n_block_layers: int = 4, + p_dropout: float = 0.1, + p_dropout_flow: float = 0.05, + gin_channels: int = 0, + n_split: int = 4, + n_sqz: int = 2, + sigmoid_scale: bool = False, + window_size: int = 4, + block_length: int = None, + hidden_channels_dec: int = None, + label_target_size=None, + spec_augment = False, + layer_norm = False, + batch_norm = False, + n_speakers = 1, + **kwargs, + ): + """_summary_ + + Args: + n_vocab (int): vocabulary size + hidden_channels (int): Number of hidden channels in encoder + out_channels (int): Number of channels in the output + n_blocks_dec (int, optional): Number of coupling blocks in the decoder. Defaults to 12. + kernel_size_dec (int, optional): Kernel size in the decoder. Defaults to 5. + dilation_rate (int, optional): Dilation rate for CNNs of coupling blocks in decoder. Defaults to 5. + n_block_layers (int, optional): Number of layers in the CNN of the coupling blocks in decoder. Defaults to 4. + p_dropout_dec (_type_, optional): Dropout probability in the decoder. Defaults to 0.. + n_speakers (int, optional): Number of speakers. Defaults to 0. + gin_channels (int, optional): Number of speaker embedding channels. Defaults to 0. + n_split (int, optional): Number of splits for the 1x1 convolution for flows in the decoder. Defaults to 4. + n_sqz (int, optional): Squeeze. Defaults to 1. + sigmoid_scale (bool, optional): Boolean to define if log probs in coupling layers should be rescaled using sigmoid. Defaults to False. + window_size (int, optional): Window size in Multi-Head Self-Attention for encoder. Defaults to None. + block_length (_type_, optional): Block length for optional block masking in Multi-Head Attention for encoder. Defaults to None. + hidden_channels_dec (_type_, optional): Number of hidden channels in decodder. Defaults to hidden_channels. + final_hidden_channels: Number of hidden channels in the final network + final_n_layers: Number of layers in the final network + label_target_size: Target size of target vocabulary, target size for final network + """ + super().__init__() + self.n_vocab = n_vocab + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.n_blocks_dec = n_blocks_dec + self.kernel_size_dec = kernel_size_dec + self.dilation_rate = dilation_rate + self.n_block_layers = n_block_layers + self.p_dropout = p_dropout + self.p_dropout_flow = p_dropout_flow + self.n_split = n_split + self.n_sqz = n_sqz + self.sigmoid_scale = sigmoid_scale + self.window_size = window_size + self.block_length = block_length + self.hidden_channels_dec = hidden_channels_dec + self.spec_augment = spec_augment + self.layer_norm = layer_norm + self.batch_norm = batch_norm + + self.net_kwargs = { + "repeat_per_num_frames": 100, + "max_dim_feat": 8, + "num_repeat_feat": 5, + "max_dim_time": 20, + } + + fe_config = DbMelFeatureExtractionConfig.from_dict(kwargs["fe_config"]) + self.feature_extraction = DbMelFeatureExtraction(config=fe_config) + + if label_target_size is None: + if n_vocab is None: + run_ctx = get_run_ctx() + dataset = run_ctx.engine.train_dataset or run_ctx.engine.forward_dataset + self.label_target_size = len(dataset.datasets["zip_dataset"].targets.labels) + else: + self.label_target_size = n_vocab + else: + self.label_target_size = label_target_size + + if n_speakers > 1: + self.x_vector = XVector(out_channels, n_speakers) + self.x_vector_bottleneck = nn.Sequential(nn.Linear(512, gin_channels), nn.ReLU()) + + self.decoder = Flow( + out_channels, + hidden_channels_dec or hidden_channels, + kernel_size_dec, + dilation_rate, + n_blocks_dec, + n_block_layers, + p_dropout=p_dropout_flow, + n_split=n_split, + n_sqz=n_sqz, + sigmoid_scale=sigmoid_scale, + gin_channels=gin_channels, + ) + + frontend_config = VGG4LayerActFrontendV1Config( + in_features=80, + conv1_channels=16, + conv2_channels=16, + conv3_channels=16, + conv4_channels=16, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + out_features=96, + activation=nn.ReLU(), + ) + + model_config = ModelConfig( + frontend_config=frontend_config, + specaug_config=None, + label_target_size=self.n_vocab, + conformer_size=96, + num_layers=8, + num_heads=2, + ff_dim=384, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=9, + final_dropout=0.2, + specauc_start_epoch=1, + ) + self.cfg = model_config + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, kernel_size=self.cfg.conv_kernel_size, dropout=self.cfg.conv_dropout, activation=nn.functional.silu, + norm=LayerNormNC(conformer_size) + ), + ), + ) + + if self.layer_norm: + print("Using Layer Norm after Flow...") + + if self.batch_norm: + print("Using Batch Norm after Flow...") + self.bn = nn.BatchNorm1d(self.out_channels) + else: + self.bn = None + + self.conformer = ConformerEncoderV1(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + def forward(self, raw_audio, raw_audio_len): + with torch.no_grad(): + self.x_vector.eval() + self.x_vector_bottleneck.eval() + self.decoder.eval() + squeezed_audio = torch.squeeze(raw_audio) + log_mel_features, log_mel_features_len = self.feature_extraction(squeezed_audio, raw_audio_len) # [B, T, F] + + audio_max_length = log_mel_features.size(1) + + flow_in = log_mel_features.transpose(1,2) # [B, F, T] + flow_in, flow_in_length, flow_in_max_length = self.preprocess(flow_in, log_mel_features_len, audio_max_length) + mask = torch.unsqueeze(commons.sequence_mask(log_mel_features_len, flow_in.size(2)), 1).to(flow_in.dtype) + + _, _, g = self.x_vector(log_mel_features.transpose(1,2), log_mel_features_len) + g = self.x_vector_bottleneck(g) + + flow_out, _ = self.decoder(flow_in, mask, g=g, reverse=False) # [B, F, T] + + spec_augment_in = flow_out.transpose(1,2) # [B, T, F] + mask = mask_tensor(spec_augment_in, flow_in_length) + + if self.training and self.spec_augment: + audio_features_masked_2 = apply_spec_aug( + spec_augment_in, + num_repeat_time=torch.max(log_mel_features_len).detach().cpu().numpy() + // self.net_kwargs["repeat_per_num_frames"], + max_dim_time=self.net_kwargs["max_dim_time"], + num_repeat_feat=self.net_kwargs["num_repeat_feat"], + max_dim_feat=self.net_kwargs["max_dim_feat"], + ) + else: + audio_features_masked_2 = spec_augment_in + + conformer_in = audio_features_masked_2 + + if self.layer_norm: + conformer_in = torch.nn.functional.layer_norm(conformer_in, (conformer_in.size(-1),)) + elif self.bn is not None: + conformer_in = self.bn(conformer_in.transpose(1,2)).transpose(1,2) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + def preprocess(self, y, y_lengths, y_max_length): + if y_max_length is not None: + y_max_length = (y_max_length // self.n_sqz) * self.n_sqz + y = y[:, :, :y_max_length] + y_lengths = (y_lengths // self.n_sqz) * self.n_sqz + return y, y_lengths, y_max_length + + def store_inverse(self): + self.decoder.store_inverse() diff --git a/users/rilling/experiments/librispeech/librispeech_glow_asr/training_comparison.ipynb b/users/rilling/experiments/librispeech/librispeech_glow_asr/training_comparison.ipynb index 8be3ac0cd..326039076 100644 --- a/users/rilling/experiments/librispeech/librispeech_glow_asr/training_comparison.ipynb +++ b/users/rilling/experiments/librispeech/librispeech_glow_asr/training_comparison.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 34, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -32,24 +32,25 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "['/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/training']\n" + "['/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/training', '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/training']\n" ] }, { "data": { "text/plain": [ - "({'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/training': ''},\n", - " 1)" + "({'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/training': '/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/training': '/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/'},\n", + " 2)" ] }, - "execution_count": 35, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -60,7 +61,8 @@ " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed*/training\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_*eval/not_silence_preprocessed/training\"\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed*/training\"\n", - " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/training\"\n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/training\",\n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/training\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/encoding_test/enc768/mean_only/*/training\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/encoding_test/enc768/with_sigma/*/training\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/encoding_test/enc192/*/training\",\n", @@ -90,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -118,7 +120,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -127,14 +129,15 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/training: 3\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/training: 3\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/training: 3\n", "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/training: 3\n", "Large Font: False\n", "Setup Interactive Legend\n", @@ -144,18 +147,18 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "d0ff87fb45264d3db97bab3bb479940b", + "model_id": "d2310e7bf4524ee3b98f4cd89b041032", "version_major": 2, "version_minor": 0 }, - "image/png": "", + "image/png": "", "text/html": [ "\n", "
\n", "
\n", " Figure\n", "
\n", - " \n", + " \n", "
\n", " " ], diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_maxlike_alignment_multi_layer_ffn.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_maxlike_alignment_multi_layer_ffn.py index a693950ae..d46e6a0e9 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_maxlike_alignment_multi_layer_ffn.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_maxlike_alignment_multi_layer_ffn.py @@ -524,6 +524,7 @@ def forward_step(*, model: Model, data, run_ctx, **kwargs): # print(f"audio_feature shape: {audio_features.shape}") # print(f"audio_feature length: {audio_features_len}") logits, attn, y_lengths = model(phonemes, phonemes_len, audio_features, audio_features_len, speaker_labels) + breakpoint() # embed() upsampled_phonemes = torch.matmul(attn.squeeze(1).transpose(1, 2), phonemes.float().unsqueeze(-1)).squeeze(-1) diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/training_comparison.ipynb b/users/rilling/experiments/librispeech/librispeech_glowtts/training_comparison.ipynb index 9b26a3326..6113ebc5e 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/training_comparison.ipynb +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/training_comparison.ipynb @@ -29,8 +29,8 @@ { "data": { "text/plain": [ - "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/decoder_test/simple_enc/glowTTS_simple_encoder_test_maxlike_alignment_multi_layer_ffn_v2/training': '/simple_enc/glowTTS_simple_encoder_test_maxlike_alignment_multi_layer_ffn_v2/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/decoder_test/simple_enc/glowTTS_simple_encoder_test_maxlike_alignment_multi_layer_ffn/training': '/simple_enc/glowTTS_simple_encoder_test_maxlike_alignment_multi_layer_ffn/',\n", + "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/decoder_test/simple_enc/glowTTS_simple_encoder_test_maxlike_alignment_multi_layer_ffn/training': '/simple_enc/glowTTS_simple_encoder_test_maxlike_alignment_multi_layer_ffn/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/decoder_test/simple_enc/glowTTS_simple_encoder_test_maxlike_alignment_multi_layer_ffn_v2/training': '/simple_enc/glowTTS_simple_encoder_test_maxlike_alignment_multi_layer_ffn_v2/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/decoder_test/enc192/glowTTS_encoder_sample_test_maxlike_alignment_multi_layer_ffn/training': '/enc192/glowTTS_encoder_sample_test_maxlike_alignment_multi_layer_ffn/'}" ] }, @@ -49,7 +49,7 @@ " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/encoding_test/glowTTS_encoding_test_simple_linear/training\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc768/*/not_silence_preprocessed/training/\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/vocoder/simple_gl/vocoder.simple_gl.blstm_gl_predictor_v1/training\"\n", - " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/decoder_test/*/*_maxlike_alignment_multi_layer_ffn*/training\",\n", + " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/decoder_test/*/*_maxlike_alignment_multi_layer_ffn*/training\",\n", "]\n", "lr_files = []\n", "for g in globs:\n", @@ -102,18 +102,18 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "c77a4179d2734831920e79543b557541", + "model_id": "c18ea0d7037b458783fc20c28536cdc4", "version_major": 2, "version_minor": 0 }, - "image/png": "", + "image/png": "", "text/html": [ "\n", "
\n", "
\n", " Figure\n", "
\n", - " \n", + " \n", "
\n", " " ], diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/experiments.py b/users/rilling/experiments/librispeech/librispeech_joint_training/experiments.py index 9b9b7b572..a4d523046 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/experiments.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/experiments.py @@ -3,6 +3,7 @@ import numpy as np from sisyphus import tk from dataclasses import asdict +import torch from .data import ( @@ -62,6 +63,7 @@ def run_exp( eval_tts=False, tts_eval_datasets=None, eval_invertibility=False, + large_gpu_training=False, ): exp = {} @@ -82,6 +84,7 @@ def run_exp( returnn_root=MINI_RETURNN_ROOT, prefix=prefix + name, num_epochs=num_epochs, + large_gpu=large_gpu_training ) else: train_job = given_train_job_for_forward @@ -168,6 +171,7 @@ def run_exp( nisqa_eval=True, swer_eval=True, swer_eval_corpus_key=ds_k, + nisqa_confidence=True, ) # forward_job_gl = tts_eval( @@ -383,6 +387,52 @@ def run_exp( n_speakers=speaker_datastream.vocab_size, ) + strong_specaug_config = SpecaugConfig( + repeat_per_n_frames=25, + max_dim_time=20, + max_dim_feat=16, + num_repeat_feat=5, + ) + strong_frontend_config = VGG4LayerActFrontendV1Config_mod( + in_features=80, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + out_features=384, + activation_str="ReLU", + activation=None + ) + model_config_strong_conformer = ModelConfig( + frontend_config=strong_frontend_config, + specaug_config=strong_specaug_config, + text_encoder_config=text_encoder_config, + decoder_config=flow_decoder_config, + label_target_size=vocab_size_without_blank_asr, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=1536, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + specauc_start_epoch=1, + out_channels=80, + gin_channels=256, + n_speakers=speaker_datastream.vocab_size, + ) + net_module = "glowTTS_ASR_conformer_x_vector" train_args = { @@ -972,6 +1022,7 @@ def run_exp( search_args=default_search_args, eval_tts=True, tts_eval_datasets=tts_forward_datasets, + eval_invertibility=True, ) exp_dict = run_exp( net_module + "_ctc_scale_0.1", @@ -984,6 +1035,7 @@ def run_exp( search_args=default_search_args, eval_tts=True, tts_eval_datasets=tts_forward_datasets, + eval_invertibility=True, ) for lm in [2.0, 2.5, 3.0, 3.5, 4.0, 4.5]: @@ -1007,6 +1059,23 @@ def run_exp( search_args={**default_search_args, **{"lm_weight": lm}}, ) + train_args_two_forward_no_xvector_strong_conformer = copy.deepcopy(train_args_two_forward_no_xvector) + train_args_two_forward_no_xvector_strong_conformer["net_args"]["model_config"] = asdict(model_config_strong_conformer) + + exp_dict = run_exp( + net_module + "_strong_conformer_ctc_scale_0.1", + train_args_two_forward_no_xvector_strong_conformer, + training_datasets, + asr_test_datasets, + 250, + training_args={"ctc_scale": 0.1}, + forward_args=forward_args, + search_args=default_search_args, + eval_tts=True, + tts_eval_datasets=tts_forward_datasets, + large_gpu_training=True + ) + train_args_conformer_only = copy.deepcopy(train_args) train_args_conformer_only["net_args"]["model_config"] = asdict(model_config) net_module = "only_conformer" diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pipeline.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pipeline.py index 4c5d7c014..1f85c074e 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/pipeline.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pipeline.py @@ -13,7 +13,7 @@ from .default_tools import SCTK_BINARY_PATH, NISQA_REPO -def training(config, returnn_exe, returnn_root, prefix, num_epochs=65): +def training(config, returnn_exe, returnn_root, prefix, num_epochs=65, large_gpu=False): train_job = ReturnnTrainingJob( config, log_verbosity=5, @@ -24,6 +24,10 @@ def training(config, returnn_exe, returnn_root, prefix, num_epochs=65): returnn_python_exe=returnn_exe, returnn_root=returnn_root, ) + + if (large_gpu): + train_job.rqmt["gpu_mem"] = 24 + train_job.add_alias(prefix + "/training") tk.register_output(prefix + "/training.models", train_job.out_model_dir) diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_two_forward_pass.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_two_forward_pass.py index ef03ac686..256267ce4 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_two_forward_pass.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_two_forward_pass.py @@ -57,6 +57,7 @@ from .shared.forward import search_init_hook, search_finish_hook from .shared.eval_forward import * +from .shared.eval_invertibility import * from IPython import embed @@ -435,8 +436,7 @@ def forward( return log_probs, torch.sum(out_mask, dim=1) else: z, logdet = self.decoder(y, z_mask, g=g, reverse=False) # [B, F, T] - from IPython import embed - embed() + with torch.no_grad(): x_s_sq_r = torch.exp(-2 * x_logs) logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - x_logs, [1]).unsqueeze(-1) # [b, t, 1] diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_x_vector.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_x_vector.py index 9d2430fd2..ecb2211c3 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_x_vector.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_x_vector.py @@ -414,7 +414,7 @@ def __init__( self.specaug_start_epoch = self.cfg.specauc_start_epoch def forward( - self, x=None, x_lengths=None, raw_audio=None, raw_audio_lengths=None, g=None, gen=False, recognition=False, noise_scale=1.0, length_scale=1.0, invertibility_check=False + self, x=None, x_lengths=None, raw_audio=None, raw_audio_lengths=None, g=None, gen=False, recognition=False, noise_scale=1.0, length_scale=1.0 ): if not gen: with torch.no_grad(): @@ -439,11 +439,6 @@ def forward( y, y_lengths, y_max_length = self.preprocess(y, y_lengths, y_max_length) z_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y_max_length), 1).to(torch.int32) - if invertibility_check: - z, _ = self.decoder(y, z_mask, g=g, reverse=False) - y_hat, _ = self.decoder(z, z_mask, g=g, reverse=True) - return y_hat, y - if not recognition: attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(z_mask, 2) diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer.py index b4ac61197..181e53bd6 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer.py @@ -384,7 +384,7 @@ def __init__( self.specaug_start_epoch = self.cfg.specauc_start_epoch def forward( - self, x=None, x_lengths=None, raw_audio=None, raw_audio_lengths=None, g=None, gen=False, recognition=False, noise_scale=1.0, length_scale=1.0, invertibility_check=False + self, x=None, x_lengths=None, raw_audio=None, raw_audio_lengths=None, g=None, gen=False, recognition=False, noise_scale=1.0, length_scale=1.0 ): if not gen: with torch.no_grad(): @@ -398,12 +398,6 @@ def forward( y, y_lengths, y_max_length = self.preprocess(y, y_lengths, y_max_length) z_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y_max_length), 1).to(torch.int32) - - if invertibility_check: - g = None # speaker is set in generic forward_invertibility_step, but should not be used, since it was not used during training - z, _ = self.decoder(y, z_mask, g=g, reverse=False) - y_hat, _ = self.decoder(z, z_mask, g=g, reverse=True) - return y_hat, y z, logdet = self.decoder(y, z_mask, g=g, reverse=False) diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_specaugment_before.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_specaugment_before.py index 7ac8b696a..99c78cbfd 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_specaugment_before.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_specaugment_before.py @@ -424,7 +424,7 @@ def __init__( self.specaug_start_epoch = self.cfg.specauc_start_epoch def forward( - self, x=None, x_lengths=None, raw_audio=None, raw_audio_lengths=None, g=None, gen=False, recognition=False, noise_scale=1.0, length_scale=1.0, invertibility_check=False + self, x=None, x_lengths=None, raw_audio=None, raw_audio_lengths=None, g=None, gen=False, recognition=False, noise_scale=1.0, length_scale=1.0 ): if not gen: with torch.no_grad(): @@ -452,12 +452,6 @@ def forward( y, y_lengths, y_max_length = self.preprocess(y, y_lengths, y_max_length) z_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y_max_length), 1).to(torch.int32) - if invertibility_check: - g = None # Remove speaker id, which is set in forward_invertibility_step - z, _ = self.decoder(y, z_mask, g=g, reverse=False) - y_hat, _ = self.decoder(z, z_mask, g=g, reverse=True) - return y_hat, y - z, logdet = self.decoder(y, z_mask, g=g, reverse=False) # from IPython import embed # embed() diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_specaugment_before_xvector.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_specaugment_before_xvector.py index 74ba049af..3565e6b00 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_specaugment_before_xvector.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_specaugment_before_xvector.py @@ -385,7 +385,7 @@ def __init__( self.specaug_start_epoch = self.cfg.specauc_start_epoch def forward( - self, x=None, x_lengths=None, raw_audio=None, raw_audio_lengths=None, g=None, gen=False, recognition=False, noise_scale=1.0, length_scale=1.0, invertibility_check=False + self, x=None, x_lengths=None, raw_audio=None, raw_audio_lengths=None, g=None, gen=False, recognition=False, noise_scale=1.0, length_scale=1.0 ): with torch.no_grad(): squeezed_audio = torch.squeeze(raw_audio) @@ -413,11 +413,6 @@ def forward( z, logdet = self.decoder(y, z_mask, g=g, reverse=False) - if invertibility_check: - z, _ = self.decoder(y, z_mask, g=g, reverse=False) - y_hat, _ = self.decoder(z, z_mask, g=g, reverse=True) - return y_hat, y - conformer_in = z.transpose(1,2) mask = mask_tensor(spec_augment_in, y_lengths) conformer_out, out_mask = self.conformer(conformer_in, mask) diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/shared/eval_invertibility.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/shared/eval_invertibility.py index b6eb1eb57..3cdc198d0 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/shared/eval_invertibility.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/shared/eval_invertibility.py @@ -2,15 +2,24 @@ import torch import numpy as np from returnn.datasets.hdf import SimpleHDFWriter +from . import commons + def forward_init_hook_invertibility(run_ctx, **kwargs): - run_ctx.batch_mae = [] + run_ctx.total_mae = 0 + run_ctx.total_ae_var = 0 + run_ctx.total_ae_max = torch.tensor(-np.inf) + run_ctx.total_ae_min = torch.tensor(np.inf) + run_ctx.num_of_obs = 0 def forward_finish_hook_invertibility(run_ctx, **kwargs): - all_batch_mse = torch.Tensor(run_ctx.batch_mae).mean() with open("output.hdf", "w+") as f: - f.write(str(all_batch_mse)) + f.write("total, mean, var, max, min \n") + f.write( + f"{run_ctx.num_of_obs}, {str(float(run_ctx.total_mae))}, {str(float(run_ctx.total_ae_var))}, {str(float(run_ctx.total_ae_max))}, {str(float(run_ctx.total_ae_min))}" + ) + def forward_step_invertibility(*, model, data, run_ctx, **kwargs): raw_audio = data["audio_features"] # [B, N] (sparse) @@ -25,16 +34,57 @@ def forward_step_invertibility(*, model, data, run_ctx, **kwargs): else: raise Exception("Missing speaker embedding!") - tags = data["seq_tag"] + squeezed_audio = torch.squeeze(raw_audio) + y, y_lengths = model.feature_extraction(squeezed_audio, raw_audio_len) # [B, T, F] + y = y.transpose(1, 2) # [B, F, T] + + if hasattr(model, "x_vector"): + _, _, g = model.x_vector(y, y_lengths) + + if hasattr(model, "x_vector_bottleneck"): + g = model.x_vector_bottleneck(g) + elif hasattr(model, "emb_g"): + g = torch.nn.functional.normalize(model.emb_g(g.squeeze(-1))).unsqueeze(-1) + else: + g = None + + y_max_length = y.size(2) + + y, y_lengths, y_max_length = model.preprocess(y, y_lengths, y_max_length) + z_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y_max_length), 1).to(torch.int32) + + z, _ = model.decoder(y, z_mask, g=g, reverse=False) + y_hat, _ = model.decoder(z, z_mask, g=g, reverse=True) + + mae = torch.nn.functional.l1_loss(y_hat * z_mask, y * z_mask, reduction="none") # [B, F, T] - y_hat, y = model( - x=phonemes, - x_lengths=phonemes_len, - raw_audio=raw_audio, - raw_audio_lengths=raw_audio_len, - g=g, - invertibility_check=True, + current_num_of_obs = y_hat.shape[1] * y_lengths.sum() # F * total_number_of_frames_in_batch + + old_mae = run_ctx.total_mae + + current_mae = ( + mae.sum() / current_num_of_obs + ) # This considers the masking by only using the mean over all unmasked elements + + current_var = (mae - current_mae).sum() / ( + current_num_of_obs - 1 + ) # Variance over unmasked elements with bias correction 1 + + run_ctx.total_mae = ((run_ctx.num_of_obs / (run_ctx.num_of_obs + current_num_of_obs)) * old_mae) + ( + (current_num_of_obs / (run_ctx.num_of_obs + current_num_of_obs)) * current_mae ) - mse = torch.nn.functional.l1_loss(y_hat, y) - run_ctx.batch_mae.append(mse) + run_ctx.total_ae_var = ( + (run_ctx.num_of_obs / (run_ctx.num_of_obs + current_num_of_obs)) * run_ctx.total_ae_var + + ((current_num_of_obs / (run_ctx.num_of_obs + current_num_of_obs)) * current_var) + + ((run_ctx.num_of_obs * current_num_of_obs) / (run_ctx.num_of_obs + current_num_of_obs) ** 2) + * (old_mae - current_mae) ** 2 + ) + + run_ctx.total_ae_max = torch.max(run_ctx.total_ae_max, mae.max()) + + run_ctx.total_ae_min = torch.min( + run_ctx.total_ae_min, (mae + (-1 * z_mask + 1) * torch.tensor(float("inf")).nan_to_num(0.0)).min() + ) # Masked Min operation + + run_ctx.num_of_obs += current_num_of_obs diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/config.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/config.py index 85f0139fc..669e1954b 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/config.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/config.py @@ -33,11 +33,14 @@ def get_training_config( # changing these does not change the hash post_config = { "cleanup_old_models": True if keep_epochs is None else {"keep": keep_epochs}, - "stop_on_nonfinite_train_score": True, # this might break now with True + # "stop_on_nonfinite_train_score": True, # this might break now with True "allow_missing_optimizer_checkpoint": True, "backend": "torch" } + if "stop_on_nonfinite_train_score" not in config: + post_config["stop_on_nonfinite_train_score"] = True + base_config = { ############# "train": training_datasets.train.as_returnn_opts(), diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/data.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/data.py index a127a39c5..8e1229403 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/data.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/data.py @@ -431,7 +431,7 @@ def build_training_dataset( if use_tts_train_segments: train_segments, cv_segments = get_librispeech_tts_segments(ls_corpus_key=librispeech_key) else: - train_segments = None + train_segments, cv_segments = (None, None) train_bliss, train_ogg = get_train_bliss_and_zip("train-clean-100", silence_preprocessed=silence_preprocessing) dev_clean_bliss_tts, dev_clean_ogg = get_train_bliss_and_zip( diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint/experiments.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint/experiments.py index 302e4d7ef..adf39a1bc 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint/experiments.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint/experiments.py @@ -33,7 +33,7 @@ PhonemePredictionConfigCNN ) -from ..storage import tts_models +from ..storage import tts_models, add_tts_model, TTSModel def get_glow_joint(x_vector_exp, joint_exps, tts_exps, gl_checkpoint): @@ -768,6 +768,8 @@ def run_exp( tts_eval_datasets=tts_forward_datasets_xvectors ) + add_tts_model(net_module + "/basic_init/no_specaug/tts_target_size/ce_ls_0.1", TTSModel(model_config_cnn, exp_dict["train_job"].out_checkpoints[200])) + train_args_cnn_pretrained = copy.deepcopy(train_args_cnn) train_args_cnn_pretrained["config"]["preload_from_files"] = { "glowTTS_xvector": { @@ -789,3 +791,5 @@ def run_exp( asr_search=False, tts_eval_datasets=tts_forward_datasets_xvectors ) + + add_tts_model(net_module + "/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1", TTSModel(model_config_cnn, exp_dict["train_job"].out_checkpoints[200])) diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_2step/experiments.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_2step/experiments.py index efb1349bd..277fae3fd 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_2step/experiments.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_2step/experiments.py @@ -21,6 +21,8 @@ from i6_experiments.users.rilling.experiments.librispeech.common.tts_eval import tts_eval from ..default_tools import RETURNN_COMMON, RETURNN_PYTORCH_EXE, RETURNN_PYTORCH_ASR_SEARCH_EXE, MINI_RETURNN_ROOT +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1, VGG4LayerActFrontendV1Config + from ..pytorch_networks.shared.configs import ( SpecaugConfig, ModelConfigV1, @@ -31,7 +33,8 @@ FlowDecoderConfig, PhonemePredictionConfig, PhonemePredictionConfigCNN, - PhonemePredictionConfigBLSTM + PhonemePredictionConfigBLSTM, + ConformerASRConfig ) from ..storage import tts_models, add_tts_model, TTSModel @@ -69,6 +72,8 @@ def run_exp( ): exp = {} + assert num_epochs == len(args["config"]["learning_rates"]), "Number of Epochs and Number of LR steps differs!" + if given_train_job_for_forward is None: training_config = get_training_config( training_datasets=dataset, @@ -175,6 +180,14 @@ def run_exp( xvectors_file=x_vector_extractions["x_vector_cnn/1e-3_not_silence_preprocessed"]["hdf"], ) + train_settings = TrainingDatasetSettings( + custom_processing_function=None, partition_epoch=3, epoch_wise_filters=[], seq_ordering="laplace:.1000" + ) + + training_datasets_pe3 = build_training_dataset( + settings=train_settings, librispeech_key="train-clean-100", silence_preprocessing=False + ) + from typing import cast from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream @@ -491,3 +504,87 @@ def run_exp( search_args=default_search_args, asr_search=False, ) + + + # ==================== CNN pretrained 2nd step Conformer ====================== + net_module = "frozen_glowtts.glowASR_conformer_x_vector" + first_step_cnn = tts_models["glowTTS_ASR_cnn_x_vector/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1"] + model_config = first_step_cnn.config + + specaug_config_conf = SpecaugConfig( + repeat_per_n_frames=100, + max_dim_feat=8, + num_repeat_feat=5, + max_dim_time=20, + ) + + frontend_config = VGG4LayerActFrontendV1Config_mod( + in_features=80, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + out_features=384, + activation_str="ReLU", + ) + + model_config.phoneme_prediction_config = ConformerASRConfig( + frontend_config=frontend_config, + label_target_size=vocab_size_without_blank_asr, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=1536, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + ) + + model_config.specaug_config = specaug_config_conf + model_config.specauc_start_epoch = 1 + + train_args = { + "net_args": {"fe_config": asdict(fe_config), "model_config": asdict(model_config)}, + "network_module": net_module, + "debug": True, + "config": { + "optimizer": {"class": "adam", "epsilon": 1e-8}, + "learning_rates": list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 30)), + "batch_size": 360 * 16000, + # "gradient_clip_norm": 1.0, + "stop_on_nonfinite_train_score": False, + # "max_seq_length": {"audio_features": 25 * 16000}, + # "max_seqs": 60, + "preload_from_files": { + "glowTTS": { + "filename": first_step_cnn.checkpoint, + "init_for_train": True, + "ignore_missing": True, + "ignore_params_prefixes": ["encoder", "phoneme_pred_cnn", "phoneme_pred_output"], + } + } + }, + } + + exp_dict = run_exp( + "second_step_asr/" + net_module.replace(".", "/"), + train_args, + training_datasets_pe3, + asr_test_datasets, + 250, + forward_args=forward_args, + search_args=default_search_args, + asr_search=True, + asr_cv_set=True, + ) \ No newline at end of file diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_2step/training_comparison.ipynb b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_2step/training_comparison.ipynb index fe4388e41..51ba86d4c 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_2step/training_comparison.ipynb +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_2step/training_comparison.ipynb @@ -23,16 +23,16 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_tts/ga_glowTTS_ASR_ffn_x_vector_v2_2ndstep_tts/ce_ls_1.0/training': ''}" + "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/training/': '/'}" ] }, - "execution_count": 7, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -44,7 +44,8 @@ " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_*/training\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2*/ce_ls_*/training\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2*/*/ce_ls_*/training\",\n", - " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_tts/ga_glowTTS_ASR_ffn_x_vector_v2*/ce_ls_*/training\",\n", + " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_tts/ga_glowTTS_ASR_ffn_x_vector_v2*/ce_ls_*/training\",\n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/training/\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/*/no_specaug/*/ce_ls_0.1/training\",\n", "]\n", "lr_files = []\n", @@ -64,38 +65,54 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ + "# lr_files.append(\n", + "# \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/not_silence_preprocessed/lm5/training/\"\n", + "# )\n", + "# files[lr_files[-1]] = \"Baseline BLSTM on Glow Enc 768\"\n", + "\n", "lr_files.append(\n", - " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/not_silence_preprocessed/lm5/training/\"\n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/training\"\n", ")\n", - "files[lr_files[-1]] = \"Baseline BLSTM on Glow Enc 768\"" + "files[lr_files[-1]] = \"Former Baseline\"\n", + "\n", + "lr_files.append(\n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/training/\"\n", + ")\n", + "files[lr_files[-1]] = \"Enc768 0.05 x-vector\"\n" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_0.1/training': 3,\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_1/training': 3,\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_1/training': 3,\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_0.1/training': 3}" + "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_1/training/': 3,\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_0.1/training/': 3,\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_1/training/': 3,\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_0.1/training/': 3,\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/training/': 3,\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/training': 3,\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/training/': 3}" ] }, - "execution_count": 9, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "partition_epochs = {\n", - " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/*/ce_ls_*/training\": 3\n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/*/ce_ls_*/training/\": 3, \n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/training/\": 3,\n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/training\": 3,\n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/training/\": 3,\n", "}\n", "\n", "partition_epochs_2 = {}\n", @@ -109,15 +126,16 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_tts/ga_glowTTS_ASR_ffn_x_vector_v2_2ndstep_tts/ce_ls_1.0/training: 1\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/not_silence_preprocessed/lm5/training/: 1\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/training/: 3\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/training: 3\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/training/: 3\n", "Large Font: False\n", "Setup Interactive Legend\n", "Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous view', 'arrow-left', 'back'), ('Forward', 'Forward to next view', 'arrow-right', 'forward'), ('Pan', 'Left button pans, Right button zooms\\nx/y fixes axis, CTRL fixes aspect', 'arrows', 'pan'), ('Zoom', 'Zoom to rectangle\\nx/y fixes axis', 'square-o', 'zoom'), ('Download', 'Download plot', 'floppy-o', 'save_figure')]))\n" @@ -126,18 +144,18 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "3b8565f06a5947a091b20af742cbf857", + "model_id": "8879ad67b2a94651b6ff26731340937c", "version_major": 2, "version_minor": 0 }, - "image/png": "", + "image/png": "", "text/html": [ "\n", "
\n", "
\n", " Figure\n", "
\n", - " \n", + " \n", "
\n", " " ], @@ -153,7 +171,6 @@ "error_data = {}\n", "labels = list(files.values())\n", "for i, lr_file in enumerate(lr_files):\n", - " breakpoint()\n", " data = get_epoch_data(lr_file, epoch=None)\n", " if data is None:\n", " continue\n", @@ -170,6 +187,7 @@ " # error_data[(labels[i], int(ep//p_e))].update(ep_data_error)\n", " error_data_tmp = []\n", "df = pd.DataFrame.from_dict(error_data, orient=\"index\")\n", + "\n", "plot_df(\n", " df,\n", " plot_lr=False,\n", @@ -177,7 +195,7 @@ " shrink_axes=0.6,\n", " # ylim_max=4,\n", " # ylim_min=-0.9,\n", - " keys_exclude=\"ctc\",\n", + " # keys_exclude=\"ctc\",\n", " # color_map=\"Set1\",\n", " draggable=True\n", ")" @@ -185,7 +203,93 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ctcdev_loss_ctcdevtrain_loss_ctc
/13.8545573.8639753.834595
23.7420743.5673983.467777
32.4797512.1673831.838922
42.0517741.8115391.417094
51.8355451.5860351.171362
\n", + "
" + ], + "text/plain": [ + " ctc dev_loss_ctc devtrain_loss_ctc\n", + "/ 1 3.854557 3.863975 3.834595\n", + " 2 3.742074 3.567398 3.467777\n", + " 3 2.479751 2.167383 1.838922\n", + " 4 2.051774 1.811539 1.417094\n", + " 5 1.835545 1.586035 1.171362" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -229,7 +333,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.13" } }, "nbformat": 4, diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_flow_ga_frozen_glowtts/experiments.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_flow_ga_frozen_glowtts/experiments.py index 7e6b8fd3d..a460aa1ed 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_flow_ga_frozen_glowtts/experiments.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_flow_ga_frozen_glowtts/experiments.py @@ -543,3 +543,59 @@ def run_exp( search_args=default_search_args, phoneme_pred=True, ) + + net_module = "frozen_glowtts.ga_glowTTS_ASR_ffn_mas" + train_args_no_xvectors_mas = copy.deepcopy(train_args_no_xvectors) + train_args_no_xvectors_mas["network_module"] = net_module + + exp_dict = run_exp( + net_module.replace(".", "/") + "/100ep/encoder", + train_args_no_xvectors_mas, + training_datasets_pe1_tts_segments, + dev_dataset_tuples_with_phon, + 100, + forward_args=forward_args, + training_args={"recognition_input": "encoder"}, + search_args=default_search_args, + phoneme_pred=True, + ) + + exp_dict = run_exp( + net_module.replace(".", "/") + "/100ep/decoder", + train_args_no_xvectors_mas, + training_datasets_pe1_tts_segments, + dev_dataset_tuples_with_phon, + 100, + forward_args=forward_args, + training_args={"recognition_input": "decoder"}, + search_args=default_search_args, + phoneme_pred=True, + ) + + net_module = "frozen_glowtts.ga_glowTTS_ASR_ffn_mas_no_eval" + train_args_no_xvectors_mas_no_eval = copy.deepcopy(train_args_no_xvectors_mas) + train_args_no_xvectors_mas_no_eval["network_module"] = net_module + + exp_dict = run_exp( + net_module.replace(".", "/") + "/100ep/encoder", + train_args_no_xvectors_mas_no_eval, + training_datasets_pe1_tts_segments, + dev_dataset_tuples_with_phon, + 100, + forward_args=forward_args, + training_args={"recognition_input": "encoder"}, + search_args=default_search_args, + phoneme_pred=True, + ) + + exp_dict = run_exp( + net_module.replace(".", "/") + "/100ep/decoder", + train_args_no_xvectors_mas_no_eval, + training_datasets_pe1_tts_segments, + dev_dataset_tuples_with_phon, + 100, + forward_args=forward_args, + training_args={"recognition_input": "decoder"}, + search_args=default_search_args, + phoneme_pred=True, + ) diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_flow_ga_frozen_glowtts/training_comparison.ipynb b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_flow_ga_frozen_glowtts/training_comparison.ipynb index 87b422387..d4a6b1d03 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_flow_ga_frozen_glowtts/training_comparison.ipynb +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_flow_ga_frozen_glowtts/training_comparison.ipynb @@ -29,14 +29,20 @@ { "data": { "text/plain": [ - "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/decoder/training': '/ga_glowTTS_ASR_ffn/100ep/decoder/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/encoder/training': '/ga_glowTTS_ASR_ffn/100ep/encoder/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/decoder/training': '/ga_glowTTS_ASR_cnn_x_vector/100ep/decoder/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/encoder/training': '/ga_glowTTS_ASR_cnn_x_vector/100ep/encoder/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/encoder/training': '/ga_glowTTS_ASR_ffn_x_vector/100ep/encoder/',\n", + "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/encoder/training': '/ga_glowTTS_ASR_ffn_x_vector/100ep/encoder/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/decoder/training': '/ga_glowTTS_ASR_ffn_x_vector/100ep/decoder/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/encoder/training': '/ga_glowTTS_ASR_cnn_x_vector/100ep/encoder/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/decoder/training': '/ga_glowTTS_ASR_cnn_x_vector/100ep/decoder/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/decoder/training': '/ga_glowTTS_ASR_cnn/100ep/decoder/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/encoder/training': '/ga_glowTTS_ASR_cnn/100ep/encoder/'}" + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/encoder/training': '/ga_glowTTS_ASR_cnn/100ep/encoder/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/decoder/training': '/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/decoder/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/encoder/training': '/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/encoder/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas/100ep/decoder/training': '/ga_glowTTS_ASR_ffn_mas/100ep/decoder/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas/100ep/encoder/training': '/ga_glowTTS_ASR_ffn_mas/100ep/encoder/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/encoder/training': '/ga_glowTTS_ASR_ffn/100ep/encoder/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/decoder/training': '/ga_glowTTS_ASR_ffn/100ep/decoder/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/encoder/training': '/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/encoder/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/decoder/training': '/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/decoder/'}" ] }, "execution_count": 2, @@ -88,8 +94,8 @@ { "data": { "text/plain": [ - "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_0.1/training': 3,\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_1/training': 3,\n", + "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_1/training': 3,\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_0.1/training': 3,\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_1/training': 3,\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_0.1/training': 3}" ] @@ -122,14 +128,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/decoder/training: 1\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/encoder/training: 1\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/decoder/training: 1\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/encoder/training: 1\n", "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/encoder/training: 1\n", "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/decoder/training: 1\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/encoder/training: 1\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/decoder/training: 1\n", "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/decoder/training: 1\n", "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/encoder/training: 1\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/decoder/training: 1\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/encoder/training: 1\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas/100ep/decoder/training: 1\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas/100ep/encoder/training: 1\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/encoder/training: 1\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/decoder/training: 1\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/encoder/training: 1\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/decoder/training: 1\n", "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/decoder_test/enc768/with_sigma/glowTTS_encoder_sample_test_multi_layer_ffn/training: 1\n", "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/decoder_test/enc768/with_sigma/glowTTS_decoder_test_multi_layer_ffn/training: 1\n", "Large Font: False\n", @@ -140,18 +152,18 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6453f1c5e2b6422d80da690ad730b709", + "model_id": "b72cb5e1fa034185b3d62f3b9035307a", "version_major": 2, "version_minor": 0 }, - "image/png": "", + "image/png": "", "text/html": [ "\n", "
\n", "
\n", " Figure\n", "
\n", - " \n", + " \n", "
\n", " " ], diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_tts/experiments.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_tts/experiments.py index 131f42a28..4e74cd979 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_tts/experiments.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_tts/experiments.py @@ -63,7 +63,9 @@ def run_exp( given_train_job_for_forward=None, nisqa_evaluation=True, swer_evaluation=True, - tts_eval_datasets=None + tts_eval_datasets=None, + eval_invertibility=False, + nisqa_confidence=False, ): exp = {} assert len(args["config"]["learning_rates"]) == num_epochs, "Length of LR schedule and number of epochs differ." @@ -111,7 +113,8 @@ def run_exp( vocoder="gl", nisqa_eval=nisqa_evaluation, swer_eval=swer_evaluation, - swer_eval_corpus_key=ds_k + swer_eval_corpus_key=ds_k, + nisqa_confidence=nisqa_confidence, ) if extract_x_vector: @@ -127,6 +130,21 @@ def run_exp( target="xvector", ) exp["forward_xvector_job"] = forward_xvector_job + + if eval_invertibility: + forward_x_vector_config = get_forward_config( + forward_dataset=dataset, **args, forward_args=forward_args, target="invertibility" + ) + forward_xvector_job = forward( + checkpoint=train_job.out_checkpoints[num_epochs], + config=forward_x_vector_config, + returnn_exe=RETURNN_PYTORCH_EXE, + returnn_root=MINI_RETURNN_ROOT, + prefix=prefix + name, + target="invertibility", + ) + exp["forward_invertibility_job"] = forward_xvector_job + return exp train_settings = TrainingDatasetSettings( @@ -395,6 +413,7 @@ def run_exp( 100, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets_xvectors, + nisqa_confidence=True, ) add_tts_model( net_module, @@ -414,10 +433,16 @@ def run_exp( 200, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets_xvectors, - swer_evaluation=True + swer_evaluation=True, + eval_invertibility=True, + nisqa_confidence=True, ) add_tts_model( - net_module + "/enc768/200ep/dec_drop_0.05", TTSModel(ModelConfigV1.from_dict(train_args_TTS_xvector_200ep["net_args"]["model_config"]), exp_dict["train_job"].out_checkpoints[200]) + net_module + "/enc768/200ep/dec_drop_0.05", + TTSModel( + ModelConfigV1.from_dict(train_args_TTS_xvector_200ep["net_args"]["model_config"]), + exp_dict["train_job"].out_checkpoints[200], + ), ) train_args_TTS_xvector_200ep["net_args"]["model_config"]["text_encoder_config"]["filter_channels"] = 192 @@ -429,7 +454,7 @@ def run_exp( 200, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets_xvectors, - swer_evaluation=True + swer_evaluation=True, ) add_tts_model( @@ -455,7 +480,7 @@ def run_exp( 200, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets_xvectors, - swer_evaluation=True + swer_evaluation=True, ) train_args_TTS_xvector_200ep_no_dec_dropout["net_args"]["model_config"]["text_encoder_config"][ @@ -469,7 +494,7 @@ def run_exp( 200, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets_xvectors, - swer_evaluation=True + swer_evaluation=True, ) train_args_xvector_altLR = copy.deepcopy(train_args_TTS_xvector_200ep) @@ -486,11 +511,13 @@ def run_exp( 200, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets_xvectors, - swer_evaluation=True + swer_evaluation=True, ) train_args_xvector_altLR_no_dec_drop = copy.deepcopy(train_args_xvector_altLR) - train_args_xvector_altLR_no_dec_drop["net_args"]["model_config"]["decoder_config"] = asdict(flow_decoder_config_no_dropout) + train_args_xvector_altLR_no_dec_drop["net_args"]["model_config"]["decoder_config"] = asdict( + flow_decoder_config_no_dropout + ) exp_dict = run_exp( net_module + "/enc768/200ep_long_cooldown/dec_drop_0.0", train_args_xvector_altLR_no_dec_drop, @@ -499,7 +526,7 @@ def run_exp( 200, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets_xvectors, - swer_evaluation=True + swer_evaluation=True, ) net_module = "glowTTS" @@ -518,7 +545,9 @@ def run_exp( ) train_args_TTS_100ep_no_dec_dropout = copy.deepcopy(train_args_TTS) - train_args_TTS_100ep_no_dec_dropout["net_args"]["model_config"]["decoder_config"] = asdict(flow_decoder_config_no_dropout) + train_args_TTS_100ep_no_dec_dropout["net_args"]["model_config"]["decoder_config"] = asdict( + flow_decoder_config_no_dropout + ) train_args_TTS_100ep_no_dec_dropout["config"]["gradient_clip_norm"] = 10.0 exp_dict = run_exp( net_module + "/enc768/100ep/dec_drop_0.00", @@ -529,7 +558,13 @@ def run_exp( forward_args=forward_args, tts_eval_datasets=tts_forward_datasets, ) - add_tts_model(net_module + "/enc768/100ep/dec_drop_0.00", TTSModel(ModelConfigV1.from_dict(train_args_TTS_100ep_no_dec_dropout["net_args"]["model_config"]), exp_dict["train_job"].out_checkpoints[100])) + add_tts_model( + net_module + "/enc768/100ep/dec_drop_0.00", + TTSModel( + ModelConfigV1.from_dict(train_args_TTS_100ep_no_dec_dropout["net_args"]["model_config"]), + exp_dict["train_job"].out_checkpoints[100], + ), + ) train_args_TTS_200ep = copy.deepcopy(train_args_TTS) train_args_TTS_200ep["config"]["learning_rates"] = lr_schedule_200ep @@ -545,8 +580,16 @@ def run_exp( forward_args=forward_args, swer_evaluation=True, tts_eval_datasets=tts_forward_datasets, + eval_invertibility=True, + nisqa_confidence=True, + ) + add_tts_model( + net_module + "/enc768/200ep/dec_drop_0.05", + TTSModel( + ModelConfigV1.from_dict(train_args_TTS_200ep["net_args"]["model_config"]), + exp_dict["train_job"].out_checkpoints[200], + ), ) - add_tts_model(net_module + "/enc768/200ep/dec_drop_0.05", TTSModel(ModelConfigV1.from_dict(train_args_TTS_200ep["net_args"]["model_config"]), exp_dict["train_job"].out_checkpoints[200])) exp_dict = run_exp( net_module + "/enc768/200ep/dec_drop_0.05_epsilon_1e-8", @@ -732,10 +775,12 @@ def run_exp( p_dropout=model_config.decoder_config.p_dropout, n_split=model_config.decoder_config.n_split, n_sqz=model_config.decoder_config.n_sqz, - sigmoid_scale=model_config.decoder_config.sigmoid_scale + sigmoid_scale=model_config.decoder_config.sigmoid_scale, ) - train_args_TTS_xvector_200ep_conformer_coupling["net_args"]["model_config"] = asdict(model_config_conformer_coupling) + train_args_TTS_xvector_200ep_conformer_coupling["net_args"]["model_config"] = asdict( + model_config_conformer_coupling + ) train_args_TTS_xvector_200ep_conformer_coupling["config"]["batch_size"] = 75 * 16000 train_args_TTS_xvector_200ep_conformer_coupling["config"]["accum_grad_multiple_step"] = 4 @@ -747,7 +792,8 @@ def run_exp( 200, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets_xvectors, - swer_evaluation=True + swer_evaluation=True, + nisqa_confidence=True, ) # ===================== Multi-Scale ======================= @@ -778,7 +824,7 @@ def run_exp( 200, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets_xvectors, - swer_evaluation=True + swer_evaluation=True, ) # ============= Encoding Distance Loss ======================== @@ -854,12 +900,12 @@ def run_exp( ffn_channels=None, specauc_start_epoch=None, out_channels=80, - gin_channels=512, + gin_channels=256, n_speakers=speaker_datastream.vocab_size, ) net_module = "glowTTS" train_args_400 = { - "net_args": {"fe_config": asdict(fe_config), "model_config": asdict(model_config_tts_only)}, + "net_args": {"fe_config": asdict(fe_config), "model_config": asdict(model_config_400ep)}, "network_module": net_module, "debug": True, "config": { @@ -871,6 +917,44 @@ def run_exp( }, } + train_args_400_gin512 = copy.deepcopy(train_args_400) + train_args_400_gin512["net_args"]["model_config"] = asdict(model_config_tts_only) + exp_dict = run_exp( + net_module + "/enc768/400ep/gin512/dec_drop_0.05", + train_args_400_gin512, + training_datasets_pe1_tts_segments, + asr_test_datasets, + 400, + forward_args=forward_args, + swer_evaluation=True, + nisqa_evaluation=True, + tts_eval_datasets=tts_forward_datasets, + nisqa_confidence=True, + ) + + train_args_400_gin512_grad_norm = copy.deepcopy(train_args_400_gin512) + train_args_400_gin512_grad_norm["config"]["gradient_clip_norm"] = 10 + exp_dict = run_exp( + net_module + "/enc768/400ep/gin512/grad_clip_10/dec_drop_0.05", + train_args_400_gin512_grad_norm, + training_datasets_pe1_tts_segments, + asr_test_datasets, + 400, + forward_args=forward_args, + swer_evaluation=True, + nisqa_evaluation=True, + tts_eval_datasets=tts_forward_datasets, + nisqa_confidence=True, + ) + + add_tts_model( + net_module + "/enc768/400ep/gin512/grad_clip_10/dec_drop_0.05", + TTSModel( + ModelConfigV1.from_dict(train_args_400_gin512_grad_norm["net_args"]["model_config"]), + exp_dict["train_job"].out_checkpoints[400], + ), + ) + exp_dict = run_exp( net_module + "/enc768/400ep/dec_drop_0.05", train_args_400, @@ -881,6 +965,7 @@ def run_exp( swer_evaluation=True, nisqa_evaluation=True, tts_eval_datasets=tts_forward_datasets, + nisqa_confidence=True, ) train_args_400_grad_norm = copy.deepcopy(train_args_400) @@ -895,6 +980,28 @@ def run_exp( swer_evaluation=True, nisqa_evaluation=True, tts_eval_datasets=tts_forward_datasets, + nisqa_confidence=True, + ) + + add_tts_model( + net_module + "/enc768/400ep/grad_clip_10/dec_drop_0.05", + TTSModel( + ModelConfigV1.from_dict(train_args_400_grad_norm["net_args"]["model_config"]), + exp_dict["train_job"].out_checkpoints[400], + ), + ) + + train_args_400_grad_norm["net_args"]["model_config"]["text_encoder_config"]["filter_channels"] = 192 + exp_dict = run_exp( + net_module + "/enc192/400ep/grad_clip_10/dec_drop_0.05", + train_args_400_grad_norm, + training_datasets_pe1_tts_segments, + asr_test_datasets, + 400, + forward_args=forward_args, + swer_evaluation=True, + nisqa_evaluation=True, + tts_eval_datasets=tts_forward_datasets, ) net_module = "glowTTS_x_vector_v2" @@ -909,6 +1016,30 @@ def run_exp( } } + train_args_400_xvector_gin512 = copy.deepcopy(train_args_400_xvector) + train_args_400_xvector_gin512["net_args"]["model_config"] = asdict(model_config_tts_only) + + exp_dict = run_exp( + net_module + "/enc768/400ep/gin512/dec_drop_0.05", + train_args_400_xvector_gin512, + training_datasets_pe1_tts_segments, + asr_test_datasets, + 400, + forward_args=forward_args, + swer_evaluation=True, + nisqa_evaluation=True, + tts_eval_datasets=tts_forward_datasets_xvectors, + nisqa_confidence=True, + ) + + add_tts_model( + net_module + "/enc768/400ep/gin512/dec_drop_0.05", + TTSModel( + ModelConfigV1.from_dict(train_args_400_xvector_gin512["net_args"]["model_config"]), + exp_dict["train_job"].out_checkpoints[400], + ), + ) + exp_dict = run_exp( net_module + "/enc768/400ep/dec_drop_0.05", train_args_400_xvector, @@ -919,4 +1050,27 @@ def run_exp( swer_evaluation=True, nisqa_evaluation=True, tts_eval_datasets=tts_forward_datasets_xvectors, + nisqa_confidence=True, + ) + + add_tts_model( + net_module + "/enc768/400ep/dec_drop_0.05", + TTSModel( + ModelConfigV1.from_dict(train_args_400_xvector["net_args"]["model_config"]), + exp_dict["train_job"].out_checkpoints[400], + ), + ) + + train_args_400_xvector["net_args"]["model_config"]["text_encoder_config"]["filter_channels"] = 192 + + exp_dict = run_exp( + net_module + "/enc192/400ep/dec_drop_0.05", + train_args_400_xvector, + training_datasets_pe1_tts_segments, + asr_test_datasets, + 400, + forward_args=forward_args, + swer_evaluation=True, + nisqa_evaluation=True, + tts_eval_datasets=tts_forward_datasets_xvectors, ) diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_tts/training_comparison.ipynb b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_tts/training_comparison.ipynb index 7c75f0100..72d769622 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_tts/training_comparison.ipynb +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_tts/training_comparison.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -23,21 +23,21 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.0_epsilon_1e-8/training': '/enc192/200ep/dec_drop_0.0_epsilon_1e-8/',\n", + "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.0/grad_clip_10/training': '/enc192/200ep/dec_drop_0.0/grad_clip_10/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/grad_clip_10/training': '/enc768/200ep/dec_drop_0.0/grad_clip_10/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05_epsilon_1e-8/training': '/enc192/200ep/dec_drop_0.05_epsilon_1e-8/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05/training': '/enc192/200ep/dec_drop_0.05/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0_epsilon_1e-8/training': '/enc768/200ep/dec_drop_0.0_epsilon_1e-8/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05_epsilon_1e-8/training': '/enc768/200ep/dec_drop_0.05_epsilon_1e-8/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05/training': '/enc768/200ep/dec_drop_0.05/'}" ] }, - "execution_count": 5, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -46,7 +46,9 @@ "globs = [\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS*/enc768/400ep/dec_drop_0.05/training\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/grad_clip_10/dec_drop_0.05/training\",\n", - " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc*/200ep/dec_drop_0.0*/training\"\n", + " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.0*/training\",\n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc*/200ep/dec_drop_0.0*/grad_clip_10/training\",\n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc*/200ep/dec_drop_0.0*/training\",\n", "]\n", "breakpoint()\n", "lr_files = []\n", @@ -66,7 +68,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -78,17 +80,17 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.0_epsilon_1e-8/training: 1\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.0/grad_clip_10/training: 1\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/grad_clip_10/training: 1\n", "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05_epsilon_1e-8/training: 1\n", "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05/training: 1\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0_epsilon_1e-8/training: 1\n", "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05_epsilon_1e-8/training: 1\n", "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05/training: 1\n", "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05/training/: 1\n", @@ -100,18 +102,18 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "08febe3432d0417d8d3a1c7840fe31d0", + "model_id": "2084e635587446d2aab96b81f3fa6006", "version_major": 2, "version_minor": 0 }, - "image/png": "", + "image/png": "", "text/html": [ "\n", "
\n", "
\n", " Figure\n", "
\n", - " \n", + " \n", "
\n", " " ], @@ -158,7 +160,564 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "df2 = df.reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dev_loss_dpdev_loss_mle...dpmle
level_0/enc192/200ep/dec_drop_0.0/grad_clip_10//enc192/200ep/dec_drop_0.05//enc192/200ep/dec_drop_0.05_epsilon_1e-8//enc768/200ep/dec_drop_0.0/grad_clip_10//enc768/200ep/dec_drop_0.05//enc768/200ep/dec_drop_0.05_epsilon_1e-8/Baseline Glow-TTS 768/enc192/200ep/dec_drop_0.0/grad_clip_10//enc192/200ep/dec_drop_0.05//enc192/200ep/dec_drop_0.05_epsilon_1e-8/.../enc768/200ep/dec_drop_0.05//enc768/200ep/dec_drop_0.05_epsilon_1e-8/Baseline Glow-TTS 768/enc192/200ep/dec_drop_0.0/grad_clip_10//enc192/200ep/dec_drop_0.05//enc192/200ep/dec_drop_0.05_epsilon_1e-8//enc768/200ep/dec_drop_0.0/grad_clip_10//enc768/200ep/dec_drop_0.05//enc768/200ep/dec_drop_0.05_epsilon_1e-8/Baseline Glow-TTS 768
level_1
11.2388031.2300681.2299081.0768351.2381041.2352751.2381040.0464500.0733960.073425...1.1451111.1450461.1451110.5372520.5681190.5695990.5026200.5308900.5325450.530890
21.2048271.2410251.2441681.1124671.3533281.3581041.353328-0.219648-0.152898-0.153953...1.1662361.1662611.166236-0.101299-0.082530-0.082966-0.083080-0.066146-0.066379-0.066146
31.1806951.2466911.2458881.0896631.7226231.7481281.722623-0.341818-0.295465-0.295729...1.1489091.1495771.148909-0.294475-0.281159-0.281597-0.293310-0.281814-0.282104-0.281814
41.1653151.2490061.2577491.0540371.5680671.5431481.568067-0.392180-0.361857-0.362248...1.1260331.1261831.126033-0.371938-0.375815-0.375824-0.381286-0.383792-0.383782-0.383792
51.1595921.2477901.2482641.0558131.4635251.4849411.463525-0.419574-0.413436-0.414346...1.1043971.1045111.104397-0.410438-0.422834-0.422658-0.422354-0.433206-0.433054-0.433206
..................................................................
1960.4307960.6787800.8247170.4451260.4423680.4382850.442368-0.837367-0.813777-0.816068...0.3888700.3870810.388870-0.850404-0.808053-0.806879-0.848083-0.811209-0.810240-0.811209
1970.4298700.7454000.8247250.4442180.4407210.4373380.440721-0.837755-0.810379-0.815329...0.3882160.3865260.388216-0.850987-0.808369-0.807225-0.848792-0.811539-0.810735-0.811539
1980.4320650.6821560.8126070.4450280.4417050.4371700.441705-0.830906-0.814995-0.817098...0.3882450.3865450.388245-0.851446-0.808689-0.807542-0.849257-0.810812-0.811058-0.810812
1990.4303980.6005700.8086160.4462320.4422300.4377040.442230-0.835626-0.814634-0.816613...0.3879940.3862900.387994-0.852212-0.809059-0.807872-0.849940-0.811891-0.811443-0.811891
2000.4319750.6150260.8141730.4458010.4412180.4382480.441218-0.773976-0.815064-0.817505...0.3873770.3860670.387377-0.852834-0.809477-0.808277-0.850649-0.812429-0.811807-0.812429
\n", + "

200 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " dev_loss_dp \\\n", + "level_0 /enc192/200ep/dec_drop_0.0/grad_clip_10/ /enc192/200ep/dec_drop_0.05/ \n", + "level_1 \n", + "1 1.238803 1.230068 \n", + "2 1.204827 1.241025 \n", + "3 1.180695 1.246691 \n", + "4 1.165315 1.249006 \n", + "5 1.159592 1.247790 \n", + "... ... ... \n", + "196 0.430796 0.678780 \n", + "197 0.429870 0.745400 \n", + "198 0.432065 0.682156 \n", + "199 0.430398 0.600570 \n", + "200 0.431975 0.615026 \n", + "\n", + " \\\n", + "level_0 /enc192/200ep/dec_drop_0.05_epsilon_1e-8/ \n", + "level_1 \n", + "1 1.229908 \n", + "2 1.244168 \n", + "3 1.245888 \n", + "4 1.257749 \n", + "5 1.248264 \n", + "... ... \n", + "196 0.824717 \n", + "197 0.824725 \n", + "198 0.812607 \n", + "199 0.808616 \n", + "200 0.814173 \n", + "\n", + " \\\n", + "level_0 /enc768/200ep/dec_drop_0.0/grad_clip_10/ /enc768/200ep/dec_drop_0.05/ \n", + "level_1 \n", + "1 1.076835 1.238104 \n", + "2 1.112467 1.353328 \n", + "3 1.089663 1.722623 \n", + "4 1.054037 1.568067 \n", + "5 1.055813 1.463525 \n", + "... ... ... \n", + "196 0.445126 0.442368 \n", + "197 0.444218 0.440721 \n", + "198 0.445028 0.441705 \n", + "199 0.446232 0.442230 \n", + "200 0.445801 0.441218 \n", + "\n", + " \\\n", + "level_0 /enc768/200ep/dec_drop_0.05_epsilon_1e-8/ Baseline Glow-TTS 768 \n", + "level_1 \n", + "1 1.235275 1.238104 \n", + "2 1.358104 1.353328 \n", + "3 1.748128 1.722623 \n", + "4 1.543148 1.568067 \n", + "5 1.484941 1.463525 \n", + "... ... ... \n", + "196 0.438285 0.442368 \n", + "197 0.437338 0.440721 \n", + "198 0.437170 0.441705 \n", + "199 0.437704 0.442230 \n", + "200 0.438248 0.441218 \n", + "\n", + " dev_loss_mle \\\n", + "level_0 /enc192/200ep/dec_drop_0.0/grad_clip_10/ /enc192/200ep/dec_drop_0.05/ \n", + "level_1 \n", + "1 0.046450 0.073396 \n", + "2 -0.219648 -0.152898 \n", + "3 -0.341818 -0.295465 \n", + "4 -0.392180 -0.361857 \n", + "5 -0.419574 -0.413436 \n", + "... ... ... \n", + "196 -0.837367 -0.813777 \n", + "197 -0.837755 -0.810379 \n", + "198 -0.830906 -0.814995 \n", + "199 -0.835626 -0.814634 \n", + "200 -0.773976 -0.815064 \n", + "\n", + " ... \\\n", + "level_0 /enc192/200ep/dec_drop_0.05_epsilon_1e-8/ ... \n", + "level_1 ... \n", + "1 0.073425 ... \n", + "2 -0.153953 ... \n", + "3 -0.295729 ... \n", + "4 -0.362248 ... \n", + "5 -0.414346 ... \n", + "... ... ... \n", + "196 -0.816068 ... \n", + "197 -0.815329 ... \n", + "198 -0.817098 ... \n", + "199 -0.816613 ... \n", + "200 -0.817505 ... \n", + "\n", + " dp \\\n", + "level_0 /enc768/200ep/dec_drop_0.05/ \n", + "level_1 \n", + "1 1.145111 \n", + "2 1.166236 \n", + "3 1.148909 \n", + "4 1.126033 \n", + "5 1.104397 \n", + "... ... \n", + "196 0.388870 \n", + "197 0.388216 \n", + "198 0.388245 \n", + "199 0.387994 \n", + "200 0.387377 \n", + "\n", + " \\\n", + "level_0 /enc768/200ep/dec_drop_0.05_epsilon_1e-8/ Baseline Glow-TTS 768 \n", + "level_1 \n", + "1 1.145046 1.145111 \n", + "2 1.166261 1.166236 \n", + "3 1.149577 1.148909 \n", + "4 1.126183 1.126033 \n", + "5 1.104511 1.104397 \n", + "... ... ... \n", + "196 0.387081 0.388870 \n", + "197 0.386526 0.388216 \n", + "198 0.386545 0.388245 \n", + "199 0.386290 0.387994 \n", + "200 0.386067 0.387377 \n", + "\n", + " mle \\\n", + "level_0 /enc192/200ep/dec_drop_0.0/grad_clip_10/ /enc192/200ep/dec_drop_0.05/ \n", + "level_1 \n", + "1 0.537252 0.568119 \n", + "2 -0.101299 -0.082530 \n", + "3 -0.294475 -0.281159 \n", + "4 -0.371938 -0.375815 \n", + "5 -0.410438 -0.422834 \n", + "... ... ... \n", + "196 -0.850404 -0.808053 \n", + "197 -0.850987 -0.808369 \n", + "198 -0.851446 -0.808689 \n", + "199 -0.852212 -0.809059 \n", + "200 -0.852834 -0.809477 \n", + "\n", + " \\\n", + "level_0 /enc192/200ep/dec_drop_0.05_epsilon_1e-8/ \n", + "level_1 \n", + "1 0.569599 \n", + "2 -0.082966 \n", + "3 -0.281597 \n", + "4 -0.375824 \n", + "5 -0.422658 \n", + "... ... \n", + "196 -0.806879 \n", + "197 -0.807225 \n", + "198 -0.807542 \n", + "199 -0.807872 \n", + "200 -0.808277 \n", + "\n", + " \\\n", + "level_0 /enc768/200ep/dec_drop_0.0/grad_clip_10/ /enc768/200ep/dec_drop_0.05/ \n", + "level_1 \n", + "1 0.502620 0.530890 \n", + "2 -0.083080 -0.066146 \n", + "3 -0.293310 -0.281814 \n", + "4 -0.381286 -0.383792 \n", + "5 -0.422354 -0.433206 \n", + "... ... ... \n", + "196 -0.848083 -0.811209 \n", + "197 -0.848792 -0.811539 \n", + "198 -0.849257 -0.810812 \n", + "199 -0.849940 -0.811891 \n", + "200 -0.850649 -0.812429 \n", + "\n", + " \n", + "level_0 /enc768/200ep/dec_drop_0.05_epsilon_1e-8/ Baseline Glow-TTS 768 \n", + "level_1 \n", + "1 0.532545 0.530890 \n", + "2 -0.066379 -0.066146 \n", + "3 -0.282104 -0.281814 \n", + "4 -0.383782 -0.383792 \n", + "5 -0.433054 -0.433206 \n", + "... ... ... \n", + "196 -0.810240 -0.811209 \n", + "197 -0.810735 -0.811539 \n", + "198 -0.811058 -0.810812 \n", + "199 -0.811443 -0.811891 \n", + "200 -0.811807 -0.812429 \n", + "\n", + "[200 rows x 28 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.pivot_table(index=\"level_1\", columns=\"level_0\", values=list(df.columns))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -196,7 +755,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pipeline.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pipeline.py index 75e982e3f..ed5402169 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pipeline.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pipeline.py @@ -58,6 +58,7 @@ def forward( returnn_python_exe=returnn_exe, returnn_root=returnn_root, mem_rqmt=20, + device="cpu" ) # last_forward_job.rqmt["gpu_mem"] = 24 diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/ga_glowTTS_ASR_ffn.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/ga_glowTTS_ASR_ffn.py index df8895e3c..bd712e14b 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/ga_glowTTS_ASR_ffn.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/ga_glowTTS_ASR_ffn.py @@ -508,6 +508,7 @@ def phoneme_prediction_step(*, model: Model, data, run_ctx, **kwargs): given_attn=given_attn, recognition_input=run_ctx.recognition_input, ) + breakpoint() x_mask = torch.unsqueeze(commons.sequence_mask(phonemes_len, phonemes.size(1)), 1).to(phonemes.dtype) attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(z_mask, 2) diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/ga_glowTTS_ASR_ffn_mas.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/ga_glowTTS_ASR_ffn_mas.py new file mode 100644 index 000000000..a0cd90ee6 --- /dev/null +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/ga_glowTTS_ASR_ffn_mas.py @@ -0,0 +1,658 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +from dataclasses import dataclass +import torch +import numpy as np +from torch import nn +import multiprocessing +from librosa import filters +import sys +import time +from typing import Any, Dict, Optional, Tuple, Union +import math +import os +import soundfile + +from ..shared.configs import ( + SpecaugConfig, + VGG4LayerActFrontendV1Config_mod, + FlowDecoderConfig, + TextEncoderConfig, + DbMelFeatureExtractionConfig, + ModelConfigV2 +) + +from returnn.datasets.hdf import SimpleHDFWriter + +from ..shared.feature_extraction import DbMelFeatureExtraction +from ..shared.spec_augment import apply_spec_aug +from ..shared.mask import mask_tensor + +from ..shared import modules +from ..shared import commons +from ..shared import attentions +from ..monotonic_align import maximum_path + +from ..shared.forward import search_init_hook, search_finish_hook +from ..shared.eval_forward import * + +from IPython import embed + +class XVector(nn.Module): + def __init__(self, input_dim=40, num_classes=8, **kwargs): + super(XVector, self).__init__() + self.tdnn1 = modules.TDNN( + input_dim=input_dim, output_dim=512, context_size=5, dilation=1, dropout_p=0.5, batch_norm=True + ) + self.tdnn2 = modules.TDNN( + input_dim=512, output_dim=512, context_size=3, dilation=2, dropout_p=0.5, batch_norm=True + ) + self.tdnn3 = modules.TDNN( + input_dim=512, output_dim=512, context_size=2, dilation=3, dropout_p=0.5, batch_norm=True + ) + self.tdnn4 = modules.TDNN( + input_dim=512, output_dim=512, context_size=1, dilation=1, dropout_p=0.5, batch_norm=True + ) + self.tdnn5 = modules.TDNN( + input_dim=512, output_dim=512, context_size=1, dilation=1, dropout_p=0.5, batch_norm=True + ) + #### Frame levelPooling + self.segment6 = nn.Linear(1024, 512) + self.segment7 = nn.Linear(512, 512) + self.output = nn.Linear(512, num_classes) + self.softmax = nn.Softmax(dim=1) + + # fe_config = DbMelFeatureExtractionConfig.from_dict(kwargs["fe_config"]) + # self.feature_extraction = DbMelFeatureExtraction(config=fe_config) + + def forward(self, x, x_lengths): + # with torch.no_grad(): + # squeezed_audio = torch.squeeze(raw_audio) + # x, x_lengths = self.feature_extraction(squeezed_audio, raw_audio_lengths) # [B, T, F] + + # x = x.transpose(1, 2) + tdnn1_out = self.tdnn1(x) + # return tdnn1_out + tdnn2_out = self.tdnn2(tdnn1_out) + tdnn3_out = self.tdnn3(tdnn2_out) + tdnn4_out = self.tdnn4(tdnn3_out) + tdnn5_out = self.tdnn5(tdnn4_out) + ### Stat Pool + mean = torch.mean(tdnn5_out, 2) + std = torch.std(tdnn5_out, 2) + stat_pooling = torch.cat((mean, std), 1) + segment6_out = self.segment6(stat_pooling) + x_vec = self.segment7(segment6_out) + output = self.output(x_vec) + predictions = self.softmax(output) + return output, predictions, x_vec + + +class DurationPredictor(nn.Module): + """ + Duration Predictor module, trained using calculated durations coming from monotonic alignment search + """ + + def __init__(self, in_channels, filter_channels, filter_size, p_dropout): + super().__init__() + + self.in_channels = in_channels + self.filter_channels = filter_channels + self.filter_size = filter_size + self.p_dropout = p_dropout + + self.convs = nn.Sequential( + modules.Conv1DBlock( + in_size=self.in_channels, + out_size=self.filter_channels, + filter_size=self.filter_size, + p_dropout=p_dropout, + ), + modules.Conv1DBlock( + in_size=self.filter_channels, + out_size=self.filter_channels, + filter_size=self.filter_size, + p_dropout=p_dropout, + ), + ) + self.proj = nn.Conv1d(in_channels=self.filter_channels, out_channels=1, kernel_size=1) + + def forward(self, x, x_mask): + x_with_mask = (x, x_mask) + (x, x_mask) = self.convs(x_with_mask) + x = self.proj(x * x_mask) + return x + + +class FlowDecoder(nn.Module): + def __init__(self, cfg: FlowDecoderConfig, in_channels, gin_channels): + """Flow-based decoder model + + Args: + in_channels (int): Number of incoming channels + hidden_channels (int): Number of hidden channels + kernel_size (int): Kernel Size for convolutions in coupling blocks + dilation_rate (float): Dilation Rate to define dilation in convolutions of coupling block + n_blocks (int): Number of coupling blocks + n_layers (int): Number of layers in CNN of the coupling blocks + p_dropout (float, optional): Dropout probability for CNN in coupling blocks. Defaults to 0.. + n_split (int, optional): Number of splits for the 1x1 convolution for flows in the decoder. Defaults to 4. + n_sqz (int, optional): Squeeze. Defaults to 1. + sigmoid_scale (bool, optional): Boolean to define if log probs in coupling layers should be rescaled using sigmoid. Defaults to False. + gin_channels (int, optional): Number of speaker embedding channels. Defaults to 0. + """ + super().__init__() + self.cfg = cfg + + self.flows = nn.ModuleList() + + for _ in range(self.cfg.n_blocks): + self.flows.append(modules.ActNorm(channels=in_channels * self.cfg.n_sqz)) + self.flows.append(modules.InvConvNear(channels=in_channels * self.cfg.n_sqz, n_split=self.cfg.n_split)) + self.flows.append( + attentions.CouplingBlock( + in_channels * self.cfg.n_sqz, + self.cfg.hidden_channels, + kernel_size=self.cfg.kernel_size, + dilation_rate=self.cfg.dilation_rate, + n_layers=self.cfg.n_layers, + gin_channels=gin_channels, + p_dropout=self.cfg.p_dropout, + sigmoid_scale=self.cfg.sigmoid_scale, + ) + ) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + flows = self.flows + logdet_tot = 0 + else: + flows = reversed(self.flows) + logdet_tot = None + + if self.cfg.n_sqz > 1: + x, x_mask = commons.channel_squeeze(x, x_mask, self.cfg.n_sqz) + for f in flows: + if not reverse: + x, logdet = f(x, x_mask, g=g, reverse=reverse) + logdet_tot += logdet + else: + x, logdet = f(x, x_mask, g=g, reverse=reverse) + if self.cfg.n_sqz > 1: + x, x_mask = commons.channel_unsqueeze(x, x_mask, self.cfg.n_sqz) + return x, logdet_tot + + def store_inverse(self): + for f in self.flows: + f.store_inverse() + +class TextEncoder(nn.Module): + """ + Text Encoder model + """ + + def __init__(self, cfg: TextEncoderConfig, out_channels, gin_channels): + """Text Encoder Model based on Multi-Head Self-Attention combined with FF-CCNs + + Args: + n_vocab (int): Size of vocabulary for embeddings + out_channels (int): Number of output channels + hidden_channels (int): Number of hidden channels + filter_channels (int): Number of filter channels + filter_channels_dp (int): Number of filter channels for duration predictor + n_heads (int): Number of heads in encoder's Multi-Head Attention + n_layers (int): Number of layers consisting of Multi-Head Attention and CNNs in encoder + kernel_size (int): Kernel Size for CNNs in encoder layers + p_dropout (float): Dropout probability for both encoder and duration predictor + window_size (int, optional): Window size in Multi-Head Self-Attention for encoder. Defaults to None. + block_length (_type_, optional): Block length for optional block masking in Multi-Head Attention for encoder. Defaults to None. + mean_only (bool, optional): Boolean to only project text encodings to mean values instead of mean and std. Defaults to False. + prenet (bool, optional): Boolean to add ConvReluNorm prenet before encoder . Defaults to False. + gin_channels (int, optional): Number of channels for speaker condition. Defaults to 0. + """ + super().__init__() + self.cfg = cfg + + self.emb = nn.Embedding(self.cfg.n_vocab, self.cfg.hidden_channels) + nn.init.normal_(self.emb.weight, 0.0, self.cfg.hidden_channels**-0.5) + + if self.cfg.prenet: + self.pre = modules.ConvReluNorm( + self.cfg.hidden_channels, + self.cfg.hidden_channels, + self.cfg.hidden_channels, + kernel_size=5, + n_layers=3, + p_dropout=0.5, + ) + self.encoder = attentions.Encoder( + self.cfg.hidden_channels, + self.cfg.filter_channels, + self.cfg.n_heads, + self.cfg.n_layers, + self.cfg.kernel_size, + self.cfg.p_dropout, + window_size=self.cfg.window_size, + block_length=self.cfg.block_length, + ) + + self.proj_m = nn.Conv1d(self.cfg.hidden_channels, out_channels, 1) + if not self.cfg.mean_only: + self.proj_s = nn.Conv1d(self.cfg.hidden_channels, out_channels, 1) + self.proj_w = DurationPredictor( + self.cfg.hidden_channels + gin_channels, + self.cfg.filter_channels_dp, + self.cfg.kernel_size, + self.cfg.p_dropout, + ) + + def forward(self, x, x_lengths, g=None): + x = self.emb(x) * math.sqrt(self.cfg.hidden_channels) # [b, t, h] + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + + if self.cfg.prenet: + x = self.pre(x, x_mask) + x = self.encoder(x, x_mask) + + if g is not None: + g_exp = g.expand(-1, -1, x.size(-1)) + # print(f"Dimension of input in Text Encoder: x.shape: {x.shape}; g: {g.shape}, g_exp: {g_exp.shape}") + x_dp = torch.cat([torch.detach(x), g_exp], 1) + else: + x_dp = torch.detach(x) + + x_m = self.proj_m(x) * x_mask + if not self.cfg.mean_only: + x_logs = self.proj_s(x) * x_mask + else: + x_logs = torch.zeros_like(x_m) + + # print(f"Dimension of input in Text Encoder before DP: {x_dp.shape}") + + logw = self.proj_w(x_dp, x_mask) + return x_m, x_logs, logw, x_mask + +class Model(nn.Module): + """ + Flow-based ASR model based on GlowTTS Structure using a pre-trained flow-based decoder + trained to generate spectrograms from given statistics coming from an encoder + + Model was pretrained using the architecture in + users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS.py + """ + + def __init__( + self, + model_config: dict, + **kwargs, + ): + """_summary_ + + Args: + n_vocab (int): vocabulary size + hidden_channels (int): Number of hidden channels in encoder + out_channels (int): Number of channels in the output + n_blocks_dec (int, optional): Number of coupling blocks in the decoder. Defaults to 12. + kernel_size_dec (int, optional): Kernel size in the decoder. Defaults to 5. + dilation_rate (int, optional): Dilation rate for CNNs of coupling blocks in decoder. Defaults to 5. + n_block_layers (int, optional): Number of layers in the CNN of the coupling blocks in decoder. Defaults to 4. + p_dropout_dec (_type_, optional): Dropout probability in the decoder. Defaults to 0.. + n_speakers (int, optional): Number of speakers. Defaults to 0. + gin_channels (int, optional): Number of speaker embedding channels. Defaults to 0. + n_split (int, optional): Number of splits for the 1x1 convolution for flows in the decoder. Defaults to 4. + n_sqz (int, optional): Squeeze. Defaults to 1. + sigmoid_scale (bool, optional): Boolean to define if log probs in coupling layers should be rescaled using sigmoid. Defaults to False. + window_size (int, optional): Window size in Multi-Head Self-Attention for encoder. Defaults to None. + block_length (_type_, optional): Block length for optional block masking in Multi-Head Attention for encoder. Defaults to None. + hidden_channels_dec (_type_, optional): Number of hidden channels in decodder. Defaults to hidden_channels. + final_hidden_channels: Number of hidden channels in the final network + final_n_layers: Number of layers in the final network + label_target_size: Target size of target vocabulary, target size for final network + """ + super().__init__() + + self.net_kwargs = { + "repeat_per_num_frames": 100, + "max_dim_feat": 8, + "num_repeat_feat": 5, + "max_dim_time": 20, + } + + fe_config = DbMelFeatureExtractionConfig.from_dict(kwargs["fe_config"]) + self.feature_extraction = DbMelFeatureExtraction(config=fe_config) + + # if label_target_size is None: + # if n_vocab is None: + # run_ctx = get_run_ctx() + # dataset = run_ctx.engine.train_dataset or run_ctx.engine.forward_dataset + # self.label_target_size = len(dataset.datasets["zip_dataset"].targets.labels) + # else: + # self.label_target_size = n_vocab + # else: + # self.label_target_size = label_target_size + + self.cfg = ModelConfigV2.from_dict(model_config) + text_encoder_config = self.cfg.text_encoder_config + decoder_config = self.cfg.decoder_config + + if self.cfg.n_speakers > 1: + self.emb_g = nn.Embedding(self.cfg.n_speakers, self.cfg.gin_channels) + nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) + + self.encoder = TextEncoder( + text_encoder_config, out_channels=self.cfg.out_channels, gin_channels=self.cfg.gin_channels + ) + + self.decoder = FlowDecoder( + decoder_config, in_channels=self.cfg.out_channels, gin_channels=self.cfg.gin_channels + ) + + self.asr_output = nn.Sequential() + + for i in range(self.cfg.phoneme_prediction_config.n_layers): + if i == 0: + in_channels = self.cfg.out_channels + else: + in_channels = self.cfg.phoneme_prediction_config.n_channels + if i < self.cfg.phoneme_prediction_config.n_layers - 1: + out_channels = self.cfg.phoneme_prediction_config.n_channels + else: + out_channels = self.cfg.label_target_size + 1 + + self.asr_output.append(nn.Linear(in_channels, out_channels)) + if i < self.cfg.phoneme_prediction_config.n_layers - 1: + self.asr_output.append(nn.ReLU()) + self.asr_output.append(nn.Dropout(self.cfg.phoneme_prediction_config.p_dropout)) + + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + def forward( + self, x=None, x_lengths=None, raw_audio=None, raw_audio_lengths=None, g=None, recognition_input=None, noise_scale=1.0, length_scale=1.0 + ): + assert recognition_input in ["encoder", "decoder"] + with torch.no_grad(): + squeezed_audio = torch.squeeze(raw_audio) + y, y_lengths = self.feature_extraction(squeezed_audio, raw_audio_lengths) # [B, T, F] + y = y.transpose(1, 2) # [B, F, T] + self.encoder.eval() + self.decoder.eval() + assert g is not None, "Missing speaker embeddings!" + g = nn.functional.normalize(self.emb_g(g.squeeze(-1))).unsqueeze(-1) + + y_max_length = y.size(2) + + y, y_lengths, y_max_length = self.preprocess(y, y_lengths, y_max_length) + z_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y_max_length), 1).to(torch.int32) + + x_m, x_logs, logw, x_mask = self.encoder(x, x_lengths, g=g) # mean, std logs, duration logs, mask + attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(z_mask, 2) + z, logdet = self.decoder(y, z_mask, g=g, reverse=False) + + x_s_sq_r = torch.exp(-2 * x_logs) + logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - x_logs, [1]).unsqueeze(-1) # [b, t, 1] + logp2 = torch.matmul(x_s_sq_r.transpose(1, 2), -0.5 * (z**2)) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp3 = torch.matmul((x_m * x_s_sq_r).transpose(1, 2), z) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp4 = torch.sum(-0.5 * (x_m**2) * x_s_sq_r, [1]).unsqueeze(-1) # [b, t, 1] + logp = logp1 + logp2 + logp3 + logp4 # [b, t, t'] + + attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() + # embed() + + z_m = torch.matmul(attn.squeeze(1).transpose(1, 2), x_m.transpose(1, 2)).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + z_logs = torch.matmul(attn.squeeze(1).transpose(1, 2), x_logs.transpose(1, 2)).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + + if recognition_input == "encoder": # TODO: This is wrong! + asr_in = z.transpose(1, 2) + else: + asr_in = ((z_m + torch.exp(z_logs) * torch.randn_like(z_m) * noise_scale) * z_mask).transpose(1,2) + + logits = self.asr_output(asr_in) + + return logits, y_lengths, z_mask, attn + + def preprocess(self, y, y_lengths, y_max_length): + if y_max_length is not None: + y_max_length = (y_max_length // self.cfg.decoder_config.n_sqz) * self.cfg.decoder_config.n_sqz + y = y[:, :, :y_max_length] + y_lengths = (y_lengths // self.cfg.decoder_config.n_sqz) * self.cfg.decoder_config.n_sqz + return y, y_lengths, y_max_length + + def store_inverse(self): + self.decoder.store_inverse() + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + tags = data["seq_tag"] + audio_features = data["audio_features"] # [B, T, F] + # audio_features = audio_features.transpose(1, 2) # [B, F, T] necessary because glowTTS expects the channels to be in the 2nd dimension + audio_features_len = data["audio_features:size1"] # [B] + + # perform local length sorting for more efficient packing + audio_features_len, indices = torch.sort(audio_features_len, descending=True) + + audio_features = audio_features[indices, :, :] + phonemes = data["phonemes"][indices, :] # [B, T] (sparse) + phonemes_len = data["phonemes:size1"][indices] # [B, T] + phonemes_eow = data["phonemes_eow"][indices, :] # [B, T] + phonemes_eow_len = data["phonemes_eow:size1"][indices] + durations = data["durations"][indices] + speaker_labels = data["speaker_labels"][indices, :] # [B, 1] (sparse) + tags = list(np.array(tags)[indices.detach().cpu().numpy()]) + + recognition_input = kwargs["recognition_input"] + logits, y_lengths, z_mask, attn = model( + phonemes, + phonemes_len, + audio_features, + audio_features_len, + g=speaker_labels, + recognition_input=recognition_input, + ) + + upsampled_phonemes = torch.matmul(attn.squeeze(1).transpose(1, 2), phonemes.float().unsqueeze(-1)).squeeze(-1) + + mask = commons.sequence_mask(y_lengths) + ce_losses = nn.functional.cross_entropy(logits.transpose(1, 2), upsampled_phonemes.long(), reduction="none") + ce_loss = (ce_losses * mask.float()).sum() / mask.float().sum() + + run_ctx.mark_as_loss(name="ce", loss=ce_loss) + +def phoneme_prediction_init_hook(run_ctx, **kwargs): + run_ctx.hdf_writer = SimpleHDFWriter("output.hdf", dim=1, ndim=1) + run_ctx.pool = multiprocessing.Pool(8) + run_ctx.recognition_input = kwargs["recognition_input"] + + +def phoneme_prediction_finish_hook(run_ctx, **kwargs): + run_ctx.hdf_writer.close() + + +def phoneme_prediction_step(*, model: Model, data, run_ctx, **kwargs): + """ + :param Model model: _description_ + :param _type_ data: _description_ + :param _type_ run_ctx: _description_ + """ + tags = data["seq_tag"] + audio_features = data["audio_features"] # [B, T, F] + audio_features_len = data["audio_features:size1"] # [B] + + # perform local length sorting for more efficient packing + audio_features_len, indices = torch.sort(audio_features_len, descending=True) + audio_features = audio_features[indices, :, :] + phonemes = data["phonemes"][indices, :] # [B, T] (sparse) + phonemes_len = data["phonemes:size1"][indices] # [B, T] + durations = data["durations"][indices] + + speaker_labels = data["speaker_labels"][indices, :] # [B, 1] (sparse) + + tags = list(np.array(tags)[indices.detach().cpu().numpy()]) + + logits, y_lengths, z_mask, attn = model( + x=phonemes, + x_lengths=phonemes_len, + raw_audio=audio_features, + raw_audio_lengths=audio_features_len, + g=speaker_labels, + recognition_input=run_ctx.recognition_input, + ) + + upsampled_phonemes = torch.matmul(attn.int().squeeze(1).transpose(1, 2), phonemes.unsqueeze(-1)).squeeze(-1) + + mask = commons.sequence_mask(y_lengths) + pred = torch.softmax(logits, dim=2).argmax(dim=2) + + accuracies = ( + (((pred == upsampled_phonemes) * mask).sum(dim=1) / y_lengths).unsqueeze(-1).unsqueeze(-1).detach().cpu() + ) + + for tag, acc in zip(tags, accuracies): + run_ctx.hdf_writer.insert_batch(np.array(acc), [1], [tag]) + + +def encoder_phoneme_prediction_init_hook(run_ctx, **kwargs): + run_ctx.hdf_writer = SimpleHDFWriter("output.hdf", dim=1, ndim=1) + run_ctx.pool = multiprocessing.Pool(8) + + +def encoder_phoneme_prediction_finish_hook(run_ctx, **kwargs): + run_ctx.hdf_writer.close() + + +def encoder_phoneme_prediction_step(*, model: Model, data, run_ctx, **kwargs): + """ + :param Model model: _description_ + :param _type_ data: _description_ + :param _type_ run_ctx: _description_ + """ + tags = data["seq_tag"] + audio_features = data["audio_features"] # [B, T, F] + # audio_features = audio_features.transpose(1, 2) # [B, F, T] necessary because glowTTS expects the channels to be in the 2nd dimension + audio_features_len = data["audio_features:size1"] # [B] + + # perform local length sorting for more efficient packing + audio_features_len, indices = torch.sort(audio_features_len, descending=True) + + audio_features = audio_features[indices, :, :] + phonemes = data["phonemes"][indices, :] # [B, T] (sparse) + phonemes_len = data["phonemes:size1"][indices] # [B, T] + speaker_labels = data["speaker_labels"][indices, :] # [B, 1] (sparse) + durations = data["durations"][indices] + + tags = list(np.array(tags)[indices.detach().cpu().numpy()]) + + x_mask = torch.unsqueeze(commons.sequence_mask(phonemes_len, phonemes.size(1)), 1).to(phonemes.dtype) + y_lengths = torch.ceil(audio_features.size(1) / model.feature_extraction.hop_length).to(torch.int32) + y_lengths = (y_lengths // model.cfg.decoder_config.n_sqz) * model.cfg.decoder_config.n_sqz + z_mask = torch.unsqueeze(commons.sequence_mask(audio_features_len, y_lengths), 1) + attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(z_mask, 2) + given_attn = commons.generate_path(durations.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1).to(torch.float32) + + (z, z_m, z_logs, logdet, z_mask), (x_m, x_logs, x_mask), y_lengths, (given_attn, logw, logw_), (logits, _) = model( + x=phonemes, + x_lengths=phonemes_len, + raw_audio=audio_features, + raw_audio_lengths=audio_features_len, + g=speaker_labels, + given_attn=given_attn, + recognition=False, + ) + x_mask = torch.unsqueeze(commons.sequence_mask(phonemes_len, phonemes.size(1)), 1).to(phonemes.dtype) + + attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(z_mask, 2) + given_attn = commons.generate_path(durations.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1) + + upsampled_phonemes = torch.matmul(given_attn.squeeze(1).transpose(1, 2), phonemes.unsqueeze(-1)).squeeze(-1) + + mask = commons.sequence_mask(y_lengths) + pred = torch.softmax(logits, dim=2).argmax(dim=2) + + accuracies = ( + (((pred == upsampled_phonemes) * mask).sum(dim=1) / y_lengths).unsqueeze(-1).unsqueeze(-1).detach().cpu() + ) + + for tag, acc in zip(tags, accuracies): + run_ctx.hdf_writer.insert_batch(np.array(acc), [1], [tag]) + + +# def search_init_hook(run_ctx, **kwargs): +# # we are storing durations, but call it output.hdf to match +# # the default output of the ReturnnForwardJob +# from torchaudio.models.decoder import ctc_decoder +# run_ctx.recognition_file = open("search_out.py", "wt") +# run_ctx.recognition_file.write("{\n") +# import subprocess +# if kwargs["arpa_lm"] is not None: +# lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip() +# else: +# lm = None +# from returnn.datasets.util.vocabulary import Vocabulary +# vocab = Vocabulary.create_vocab( +# vocab_file=kwargs["returnn_vocab"], unknown_label=None) +# labels = vocab.labels + +# run_ctx.ctc_decoder = ctc_decoder( +# lexicon=kwargs["lexicon"], +# lm=lm, +# lm_weight=kwargs["lm_weight"], +# tokens=labels + ["[blank]", "[SILENCE]", "[UNK]"], +# # "[SILENCE]" and "[UNK]" are not actually part of the vocab, +# # but the decoder is happy as long they are defined in the token list +# # even if they do not exist as label index in the softmax output, +# blank_token="[blank]", +# sil_token="[SILENCE]", +# unk_word="[unknown]", +# nbest=1, +# beam_size=kwargs["beam_size"], +# beam_size_token=kwargs.get("beam_size_token", None), +# beam_threshold=kwargs["beam_threshold"], +# sil_score=kwargs.get("sil_score", 0.0), +# word_score=kwargs.get("word_score", 0.0), +# ) +# run_ctx.labels = labels +# run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None) + +# if kwargs.get("prior_file", None): +# run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32") +# run_ctx.prior_scale = kwargs["prior_scale"] +# else: +# run_ctx.prior = None + +# def search_finish_hook(run_ctx, **kwargs): +# run_ctx.recognition_file.write("}\n") +# run_ctx.recognition_file.close() + +# def search_step(*, model, data, run_ctx, **kwargs): +# raw_audio = data["raw_audio"] # [B, T', F] +# raw_audio_len = data["raw_audio:size1"] # [B] + +# logprobs, audio_features_len = model( +# raw_audio=raw_audio, +# raw_audio_lengths=raw_audio_len, +# recognition=True +# ) + +# tags = data["seq_tag"] + +# logprobs_cpu = logprobs.cpu() +# if run_ctx.blank_log_penalty is not None: +# # assumes blank is last +# logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty +# if run_ctx.prior is not None: +# logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior +# hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu()) + +# for hyp, tag in zip(hypothesis, tags): +# words = hyp[0].words +# sequence = " ".join([word for word in words if not word.startswith("[")]) +# print(sequence) +# run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence))) diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval.py new file mode 100644 index 000000000..564e5ba13 --- /dev/null +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval.py @@ -0,0 +1,657 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +from dataclasses import dataclass +import torch +import numpy as np +from torch import nn +import multiprocessing +from librosa import filters +import sys +import time +from typing import Any, Dict, Optional, Tuple, Union +import math +import os +import soundfile + +from ..shared.configs import ( + SpecaugConfig, + VGG4LayerActFrontendV1Config_mod, + FlowDecoderConfig, + TextEncoderConfig, + DbMelFeatureExtractionConfig, + ModelConfigV2 +) + +from returnn.datasets.hdf import SimpleHDFWriter + +from ..shared.feature_extraction import DbMelFeatureExtraction +from ..shared.spec_augment import apply_spec_aug +from ..shared.mask import mask_tensor + +from ..shared import modules +from ..shared import commons +from ..shared import attentions +from ..monotonic_align import maximum_path + +from ..shared.forward import search_init_hook, search_finish_hook +from ..shared.eval_forward import * + +from IPython import embed + +class XVector(nn.Module): + def __init__(self, input_dim=40, num_classes=8, **kwargs): + super(XVector, self).__init__() + self.tdnn1 = modules.TDNN( + input_dim=input_dim, output_dim=512, context_size=5, dilation=1, dropout_p=0.5, batch_norm=True + ) + self.tdnn2 = modules.TDNN( + input_dim=512, output_dim=512, context_size=3, dilation=2, dropout_p=0.5, batch_norm=True + ) + self.tdnn3 = modules.TDNN( + input_dim=512, output_dim=512, context_size=2, dilation=3, dropout_p=0.5, batch_norm=True + ) + self.tdnn4 = modules.TDNN( + input_dim=512, output_dim=512, context_size=1, dilation=1, dropout_p=0.5, batch_norm=True + ) + self.tdnn5 = modules.TDNN( + input_dim=512, output_dim=512, context_size=1, dilation=1, dropout_p=0.5, batch_norm=True + ) + #### Frame levelPooling + self.segment6 = nn.Linear(1024, 512) + self.segment7 = nn.Linear(512, 512) + self.output = nn.Linear(512, num_classes) + self.softmax = nn.Softmax(dim=1) + + # fe_config = DbMelFeatureExtractionConfig.from_dict(kwargs["fe_config"]) + # self.feature_extraction = DbMelFeatureExtraction(config=fe_config) + + def forward(self, x, x_lengths): + # with torch.no_grad(): + # squeezed_audio = torch.squeeze(raw_audio) + # x, x_lengths = self.feature_extraction(squeezed_audio, raw_audio_lengths) # [B, T, F] + + # x = x.transpose(1, 2) + tdnn1_out = self.tdnn1(x) + # return tdnn1_out + tdnn2_out = self.tdnn2(tdnn1_out) + tdnn3_out = self.tdnn3(tdnn2_out) + tdnn4_out = self.tdnn4(tdnn3_out) + tdnn5_out = self.tdnn5(tdnn4_out) + ### Stat Pool + mean = torch.mean(tdnn5_out, 2) + std = torch.std(tdnn5_out, 2) + stat_pooling = torch.cat((mean, std), 1) + segment6_out = self.segment6(stat_pooling) + x_vec = self.segment7(segment6_out) + output = self.output(x_vec) + predictions = self.softmax(output) + return output, predictions, x_vec + + +class DurationPredictor(nn.Module): + """ + Duration Predictor module, trained using calculated durations coming from monotonic alignment search + """ + + def __init__(self, in_channels, filter_channels, filter_size, p_dropout): + super().__init__() + + self.in_channels = in_channels + self.filter_channels = filter_channels + self.filter_size = filter_size + self.p_dropout = p_dropout + + self.convs = nn.Sequential( + modules.Conv1DBlock( + in_size=self.in_channels, + out_size=self.filter_channels, + filter_size=self.filter_size, + p_dropout=p_dropout, + ), + modules.Conv1DBlock( + in_size=self.filter_channels, + out_size=self.filter_channels, + filter_size=self.filter_size, + p_dropout=p_dropout, + ), + ) + self.proj = nn.Conv1d(in_channels=self.filter_channels, out_channels=1, kernel_size=1) + + def forward(self, x, x_mask): + x_with_mask = (x, x_mask) + (x, x_mask) = self.convs(x_with_mask) + x = self.proj(x * x_mask) + return x + + +class FlowDecoder(nn.Module): + def __init__(self, cfg: FlowDecoderConfig, in_channels, gin_channels): + """Flow-based decoder model + + Args: + in_channels (int): Number of incoming channels + hidden_channels (int): Number of hidden channels + kernel_size (int): Kernel Size for convolutions in coupling blocks + dilation_rate (float): Dilation Rate to define dilation in convolutions of coupling block + n_blocks (int): Number of coupling blocks + n_layers (int): Number of layers in CNN of the coupling blocks + p_dropout (float, optional): Dropout probability for CNN in coupling blocks. Defaults to 0.. + n_split (int, optional): Number of splits for the 1x1 convolution for flows in the decoder. Defaults to 4. + n_sqz (int, optional): Squeeze. Defaults to 1. + sigmoid_scale (bool, optional): Boolean to define if log probs in coupling layers should be rescaled using sigmoid. Defaults to False. + gin_channels (int, optional): Number of speaker embedding channels. Defaults to 0. + """ + super().__init__() + self.cfg = cfg + + self.flows = nn.ModuleList() + + for _ in range(self.cfg.n_blocks): + self.flows.append(modules.ActNorm(channels=in_channels * self.cfg.n_sqz)) + self.flows.append(modules.InvConvNear(channels=in_channels * self.cfg.n_sqz, n_split=self.cfg.n_split)) + self.flows.append( + attentions.CouplingBlock( + in_channels * self.cfg.n_sqz, + self.cfg.hidden_channels, + kernel_size=self.cfg.kernel_size, + dilation_rate=self.cfg.dilation_rate, + n_layers=self.cfg.n_layers, + gin_channels=gin_channels, + p_dropout=self.cfg.p_dropout, + sigmoid_scale=self.cfg.sigmoid_scale, + ) + ) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + flows = self.flows + logdet_tot = 0 + else: + flows = reversed(self.flows) + logdet_tot = None + + if self.cfg.n_sqz > 1: + x, x_mask = commons.channel_squeeze(x, x_mask, self.cfg.n_sqz) + for f in flows: + if not reverse: + x, logdet = f(x, x_mask, g=g, reverse=reverse) + logdet_tot += logdet + else: + x, logdet = f(x, x_mask, g=g, reverse=reverse) + if self.cfg.n_sqz > 1: + x, x_mask = commons.channel_unsqueeze(x, x_mask, self.cfg.n_sqz) + return x, logdet_tot + + def store_inverse(self): + for f in self.flows: + f.store_inverse() + +class TextEncoder(nn.Module): + """ + Text Encoder model + """ + + def __init__(self, cfg: TextEncoderConfig, out_channels, gin_channels): + """Text Encoder Model based on Multi-Head Self-Attention combined with FF-CCNs + + Args: + n_vocab (int): Size of vocabulary for embeddings + out_channels (int): Number of output channels + hidden_channels (int): Number of hidden channels + filter_channels (int): Number of filter channels + filter_channels_dp (int): Number of filter channels for duration predictor + n_heads (int): Number of heads in encoder's Multi-Head Attention + n_layers (int): Number of layers consisting of Multi-Head Attention and CNNs in encoder + kernel_size (int): Kernel Size for CNNs in encoder layers + p_dropout (float): Dropout probability for both encoder and duration predictor + window_size (int, optional): Window size in Multi-Head Self-Attention for encoder. Defaults to None. + block_length (_type_, optional): Block length for optional block masking in Multi-Head Attention for encoder. Defaults to None. + mean_only (bool, optional): Boolean to only project text encodings to mean values instead of mean and std. Defaults to False. + prenet (bool, optional): Boolean to add ConvReluNorm prenet before encoder . Defaults to False. + gin_channels (int, optional): Number of channels for speaker condition. Defaults to 0. + """ + super().__init__() + self.cfg = cfg + + self.emb = nn.Embedding(self.cfg.n_vocab, self.cfg.hidden_channels) + nn.init.normal_(self.emb.weight, 0.0, self.cfg.hidden_channels**-0.5) + + if self.cfg.prenet: + self.pre = modules.ConvReluNorm( + self.cfg.hidden_channels, + self.cfg.hidden_channels, + self.cfg.hidden_channels, + kernel_size=5, + n_layers=3, + p_dropout=0.5, + ) + self.encoder = attentions.Encoder( + self.cfg.hidden_channels, + self.cfg.filter_channels, + self.cfg.n_heads, + self.cfg.n_layers, + self.cfg.kernel_size, + self.cfg.p_dropout, + window_size=self.cfg.window_size, + block_length=self.cfg.block_length, + ) + + self.proj_m = nn.Conv1d(self.cfg.hidden_channels, out_channels, 1) + if not self.cfg.mean_only: + self.proj_s = nn.Conv1d(self.cfg.hidden_channels, out_channels, 1) + self.proj_w = DurationPredictor( + self.cfg.hidden_channels + gin_channels, + self.cfg.filter_channels_dp, + self.cfg.kernel_size, + self.cfg.p_dropout, + ) + + def forward(self, x, x_lengths, g=None): + x = self.emb(x) * math.sqrt(self.cfg.hidden_channels) # [b, t, h] + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + + if self.cfg.prenet: + x = self.pre(x, x_mask) + x = self.encoder(x, x_mask) + + if g is not None: + g_exp = g.expand(-1, -1, x.size(-1)) + # print(f"Dimension of input in Text Encoder: x.shape: {x.shape}; g: {g.shape}, g_exp: {g_exp.shape}") + x_dp = torch.cat([torch.detach(x), g_exp], 1) + else: + x_dp = torch.detach(x) + + x_m = self.proj_m(x) * x_mask + if not self.cfg.mean_only: + x_logs = self.proj_s(x) * x_mask + else: + x_logs = torch.zeros_like(x_m) + + # print(f"Dimension of input in Text Encoder before DP: {x_dp.shape}") + + logw = self.proj_w(x_dp, x_mask) + return x_m, x_logs, logw, x_mask + +class Model(nn.Module): + """ + Flow-based ASR model based on GlowTTS Structure using a pre-trained flow-based decoder + trained to generate spectrograms from given statistics coming from an encoder + + Model was pretrained using the architecture in + users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS.py + """ + + def __init__( + self, + model_config: dict, + **kwargs, + ): + """_summary_ + + Args: + n_vocab (int): vocabulary size + hidden_channels (int): Number of hidden channels in encoder + out_channels (int): Number of channels in the output + n_blocks_dec (int, optional): Number of coupling blocks in the decoder. Defaults to 12. + kernel_size_dec (int, optional): Kernel size in the decoder. Defaults to 5. + dilation_rate (int, optional): Dilation rate for CNNs of coupling blocks in decoder. Defaults to 5. + n_block_layers (int, optional): Number of layers in the CNN of the coupling blocks in decoder. Defaults to 4. + p_dropout_dec (_type_, optional): Dropout probability in the decoder. Defaults to 0.. + n_speakers (int, optional): Number of speakers. Defaults to 0. + gin_channels (int, optional): Number of speaker embedding channels. Defaults to 0. + n_split (int, optional): Number of splits for the 1x1 convolution for flows in the decoder. Defaults to 4. + n_sqz (int, optional): Squeeze. Defaults to 1. + sigmoid_scale (bool, optional): Boolean to define if log probs in coupling layers should be rescaled using sigmoid. Defaults to False. + window_size (int, optional): Window size in Multi-Head Self-Attention for encoder. Defaults to None. + block_length (_type_, optional): Block length for optional block masking in Multi-Head Attention for encoder. Defaults to None. + hidden_channels_dec (_type_, optional): Number of hidden channels in decodder. Defaults to hidden_channels. + final_hidden_channels: Number of hidden channels in the final network + final_n_layers: Number of layers in the final network + label_target_size: Target size of target vocabulary, target size for final network + """ + super().__init__() + + self.net_kwargs = { + "repeat_per_num_frames": 100, + "max_dim_feat": 8, + "num_repeat_feat": 5, + "max_dim_time": 20, + } + + fe_config = DbMelFeatureExtractionConfig.from_dict(kwargs["fe_config"]) + self.feature_extraction = DbMelFeatureExtraction(config=fe_config) + + # if label_target_size is None: + # if n_vocab is None: + # run_ctx = get_run_ctx() + # dataset = run_ctx.engine.train_dataset or run_ctx.engine.forward_dataset + # self.label_target_size = len(dataset.datasets["zip_dataset"].targets.labels) + # else: + # self.label_target_size = n_vocab + # else: + # self.label_target_size = label_target_size + + self.cfg = ModelConfigV2.from_dict(model_config) + text_encoder_config = self.cfg.text_encoder_config + decoder_config = self.cfg.decoder_config + + if self.cfg.n_speakers > 1: + self.emb_g = nn.Embedding(self.cfg.n_speakers, self.cfg.gin_channels) + nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) + + self.encoder = TextEncoder( + text_encoder_config, out_channels=self.cfg.out_channels, gin_channels=self.cfg.gin_channels + ) + + self.decoder = FlowDecoder( + decoder_config, in_channels=self.cfg.out_channels, gin_channels=self.cfg.gin_channels + ) + + self.asr_output = nn.Sequential() + + for i in range(self.cfg.phoneme_prediction_config.n_layers): + if i == 0: + in_channels = self.cfg.out_channels + else: + in_channels = self.cfg.phoneme_prediction_config.n_channels + if i < self.cfg.phoneme_prediction_config.n_layers - 1: + out_channels = self.cfg.phoneme_prediction_config.n_channels + else: + out_channels = self.cfg.label_target_size + 1 + + self.asr_output.append(nn.Linear(in_channels, out_channels)) + if i < self.cfg.phoneme_prediction_config.n_layers - 1: + self.asr_output.append(nn.ReLU()) + self.asr_output.append(nn.Dropout(self.cfg.phoneme_prediction_config.p_dropout)) + + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + def forward( + self, x=None, x_lengths=None, raw_audio=None, raw_audio_lengths=None, g=None, recognition_input=None, noise_scale=1.0, length_scale=1.0 + ): + assert recognition_input in ["encoder", "decoder"] + with torch.no_grad(): + squeezed_audio = torch.squeeze(raw_audio) + y, y_lengths = self.feature_extraction(squeezed_audio, raw_audio_lengths) # [B, T, F] + y = y.transpose(1, 2) # [B, F, T] + + assert g is not None, "Missing speaker embeddings!" + g = nn.functional.normalize(self.emb_g(g.squeeze(-1))).unsqueeze(-1) + + y_max_length = y.size(2) + + y, y_lengths, y_max_length = self.preprocess(y, y_lengths, y_max_length) + z_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y_max_length), 1).to(torch.int32) + + x_m, x_logs, logw, x_mask = self.encoder(x, x_lengths, g=g) # mean, std logs, duration logs, mask + attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(z_mask, 2) + z, logdet = self.decoder(y, z_mask, g=g, reverse=False) + + x_s_sq_r = torch.exp(-2 * x_logs) + logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - x_logs, [1]).unsqueeze(-1) # [b, t, 1] + logp2 = torch.matmul(x_s_sq_r.transpose(1, 2), -0.5 * (z**2)) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp3 = torch.matmul((x_m * x_s_sq_r).transpose(1, 2), z) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp4 = torch.sum(-0.5 * (x_m**2) * x_s_sq_r, [1]).unsqueeze(-1) # [b, t, 1] + logp = logp1 + logp2 + logp3 + logp4 # [b, t, t'] + + attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() + # embed() + + z_m = torch.matmul(attn.squeeze(1).transpose(1, 2), x_m.transpose(1, 2)).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + z_logs = torch.matmul(attn.squeeze(1).transpose(1, 2), x_logs.transpose(1, 2)).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + + if recognition_input == "encoder": + asr_in = z.transpose(1,2) + else: + asr_in = ((z_m + torch.exp(z_logs) * torch.randn_like(z_m) * noise_scale) * z_mask).transpose(1,2) + + logits = self.asr_output(asr_in) + + return logits, y_lengths, z_mask, attn + + def preprocess(self, y, y_lengths, y_max_length): + if y_max_length is not None: + y_max_length = (y_max_length // self.cfg.decoder_config.n_sqz) * self.cfg.decoder_config.n_sqz + y = y[:, :, :y_max_length] + y_lengths = (y_lengths // self.cfg.decoder_config.n_sqz) * self.cfg.decoder_config.n_sqz + return y, y_lengths, y_max_length + + def store_inverse(self): + self.decoder.store_inverse() + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + tags = data["seq_tag"] + audio_features = data["audio_features"] # [B, T, F] + # audio_features = audio_features.transpose(1, 2) # [B, F, T] necessary because glowTTS expects the channels to be in the 2nd dimension + audio_features_len = data["audio_features:size1"] # [B] + + # perform local length sorting for more efficient packing + audio_features_len, indices = torch.sort(audio_features_len, descending=True) + + audio_features = audio_features[indices, :, :] + phonemes = data["phonemes"][indices, :] # [B, T] (sparse) + phonemes_len = data["phonemes:size1"][indices] # [B, T] + phonemes_eow = data["phonemes_eow"][indices, :] # [B, T] + phonemes_eow_len = data["phonemes_eow:size1"][indices] + durations = data["durations"][indices] + speaker_labels = data["speaker_labels"][indices, :] # [B, 1] (sparse) + tags = list(np.array(tags)[indices.detach().cpu().numpy()]) + + recognition_input = kwargs["recognition_input"] + logits, y_lengths, z_mask, attn = model( + phonemes, + phonemes_len, + audio_features, + audio_features_len, + g=speaker_labels, + recognition_input=recognition_input, + ) + + upsampled_phonemes = torch.matmul(attn.squeeze(1).transpose(1, 2), phonemes.float().unsqueeze(-1)).squeeze(-1) + + mask = commons.sequence_mask(y_lengths) + ce_losses = nn.functional.cross_entropy(logits.transpose(1, 2), upsampled_phonemes.long(), reduction="none") + ce_loss = (ce_losses * mask.float()).sum() / mask.float().sum() + + run_ctx.mark_as_loss(name="ce", loss=ce_loss) + +def phoneme_prediction_init_hook(run_ctx, **kwargs): + run_ctx.hdf_writer = SimpleHDFWriter("output.hdf", dim=1, ndim=1) + run_ctx.pool = multiprocessing.Pool(8) + run_ctx.recognition_input = kwargs["recognition_input"] + + +def phoneme_prediction_finish_hook(run_ctx, **kwargs): + run_ctx.hdf_writer.close() + + +def phoneme_prediction_step(*, model: Model, data, run_ctx, **kwargs): + """ + :param Model model: _description_ + :param _type_ data: _description_ + :param _type_ run_ctx: _description_ + """ + tags = data["seq_tag"] + audio_features = data["audio_features"] # [B, T, F] + audio_features_len = data["audio_features:size1"] # [B] + + # perform local length sorting for more efficient packing + audio_features_len, indices = torch.sort(audio_features_len, descending=True) + audio_features = audio_features[indices, :, :] + phonemes = data["phonemes"][indices, :] # [B, T] (sparse) + phonemes_len = data["phonemes:size1"][indices] # [B, T] + durations = data["durations"][indices] + + speaker_labels = data["speaker_labels"][indices, :] # [B, 1] (sparse) + + tags = list(np.array(tags)[indices.detach().cpu().numpy()]) + + logits, y_lengths, z_mask, attn = model( + x=phonemes, + x_lengths=phonemes_len, + raw_audio=audio_features, + raw_audio_lengths=audio_features_len, + g=speaker_labels, + recognition_input=run_ctx.recognition_input, + ) + + upsampled_phonemes = torch.matmul(attn.int().squeeze(1).transpose(1, 2), phonemes.unsqueeze(-1)).squeeze(-1) + + mask = commons.sequence_mask(y_lengths) + pred = torch.softmax(logits, dim=2).argmax(dim=2) + + accuracies = ( + (((pred == upsampled_phonemes) * mask).sum(dim=1) / y_lengths).unsqueeze(-1).unsqueeze(-1).detach().cpu() + ) + + for tag, acc in zip(tags, accuracies): + run_ctx.hdf_writer.insert_batch(np.array(acc), [1], [tag]) + + +def encoder_phoneme_prediction_init_hook(run_ctx, **kwargs): + run_ctx.hdf_writer = SimpleHDFWriter("output.hdf", dim=1, ndim=1) + run_ctx.pool = multiprocessing.Pool(8) + + +def encoder_phoneme_prediction_finish_hook(run_ctx, **kwargs): + run_ctx.hdf_writer.close() + + +def encoder_phoneme_prediction_step(*, model: Model, data, run_ctx, **kwargs): + """ + :param Model model: _description_ + :param _type_ data: _description_ + :param _type_ run_ctx: _description_ + """ + tags = data["seq_tag"] + audio_features = data["audio_features"] # [B, T, F] + # audio_features = audio_features.transpose(1, 2) # [B, F, T] necessary because glowTTS expects the channels to be in the 2nd dimension + audio_features_len = data["audio_features:size1"] # [B] + + # perform local length sorting for more efficient packing + audio_features_len, indices = torch.sort(audio_features_len, descending=True) + + audio_features = audio_features[indices, :, :] + phonemes = data["phonemes"][indices, :] # [B, T] (sparse) + phonemes_len = data["phonemes:size1"][indices] # [B, T] + speaker_labels = data["speaker_labels"][indices, :] # [B, 1] (sparse) + durations = data["durations"][indices] + + tags = list(np.array(tags)[indices.detach().cpu().numpy()]) + + x_mask = torch.unsqueeze(commons.sequence_mask(phonemes_len, phonemes.size(1)), 1).to(phonemes.dtype) + y_lengths = torch.ceil(audio_features.size(1) / model.feature_extraction.hop_length).to(torch.int32) + y_lengths = (y_lengths // model.cfg.decoder_config.n_sqz) * model.cfg.decoder_config.n_sqz + z_mask = torch.unsqueeze(commons.sequence_mask(audio_features_len, y_lengths), 1) + attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(z_mask, 2) + given_attn = commons.generate_path(durations.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1).to(torch.float32) + + (z, z_m, z_logs, logdet, z_mask), (x_m, x_logs, x_mask), y_lengths, (given_attn, logw, logw_), (logits, _) = model( + x=phonemes, + x_lengths=phonemes_len, + raw_audio=audio_features, + raw_audio_lengths=audio_features_len, + g=speaker_labels, + given_attn=given_attn, + recognition=False, + ) + x_mask = torch.unsqueeze(commons.sequence_mask(phonemes_len, phonemes.size(1)), 1).to(phonemes.dtype) + + attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(z_mask, 2) + given_attn = commons.generate_path(durations.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1) + + upsampled_phonemes = torch.matmul(given_attn.squeeze(1).transpose(1, 2), phonemes.unsqueeze(-1)).squeeze(-1) + + mask = commons.sequence_mask(y_lengths) + pred = torch.softmax(logits, dim=2).argmax(dim=2) + + accuracies = ( + (((pred == upsampled_phonemes) * mask).sum(dim=1) / y_lengths).unsqueeze(-1).unsqueeze(-1).detach().cpu() + ) + + for tag, acc in zip(tags, accuracies): + run_ctx.hdf_writer.insert_batch(np.array(acc), [1], [tag]) + + +# def search_init_hook(run_ctx, **kwargs): +# # we are storing durations, but call it output.hdf to match +# # the default output of the ReturnnForwardJob +# from torchaudio.models.decoder import ctc_decoder +# run_ctx.recognition_file = open("search_out.py", "wt") +# run_ctx.recognition_file.write("{\n") +# import subprocess +# if kwargs["arpa_lm"] is not None: +# lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip() +# else: +# lm = None +# from returnn.datasets.util.vocabulary import Vocabulary +# vocab = Vocabulary.create_vocab( +# vocab_file=kwargs["returnn_vocab"], unknown_label=None) +# labels = vocab.labels + +# run_ctx.ctc_decoder = ctc_decoder( +# lexicon=kwargs["lexicon"], +# lm=lm, +# lm_weight=kwargs["lm_weight"], +# tokens=labels + ["[blank]", "[SILENCE]", "[UNK]"], +# # "[SILENCE]" and "[UNK]" are not actually part of the vocab, +# # but the decoder is happy as long they are defined in the token list +# # even if they do not exist as label index in the softmax output, +# blank_token="[blank]", +# sil_token="[SILENCE]", +# unk_word="[unknown]", +# nbest=1, +# beam_size=kwargs["beam_size"], +# beam_size_token=kwargs.get("beam_size_token", None), +# beam_threshold=kwargs["beam_threshold"], +# sil_score=kwargs.get("sil_score", 0.0), +# word_score=kwargs.get("word_score", 0.0), +# ) +# run_ctx.labels = labels +# run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None) + +# if kwargs.get("prior_file", None): +# run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32") +# run_ctx.prior_scale = kwargs["prior_scale"] +# else: +# run_ctx.prior = None + +# def search_finish_hook(run_ctx, **kwargs): +# run_ctx.recognition_file.write("}\n") +# run_ctx.recognition_file.close() + +# def search_step(*, model, data, run_ctx, **kwargs): +# raw_audio = data["raw_audio"] # [B, T', F] +# raw_audio_len = data["raw_audio:size1"] # [B] + +# logprobs, audio_features_len = model( +# raw_audio=raw_audio, +# raw_audio_lengths=raw_audio_len, +# recognition=True +# ) + +# tags = data["seq_tag"] + +# logprobs_cpu = logprobs.cpu() +# if run_ctx.blank_log_penalty is not None: +# # assumes blank is last +# logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty +# if run_ctx.prior is not None: +# logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior +# hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu()) + +# for hyp, tag in zip(hypothesis, tags): +# words = hyp[0].words +# sequence = " ".join([word for word in words if not word.startswith("[")]) +# print(sequence) +# run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence))) diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector.py index 096a27828..bca2eed86 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector.py @@ -501,7 +501,6 @@ def phoneme_prediction_step(*, model: Model, data, run_ctx, **kwargs): z_mask = torch.unsqueeze(commons.sequence_mask(audio_features_len, y_lengths), 1) attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(z_mask, 2) given_attn = commons.generate_path(durations.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1).to(torch.float32) - logits, y_lengths, z_mask = model( x=phonemes, x_lengths=phonemes_len, diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas.py index 0ec257369..a23d9e795 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas.py @@ -414,7 +414,7 @@ def forward( 1, 2 ) # [b, t', t], [b, t, d] -> [b, d, t'] - if recognition_input == "encoder": + if recognition_input == "encoder": #TODO: This is wrong! asr_in = z.transpose(1,2) else: asr_in = ((z_m + torch.exp(z_logs) * torch.randn_like(z_m) * noise_scale) * z_mask).transpose(1,2) @@ -508,12 +508,8 @@ def phoneme_prediction_step(*, model: Model, data, run_ctx, **kwargs): g=speaker_labels, recognition_input=run_ctx.recognition_input, ) - x_mask = torch.unsqueeze(commons.sequence_mask(phonemes_len, phonemes.size(1)), 1).to(phonemes.dtype) - attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(z_mask, 2) - given_attn = commons.generate_path(durations.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1) - - upsampled_phonemes = torch.matmul(given_attn.squeeze(1).transpose(1, 2), phonemes.unsqueeze(-1)).squeeze(-1) + upsampled_phonemes = torch.matmul(attn.squeeze(1).transpose(1, 2), phonemes.unsqueeze(-1)).squeeze(-1) mask = commons.sequence_mask(y_lengths) pred = torch.softmax(logits, dim=2).argmax(dim=2) diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas_no_eval.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas_no_eval.py new file mode 100644 index 000000000..198ff888b --- /dev/null +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas_no_eval.py @@ -0,0 +1,600 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +from dataclasses import dataclass +import torch +import numpy as np +from torch import nn +import multiprocessing +from librosa import filters +import sys +import time +from typing import Any, Dict, Optional, Tuple, Union +import math +import os +import soundfile + +from ..shared.configs import ( + SpecaugConfig, + VGG4LayerActFrontendV1Config_mod, + FlowDecoderConfig, + TextEncoderConfig, + DbMelFeatureExtractionConfig, + ModelConfigV2 +) + +from returnn.datasets.hdf import SimpleHDFWriter + +from ..shared.feature_extraction import DbMelFeatureExtraction +from ..shared.spec_augment import apply_spec_aug +from ..shared.mask import mask_tensor + +from ..shared import modules +from ..shared import commons +from ..shared import attentions +from ..monotonic_align import maximum_path + +from ..shared.forward import search_init_hook, search_finish_hook +from ..shared.eval_forward import * + +from IPython import embed + +class XVector(nn.Module): + def __init__(self, input_dim=40, num_classes=8, **kwargs): + super(XVector, self).__init__() + self.tdnn1 = modules.TDNN( + input_dim=input_dim, output_dim=512, context_size=5, dilation=1, dropout_p=0.5, batch_norm=True + ) + self.tdnn2 = modules.TDNN( + input_dim=512, output_dim=512, context_size=3, dilation=2, dropout_p=0.5, batch_norm=True + ) + self.tdnn3 = modules.TDNN( + input_dim=512, output_dim=512, context_size=2, dilation=3, dropout_p=0.5, batch_norm=True + ) + self.tdnn4 = modules.TDNN( + input_dim=512, output_dim=512, context_size=1, dilation=1, dropout_p=0.5, batch_norm=True + ) + self.tdnn5 = modules.TDNN( + input_dim=512, output_dim=512, context_size=1, dilation=1, dropout_p=0.5, batch_norm=True + ) + #### Frame levelPooling + self.segment6 = nn.Linear(1024, 512) + self.segment7 = nn.Linear(512, 512) + self.output = nn.Linear(512, num_classes) + self.softmax = nn.Softmax(dim=1) + + # fe_config = DbMelFeatureExtractionConfig.from_dict(kwargs["fe_config"]) + # self.feature_extraction = DbMelFeatureExtraction(config=fe_config) + + def forward(self, x, x_lengths): + # with torch.no_grad(): + # squeezed_audio = torch.squeeze(raw_audio) + # x, x_lengths = self.feature_extraction(squeezed_audio, raw_audio_lengths) # [B, T, F] + + # x = x.transpose(1, 2) + tdnn1_out = self.tdnn1(x) + # return tdnn1_out + tdnn2_out = self.tdnn2(tdnn1_out) + tdnn3_out = self.tdnn3(tdnn2_out) + tdnn4_out = self.tdnn4(tdnn3_out) + tdnn5_out = self.tdnn5(tdnn4_out) + ### Stat Pool + mean = torch.mean(tdnn5_out, 2) + std = torch.std(tdnn5_out, 2) + stat_pooling = torch.cat((mean, std), 1) + segment6_out = self.segment6(stat_pooling) + x_vec = self.segment7(segment6_out) + output = self.output(x_vec) + predictions = self.softmax(output) + return output, predictions, x_vec + + +class DurationPredictor(nn.Module): + """ + Duration Predictor module, trained using calculated durations coming from monotonic alignment search + """ + + def __init__(self, in_channels, filter_channels, filter_size, p_dropout): + super().__init__() + + self.in_channels = in_channels + self.filter_channels = filter_channels + self.filter_size = filter_size + self.p_dropout = p_dropout + + self.convs = nn.Sequential( + modules.Conv1DBlock( + in_size=self.in_channels, + out_size=self.filter_channels, + filter_size=self.filter_size, + p_dropout=p_dropout, + ), + modules.Conv1DBlock( + in_size=self.filter_channels, + out_size=self.filter_channels, + filter_size=self.filter_size, + p_dropout=p_dropout, + ), + ) + self.proj = nn.Conv1d(in_channels=self.filter_channels, out_channels=1, kernel_size=1) + + def forward(self, x, x_mask): + x_with_mask = (x, x_mask) + (x, x_mask) = self.convs(x_with_mask) + x = self.proj(x * x_mask) + return x + + +class FlowDecoder(nn.Module): + def __init__(self, cfg: FlowDecoderConfig, in_channels, gin_channels): + """Flow-based decoder model + + Args: + in_channels (int): Number of incoming channels + hidden_channels (int): Number of hidden channels + kernel_size (int): Kernel Size for convolutions in coupling blocks + dilation_rate (float): Dilation Rate to define dilation in convolutions of coupling block + n_blocks (int): Number of coupling blocks + n_layers (int): Number of layers in CNN of the coupling blocks + p_dropout (float, optional): Dropout probability for CNN in coupling blocks. Defaults to 0.. + n_split (int, optional): Number of splits for the 1x1 convolution for flows in the decoder. Defaults to 4. + n_sqz (int, optional): Squeeze. Defaults to 1. + sigmoid_scale (bool, optional): Boolean to define if log probs in coupling layers should be rescaled using sigmoid. Defaults to False. + gin_channels (int, optional): Number of speaker embedding channels. Defaults to 0. + """ + super().__init__() + self.cfg = cfg + + self.flows = nn.ModuleList() + + for _ in range(self.cfg.n_blocks): + self.flows.append(modules.ActNorm(channels=in_channels * self.cfg.n_sqz)) + self.flows.append(modules.InvConvNear(channels=in_channels * self.cfg.n_sqz, n_split=self.cfg.n_split)) + self.flows.append( + attentions.CouplingBlock( + in_channels * self.cfg.n_sqz, + self.cfg.hidden_channels, + kernel_size=self.cfg.kernel_size, + dilation_rate=self.cfg.dilation_rate, + n_layers=self.cfg.n_layers, + gin_channels=gin_channels, + p_dropout=self.cfg.p_dropout, + sigmoid_scale=self.cfg.sigmoid_scale, + ) + ) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + flows = self.flows + logdet_tot = 0 + else: + flows = reversed(self.flows) + logdet_tot = None + + if g is not None: + g = g.unsqueeze(-1) + + if self.cfg.n_sqz > 1: + x, x_mask = commons.channel_squeeze(x, x_mask, self.cfg.n_sqz) + for f in flows: + if not reverse: + x, logdet = f(x, x_mask, g=g, reverse=reverse) + logdet_tot += logdet + else: + x, logdet = f(x, x_mask, g=g, reverse=reverse) + if self.cfg.n_sqz > 1: + x, x_mask = commons.channel_unsqueeze(x, x_mask, self.cfg.n_sqz) + return x, logdet_tot + + def store_inverse(self): + for f in self.flows: + f.store_inverse() + +class TextEncoder(nn.Module): + """ + Text Encoder model + """ + + def __init__(self, cfg: TextEncoderConfig, out_channels, gin_channels): + """Text Encoder Model based on Multi-Head Self-Attention combined with FF-CCNs + + Args: + n_vocab (int): Size of vocabulary for embeddings + out_channels (int): Number of output channels + hidden_channels (int): Number of hidden channels + filter_channels (int): Number of filter channels + filter_channels_dp (int): Number of filter channels for duration predictor + n_heads (int): Number of heads in encoder's Multi-Head Attention + n_layers (int): Number of layers consisting of Multi-Head Attention and CNNs in encoder + kernel_size (int): Kernel Size for CNNs in encoder layers + p_dropout (float): Dropout probability for both encoder and duration predictor + window_size (int, optional): Window size in Multi-Head Self-Attention for encoder. Defaults to None. + block_length (_type_, optional): Block length for optional block masking in Multi-Head Attention for encoder. Defaults to None. + mean_only (bool, optional): Boolean to only project text encodings to mean values instead of mean and std. Defaults to False. + prenet (bool, optional): Boolean to add ConvReluNorm prenet before encoder . Defaults to False. + gin_channels (int, optional): Number of channels for speaker condition. Defaults to 0. + """ + super().__init__() + self.cfg = cfg + + self.emb = nn.Embedding(self.cfg.n_vocab, self.cfg.hidden_channels) + nn.init.normal_(self.emb.weight, 0.0, self.cfg.hidden_channels**-0.5) + + if self.cfg.prenet: + self.pre = modules.ConvReluNorm( + self.cfg.hidden_channels, + self.cfg.hidden_channels, + self.cfg.hidden_channels, + kernel_size=5, + n_layers=3, + p_dropout=0.5, + ) + self.encoder = attentions.Encoder( + self.cfg.hidden_channels, + self.cfg.filter_channels, + self.cfg.n_heads, + self.cfg.n_layers, + self.cfg.kernel_size, + self.cfg.p_dropout, + window_size=self.cfg.window_size, + block_length=self.cfg.block_length, + ) + + self.proj_m = nn.Conv1d(self.cfg.hidden_channels, out_channels, 1) + if not self.cfg.mean_only: + self.proj_s = nn.Conv1d(self.cfg.hidden_channels, out_channels, 1) + self.proj_w = DurationPredictor( + self.cfg.hidden_channels + gin_channels, + self.cfg.filter_channels_dp, + self.cfg.kernel_size, + self.cfg.p_dropout, + ) + + def forward(self, x, x_lengths, g=None): + x = self.emb(x) * math.sqrt(self.cfg.hidden_channels) # [b, t, h] + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + + if self.cfg.prenet: + x = self.pre(x, x_mask) + x = self.encoder(x, x_mask) + + if g is not None: + g_exp = g.unsqueeze(-1).expand(-1, -1, x.size(-1)) + # print(f"Dimension of input in Text Encoder: x.shape: {x.shape}; g: {g.shape}, g_exp: {g_exp.shape}") + x_dp = torch.cat([torch.detach(x), g_exp], 1) + else: + x_dp = torch.detach(x) + + x_m = self.proj_m(x) * x_mask + if not self.cfg.mean_only: + x_logs = self.proj_s(x) * x_mask + else: + x_logs = torch.zeros_like(x_m) + + # print(f"Dimension of input in Text Encoder before DP: {x_dp.shape}") + + logw = self.proj_w(x_dp, x_mask) + return x_m, x_logs, logw, x_mask + +class Model(nn.Module): + """ + Flow-based ASR model based on GlowTTS Structure using a pre-trained flow-based decoder + trained to generate spectrograms from given statistics coming from an encoder + + Model was pretrained using the architecture in + users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS.py + """ + + def __init__( + self, + model_config: dict, + **kwargs, + ): + """_summary_ + + Args: + n_vocab (int): vocabulary size + hidden_channels (int): Number of hidden channels in encoder + out_channels (int): Number of channels in the output + n_blocks_dec (int, optional): Number of coupling blocks in the decoder. Defaults to 12. + kernel_size_dec (int, optional): Kernel size in the decoder. Defaults to 5. + dilation_rate (int, optional): Dilation rate for CNNs of coupling blocks in decoder. Defaults to 5. + n_block_layers (int, optional): Number of layers in the CNN of the coupling blocks in decoder. Defaults to 4. + p_dropout_dec (_type_, optional): Dropout probability in the decoder. Defaults to 0.. + n_speakers (int, optional): Number of speakers. Defaults to 0. + gin_channels (int, optional): Number of speaker embedding channels. Defaults to 0. + n_split (int, optional): Number of splits for the 1x1 convolution for flows in the decoder. Defaults to 4. + n_sqz (int, optional): Squeeze. Defaults to 1. + sigmoid_scale (bool, optional): Boolean to define if log probs in coupling layers should be rescaled using sigmoid. Defaults to False. + window_size (int, optional): Window size in Multi-Head Self-Attention for encoder. Defaults to None. + block_length (_type_, optional): Block length for optional block masking in Multi-Head Attention for encoder. Defaults to None. + hidden_channels_dec (_type_, optional): Number of hidden channels in decodder. Defaults to hidden_channels. + final_hidden_channels: Number of hidden channels in the final network + final_n_layers: Number of layers in the final network + label_target_size: Target size of target vocabulary, target size for final network + """ + super().__init__() + + self.net_kwargs = { + "repeat_per_num_frames": 100, + "max_dim_feat": 8, + "num_repeat_feat": 5, + "max_dim_time": 20, + } + + fe_config = DbMelFeatureExtractionConfig.from_dict(kwargs["fe_config"]) + self.feature_extraction = DbMelFeatureExtraction(config=fe_config) + + # if label_target_size is None: + # if n_vocab is None: + # run_ctx = get_run_ctx() + # dataset = run_ctx.engine.train_dataset or run_ctx.engine.forward_dataset + # self.label_target_size = len(dataset.datasets["zip_dataset"].targets.labels) + # else: + # self.label_target_size = n_vocab + # else: + # self.label_target_size = label_target_size + + self.cfg = ModelConfigV2.from_dict(model_config) + text_encoder_config = self.cfg.text_encoder_config + decoder_config = self.cfg.decoder_config + + if self.cfg.n_speakers > 1: + self.x_vector = XVector(self.cfg.out_channels, self.cfg.n_speakers) + self.x_vector_bottleneck = nn.Sequential(nn.Linear(512, self.cfg.gin_channels), nn.ReLU()) + + self.encoder = TextEncoder( + text_encoder_config, out_channels=self.cfg.out_channels, gin_channels=self.cfg.gin_channels + ) + + self.decoder = FlowDecoder( + decoder_config, in_channels=self.cfg.out_channels, gin_channels=self.cfg.gin_channels + ) + + self.asr_output = nn.Sequential() + + for i in range(self.cfg.phoneme_prediction_config.n_layers): + if i == 0: + in_channels = self.cfg.out_channels + else: + in_channels = self.cfg.phoneme_prediction_config.n_channels + if i < self.cfg.phoneme_prediction_config.n_layers - 1: + out_channels = self.cfg.phoneme_prediction_config.n_channels + else: + out_channels = self.cfg.label_target_size + 1 + + self.asr_output.append(nn.Linear(in_channels, out_channels)) + if i < self.cfg.phoneme_prediction_config.n_layers - 1: + self.asr_output.append(nn.ReLU()) + self.asr_output.append(nn.Dropout(self.cfg.phoneme_prediction_config.p_dropout)) + + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + def forward( + self, x=None, x_lengths=None, raw_audio=None, raw_audio_lengths=None, g=None, recognition_input=None, noise_scale=1.0, length_scale=1.0 + ): + with torch.no_grad(): + squeezed_audio = torch.squeeze(raw_audio) + y, y_lengths = self.feature_extraction(squeezed_audio, raw_audio_lengths) # [B, T, F] + y = y.transpose(1, 2) # [B, F, T] + self.encoder.eval() + self.decoder.eval() + self.x_vector_bottleneck.eval() + self.x_vector.eval() + _, _, g = self.x_vector(y, y_lengths) + g = self.x_vector_bottleneck(g) + + y_max_length = y.size(2) + + y, y_lengths, y_max_length = self.preprocess(y, y_lengths, y_max_length) + z_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y_max_length), 1).to(torch.int32) + + x_m, x_logs, logw, x_mask = self.encoder(x, x_lengths, g=g) # mean, std logs, duration logs, mask + attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(z_mask, 2) + z, logdet = self.decoder(y, z_mask, g=g, reverse=False) + + x_s_sq_r = torch.exp(-2 * x_logs) + logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - x_logs, [1]).unsqueeze(-1) # [b, t, 1] + logp2 = torch.matmul(x_s_sq_r.transpose(1, 2), -0.5 * (z**2)) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp3 = torch.matmul((x_m * x_s_sq_r).transpose(1, 2), z) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp4 = torch.sum(-0.5 * (x_m**2) * x_s_sq_r, [1]).unsqueeze(-1) # [b, t, 1] + logp = logp1 + logp2 + logp3 + logp4 # [b, t, t'] + + attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() + # embed() + + z_m = torch.matmul(attn.squeeze(1).transpose(1, 2), x_m.transpose(1, 2)).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + z_logs = torch.matmul(attn.squeeze(1).transpose(1, 2), x_logs.transpose(1, 2)).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + + if recognition_input == "encoder": # TODO: This is wrong! + asr_in = z.transpose(1, 2) + else: + asr_in = ((z_m + torch.exp(z_logs) * torch.randn_like(z_m) * noise_scale) * z_mask).transpose(1,2) + + logits = self.asr_output(asr_in) + + return logits, y_lengths, z_mask, attn + + def preprocess(self, y, y_lengths, y_max_length): + if y_max_length is not None: + y_max_length = (y_max_length // self.cfg.decoder_config.n_sqz) * self.cfg.decoder_config.n_sqz + y = y[:, :, :y_max_length] + y_lengths = (y_lengths // self.cfg.decoder_config.n_sqz) * self.cfg.decoder_config.n_sqz + return y, y_lengths, y_max_length + + def store_inverse(self): + self.decoder.store_inverse() + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + tags = data["seq_tag"] + audio_features = data["audio_features"] # [B, T, F] + # audio_features = audio_features.transpose(1, 2) # [B, F, T] necessary because glowTTS expects the channels to be in the 2nd dimension + audio_features_len = data["audio_features:size1"] # [B] + + # perform local length sorting for more efficient packing + audio_features_len, indices = torch.sort(audio_features_len, descending=True) + + audio_features = audio_features[indices, :, :] + phonemes = data["phonemes"][indices, :] # [B, T] (sparse) + phonemes_len = data["phonemes:size1"][indices] # [B, T] + phonemes_eow = data["phonemes_eow"][indices, :] # [B, T] + phonemes_eow_len = data["phonemes_eow:size1"][indices] + durations = data["durations"][indices] + tags = list(np.array(tags)[indices.detach().cpu().numpy()]) + + recognition_input = kwargs["recognition_input"] + logits, y_lengths, z_mask, attn = model( + phonemes, + phonemes_len, + audio_features, + audio_features_len, + recognition_input=recognition_input, + ) + + upsampled_phonemes = torch.matmul(attn.squeeze(1).transpose(1, 2), phonemes.float().unsqueeze(-1)).squeeze(-1) + + mask = commons.sequence_mask(y_lengths) + ce_losses = nn.functional.cross_entropy(logits.transpose(1, 2), upsampled_phonemes.long(), reduction="none") + ce_loss = (ce_losses * mask.float()).sum() / mask.float().sum() + + run_ctx.mark_as_loss(name="ce", loss=ce_loss) + + +def phoneme_prediction_init_hook(run_ctx, **kwargs): + run_ctx.hdf_writer = SimpleHDFWriter("output.hdf", dim=1, ndim=1) + run_ctx.pool = multiprocessing.Pool(8) + run_ctx.recognition_input = kwargs["recognition_input"] + + +def phoneme_prediction_finish_hook(run_ctx, **kwargs): + run_ctx.hdf_writer.close() + + +def phoneme_prediction_step(*, model: Model, data, run_ctx, **kwargs): + """ + :param Model model: _description_ + :param _type_ data: _description_ + :param _type_ run_ctx: _description_ + """ + tags = data["seq_tag"] + audio_features = data["audio_features"] # [B, T, F] + audio_features_len = data["audio_features:size1"] # [B] + + # perform local length sorting for more efficient packing + audio_features_len, indices = torch.sort(audio_features_len, descending=True) + audio_features = audio_features[indices, :, :] + phonemes = data["phonemes"][indices, :] # [B, T] (sparse) + phonemes_len = data["phonemes:size1"][indices] # [B, T] + durations = data["durations"][indices] + + speaker_labels = None # it is x-vector and during prediction the audio is available therefore g will be predicted + + tags = list(np.array(tags)[indices.detach().cpu().numpy()]) + + logits, y_lengths, z_mask, attn = model( + x=phonemes, + x_lengths=phonemes_len, + raw_audio=audio_features, + raw_audio_lengths=audio_features_len, + g=speaker_labels, + recognition_input=run_ctx.recognition_input, + ) + x_mask = torch.unsqueeze(commons.sequence_mask(phonemes_len, phonemes.size(1)), 1).to(phonemes.dtype) + + attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(z_mask, 2) + given_attn = commons.generate_path(durations.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1) + + upsampled_phonemes = torch.matmul(given_attn.squeeze(1).transpose(1, 2), phonemes.unsqueeze(-1)).squeeze(-1) + + mask = commons.sequence_mask(y_lengths) + pred = torch.softmax(logits, dim=2).argmax(dim=2) + + accuracies = ( + (((pred == upsampled_phonemes) * mask).sum(dim=1) / y_lengths).unsqueeze(-1).unsqueeze(-1).detach().cpu() + ) + + for tag, acc in zip(tags, accuracies): + run_ctx.hdf_writer.insert_batch(np.array(acc), [1], [tag]) + + +# def search_init_hook(run_ctx, **kwargs): +# # we are storing durations, but call it output.hdf to match +# # the default output of the ReturnnForwardJob +# from torchaudio.models.decoder import ctc_decoder +# run_ctx.recognition_file = open("search_out.py", "wt") +# run_ctx.recognition_file.write("{\n") +# import subprocess +# if kwargs["arpa_lm"] is not None: +# lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip() +# else: +# lm = None +# from returnn.datasets.util.vocabulary import Vocabulary +# vocab = Vocabulary.create_vocab( +# vocab_file=kwargs["returnn_vocab"], unknown_label=None) +# labels = vocab.labels + +# run_ctx.ctc_decoder = ctc_decoder( +# lexicon=kwargs["lexicon"], +# lm=lm, +# lm_weight=kwargs["lm_weight"], +# tokens=labels + ["[blank]", "[SILENCE]", "[UNK]"], +# # "[SILENCE]" and "[UNK]" are not actually part of the vocab, +# # but the decoder is happy as long they are defined in the token list +# # even if they do not exist as label index in the softmax output, +# blank_token="[blank]", +# sil_token="[SILENCE]", +# unk_word="[unknown]", +# nbest=1, +# beam_size=kwargs["beam_size"], +# beam_size_token=kwargs.get("beam_size_token", None), +# beam_threshold=kwargs["beam_threshold"], +# sil_score=kwargs.get("sil_score", 0.0), +# word_score=kwargs.get("word_score", 0.0), +# ) +# run_ctx.labels = labels +# run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None) + +# if kwargs.get("prior_file", None): +# run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32") +# run_ctx.prior_scale = kwargs["prior_scale"] +# else: +# run_ctx.prior = None + +# def search_finish_hook(run_ctx, **kwargs): +# run_ctx.recognition_file.write("}\n") +# run_ctx.recognition_file.close() + +# def search_step(*, model, data, run_ctx, **kwargs): +# raw_audio = data["raw_audio"] # [B, T', F] +# raw_audio_len = data["raw_audio:size1"] # [B] + +# logprobs, audio_features_len = model( +# raw_audio=raw_audio, +# raw_audio_lengths=raw_audio_len, +# recognition=True +# ) + +# tags = data["seq_tag"] + +# logprobs_cpu = logprobs.cpu() +# if run_ctx.blank_log_penalty is not None: +# # assumes blank is last +# logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty +# if run_ctx.prior is not None: +# logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior +# hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu()) + +# for hyp, tag in zip(hypothesis, tags): +# words = hyp[0].words +# sequence = " ".join([word for word in words if not word.startswith("[")]) +# print(sequence) +# run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence))) diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/glowASR_conformer_x_vector.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/glowASR_conformer_x_vector.py new file mode 100644 index 000000000..0d34a2d53 --- /dev/null +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/glowASR_conformer_x_vector.py @@ -0,0 +1,483 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +from dataclasses import dataclass +import torch +import numpy as np +from torch import nn +import multiprocessing +from librosa import filters +import sys +import time +from typing import Any, Dict, Optional, Tuple, Union +import math + +from torchaudio.functional import mask_along_axis + +from i6_models.parts.blstm import BlstmEncoderV1, BlstmEncoderV1Config + + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1, VGG4LayerActFrontendV1Config + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + +from ..shared.configs import SpecaugConfig, ModelConfigV2, FlowDecoderConfig, ConformerASRConfig + + +from returnn.torch.context import get_run_ctx + +from ..shared.configs import DbMelFeatureExtractionConfig +from ..shared.feature_extraction import DbMelFeatureExtraction +from ..shared.spec_augment import apply_spec_aug +from ..shared.mask import mask_tensor + +from ..shared import modules +from ..shared import commons +from ..shared import attentions +from ..monotonic_align import maximum_path + +class XVector(nn.Module): + def __init__(self, input_dim=40, num_classes=8, **kwargs): + super(XVector, self).__init__() + self.tdnn1 = modules.TDNN( + input_dim=input_dim, + output_dim=512, + context_size=5, + dilation=1, + dropout_p=0.5, + batch_norm=True + ) + self.tdnn2 = modules.TDNN( + input_dim=512, output_dim=512, context_size=3, dilation=2, dropout_p=0.5, batch_norm=True + ) + self.tdnn3 = modules.TDNN( + input_dim=512, output_dim=512, context_size=2, dilation=3, dropout_p=0.5, batch_norm=True + ) + self.tdnn4 = modules.TDNN( + input_dim=512, output_dim=512, context_size=1, dilation=1, dropout_p=0.5, batch_norm=True + ) + self.tdnn5 = modules.TDNN( + input_dim=512, output_dim=512, context_size=1, dilation=1, dropout_p=0.5, batch_norm=True + ) + #### Frame levelPooling + self.segment6 = nn.Linear(1024, 512) + self.segment7 = nn.Linear(512, 512) + self.output = nn.Linear(512, num_classes) + self.softmax = nn.Softmax(dim=1) + + # fe_config = DbMelFeatureExtractionConfig.from_dict(kwargs["fe_config"]) + # self.feature_extraction = DbMelFeatureExtraction(config=fe_config) + + def forward(self, x, x_lengths): + # with torch.no_grad(): + # squeezed_audio = torch.squeeze(raw_audio) + # x, x_lengths = self.feature_extraction(squeezed_audio, raw_audio_lengths) # [B, T, F] + + # x = x.transpose(1, 2) + tdnn1_out = self.tdnn1(x) + # return tdnn1_out + tdnn2_out = self.tdnn2(tdnn1_out) + tdnn3_out = self.tdnn3(tdnn2_out) + tdnn4_out = self.tdnn4(tdnn3_out) + tdnn5_out = self.tdnn5(tdnn4_out) + ### Stat Pool + mean = torch.mean(tdnn5_out, 2) + std = torch.std(tdnn5_out, 2) + stat_pooling = torch.cat((mean, std), 1) + segment6_out = self.segment6(stat_pooling) + x_vec = self.segment7(segment6_out) + output = self.output(x_vec) + predictions = self.softmax(output) + return output, predictions, x_vec + +class FlowDecoder(nn.Module): + def __init__(self, cfg: FlowDecoderConfig, in_channels, gin_channels): + """Flow-based decoder model + + Args: + in_channels (int): Number of incoming channels + hidden_channels (int): Number of hidden channels + kernel_size (int): Kernel Size for convolutions in coupling blocks + dilation_rate (float): Dilation Rate to define dilation in convolutions of coupling block + n_blocks (int): Number of coupling blocks + n_layers (int): Number of layers in CNN of the coupling blocks + p_dropout (float, optional): Dropout probability for CNN in coupling blocks. Defaults to 0.. + n_split (int, optional): Number of splits for the 1x1 convolution for flows in the decoder. Defaults to 4. + n_sqz (int, optional): Squeeze. Defaults to 1. + sigmoid_scale (bool, optional): Boolean to define if log probs in coupling layers should be rescaled using sigmoid. Defaults to False. + gin_channels (int, optional): Number of speaker embedding channels. Defaults to 0. + """ + super().__init__() + self.cfg = cfg + + self.flows = nn.ModuleList() + + for _ in range(self.cfg.n_blocks): + self.flows.append(modules.ActNorm(channels=in_channels * self.cfg.n_sqz)) + self.flows.append(modules.InvConvNear(channels=in_channels * self.cfg.n_sqz, n_split=self.cfg.n_split)) + self.flows.append( + attentions.CouplingBlock( + in_channels * self.cfg.n_sqz, + self.cfg.hidden_channels, + kernel_size=self.cfg.kernel_size, + dilation_rate=self.cfg.dilation_rate, + n_layers=self.cfg.n_layers, + gin_channels=gin_channels, + p_dropout=self.cfg.p_dropout, + sigmoid_scale=self.cfg.sigmoid_scale, + ) + ) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + flows = self.flows + logdet_tot = 0 + else: + flows = reversed(self.flows) + logdet_tot = None + + if g is not None: + g = g.unsqueeze(-1) + + if self.cfg.n_sqz > 1: + x, x_mask = commons.channel_squeeze(x, x_mask, self.cfg.n_sqz) + for f in flows: + if not reverse: + x, logdet = f(x, x_mask, g=g, reverse=reverse) + logdet_tot += logdet + else: + x, logdet = f(x, x_mask, g=g, reverse=reverse) + if self.cfg.n_sqz > 1: + x, x_mask = commons.channel_unsqueeze(x, x_mask, self.cfg.n_sqz) + return x, logdet_tot + + def store_inverse(self): + for f in self.flows: + f.store_inverse() + +class Model(nn.Module): + """ + Flow-based ASR model based on GlowTTS Structure using a pre-trained flow-based decoder + trained to generate spectrograms from given statistics coming from an encoder + + Model was pretrained using the architecture in + users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS.py + """ + + def __init__( + self, + model_config: ModelConfigV2, + # n_vocab: int, + # hidden_channels: int = 192, + # out_channels: int = 80, + # n_blocks_dec: int = 12, + # kernel_size_dec: int = 5, + # dilation_rate: int = 1, + # n_block_layers: int = 4, + # p_dropout: float = 0.1, + # p_dropout_flow: float = 0.05, + # gin_channels: int = 0, + # n_split: int = 4, + # n_sqz: int = 2, + # sigmoid_scale: bool = False, + # window_size: int = 4, + # block_length: int = None, + # hidden_channels_dec: int = None, + # label_target_size=None, + # spec_augment = False, + # layer_norm = False, + # batch_norm = False, + # n_speakers = 1, + **kwargs, + ): + """_summary_ + + Args: + n_vocab (int): vocabulary size + hidden_channels (int): Number of hidden channels in encoder + out_channels (int): Number of channels in the output + n_blocks_dec (int, optional): Number of coupling blocks in the decoder. Defaults to 12. + kernel_size_dec (int, optional): Kernel size in the decoder. Defaults to 5. + dilation_rate (int, optional): Dilation rate for CNNs of coupling blocks in decoder. Defaults to 5. + n_block_layers (int, optional): Number of layers in the CNN of the coupling blocks in decoder. Defaults to 4. + p_dropout_dec (_type_, optional): Dropout probability in the decoder. Defaults to 0.. + n_speakers (int, optional): Number of speakers. Defaults to 0. + gin_channels (int, optional): Number of speaker embedding channels. Defaults to 0. + n_split (int, optional): Number of splits for the 1x1 convolution for flows in the decoder. Defaults to 4. + n_sqz (int, optional): Squeeze. Defaults to 1. + sigmoid_scale (bool, optional): Boolean to define if log probs in coupling layers should be rescaled using sigmoid. Defaults to False. + window_size (int, optional): Window size in Multi-Head Self-Attention for encoder. Defaults to None. + block_length (_type_, optional): Block length for optional block masking in Multi-Head Attention for encoder. Defaults to None. + hidden_channels_dec (_type_, optional): Number of hidden channels in decodder. Defaults to hidden_channels. + final_hidden_channels: Number of hidden channels in the final network + final_n_layers: Number of layers in the final network + label_target_size: Target size of target vocabulary, target size for final network + """ + super().__init__() + # self.n_vocab = n_vocab + # self.hidden_channels = hidden_channels + # self.out_channels = out_channels + # self.n_blocks_dec = n_blocks_dec + # self.kernel_size_dec = kernel_size_dec + # self.dilation_rate = dilation_rate + # self.n_block_layers = n_block_layers + # self.p_dropout = p_dropout + # self.p_dropout_flow = p_dropout_flow + # self.n_split = n_split + # self.n_sqz = n_sqz + # self.sigmoid_scale = sigmoid_scale + # self.window_size = window_size + # self.block_length = block_length + # self.hidden_channels_dec = hidden_channels_dec + # self.spec_augment = spec_augment + # self.layer_norm = layer_norm + # self.batch_norm = batch_norm + + fe_config = DbMelFeatureExtractionConfig.from_dict(kwargs["fe_config"]) + self.feature_extraction = DbMelFeatureExtraction(config=fe_config) + + self.cfg = ModelConfigV2.from_dict(model_config) + text_encoder_config = self.cfg.text_encoder_config + decoder_config = self.cfg.decoder_config + + if self.cfg.n_speakers > 1: + self.x_vector = XVector(self.cfg.out_channels, self.cfg.n_speakers) + self.x_vector_bottleneck = nn.Sequential(nn.Linear(512, self.cfg.gin_channels), nn.ReLU()) + + # self.encoder = TextEncoder( + # text_encoder_config, out_channels=self.cfg.out_channels, gin_channels=self.cfg.gin_channels + # ) + + self.decoder = FlowDecoder( + decoder_config, in_channels=self.cfg.out_channels, gin_channels=self.cfg.gin_channels + ) + + if self.cfg.n_speakers > 1: + self.x_vector = XVector(self.cfg.out_channels, self.cfg.n_speakers) + self.x_vector_bottleneck = nn.Sequential(nn.Linear(512, self.cfg.gin_channels), nn.ReLU()) + + + # specaug_config = SpecaugConfig( + # repeat_per_n_frames=25, + # max_dim_time=20, + # max_dim_feat=16, + # num_repeat_feat=5, + # ) + + self.conf_cfg = self.cfg.phoneme_prediction_config + frontend_config = self.conf_cfg.frontend_config + conformer_size = self.conf_cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.conf_cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.conf_cfg.ff_dim, + dropout=self.conf_cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.conf_cfg.num_heads, + att_weights_dropout=self.conf_cfg.att_weights_dropout, + dropout=self.conf_cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, kernel_size=self.conf_cfg.conv_kernel_size, dropout=self.conf_cfg.conv_dropout, activation=nn.functional.silu, + norm=LayerNormNC(conformer_size) + ), + ), + ) + + self.conformer = ConformerEncoderV1(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.conf_cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.conf_cfg.final_dropout) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + self.specaug_cfg = self.cfg.specaug_config + + def forward(self, raw_audio, raw_audio_len): + with torch.no_grad(): + self.x_vector.eval() + self.x_vector_bottleneck.eval() + self.decoder.eval() + squeezed_audio = torch.squeeze(raw_audio) + log_mel_features, log_mel_features_len = self.feature_extraction(squeezed_audio, raw_audio_len) # [B, T, F] + + audio_max_length = log_mel_features.size(1) + + flow_in = log_mel_features.transpose(1,2) # [B, F, T] + flow_in, flow_in_length, flow_in_max_length = self.preprocess(flow_in, log_mel_features_len, audio_max_length) + mask = torch.unsqueeze(commons.sequence_mask(log_mel_features_len, flow_in.size(2)), 1).to(flow_in.dtype) + + _, _, g = self.x_vector(log_mel_features.transpose(1,2), log_mel_features_len) + g = self.x_vector_bottleneck(g) + + flow_out, _ = self.decoder(flow_in, mask, g=g, reverse=False) # [B, F, T] + + spec_augment_in = flow_out.transpose(1,2) # [B, T, F] + mask = mask_tensor(spec_augment_in, flow_in_length) + + run_ctx = get_run_ctx() + if self.training and self.specaug_start_epoch is not None and run_ctx.epoch > self.specaug_start_epoch: + audio_features_masked_2 = apply_spec_aug( + spec_augment_in, + num_repeat_time=torch.max(log_mel_features_len).detach().cpu().numpy() + // self.cfg.specaug_config.repeat_per_n_frames, + max_dim_time=self.cfg.specaug_config.max_dim_time, + num_repeat_feat=self.cfg.specaug_config.num_repeat_feat, + max_dim_feat=self.cfg.specaug_config.max_dim_feat, + ) + else: + audio_features_masked_2 = spec_augment_in + + conformer_in = audio_features_masked_2 + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + def preprocess(self, y, y_lengths, y_max_length): + if y_max_length is not None: + y_max_length = (y_max_length // self.cfg.decoder_config.n_sqz) * self.cfg.decoder_config.n_sqz + y = y[:, :, :y_max_length] + y_lengths = (y_lengths // self.cfg.decoder_config.n_sqz) * self.cfg.decoder_config.n_sqz + return y, y_lengths, y_max_length + + def store_inverse(self): + self.decoder.store_inverse() + + +def train_step(*, model: nn.Module, data, run_ctx, **kwargs): + raw_audio = data["audio_features"] # [B, T', F] + raw_audio_len = data["audio_features:size1"] # [B] + + phon_labels = data["phonemes_eow"] # [B, N] (sparse) + phon_labels_len = data["phonemes_eow:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + phon_labels, + input_lengths=audio_features_len, + target_lengths=phon_labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True + ) + num_phonemes = torch.sum(phon_labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def forward_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + from torchaudio.models.decoder import ctc_decoder + run_ctx.recognition_file = open("search_out.py", "wt") + run_ctx.recognition_file.write("{\n") + import subprocess + if kwargs["arpa_lm"] is not None: + lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip() + else: + lm = None + from returnn.datasets.util.vocabulary import Vocabulary + vocab = Vocabulary.create_vocab( + vocab_file=kwargs["returnn_vocab"], unknown_label=None) + labels = vocab.labels + print(f"labels from vocab:{labels}") + if "asr_data" in kwargs.keys() and kwargs["asr_data"]: + print(f"Using ctc_decoder for ASR data...") + run_ctx.ctc_decoder = ctc_decoder( + lexicon=kwargs["lexicon"], + lm=lm, + lm_weight=kwargs["lm_weight"], + tokens=labels + ["[blank]", "[SILENCE]", "[UNK]"], + # "[SILENCE]" and "[UNK]" are not actually part of the vocab, + # but the decoder is happy as long they are defined in the token list + # even if they do not exist as label index in the softmax output, + blank_token="[blank]", + sil_token="[SILENCE]", + unk_word="[unknown]", + nbest=1, + beam_size=kwargs["beam_size"], + beam_size_token=kwargs.get("beam_size_token", None), + beam_threshold=kwargs["beam_threshold"], + sil_score=kwargs.get("sil_score", 0.0), + word_score=kwargs.get("word_score", 0.0), + ) + else: + print(f"Using ctc_decoder for TTS data...") + + run_ctx.ctc_decoder = ctc_decoder( + lexicon=kwargs["lexicon"], + lm=lm, + lm_weight=kwargs["lm_weight"], + tokens=labels, + blank_token="[blank]", + sil_token="[space]", # [space] is our actual silence + unk_word="[UNKNOWN]", + nbest=1, + beam_size=kwargs["beam_size"], + beam_threshold=kwargs["beam_threshold"], + sil_score=kwargs.get("sil_score", 0.0), + word_score=kwargs.get("word_score", 0.0), + ) + run_ctx.labels = labels + run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None) + + if kwargs.get("prior_file", None): + run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32") + run_ctx.prior_scale = kwargs["prior_scale"] + else: + run_ctx.prior = None + + +def forward_finish_hook(run_ctx, **kwargs): + run_ctx.recognition_file.write("}\n") + run_ctx.recognition_file.close() + + +def forward_step(*, model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + tags = data["seq_tag"] + + logprobs_cpu = logprobs.cpu() + if run_ctx.blank_log_penalty is not None: + # assumes blank is last + logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty + if run_ctx.prior is not None: + logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior + hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu()) + + for hyp, tag in zip(hypothesis, tags): + words = hyp[0].words + sequence = " ".join([word for word in words if not word.startswith("[")]) + print(sequence) + run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence))) + + diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/glowTTS.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/glowTTS.py index 0d095d95e..4ebb7b991 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/glowTTS.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/glowTTS.py @@ -18,6 +18,7 @@ from .shared.eval_forward import * +from .shared.eval_invertibility import * class DurationPredictor(nn.Module): """ diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/glowTTS_x_vector_v2.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/glowTTS_x_vector_v2.py index 1b88aede8..19577801f 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/glowTTS_x_vector_v2.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/glowTTS_x_vector_v2.py @@ -19,6 +19,7 @@ from .shared.configs import DbMelFeatureExtractionConfig, ModelConfigV1 from .shared.eval_forward import * +from .shared.eval_invertibility import * class XVector(nn.Module): diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/shared/configs.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/shared/configs.py index 9d4a920c6..1a6a1b009 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/shared/configs.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/shared/configs.py @@ -58,21 +58,6 @@ def from_dict(cls, d): return VGG4LayerActFrontendV1Config(**d) -@dataclass -class ConformerEncoderV1Config(ModelConfiguration): - """ - Attributes: - num_layers: Number of conformer layers in the conformer encoder - frontend: A pair of ConformerFrontend and corresponding config - block_cfg: Configuration for ConformerBlockV1 - """ - - num_layers: int - - # nested configurations - frontend: ModuleFactoryV1 - block_cfg: ConformerBlockV1Config - @dataclass class SpecaugConfig(ModelConfiguration): @@ -215,7 +200,27 @@ class PhonemePredictionConfigBLSTM(ModelConfiguration): def from_dict(cls, d): d = d.copy() return PhonemePredictionConfigBLSTM(**d) - + +@dataclass +class ConformerASRConfig(): + frontend_config: VGG4LayerActFrontendV1Config + label_target_size: int + conformer_size: int + num_layers: int + num_heads: int + ff_dim: int + att_weights_dropout: float + conv_dropout: float + ff_dropout: float + mhsa_dropout: float + conv_kernel_size: int + final_dropout: float + + @classmethod + def from_dict(cls, d): + d = d.copy() + d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"]) + return ConformerASRConfig(**d) @dataclass class ModelConfigV1: @@ -250,7 +255,7 @@ class ModelConfigV2: gin_channels: int n_speakers: Union[tk.Variable, int] specauc_start_epoch: Optional[int] = None - phoneme_prediction_config: Optional[Union[PhonemePredictionConfig, PhonemePredictionConfigCNN, PhonemePredictionConfigBLSTM]] = None + phoneme_prediction_config: Optional[Union[PhonemePredictionConfig, PhonemePredictionConfigCNN, PhonemePredictionConfigBLSTM, ConformerASRConfig]] = None specaug_config: Optional[SpecaugConfig] = None @classmethod @@ -272,6 +277,8 @@ def from_dict(cls, d): d["phoneme_prediction_config"] = PhonemePredictionConfigCNN.from_dict(d["phoneme_prediction_config"]) elif "subsampling_factor" in d["phoneme_prediction_config"].keys(): d["phoneme_prediction_config"] = PhonemePredictionConfigBLSTM.from_dict(d["phoneme_prediction_config"]) + elif "conformer_size" in d["phoneme_prediction_config"].keys(): + d["phoneme_prediction_config"] = ConformerASRConfig.from_dict(d["phoneme_prediction_config"]) else: d["phoneme_prediction_config"] = PhonemePredictionConfig.from_dict(d["phoneme_prediction_config"]) return ModelConfigV2(**d) diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/shared/eval_invertibility.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/shared/eval_invertibility.py new file mode 100644 index 000000000..3cdc198d0 --- /dev/null +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/shared/eval_invertibility.py @@ -0,0 +1,90 @@ +import multiprocessing +import torch +import numpy as np +from returnn.datasets.hdf import SimpleHDFWriter +from . import commons + + +def forward_init_hook_invertibility(run_ctx, **kwargs): + run_ctx.total_mae = 0 + run_ctx.total_ae_var = 0 + run_ctx.total_ae_max = torch.tensor(-np.inf) + run_ctx.total_ae_min = torch.tensor(np.inf) + run_ctx.num_of_obs = 0 + + +def forward_finish_hook_invertibility(run_ctx, **kwargs): + with open("output.hdf", "w+") as f: + f.write("total, mean, var, max, min \n") + f.write( + f"{run_ctx.num_of_obs}, {str(float(run_ctx.total_mae))}, {str(float(run_ctx.total_ae_var))}, {str(float(run_ctx.total_ae_max))}, {str(float(run_ctx.total_ae_min))}" + ) + + +def forward_step_invertibility(*, model, data, run_ctx, **kwargs): + raw_audio = data["audio_features"] # [B, N] (sparse) + raw_audio_len = data["audio_features:size1"] # [B] + phonemes = data["phonemes"] + phonemes_len = data["phonemes:size1"] + + if "xvectors" in data: + g = data["xvectors"] + elif "speaker_labels" in data: + g = data["speaker_labels"] + else: + raise Exception("Missing speaker embedding!") + + squeezed_audio = torch.squeeze(raw_audio) + y, y_lengths = model.feature_extraction(squeezed_audio, raw_audio_len) # [B, T, F] + y = y.transpose(1, 2) # [B, F, T] + + if hasattr(model, "x_vector"): + _, _, g = model.x_vector(y, y_lengths) + + if hasattr(model, "x_vector_bottleneck"): + g = model.x_vector_bottleneck(g) + elif hasattr(model, "emb_g"): + g = torch.nn.functional.normalize(model.emb_g(g.squeeze(-1))).unsqueeze(-1) + else: + g = None + + y_max_length = y.size(2) + + y, y_lengths, y_max_length = model.preprocess(y, y_lengths, y_max_length) + z_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y_max_length), 1).to(torch.int32) + + z, _ = model.decoder(y, z_mask, g=g, reverse=False) + y_hat, _ = model.decoder(z, z_mask, g=g, reverse=True) + + mae = torch.nn.functional.l1_loss(y_hat * z_mask, y * z_mask, reduction="none") # [B, F, T] + + current_num_of_obs = y_hat.shape[1] * y_lengths.sum() # F * total_number_of_frames_in_batch + + old_mae = run_ctx.total_mae + + current_mae = ( + mae.sum() / current_num_of_obs + ) # This considers the masking by only using the mean over all unmasked elements + + current_var = (mae - current_mae).sum() / ( + current_num_of_obs - 1 + ) # Variance over unmasked elements with bias correction 1 + + run_ctx.total_mae = ((run_ctx.num_of_obs / (run_ctx.num_of_obs + current_num_of_obs)) * old_mae) + ( + (current_num_of_obs / (run_ctx.num_of_obs + current_num_of_obs)) * current_mae + ) + + run_ctx.total_ae_var = ( + (run_ctx.num_of_obs / (run_ctx.num_of_obs + current_num_of_obs)) * run_ctx.total_ae_var + + ((current_num_of_obs / (run_ctx.num_of_obs + current_num_of_obs)) * current_var) + + ((run_ctx.num_of_obs * current_num_of_obs) / (run_ctx.num_of_obs + current_num_of_obs) ** 2) + * (old_mae - current_mae) ** 2 + ) + + run_ctx.total_ae_max = torch.max(run_ctx.total_ae_max, mae.max()) + + run_ctx.total_ae_min = torch.min( + run_ctx.total_ae_min, (mae + (-1 * z_mask + 1) * torch.tensor(float("inf")).nan_to_num(0.0)).min() + ) # Masked Min operation + + run_ctx.num_of_obs += current_num_of_obs diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/storage.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/storage.py index 98f41b9b9..4d1e613e2 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/storage.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/storage.py @@ -1,7 +1,8 @@ from typing import Dict from dataclasses import dataclass from sisyphus import tk -from .pytorch_networks.shared.configs import ModelConfigV1 +from typing import Union +from .pytorch_networks.shared.configs import ModelConfigV1, ModelConfigV2 synthetic_ogg_zip_data = {} @@ -17,7 +18,7 @@ def add_duration(name: str, duration_hdf: tk.Path): @dataclass class TTSModel: - config: ModelConfigV1 + config: Union[ModelConfigV1, ModelConfigV2] checkpoint: tk.Path tts_models: Dict[str, TTSModel] = {} From 0818c70e69db643d1dbcb0f2cb44f6996e2e50da Mon Sep 17 00:00:00 2001 From: Lukas Rilling Date: Thu, 16 May 2024 12:26:04 +0200 Subject: [PATCH 026/227] Glow-TTS-ASR update --- .../evaluation/invertibility_eval.ipynb | 865 +++++ .../evaluation/phoneme_prediction_eval.ipynb | 456 +-- users/rilling/evaluation/swer_eval.ipynb | 353 +- users/rilling/evaluation/wer_eval.ipynb | 3275 ++++++++++++----- .../experiments.py | 66 +- .../glowTTS_ASR_conformer_two_forward_pass.py | 25 +- .../glowTTS_ASR_conformer_x_vector_v2.py | 26 +- .../training_comparison.ipynb | 72 +- .../librispeech_glow_asr/config.py | 44 + .../librispeech_glow_asr/experiments.py | 210 +- .../librispeech_glow_asr/pipeline.py | 30 + .../glowASR_blstm_frame_stack.py | 2 +- .../glowASR_blstm_frame_stack_v2.py | 2 +- .../glowASR_blstm_frame_stack_x_vector.py | 2 +- .../glowASR_blstm_frame_stack_x_vector_v2.py | 2 +- .../pytorch_networks/glowASR_conformer.py | 8 +- .../glowASR_conformer_no_freeze.py | 1 + ...lowASR_conformer_no_freeze_spec_augment.py | 1 + ...no_freeze_spec_augment_before_weak_conf.py | 298 ++ .../glowASR_conformer_no_pretrained.py | 2 +- .../pytorch_networks/glowASR_conformer_v2.py | 298 ++ .../only_blstm_frame_stack.py | 2 +- .../shared/eval_invertibility.py | 83 + .../librispeech_glow_asr/serializer.py | 22 + .../training_comparison.ipynb | 30 +- .../librispeech_joint_training/config.py | 47 + .../librispeech_joint_training/data.py | 13 +- .../librispeech_joint_training/experiments.py | 617 +++- .../librispeech_joint_training/pipeline.py | 34 + .../glowTTS_ASR_blstm_two_forward_pass.py | 632 ++++ .../glowTTS_ASR_blstm_x_vector.py | 679 ++++ .../glowTTS_ASR_conformer_two_forward_pass.py | 3 +- ...ASR_conformer_x_vector_two_forward_pass.py | 2 +- ..._conformer_x_vector_two_forward_pass_v2.py | 2 +- .../glowTTS_ASR_conformer_x_vector_v2.py | 3 +- .../pytorch_networks/glow_ASR_conformer.py | 2 +- .../glow_ASR_conformer_specaugment_before.py | 2 +- ...SR_conformer_specaugment_before_xvector.py | 2 +- ...conformer_specaugment_before_xvector_v2.py | 2 +- .../glow_ASR_conformer_xvector.py | 2 +- .../glow_ASR_conformer_xvector_eval.py | 2 +- .../pytorch_networks/only_conformer.py | 38 +- .../shared/eval_invertibility.py | 86 +- .../pytorch_networks/shared/forward.py | 7 +- ...6modelsV1_VGG4LayerActFrontendV1_v4_cfg.py | 23 + .../librispeech_joint_training/serializer.py | 19 + .../training_comparison.ipynb | 81 +- .../config.py | 49 + .../data.py | 13 +- .../exp_joint_2step/experiments.py | 38 +- .../exp_joint_2step/training_comparison.ipynb | 48 +- .../exp_tts/experiments.py | 157 +- .../exp_tts/training_comparison.ipynb | 996 ++--- .../pipeline.py | 39 +- .../glowASR_conformer_x_vector.py | 100 +- .../pytorch_networks/glowTTS.py | 2 + .../shared/eval_invertibility.py | 2 +- .../shared/feature_statistics.py | 65 + .../text_hdf/text_hdf_from_bliss.py | 4 +- users/rilling/speakers/pooling.py | 26 +- 60 files changed, 7642 insertions(+), 2370 deletions(-) create mode 100644 users/rilling/evaluation/invertibility_eval.ipynb create mode 100644 users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_no_freeze_spec_augment_before_weak_conf.py create mode 100644 users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_v2.py create mode 100644 users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/shared/eval_invertibility.py create mode 100644 users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_blstm_two_forward_pass.py create mode 100644 users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_blstm_x_vector.py create mode 100644 users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/shared/feature_statistics.py diff --git a/users/rilling/evaluation/invertibility_eval.ipynb b/users/rilling/evaluation/invertibility_eval.ipynb new file mode 100644 index 000000000..95ea667d4 --- /dev/null +++ b/users/rilling/evaluation/invertibility_eval.ipynb @@ -0,0 +1,865 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "import sys\n", + "import subprocess\n", + "import numpy as np\n", + "import pandas as pd\n", + "import os\n", + "\n", + "sys.path.append(\"/u/lukas.rilling/dev/\")\n", + "\n", + "from returnn_training_progress import get_epoch_data\n", + "from returnn_training_plot_nb import plot_df\n", + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "base_dir = \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/\"\n", + "accuracy_files = str(\n", + " subprocess.check_output(\n", + " f\"find {base_dir} -type l -name 'forward_invertibility' -not -path '*/lm*'\",\n", + " shell=True,\n", + " ),\n", + " \"utf-8\",\n", + ").split(\"\\n\")[:-1]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
totalmeanvarmaxminGroupPath
02443905600.6181952.617023e+052.852016e+060.0librispeech_glow_asr/u/lukas.rilling/experiments/glow_tts_asr_v2/a...
02443905600.0000012.096195e-122.357960e-040.0librispeech_glow_asr/u/lukas.rilling/experiments/glow_tts_asr_v2/a...
02443905600.0399575.573043e-021.344167e+030.0librispeech_glow_asr/u/lukas.rilling/experiments/glow_tts_asr_v2/a...
02443905600.0000232.390446e-093.259611e-020.0librispeech_glow_asr/u/lukas.rilling/experiments/glow_tts_asr_v2/a...
02443905600.0420551.417796e-025.968618e+020.0joint_training/default/u/lukas.rilling/experiments/glow_tts_asr_v2/a...
\n", + "
" + ], + "text/plain": [ + " total mean var max min \\\n", + "0 244390560 0.618195 2.617023e+05 2.852016e+06 0.0 \n", + "0 244390560 0.000001 2.096195e-12 2.357960e-04 0.0 \n", + "0 244390560 0.039957 5.573043e-02 1.344167e+03 0.0 \n", + "0 244390560 0.000023 2.390446e-09 3.259611e-02 0.0 \n", + "0 244390560 0.042055 1.417796e-02 5.968618e+02 0.0 \n", + "\n", + " Group Path \n", + "0 librispeech_glow_asr /u/lukas.rilling/experiments/glow_tts_asr_v2/a... \n", + "0 librispeech_glow_asr /u/lukas.rilling/experiments/glow_tts_asr_v2/a... \n", + "0 librispeech_glow_asr /u/lukas.rilling/experiments/glow_tts_asr_v2/a... \n", + "0 librispeech_glow_asr /u/lukas.rilling/experiments/glow_tts_asr_v2/a... \n", + "0 joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/a... " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "experiment_group = []\n", + "paths = []\n", + "df = None\n", + "for i,f in enumerate(accuracy_files):\n", + " filename = f + \"/output/output.hdf\"\n", + " if os.path.exists(filename):\n", + " if df is None:\n", + " df = pd.read_csv(filename)\n", + " else:\n", + " df2 = pd.read_csv(filename)\n", + " df = pd.concat([df, df2])\n", + " \n", + " folders = [\"librispeech_glow_asr\", \"joint_training/default\", \"joint_training/conformer_coupling\", \"joint_training/given_alignments\"]\n", + " found = False\n", + " for folder in folders:\n", + " if folder in f:\n", + " experiment_group.append(folder)\n", + " found = True\n", + " break\n", + " paths.append(f.replace(\"forward_invertibility/invertibility\", \"\").replace(\"/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech\", \"\"))\n", + " assert len(paths) == len(experiment_group), f\"Can't find this experiment group: {f}\"\n", + "\n", + "assert len(paths) == len(experiment_group), f\"Well wer has length: {len(accuracies)} and experiment_group has length {len(experiment_group)}\"\n", + "df[\"Group\"] = experiment_group\n", + "df[\"Path\"] = paths\n", + "df.head(5)\n", + "\n", + "# accuracy_files" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"Name\"] = df[\"Path\"].str.removeprefix(\"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech\")\n", + "df.rename(columns=lambda x: x.strip(), inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "| | Group | Name | mean | var | min | max |\n", + "|---:|:--------------------------------|:---------------------------------------------------------------------------------------------------------------------------|------------:|-----------------:|------:|---------------:|\n", + "| 0 | librispeech_glow_asr | /librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained/forward_invertibility | 0.618195 | 261702 | 0 | 2.85202e+06 |\n", + "| 0 | librispeech_glow_asr | /librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/forward_invertibility | 1.04137e-06 | 2.09619e-12 | 0 | 0.000235796 |\n", + "| 0 | librispeech_glow_asr | /librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/forward_invertibility | 0.0399574 | 0.0557304 | 0 | 1344.17 |\n", + "| 0 | librispeech_glow_asr | /librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/forward_invertibility | 2.25829e-05 | 2.39045e-09 | 0 | 0.0325961 |\n", + "| 0 | joint_training/default | /joint_training/default/raw_audio/glow_ASR_conformer/forward_invertibility | 0.0420548 | 0.014178 | 0 | 596.862 |\n", + "| 0 | joint_training/default | /joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/forward_invertibility | 1.92626e-07 | 3.00763e-14 | 0 | 9.05991e-06 |\n", + "| 0 | joint_training/default | /joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/forward_invertibility | 0.278556 | 5.73934 | 0 | 7504.17 |\n", + "| 0 | joint_training/default | /joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/forward_invertibility | 2.07554e-07 | 3.50488e-14 | 0 | 5.96046e-06 |\n", + "| 0 | joint_training/default | /joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/forward_invertibility | 2.37935e-07 | 4.92569e-14 | 0 | 5.48363e-06 |\n", + "| 0 | joint_training/default | /joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/forward_invertibility | 2.78031e-07 | 5.60585e-14 | 0 | 1.40667e-05 |\n", + "| 0 | joint_training/default | /joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/forward_invertibility | 1.8688e-07 | 2.94753e-14 | 0 | 6.91414e-06 |\n", + "| 0 | joint_training/given_alignments | /joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05/forward_invertibility | 2.50928e-07 | 5.20072e-14 | 0 | 8.10623e-06 |\n", + "| 0 | joint_training/given_alignments | /joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/forward_invertibility | 2.27961e-07 | 4.0735e-14 | 0 | 8.34465e-06 |\n" + ] + } + ], + "source": [ + "print(df[[\"Group\", \"Name\", \"mean\", \"var\", \"min\", \"max\"]].to_markdown())" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'accuracies' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[6], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m index \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mMultiIndex\u001b[38;5;241m.\u001b[39mfrom_arrays([experiment_group, accuracy_files], names\u001b[38;5;241m=\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGroup\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExperiment\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[0;32m----> 3\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAccuracy\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[43maccuracies\u001b[49m}, index\u001b[38;5;241m=\u001b[39mindex)\n\u001b[1;32m 4\u001b[0m df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAccuracy [\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124m]\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAccuracy\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m*\u001b[39m \u001b[38;5;241m100\u001b[39m\n\u001b[1;32m 5\u001b[0m df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAccuracy [\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124m]\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAccuracy [\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124m]\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mround(decimals\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m)\n", + "\u001b[0;31mNameError\u001b[0m: name 'accuracies' is not defined" + ] + } + ], + "source": [ + "index = pd.MultiIndex.from_arrays([experiment_group, accuracy_files], names=(\"Group\", \"Experiment\"))\n", + "\n", + "df = pd.DataFrame({\"Accuracy\": accuracies}, index=index)\n", + "df[\"Accuracy [%]\"] = df[\"Accuracy\"] * 100\n", + "df[\"Accuracy [%]\"] = df[\"Accuracy [%]\"].round(decimals=2)\n", + "df = df.drop(\"Accuracy\", axis=1)\n", + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Accuracy [%]CEdev CEMLEDPJointStill runningoverfittingTraining data available
GroupExperiment
joint_training/given_alignments/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_simple_encoder_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/21.302.9205852.941674-0.7972840.957548TrueFalse1.007221True
/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_0.1/26.492.8684202.858513-0.8028360.434087TrueFalse0.996546True
/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_1/27.302.6643962.957262-0.721080.747725TrueFalse1.109919True
/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/30.092.7822692.795637-0.7854920.890597TrueFalse1.004805True
/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/28.562.8135272.825326-0.8037160.439343TrueFalse1.004194True
\n", + "
" + ], + "text/plain": [ + " Accuracy [%] \\\n", + "Group Experiment \n", + "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 21.30 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 26.49 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 27.30 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 30.09 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 28.56 \n", + "\n", + " CE \\\n", + "Group Experiment \n", + "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.920585 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.868420 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.664396 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.782269 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.813527 \n", + "\n", + " dev CE \\\n", + "Group Experiment \n", + "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.941674 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.858513 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.957262 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.795637 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.825326 \n", + "\n", + " MLE \\\n", + "Group Experiment \n", + "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.797284 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.802836 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.72108 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.785492 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.803716 \n", + "\n", + " DP \\\n", + "Group Experiment \n", + "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.957548 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.434087 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.747725 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.890597 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.439343 \n", + "\n", + " Joint \\\n", + "Group Experiment \n", + "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True \n", + "\n", + " Still running \\\n", + "Group Experiment \n", + "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + "\n", + " overfitting \\\n", + "Group Experiment \n", + "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.007221 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.996546 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.109919 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.004805 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.004194 \n", + "\n", + " Training data available \n", + "Group Experiment \n", + "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ce_scores = []\n", + "ce_dev_scores = []\n", + "overfitting = []\n", + "mle_scores = []\n", + "dp_scores = []\n", + "lr = []\n", + "finished = []\n", + "joint = []\n", + "for index, series in df.iterrows():\n", + " data = get_epoch_data((f\"{index[1]}training/work/learning_rates\").replace(\"output\", \"alias\"), None)\n", + " breakpoint()\n", + " if data is None or \"ce\" not in data[list(data.keys())[-1]][\"error\"]:\n", + " if data is None:\n", + " finished.append(True)\n", + " else:\n", + " finished.append(False)\n", + " ce_scores.append(np.nan)\n", + " ce_dev_scores.append(np.nan)\n", + " mle_scores.append(np.nan)\n", + " dp_scores.append(np.nan)\n", + " overfitting.append(np.nan)\n", + " else:\n", + " last_epoch_data = data[list(data.keys())[-1]]\n", + " finished.append(True)\n", + " ce_scores.append(last_epoch_data[\"error\"][\"ce\"])\n", + " ce_dev_scores.append(last_epoch_data[\"error\"][\"dev_loss_ce\"])\n", + " overfitting.append(ce_dev_scores[-1] / ce_scores[-1])\n", + "\n", + " if \"mle\" in last_epoch_data[\"error\"]:\n", + " mle_scores.append(last_epoch_data[\"error\"][\"mle\"])\n", + " dp_scores.append(last_epoch_data[\"error\"][\"dp\"])\n", + " else:\n", + " mle_scores.append(\"-\")\n", + " dp_scores.append(\"-\")\n", + "\n", + "df[\"CE\"] = ce_scores\n", + "df[\"dev CE\"] = ce_dev_scores\n", + "df[\"MLE\"] = mle_scores\n", + "df[\"DP\"] = dp_scores\n", + "df[\"Joint\"] = [True if x != \"-\" and x != False else False for x in mle_scores]\n", + "df[\"Still running\"] = [not x for x in finished]\n", + "df[\"overfitting\"] = overfitting\n", + "df[\"Training data available\"] = df[\"CE\"] != np.nan\n", + "\n", + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:root:Settings file 'settings.py' does not exist, ignoring it ([Errno 2] No such file or directory: 'settings.py').\n" + ] + } + ], + "source": [ + "from returnn.config import Config as ReturnnConfig\n", + "\n", + "df_indexed = df.reset_index()\n", + "\n", + "returnn_configs = []\n", + "for i in df_indexed.itertuples():\n", + " experiment_path = i[2]\n", + " returnn_config_path = experiment_path.replace(\"output\", \"alias\") + \"training/output/returnn.config\"\n", + " returnn_configs.append(ReturnnConfig())\n", + " try:\n", + " returnn_configs[-1].load_file(returnn_config_path)\n", + " except AssertionError:\n", + " returnn_configs[-1] = False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lr = []\n", + "num_epochs = []\n", + "asr_model_type = []\n", + "\n", + "for i in df_indexed.itertuples():\n", + " if returnn_configs[i[0]]:\n", + " if returnn_configs[i[0]].has(\"learning_rates\"):\n", + " learning_rates = np.array(eval(returnn_configs[i[0]].value(\"learning_rates\", \"default\")))\n", + " lr_argmax = learning_rates.argmax()\n", + " lr.append(\n", + " f\"[0: {learning_rates[0]}, {lr_argmax}: {learning_rates[lr_argmax]}, {lr_argmax + 1}: {learning_rates[lr_argmax + 1]}, {returnn_configs[i[0]].value('num_epochs', '-')}: {learning_rates[-1]}]\"\n", + " )\n", + " elif returnn_configs[i[0]].has(\"learning_rate\"):\n", + " lr.append(returnn_configs[i[0]].value(\"learning_rate\", \"-\"))\n", + " else:\n", + " lr.append(\"-\")\n", + "\n", + " num_epochs.append(int(returnn_configs[i[0]].value(\"num_epochs\", \"-\")))\n", + " else:\n", + " lr.append(\"-\")\n", + " num_epochs.append(\"-\")\n", + "\n", + " if \"conformer\" in i[2]:\n", + " asr_model_type.append(\"conformer\")\n", + " elif \"blstm\" in i[2]:\n", + " asr_model_type.append(\"blstm\")\n", + " else:\n", + " asr_model_type.append(\"unknown\")\n", + "\n", + "df_indexed[\"Num Epochs\"] = num_epochs\n", + "df_indexed[\"LR\"] = lr\n", + "df_indexed[\"ASR Model Type\"] = asr_model_type" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_indexed[\"CE loss scale\"] = df_indexed.apply(lambda x: float(x[\"Experiment\"].split(\"/ce_ls_\")[1].split(\"/\")[0] if \"/ce_ls_\" in x[\"Experiment\"] else np.nan), axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GroupExperimentAccuracy [%]CEdev CEMLEDPJointStill runningoverfittingTraining data availableNum EpochsLRASR Model TypeCE loss scale
0joint_training/given_alignments/u/lukas.rilling/experiments/glow_tts_asr_v2/o...21.302.9212.942-0.7970.958TrueFalse1.007True200.0[0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05]unknown0.1
1joint_training/given_alignments/u/lukas.rilling/experiments/glow_tts_asr_v2/o...26.492.8682.859-0.8030.434TrueFalse0.997True250.0[0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]unknown0.1
2joint_training/given_alignments/u/lukas.rilling/experiments/glow_tts_asr_v2/o...27.302.6642.957-0.7210.748TrueFalse1.11True250.0[0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]unknown1.0
3joint_training/given_alignments/u/lukas.rilling/experiments/glow_tts_asr_v2/o...30.092.7822.796-0.7850.891TrueFalse1.005True200.0[0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05]unknown0.1
4joint_training/given_alignments/u/lukas.rilling/experiments/glow_tts_asr_v2/o...28.562.8142.825-0.8040.439TrueFalse1.004True200.0[0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05]unknown0.1
\n", + "
" + ], + "text/plain": [ + " Group \\\n", + "0 joint_training/given_alignments \n", + "1 joint_training/given_alignments \n", + "2 joint_training/given_alignments \n", + "3 joint_training/given_alignments \n", + "4 joint_training/given_alignments \n", + "\n", + " Experiment Accuracy [%] CE \\\n", + "0 /u/lukas.rilling/experiments/glow_tts_asr_v2/o... 21.30 2.921 \n", + "1 /u/lukas.rilling/experiments/glow_tts_asr_v2/o... 26.49 2.868 \n", + "2 /u/lukas.rilling/experiments/glow_tts_asr_v2/o... 27.30 2.664 \n", + "3 /u/lukas.rilling/experiments/glow_tts_asr_v2/o... 30.09 2.782 \n", + "4 /u/lukas.rilling/experiments/glow_tts_asr_v2/o... 28.56 2.814 \n", + "\n", + " dev CE MLE DP Joint Still running overfitting \\\n", + "0 2.942 -0.797 0.958 True False 1.007 \n", + "1 2.859 -0.803 0.434 True False 0.997 \n", + "2 2.957 -0.721 0.748 True False 1.11 \n", + "3 2.796 -0.785 0.891 True False 1.005 \n", + "4 2.825 -0.804 0.439 True False 1.004 \n", + "\n", + " Training data available Num Epochs \\\n", + "0 True 200.0 \n", + "1 True 250.0 \n", + "2 True 250.0 \n", + "3 True 200.0 \n", + "4 True 200.0 \n", + "\n", + " LR ASR Model Type \\\n", + "0 [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] unknown \n", + "1 [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] unknown \n", + "2 [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] unknown \n", + "3 [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] unknown \n", + "4 [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] unknown \n", + "\n", + " CE loss scale \n", + "0 0.1 \n", + "1 0.1 \n", + "2 1.0 \n", + "3 0.1 \n", + "4 0.1 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_rounded = df_indexed.replace(\"-\", np.nan).round(decimals=3).fillna(\"-\")\n", + "df_rounded.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "| | Group | Experiment | Accuracy [%] | CE | dev CE | MLE | DP | Joint | Still running | overfitting | Training data available | Num Epochs | LR | ASR Model Type | CE loss scale |\n", + "|---:|:--------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------:|:------|:---------|:-------|:------|:--------|:----------------|:--------------|:--------------------------|:-------------|:-------------------------------------------------|:-----------------|:----------------|\n", + "| 0 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_simple_encoder_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 21.3 | 2.921 | 2.942 | -0.797 | 0.958 | True | False | 1.007 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 1 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_0.1/ | 26.49 | 2.868 | 2.859 | -0.803 | 0.434 | True | False | 0.997 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | unknown | 0.1 |\n", + "| 2 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_1/ | 27.3 | 2.664 | 2.957 | -0.721 | 0.748 | True | False | 1.11 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | unknown | 1.0 |\n", + "| 3 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 30.09 | 2.782 | 2.796 | -0.785 | 0.891 | True | False | 1.005 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 4 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 28.56 | 2.814 | 2.825 | -0.804 | 0.439 | True | False | 1.004 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 5 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_1/ | 27.72 | 2.532 | 2.992 | -0.691 | 0.76 | True | False | 1.182 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | unknown | 1.0 |\n", + "| 6 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_0.1/ | 30.44 | 2.793 | 2.791 | -0.795 | 0.506 | True | False | 1.0 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | unknown | 0.1 |\n", + "| 7 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 28.92 | 2.564 | 2.893 | -0.793 | 0.396 | True | False | 1.128 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 8 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 27.3 | 2.573 | 2.942 | -0.776 | 0.432 | True | False | 1.143 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 9 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/mean_only/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 22.6 | 2.922 | 2.923 | -0.804 | 0.932 | True | False | 1.0 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 10 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 23.4 | 2.879 | 2.887 | -0.806 | 0.626 | True | False | 1.003 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 11 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_1.0/ | 22.24 | 2.923 | 2.924 | -0.801 | 0.835 | True | False | 1.0 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 1.0 |\n", + "| 12 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 22.76 | 2.924 | 2.934 | -0.8 | 0.818 | True | False | 1.004 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 13 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/specaug/ce_ls_1/ | 35.97 | 1.395 | 2.544 | -0.66 | 0.303 | True | False | 1.824 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | blstm | 1.0 |\n", + "| 14 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/specaug/ce_ls_0.1/ | 36.7 | 1.672 | 2.312 | -0.774 | 1.02 | True | False | 1.383 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | blstm | 0.1 |\n", + "| 15 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/no_specaug/ce_ls_1/ | 34.25 | 1.042 | 3.15 | -0.69 | 0.352 | True | False | 3.023 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | blstm | 1.0 |\n", + "| 16 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/no_specaug/ce_ls_0.1/ | 33.05 | 1.031 | 3.339 | -0.786 | 0.368 | True | False | 3.239 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | blstm | 0.1 |\n", + "| 17 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_4.0/ | 28.56 | 0.003 | 0.003 | -0.702 | 0.075 | True | False | 0.948 | True | 100.0 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 4.0 |\n", + "| 18 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_3.0/ | 20.47 | 0.004 | 0.004 | -0.703 | 0.075 | True | False | 0.954 | True | 100.0 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 3.0 |\n", + "| 19 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_2.0/ | 20.38 | 0.006 | 0.006 | -0.708 | 0.075 | True | False | 1.018 | True | 100.0 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 2.0 |\n", + "| 20 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_1.0/ | 20.45 | 0.012 | 0.012 | -0.716 | 0.075 | True | False | 0.98 | True | 100.0 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 1.0 |\n", + "| 21 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas/100ep/encoder/decoder_eval/ | 37.48 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 22 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas/100ep/encoder/encoder_eval/ | 37.57 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 23 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/encoder/decoder_eval/ | 12.68 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 24 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/encoder/encoder_eval/ | 42.05 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 25 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/decoder/encoder_eval/ | 19.16 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 26 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/decoder/decoder_eval/ | 19.96 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 27 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/encoder/encoder_eval/ | 44.63 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 28 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/encoder/decoder_eval/ | 12.24 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 29 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/decoder/decoder_eval/ | 19.94 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 30 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/decoder/encoder_eval/ | 18.49 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 31 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/encoder/encoder_eval/ | 70.57 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 32 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/encoder/decoder_eval/ | 11.4 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 33 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/decoder/encoder_eval/ | 22.76 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 34 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/decoder/decoder_eval/ | 21.33 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 35 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/encoder/decoder_eval/ | 12.72 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 36 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/encoder/encoder_eval/ | 12.68 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 37 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/decoder/decoder_eval/ | 12.73 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 38 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/decoder/encoder_eval/ | 12.71 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 39 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/decoder/decoder_eval/ | 21.3 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 40 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/decoder/encoder_eval/ | 22.09 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 41 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/encoder/decoder_eval/ | 11.33 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 42 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/encoder/encoder_eval/ | 72.13 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 43 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/decoder/decoder_eval/ | 37.64 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 44 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/decoder/encoder_eval/ | 37.46 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 45 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/encoder/decoder_eval/ | 37.51 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 46 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/encoder/encoder_eval/ | 37.56 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 47 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.1/ | 20.29 | 0.052 | 0.047 | -0.791 | 0.052 | True | False | 0.9 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 48 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.01/ | 19.98 | 0.372 | 0.338 | -0.797 | 0.063 | True | False | 0.91 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.01 |\n", + "| 49 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.1/ | 20.32 | 0.052 | 0.048 | -0.783 | 0.052 | True | False | 0.916 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 50 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.01/ | 19.97 | 0.341 | 0.308 | -0.787 | 0.064 | True | False | 0.903 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.01 |\n", + "| 51 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_tts/ga_glowTTS_ASR_ffn_x_vector_v2_2ndstep_tts/ce_ls_1.0/ | 20.18 | - | - | - | - | True | True | - | True | 100.0 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 1.0 |\n", + "| 52 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_1.0/ | 20.42 | 0.014 | 0.014 | -0.729 | 0.076 | True | False | 1.049 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 1.0 |\n", + "| 53 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_0.1/ | 20.22 | 0.14 | 0.14 | -0.76 | 0.075 | True | False | 0.995 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 54 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_1.0/ | 20.48 | 0.012 | 0.011 | -0.73 | 0.07 | True | False | 0.965 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 1.0 |\n", + "| 55 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_0.1/ | 20.71 | 0.124 | 0.122 | -0.757 | 0.071 | True | False | 0.983 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 56 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/basic_init/ce_ls_0.1/ | 21.39 | 2.921 | 2.947 | -0.779 | 0.256 | True | False | 1.009 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 57 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/ce_ls_0.1/ | 21.33 | 2.925 | 2.946 | -0.792 | 0.247 | True | False | 1.007 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 58 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/tts_pretrained/ce_ls_0.1/ | 27.63 | 2.617 | 2.915 | -0.775 | 0.263 | True | False | 1.114 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 59 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/basic_init/ce_ls_0.1/ | 25.67 | 2.583 | 3.019 | -0.767 | 0.265 | True | False | 1.169 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 60 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_2ndstep/ce_ls_1.0/ | 21.61 | 2.939 | 2.947 | - | - | False | False | 1.003 | True | 100.0 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 1.0 |\n" + ] + } + ], + "source": [ + "print(df_rounded.to_markdown())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "sis_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/users/rilling/evaluation/phoneme_prediction_eval.ipynb b/users/rilling/evaluation/phoneme_prediction_eval.ipynb index 31e863a2d..f6539769a 100644 --- a/users/rilling/evaluation/phoneme_prediction_eval.ipynb +++ b/users/rilling/evaluation/phoneme_prediction_eval.ipynb @@ -45,66 +45,68 @@ "data": { "text/plain": [ "['/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_simple_encoder_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_1.0/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_1.0/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.01/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.01/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/basic_init/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_tts/ga_glowTTS_ASR_ffn_x_vector_v2_2ndstep_tts/ce_ls_1.0/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/tts_pretrained/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/basic_init/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_2ndstep/ce_ls_1.0/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_1.0/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_3.0/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_2.0/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_4.0/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/encoder/decoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/encoder/encoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/decoder/decoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/decoder/encoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/encoder/decoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/encoder/encoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/decoder/decoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/decoder/encoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/encoder/encoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/encoder/decoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/decoder/encoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/decoder/decoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/decoder/decoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/decoder/encoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/encoder/decoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/encoder/encoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas/100ep/encoder/decoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas/100ep/encoder/encoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas/100ep/decoder/decoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas/100ep/decoder/encoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/encoder/decoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/encoder/encoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/decoder/decoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/decoder/encoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/decoder/decoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/decoder/encoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/encoder/decoder_eval/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/encoder/encoder_eval/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/basic_init/no_specaug/tts_target_size/ce_ls_0.1/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/mean_only/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_1.0/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_1.0/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/specaug/ce_ls_1/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/specaug/ce_ls_0.1/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/no_specaug/ce_ls_1/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/no_specaug/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_4.0/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_3.0/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_2.0/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_1.0/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas/100ep/encoder/decoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas/100ep/encoder/encoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/encoder/decoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/encoder/encoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/decoder/encoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/decoder/decoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/encoder/encoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/encoder/decoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/decoder/decoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/decoder/encoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/encoder/encoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/encoder/decoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/decoder/encoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/decoder/decoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/encoder/decoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/encoder/encoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/decoder/decoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/decoder/encoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/decoder/decoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/decoder/encoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/encoder/decoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/encoder/encoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/decoder/decoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/decoder/encoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/encoder/decoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/encoder/encoder_eval/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.01/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.01/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_tts/ga_glowTTS_ASR_ffn_x_vector_v2_2ndstep_tts/ce_ls_1.0/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_1.0/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_1.0/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/basic_init/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/tts_pretrained/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/basic_init/ce_ls_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_2ndstep/ce_ls_1.0/']" + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_0.1/']" ] }, "execution_count": 3, @@ -177,20 +179,20 @@ " 21.30\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_0.1/\n", - " 26.49\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_0.1/\n", + " 20.22\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_1/\n", - " 27.30\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_1.0/\n", + " 20.42\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/\n", - " 30.09\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_0.1/\n", + " 20.71\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/\n", - " 28.56\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_1.0/\n", + " 20.48\n", " \n", " \n", "\n", @@ -200,10 +202,10 @@ " Accuracy [%]\n", "Group Experiment \n", "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 21.30\n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 26.49\n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 27.30\n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 30.09\n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 28.56" + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 20.22\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 20.42\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 20.71\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 20.48" ] }, "execution_count": 4, @@ -287,51 +289,51 @@ " True\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_0.1/\n", - " 26.49\n", - " 2.868420\n", - " 2.858513\n", - " -0.802836\n", - " 0.434087\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_0.1/\n", + " 20.22\n", + " 0.140343\n", + " 0.139632\n", + " -0.760167\n", + " 0.075069\n", " True\n", " False\n", - " 0.996546\n", + " 0.994934\n", " True\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_1/\n", - " 27.30\n", - " 2.664396\n", - " 2.957262\n", - " -0.72108\n", - " 0.747725\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_1.0/\n", + " 20.42\n", + " 0.013599\n", + " 0.014265\n", + " -0.72889\n", + " 0.075977\n", " True\n", " False\n", - " 1.109919\n", + " 1.048934\n", " True\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/\n", - " 30.09\n", - " 2.782269\n", - " 2.795637\n", - " -0.785492\n", - " 0.890597\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_0.1/\n", + " 20.71\n", + " 0.124269\n", + " 0.122107\n", + " -0.757141\n", + " 0.071102\n", " True\n", " False\n", - " 1.004805\n", + " 0.982599\n", " True\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/\n", - " 28.56\n", - " 2.813527\n", - " 2.825326\n", - " -0.803716\n", - " 0.439343\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_1.0/\n", + " 20.48\n", + " 0.011644\n", + " 0.011235\n", + " -0.729906\n", + " 0.069833\n", " True\n", " False\n", - " 1.004194\n", + " 0.964920\n", " True\n", " \n", " \n", @@ -342,42 +344,42 @@ " Accuracy [%] \\\n", "Group Experiment \n", "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 21.30 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 26.49 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 27.30 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 30.09 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 28.56 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 20.22 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 20.42 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 20.71 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 20.48 \n", "\n", " CE \\\n", "Group Experiment \n", "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.920585 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.868420 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.664396 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.782269 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.813527 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.140343 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.013599 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.124269 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.011644 \n", "\n", " dev CE \\\n", "Group Experiment \n", "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.941674 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.858513 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.957262 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.795637 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.825326 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.139632 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.014265 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.122107 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.011235 \n", "\n", " MLE \\\n", "Group Experiment \n", "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.797284 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.802836 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.72108 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.785492 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.803716 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.760167 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.72889 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.757141 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.729906 \n", "\n", " DP \\\n", "Group Experiment \n", "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.957548 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.434087 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.747725 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.890597 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.439343 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.075069 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.075977 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.071102 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.069833 \n", "\n", " Joint \\\n", "Group Experiment \n", @@ -398,10 +400,10 @@ " overfitting \\\n", "Group Experiment \n", "joint_training/given_alignments /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.007221 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.996546 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.109919 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.004805 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.004194 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.994934 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 1.048934 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.982599 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.964920 \n", "\n", " Training data available \n", "Group Experiment \n", @@ -536,7 +538,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -545,7 +547,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -609,17 +611,17 @@ " 1\n", " joint_training/given_alignments\n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/o...\n", - " 26.49\n", - " 2.868\n", - " 2.859\n", - " -0.803\n", - " 0.434\n", + " 20.22\n", + " 0.14\n", + " 0.14\n", + " -0.76\n", + " 0.075\n", " True\n", " False\n", - " 0.997\n", + " 0.995\n", " True\n", - " 250.0\n", - " [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]\n", + " 200.0\n", + " [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05]\n", " unknown\n", " 0.1\n", " \n", @@ -627,17 +629,17 @@ " 2\n", " joint_training/given_alignments\n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/o...\n", - " 27.30\n", - " 2.664\n", - " 2.957\n", - " -0.721\n", - " 0.748\n", + " 20.42\n", + " 0.014\n", + " 0.014\n", + " -0.729\n", + " 0.076\n", " True\n", " False\n", - " 1.11\n", + " 1.049\n", " True\n", - " 250.0\n", - " [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]\n", + " 200.0\n", + " [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05]\n", " unknown\n", " 1.0\n", " \n", @@ -645,14 +647,14 @@ " 3\n", " joint_training/given_alignments\n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/o...\n", - " 30.09\n", - " 2.782\n", - " 2.796\n", - " -0.785\n", - " 0.891\n", + " 20.71\n", + " 0.124\n", + " 0.122\n", + " -0.757\n", + " 0.071\n", " True\n", " False\n", - " 1.005\n", + " 0.983\n", " True\n", " 200.0\n", " [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05]\n", @@ -663,19 +665,19 @@ " 4\n", " joint_training/given_alignments\n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/o...\n", - " 28.56\n", - " 2.814\n", - " 2.825\n", - " -0.804\n", - " 0.439\n", + " 20.48\n", + " 0.012\n", + " 0.011\n", + " -0.73\n", + " 0.07\n", " True\n", " False\n", - " 1.004\n", + " 0.965\n", " True\n", " 200.0\n", " [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05]\n", " unknown\n", - " 0.1\n", + " 1.0\n", " \n", " \n", "\n", @@ -691,41 +693,41 @@ "\n", " Experiment Accuracy [%] CE \\\n", "0 /u/lukas.rilling/experiments/glow_tts_asr_v2/o... 21.30 2.921 \n", - "1 /u/lukas.rilling/experiments/glow_tts_asr_v2/o... 26.49 2.868 \n", - "2 /u/lukas.rilling/experiments/glow_tts_asr_v2/o... 27.30 2.664 \n", - "3 /u/lukas.rilling/experiments/glow_tts_asr_v2/o... 30.09 2.782 \n", - "4 /u/lukas.rilling/experiments/glow_tts_asr_v2/o... 28.56 2.814 \n", + "1 /u/lukas.rilling/experiments/glow_tts_asr_v2/o... 20.22 0.14 \n", + "2 /u/lukas.rilling/experiments/glow_tts_asr_v2/o... 20.42 0.014 \n", + "3 /u/lukas.rilling/experiments/glow_tts_asr_v2/o... 20.71 0.124 \n", + "4 /u/lukas.rilling/experiments/glow_tts_asr_v2/o... 20.48 0.012 \n", "\n", " dev CE MLE DP Joint Still running overfitting \\\n", "0 2.942 -0.797 0.958 True False 1.007 \n", - "1 2.859 -0.803 0.434 True False 0.997 \n", - "2 2.957 -0.721 0.748 True False 1.11 \n", - "3 2.796 -0.785 0.891 True False 1.005 \n", - "4 2.825 -0.804 0.439 True False 1.004 \n", + "1 0.14 -0.76 0.075 True False 0.995 \n", + "2 0.014 -0.729 0.076 True False 1.049 \n", + "3 0.122 -0.757 0.071 True False 0.983 \n", + "4 0.011 -0.73 0.07 True False 0.965 \n", "\n", " Training data available Num Epochs \\\n", "0 True 200.0 \n", - "1 True 250.0 \n", - "2 True 250.0 \n", + "1 True 200.0 \n", + "2 True 200.0 \n", "3 True 200.0 \n", "4 True 200.0 \n", "\n", - " LR ASR Model Type \\\n", - "0 [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] unknown \n", - "1 [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] unknown \n", - "2 [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] unknown \n", - "3 [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] unknown \n", - "4 [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] unknown \n", + " LR ASR Model Type \\\n", + "0 [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] unknown \n", + "1 [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] unknown \n", + "2 [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] unknown \n", + "3 [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] unknown \n", + "4 [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] unknown \n", "\n", " CE loss scale \n", "0 0.1 \n", "1 0.1 \n", "2 1.0 \n", "3 0.1 \n", - "4 0.1 " + "4 1.0 " ] }, - "execution_count": 15, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -737,7 +739,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -747,66 +749,68 @@ "| | Group | Experiment | Accuracy [%] | CE | dev CE | MLE | DP | Joint | Still running | overfitting | Training data available | Num Epochs | LR | ASR Model Type | CE loss scale |\n", "|---:|:--------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------:|:------|:---------|:-------|:------|:--------|:----------------|:--------------|:--------------------------|:-------------|:-------------------------------------------------|:-----------------|:----------------|\n", "| 0 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_simple_encoder_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 21.3 | 2.921 | 2.942 | -0.797 | 0.958 | True | False | 1.007 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 1 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_0.1/ | 26.49 | 2.868 | 2.859 | -0.803 | 0.434 | True | False | 0.997 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | unknown | 0.1 |\n", - "| 2 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_1/ | 27.3 | 2.664 | 2.957 | -0.721 | 0.748 | True | False | 1.11 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | unknown | 1.0 |\n", - "| 3 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 30.09 | 2.782 | 2.796 | -0.785 | 0.891 | True | False | 1.005 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 4 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 28.56 | 2.814 | 2.825 | -0.804 | 0.439 | True | False | 1.004 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 5 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_1/ | 27.72 | 2.532 | 2.992 | -0.691 | 0.76 | True | False | 1.182 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | unknown | 1.0 |\n", - "| 6 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_0.1/ | 30.44 | 2.793 | 2.791 | -0.795 | 0.506 | True | False | 1.0 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | unknown | 0.1 |\n", - "| 7 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 28.92 | 2.564 | 2.893 | -0.793 | 0.396 | True | False | 1.128 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 8 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 27.3 | 2.573 | 2.942 | -0.776 | 0.432 | True | False | 1.143 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 9 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/mean_only/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 22.6 | 2.922 | 2.923 | -0.804 | 0.932 | True | False | 1.0 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 10 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 23.4 | 2.879 | 2.887 | -0.806 | 0.626 | True | False | 1.003 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 11 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_1.0/ | 22.24 | 2.923 | 2.924 | -0.801 | 0.835 | True | False | 1.0 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 1.0 |\n", - "| 12 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 22.76 | 2.924 | 2.934 | -0.8 | 0.818 | True | False | 1.004 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 13 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/specaug/ce_ls_1/ | 35.97 | 1.395 | 2.544 | -0.66 | 0.303 | True | False | 1.824 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | blstm | 1.0 |\n", - "| 14 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/specaug/ce_ls_0.1/ | 36.7 | 1.672 | 2.312 | -0.774 | 1.02 | True | False | 1.383 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | blstm | 0.1 |\n", - "| 15 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/no_specaug/ce_ls_1/ | 34.25 | 1.042 | 3.15 | -0.69 | 0.352 | True | False | 3.023 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | blstm | 1.0 |\n", - "| 16 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/no_specaug/ce_ls_0.1/ | 33.05 | 1.031 | 3.339 | -0.786 | 0.368 | True | False | 3.239 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | blstm | 0.1 |\n", - "| 17 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_4.0/ | 28.56 | 0.003 | 0.003 | -0.702 | 0.075 | True | False | 0.948 | True | 100.0 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 4.0 |\n", - "| 18 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_3.0/ | 20.47 | 0.004 | 0.004 | -0.703 | 0.075 | True | False | 0.954 | True | 100.0 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 3.0 |\n", - "| 19 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_2.0/ | 20.38 | 0.006 | 0.006 | -0.708 | 0.075 | True | False | 1.018 | True | 100.0 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 2.0 |\n", - "| 20 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_1.0/ | 20.45 | 0.012 | 0.012 | -0.716 | 0.075 | True | False | 0.98 | True | 100.0 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 1.0 |\n", - "| 21 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas/100ep/encoder/decoder_eval/ | 37.48 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 22 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas/100ep/encoder/encoder_eval/ | 37.57 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 23 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/encoder/decoder_eval/ | 12.68 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 24 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/encoder/encoder_eval/ | 42.05 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 25 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/decoder/encoder_eval/ | 19.16 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 26 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/decoder/decoder_eval/ | 19.96 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 27 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/encoder/encoder_eval/ | 44.63 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 28 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/encoder/decoder_eval/ | 12.24 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 29 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/decoder/decoder_eval/ | 19.94 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 30 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/decoder/encoder_eval/ | 18.49 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 31 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/encoder/encoder_eval/ | 70.57 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 32 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/encoder/decoder_eval/ | 11.4 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 33 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/decoder/encoder_eval/ | 22.76 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 34 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/decoder/decoder_eval/ | 21.33 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 35 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/encoder/decoder_eval/ | 12.72 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 36 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/encoder/encoder_eval/ | 12.68 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 37 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/decoder/decoder_eval/ | 12.73 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 38 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/decoder/encoder_eval/ | 12.71 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 39 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/decoder/decoder_eval/ | 21.3 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 40 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/decoder/encoder_eval/ | 22.09 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 41 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/encoder/decoder_eval/ | 11.33 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 42 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/encoder/encoder_eval/ | 72.13 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 43 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/decoder/decoder_eval/ | 37.64 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 44 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/decoder/encoder_eval/ | 37.46 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 45 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/encoder/decoder_eval/ | 37.51 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 46 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/encoder/encoder_eval/ | 37.56 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", - "| 47 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.1/ | 20.29 | 0.052 | 0.047 | -0.791 | 0.052 | True | False | 0.9 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 48 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.01/ | 19.98 | 0.372 | 0.338 | -0.797 | 0.063 | True | False | 0.91 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.01 |\n", - "| 49 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.1/ | 20.32 | 0.052 | 0.048 | -0.783 | 0.052 | True | False | 0.916 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 50 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.01/ | 19.97 | 0.341 | 0.308 | -0.787 | 0.064 | True | False | 0.903 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.01 |\n", - "| 51 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_tts/ga_glowTTS_ASR_ffn_x_vector_v2_2ndstep_tts/ce_ls_1.0/ | 20.18 | - | - | - | - | True | True | - | True | 100.0 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 1.0 |\n", - "| 52 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_1.0/ | 20.42 | 0.014 | 0.014 | -0.729 | 0.076 | True | False | 1.049 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 1.0 |\n", - "| 53 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_0.1/ | 20.22 | 0.14 | 0.14 | -0.76 | 0.075 | True | False | 0.995 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 54 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_1.0/ | 20.48 | 0.012 | 0.011 | -0.73 | 0.07 | True | False | 0.965 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 1.0 |\n", - "| 55 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_0.1/ | 20.71 | 0.124 | 0.122 | -0.757 | 0.071 | True | False | 0.983 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 56 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/basic_init/ce_ls_0.1/ | 21.39 | 2.921 | 2.947 | -0.779 | 0.256 | True | False | 1.009 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 57 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/ce_ls_0.1/ | 21.33 | 2.925 | 2.946 | -0.792 | 0.247 | True | False | 1.007 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 58 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/tts_pretrained/ce_ls_0.1/ | 27.63 | 2.617 | 2.915 | -0.775 | 0.263 | True | False | 1.114 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 59 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/basic_init/ce_ls_0.1/ | 25.67 | 2.583 | 3.019 | -0.767 | 0.265 | True | False | 1.169 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", - "| 60 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_2ndstep/ce_ls_1.0/ | 21.61 | 2.939 | 2.947 | - | - | False | False | 1.003 | True | 100.0 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 1.0 |\n" + "| 1 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_0.1/ | 20.22 | 0.14 | 0.14 | -0.76 | 0.075 | True | False | 0.995 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 2 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_1.0/ | 20.42 | 0.014 | 0.014 | -0.729 | 0.076 | True | False | 1.049 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 1.0 |\n", + "| 3 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_0.1/ | 20.71 | 0.124 | 0.122 | -0.757 | 0.071 | True | False | 0.983 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 4 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_1.0/ | 20.48 | 0.012 | 0.011 | -0.73 | 0.07 | True | False | 0.965 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 1.0 |\n", + "| 5 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.01/ | 19.98 | 0.372 | 0.338 | -0.797 | 0.063 | True | False | 0.91 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.01 |\n", + "| 6 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.1/ | 20.29 | 0.052 | 0.047 | -0.791 | 0.052 | True | False | 0.9 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 7 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.01/ | 19.97 | 0.341 | 0.308 | -0.787 | 0.064 | True | False | 0.903 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.01 |\n", + "| 8 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.1/ | 20.32 | 0.052 | 0.048 | -0.783 | 0.052 | True | False | 0.916 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 9 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/ce_ls_0.1/ | 21.33 | 2.925 | 2.946 | -0.792 | 0.247 | True | False | 1.007 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 10 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/basic_init/ce_ls_0.1/ | 21.39 | 2.921 | 2.947 | -0.779 | 0.256 | True | False | 1.009 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 11 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_tts/ga_glowTTS_ASR_ffn_x_vector_v2_2ndstep_tts/ce_ls_1.0/ | 20.18 | - | - | - | - | True | True | - | True | 100.0 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 1.0 |\n", + "| 12 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/tts_pretrained/ce_ls_0.1/ | 27.63 | 2.617 | 2.915 | -0.775 | 0.263 | True | False | 1.114 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 13 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/basic_init/ce_ls_0.1/ | 25.67 | 2.583 | 3.019 | -0.767 | 0.265 | True | False | 1.169 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 14 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_2ndstep/ce_ls_1.0/ | 21.61 | 2.939 | 2.947 | - | - | False | False | 1.003 | True | 100.0 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 1.0 |\n", + "| 15 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_1.0/ | 20.45 | 0.012 | 0.012 | -0.716 | 0.075 | True | False | 0.98 | True | 100.0 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 1.0 |\n", + "| 16 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_3.0/ | 20.47 | 0.004 | 0.004 | -0.703 | 0.075 | True | False | 0.954 | True | 100.0 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 3.0 |\n", + "| 17 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_2.0/ | 20.38 | 0.006 | 0.006 | -0.708 | 0.075 | True | False | 1.018 | True | 100.0 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 2.0 |\n", + "| 18 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/first_step/ga_glowTTS_ASR_ffn_x_vector_v2/ce_ls_4.0/ | 28.56 | 0.003 | 0.003 | -0.702 | 0.075 | True | False | 0.948 | True | 100.0 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | 4.0 |\n", + "| 19 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/encoder/decoder_eval/ | 11.33 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 20 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/encoder/encoder_eval/ | 72.13 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 21 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/decoder/decoder_eval/ | 21.3 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 22 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn/100ep/decoder/encoder_eval/ | 22.09 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 23 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/encoder/decoder_eval/ | 12.72 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 24 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/encoder/encoder_eval/ | 12.68 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 25 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/decoder/decoder_eval/ | 12.73 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 26 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector_mas/100ep/decoder/encoder_eval/ | 12.71 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 27 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/encoder/encoder_eval/ | 37.56 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 28 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/encoder/decoder_eval/ | 37.51 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 29 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/decoder/encoder_eval/ | 37.46 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 30 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas_no_eval/100ep/decoder/decoder_eval/ | 37.64 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 31 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/decoder/decoder_eval/ | 19.94 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 32 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/decoder/encoder_eval/ | 18.49 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 33 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/encoder/decoder_eval/ | 12.24 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 34 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn/100ep/encoder/encoder_eval/ | 44.63 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 35 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas/100ep/encoder/decoder_eval/ | 37.48 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 36 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas/100ep/encoder/encoder_eval/ | 37.57 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 37 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas/100ep/decoder/decoder_eval/ | 37.68 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 38 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_mas/100ep/decoder/encoder_eval/ | 37.45 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 39 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/encoder/decoder_eval/ | 11.4 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 40 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/encoder/encoder_eval/ | 70.57 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 41 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/decoder/decoder_eval/ | 21.33 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 42 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_cnn_x_vector/100ep/decoder/encoder_eval/ | 22.76 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 43 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/decoder/decoder_eval/ | 19.96 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 44 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/decoder/encoder_eval/ | 19.16 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 45 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/encoder/decoder_eval/ | 12.68 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 46 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/frozen_glowtts/ga_glowTTS_ASR_ffn_x_vector/100ep/encoder/encoder_eval/ | 42.05 | - | - | - | - | True | False | - | True | - | - | unknown | - |\n", + "| 47 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 28.92 | 2.564 | 2.893 | -0.793 | 0.396 | True | False | 1.128 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 48 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 27.3 | 2.573 | 2.942 | -0.776 | 0.432 | True | False | 1.143 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 49 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/mean_only/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 22.6 | 2.922 | 2.923 | -0.804 | 0.932 | True | False | 1.0 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 50 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 23.4 | 2.879 | 2.887 | -0.806 | 0.626 | True | False | 1.003 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 51 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 22.76 | 2.924 | 2.934 | -0.8 | 0.818 | True | False | 1.004 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 52 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_1.0/ | 22.24 | 2.923 | 2.924 | -0.801 | 0.835 | True | False | 1.0 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 1.0 |\n", + "| 53 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/specaug/ce_ls_1/ | 35.97 | 1.395 | 2.544 | -0.66 | 0.303 | True | False | 1.824 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | blstm | 1.0 |\n", + "| 54 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/specaug/ce_ls_0.1/ | 36.7 | 1.672 | 2.312 | -0.774 | 1.02 | True | False | 1.383 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | blstm | 0.1 |\n", + "| 55 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/no_specaug/ce_ls_1/ | 34.25 | 1.042 | 3.15 | -0.69 | 0.352 | True | False | 3.023 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | blstm | 1.0 |\n", + "| 56 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_blstm_x_vector/no_specaug/ce_ls_0.1/ | 33.05 | 1.031 | 3.339 | -0.786 | 0.368 | True | False | 3.239 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | blstm | 0.1 |\n", + "| 57 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 30.09 | 2.782 | 2.796 | -0.785 | 0.891 | True | False | 1.005 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 58 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 28.56 | 2.814 | 2.825 | -0.804 | 0.439 | True | False | 1.004 | True | 200.0 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | unknown | 0.1 |\n", + "| 59 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_1/ | 27.72 | 2.532 | 2.992 | -0.691 | 0.76 | True | False | 1.182 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | unknown | 1.0 |\n", + "| 60 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_0.1/ | 30.44 | 2.793 | 2.791 | -0.795 | 0.506 | True | False | 1.0 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | unknown | 0.1 |\n", + "| 61 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_1/ | 27.3 | 2.664 | 2.957 | -0.721 | 0.748 | True | False | 1.11 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | unknown | 1.0 |\n", + "| 62 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_0.1/ | 26.49 | 2.868 | 2.859 | -0.803 | 0.434 | True | False | 0.997 | True | 250.0 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | unknown | 0.1 |\n" ] } ], @@ -831,7 +835,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/users/rilling/evaluation/swer_eval.ipynb b/users/rilling/evaluation/swer_eval.ipynb index 9122f2655..1e207a807 100644 --- a/users/rilling/evaluation/swer_eval.ipynb +++ b/users/rilling/evaluation/swer_eval.ipynb @@ -49,6 +49,7 @@ " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/gin512/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", @@ -62,6 +63,7 @@ " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.0/grad_clip_10/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/100ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/grad_clip_10/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/gin512/grad_clip_10/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/epsilon_1e-8/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/grad_clip_10/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05_epsilon_1e-8/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", @@ -184,7 +186,7 @@ { "data": { "text/plain": [ - "(91, 91, 91, 91, 91, 91, 91, 91)" + "(93, 93, 93, 93, 93, 93, 93, 93)" ] }, "execution_count": 4, @@ -304,7 +306,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -491,7 +493,7 @@ " \n", " \n", "\n", - "

91 rows × 9 columns

\n", + "

93 rows × 9 columns

\n", "" ], "text/plain": [ @@ -621,10 +623,10 @@ " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-09} \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'radam', 'epsilon': 1e-09} \n", "\n", - "[91 rows x 9 columns]" + "[93 rows x 9 columns]" ] }, - "execution_count": 16, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -641,7 +643,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -719,10 +721,10 @@ " {'class': 'adam', 'epsilon': 1e-08}\n", " -0.826781\n", " -0.846025\n", - " NaN\n", + " None\n", " 0.382039\n", " 0.353906\n", - " NaN\n", + " None\n", " False\n", " \n", " \n", @@ -736,13 +738,13 @@ " 768\n", " [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]\n", " {'class': 'adam', 'epsilon': 1e-08}\n", - " -0.672552\n", - " NaN\n", - " -0.692025\n", - " 0.444391\n", - " NaN\n", - " 0.412498\n", - " True\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", " \n", " \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", @@ -755,13 +757,13 @@ " 768\n", " [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]\n", " {'class': 'adam', 'epsilon': 1e-08}\n", - " -0.453089\n", - " -0.401680\n", - " -0.465325\n", - " 0.246206\n", - " 0.344511\n", - " 0.229336\n", - " True\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", " \n", " \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1_radam1e-9/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", @@ -774,12 +776,12 @@ " 768\n", " [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05]\n", " {'class': 'radam', 'epsilon': 1e-09}\n", - " -0.825730\n", + " -0.82573\n", " -0.844433\n", - " NaN\n", + " None\n", " 0.384814\n", " 0.355806\n", - " NaN\n", + " None\n", " False\n", " \n", " \n", @@ -795,10 +797,10 @@ " {'class': 'adam', 'epsilon': 1e-08}\n", " -0.825686\n", " -0.844577\n", - " NaN\n", + " None\n", " 0.389056\n", " 0.360854\n", - " NaN\n", + " None\n", " False\n", " \n", " \n", @@ -881,61 +883,61 @@ " MLE \\\n", "Group Experiment \n", "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.826781 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.672552 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.453089 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.825730 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.82573 \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.825686 \n", "\n", " dev MLE \\\n", "Group Experiment \n", "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.846025 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.401680 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.844433 \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.844577 \n", "\n", - " devtrain MLE \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.692025 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.465325 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", + " devtrain MLE \\\n", + "Group Experiment \n", + "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... None \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... None \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... None \n", "\n", " DP loss \\\n", "Group Experiment \n", "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.382039 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.444391 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.246206 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.384814 \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.389056 \n", "\n", - " DP dev loss \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.353906 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.344511 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.355806 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.360854 \n", + " DP dev loss \\\n", + "Group Experiment \n", + "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.353906 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.355806 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.360854 \n", "\n", - " DP devtrain loss \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.412498 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.229336 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", + " DP devtrain loss \\\n", + "Group Experiment \n", + "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... None \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... None \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... None \n", "\n", " Joint \n", "Group Experiment \n", "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False " ] }, - "execution_count": 17, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -961,6 +963,9 @@ " mle_dev_scores.append(False)\n", " dp_scores.append(False)\n", " dp_dev_scores.append(False)\n", + " mle_devtrain_scores.append(False)\n", + " dp_devtrain_scores.append(False)\n", + " joint.append(False)\n", " else:\n", " last_epoch_data = data[list(data.keys())[-1]]\n", " finished.append(True)\n", @@ -996,7 +1001,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -1008,64 +1013,69 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "| | Group | Experiment | sWER | autoMOS | autoMOS confidence | num_epochs | decoder dropout | mean only | encoder channels | LR | Optimizer | MLE | dev MLE | devtrain MLE | DP loss | DP dev loss | DP devtrain loss | Joint |\n", - "|---:|:----------------------------------|:------------------------------------------------------------------------------------------------------------------------|-------:|----------:|---------------------:|-------------:|------------------:|:------------|:-------------------|:-------------------------------------------------|:-------------------------------------|------:|----------:|---------------:|----------:|--------------:|-------------------:|:--------|\n", - "| 0 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_100ep_pe1/ | 12.4 | 2.94 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-08} | -0.83 | -0.85 | nan | 0.38 | 0.35 | nan | False |\n", - "| 3 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1_radam1e-9/ | 14.7 | 2.59 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.83 | -0.84 | nan | 0.38 | 0.36 | nan | False |\n", - "| 4 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1/ | 14.4 | 2.72 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-08} | -0.83 | -0.84 | nan | 0.39 | 0.36 | nan | False |\n", - "| 5 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS/ | 97.7 | 1.61 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.82 | -0.83 | nan | 0.96 | 0.73 | nan | False |\n", - "| 7 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_x_vector/ | 99.5 | 1.93 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.81 | -0.76 | -0.81 | 1 | 1.08 | 1.04 | False |\n", - "| 23 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/dec_drop_0.05/ | 12.9 | 3.33 | 0.02 | 400 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 400: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.82 | -0.83 | -0.84 | 0.38 | 0.46 | 0.34 | False |\n", - "| 24 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.0/ | 96.2 | 2.29 | nan | 200 | 0 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.8 | -0.8 | 0.98 | 0.87 | 0.88 | False |\n", - "| 25 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/ | 14.6 | 3.26 | 0.02 | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.82 | -0.83 | 0.39 | 0.44 | 0.36 | False |\n", - "| 26 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/100ep/dec_drop_0.05/ | 16.3 | 3.18 | 0.02 | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.81 | -0.82 | 0.41 | 0.44 | 0.38 | False |\n", - "| 27 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.0/ | 16.7 | 3.32 | nan | 200 | 0 | False | 768 | [0: 1e-06, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.85 | -0.76 | -0.85 | 0.41 | 0.47 | 0.38 | False |\n", - "| 28 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.05/ | 15.6 | 3.21 | nan | 200 | 0.05 | False | 768 | [0: 1e-06, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.82 | -0.83 | 0.39 | 0.45 | 0.36 | False |\n", - "| 29 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.05/ | 13.3 | 3.3 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.82 | -0.83 | 0.38 | 0.41 | 0.34 | False |\n", - "| 30 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.0/ | 95.4 | 2.41 | nan | 200 | 0 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.84 | -0.82 | -0.83 | 0.97 | 1.15 | 1.18 | False |\n", - "| 31 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05_epsilon_1e-8/ | 96.7 | 2.27 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-08} | -0.81 | -0.82 | -0.82 | 0.98 | 0.81 | 0.83 | False |\n", - "| 32 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05/ | 98.8 | 1.88 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.82 | -0.82 | 0.99 | 0.62 | 0.62 | False |\n", - "| 33 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.0/grad_clip_10/ | 12.8 | 3.34 | nan | 200 | 0 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.85 | -0.77 | -0.85 | 0.39 | 0.43 | 0.36 | False |\n", - "| 34 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/100ep/dec_drop_0.05/ | 98.1 | 1.52 | nan | 100 | 0.05 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.8 | -0.81 | 0.99 | 0.78 | 0.78 | False |\n", - "| 35 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/grad_clip_10/dec_drop_0.05/ | 14.4 | 3.29 | 0.02 | 400 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 400: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.82 | -0.84 | -0.85 | 0.37 | 0.44 | 0.33 | False |\n", - "| 36 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/epsilon_1e-8/ | 95.9 | 3.06 | nan | 200 | 0 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-08} | -0.83 | -0.75 | -0.81 | 0.98 | 0.45 | 0.46 | False |\n", - "| 37 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/grad_clip_10/ | 14.5 | 3.31 | nan | 200 | 0 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.85 | -0.84 | -0.85 | 0.4 | 0.45 | 0.36 | False |\n", - "| 38 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05_epsilon_1e-8/ | 15.7 | 3.35 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-08} | -0.81 | -0.83 | -0.83 | 0.39 | 0.44 | 0.35 | False |\n", - "| 39 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05/ | 15.1 | 3.36 | 0.02 | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.83 | -0.83 | 0.39 | 0.44 | 0.35 | False |\n", - "| 40 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.00/ | 16.3 | 3.27 | nan | 100 | 0 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.83 | -0.82 | -0.83 | 0.41 | 0.44 | 0.38 | False |\n", - "| 41 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.05/ | 17.2 | 3.3 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.81 | -0.82 | 0.4 | 0.43 | 0.37 | False |\n", - "| 42 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss/ed_scale_0.1/ | 95.8 | 3.09 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.78 | -0.78 | -0.78 | 1.08 | 2.07 | 2.08 | False |\n", - "| 43 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss_grad_clip_10/ed_scale_1.0/ | 98.8 | 2.27 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.74 | -0.76 | -0.76 | 1.13 | 1.54 | 1.55 | False |\n", - "| 44 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss_grad_clip_10/ed_scale_0.1/ | 95.7 | 3.21 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.79 | -0.79 | 1.08 | 1.81 | 1.83 | False |\n", - "| 45 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_simple_encoder/12cb/200ep/dec_drop_0.05/ | 70.3 | 3.47 | nan | 200 | 0.05 | False | - | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.82 | -0.83 | 0.43 | 0.5 | 0.4 | False |\n", - "| 46 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_simple_encoder/20cb/200ep/dec_drop_0.05/ | 57.3 | 3.46 | nan | 200 | 0.05 | False | - | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.83 | -0.85 | -0.86 | 0.4 | 0.47 | 0.38 | False |\n", - "| 47 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_conformer_coupling/enc768/200ep/dec_drop_0.05/ | 9.6 | 1.79 | 0.03 | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.84 | -0.85 | -0.86 | 0.37 | 0.42 | 0.34 | False |\n", - "| 48 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_multiscale/enc768/200ep/dec_drop_0.05/ | 100 | 1.25 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.79 | -0.8 | -0.81 | 0.53 | 0.57 | 0.52 | False |\n", - "| 49 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_batch_norm/enc768/200ep/dec_drop_0.05/ | 15.1 | 3.4 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.82 | -0.83 | 0.39 | 0.45 | 0.36 | False |\n", - "| 50 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector/enc768/100ep/dec_drop_0.05/ | 97.9 | 1.51 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.78 | -0.79 | -0.79 | 0.99 | 1.38 | 1.38 | False |\n", - "| 75 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc256/not_silence_preprocessed/ | 97.9 | 1.69 | nan | 200 | 0 | False | 256 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.85 | -0.54 | nan | 0.96 | 0.5 | nan | False |\n", - "| 76 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc768/with_sigma/not_silence_preprocessed/ | 15.7 | 3.35 | nan | 200 | 0 | False | 768 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.85 | -0.83 | nan | 0.4 | 0.45 | nan | False |\n", - "| 77 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc768/with_sigma/silence_preprocessed/ | 15.3 | 3.4 | nan | 200 | 0 | False | 768 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.74 | -0.46 | nan | 0.37 | 0.41 | nan | False |\n", - "| 78 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc768/mean_only/silence_preprocessed/ | 16.9 | 3.52 | nan | 200 | 0 | True | 768 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.74 | -0.71 | nan | 0.37 | 0.41 | nan | False |\n", - "| 79 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc768/mean_only/not_silence_preprocessed/ | 15.5 | 3.47 | nan | 200 | 0 | True | 768 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.85 | -0.76 | nan | 0.4 | 0.45 | nan | False |\n", - "| 80 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/not_silence_preprocessed/ | 14.3 | 3.31 | nan | 100 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.84 | -0.77 | nan | 0.41 | 0.42 | nan | False |\n", - "| 81 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/silence_preprocessed/ | 13.5 | 3.3 | nan | 100 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.82 | -0.81 | nan | 0.37 | 0.39 | nan | False |\n", - "| 82 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc192/200ep/long_cooldown/not_silence_preprocessed/ | 14.7 | 3.29 | nan | 200 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-07] | {'class': 'adam', 'epsilon': 1e-08} | -0.84 | -0.69 | nan | 0.41 | 0.43 | nan | False |\n", - "| 83 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc192/200ep/long_cooldown/silence_preprocessed/ | 13.5 | 3.28 | nan | 200 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-07] | {'class': 'adam', 'epsilon': 1e-08} | -0.82 | -0.81 | nan | 0.38 | 0.39 | nan | False |\n", - "| 84 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_simple_encoder/silence_preprocessed/ | 56.4 | 3.33 | nan | 100 | 0 | - | - | [0: 0.0001, 49: 0.0005, 50: 0.0005, 100: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.84 | -0.82 | nan | 0.38 | 0.43 | nan | False |\n", - "| 85 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_ddi_actnorm/enc192/100ep/not_silence_preprocessed/LR_scheduled/ | 100 | 3.12 | nan | 100 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.84 | -0.8 | nan | 0.42 | 0.44 | nan | False |\n", - "| 86 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc768/100ep/not_silence_preprocessed/ | 20.9 | 3.31 | nan | 100 | 0 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.81 | -0.8 | nan | 0.46 | 0.47 | nan | False |\n", - "| 87 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc192/100ep/not_silence_preprocessed/ | 95.4 | 2.63 | nan | 100 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.81 | -0.79 | nan | 0.97 | 1.27 | nan | False |\n", - "| 88 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/not_silence_preprocessed/ | 25.2 | 3.46 | nan | 200 | 0 | - | - | [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.84 | -0.71 | nan | 0.41 | 0.46 | nan | False |\n", - "| 89 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/silence_preprocessed/ | 25.9 | 3.39 | nan | 200 | 0 | - | - | [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.83 | -0.81 | nan | 0.39 | 0.42 | nan | False |\n", - "| 90 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder_no_blstm/silence_preprocessed/ | 105.6 | 3.17 | nan | 100 | 0 | - | - | [0: 0.0001, 49: 0.0005, 50: 0.0001, 100: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.79 | nan | 0.88 | 0.87 | nan | False |\n" + "| | Group | Experiment | sWER | autoMOS | autoMOS confidence | num_epochs | decoder dropout | mean only | encoder channels | LR | Optimizer | MLE | dev MLE | devtrain MLE | DP loss | DP dev loss | DP devtrain loss | Joint |\n", + "|---:|:----------------------------------|:------------------------------------------------------------------------------------------------------------------------|-------:|----------:|---------------------:|-------------:|------------------:|:------------|:-------------------|:-------------------------------------------------|:-------------------------------------|----------:|----------:|---------------:|----------:|--------------:|-------------------:|:--------|\n", + "| 0 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_100ep_pe1/ | 12.4 | 2.94 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-08} | -0.826781 | -0.846025 | | 0.382039 | 0.353906 | | False |\n", + "| 1 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/ | 20.9 | 2.45 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | 0 | 0 | 0 | 0 | 0 | 0 | False |\n", + "| 2 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/ | 5.2 | 2.14 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | 0 | 0 | 0 | 0 | 0 | 0 | False |\n", + "| 3 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1_radam1e-9/ | 14.7 | 2.59 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.82573 | -0.844433 | | 0.384814 | 0.355806 | | False |\n", + "| 4 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1/ | 14.4 | 2.72 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-08} | -0.825686 | -0.844577 | | 0.389056 | 0.360854 | | False |\n", + "| 5 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS/ | 97.7 | 1.61 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.824271 | -0.830326 | | 0.964433 | 0.725438 | | False |\n", + "| 7 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_x_vector/ | 99.5 | 1.93 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.809305 | -0.755239 | -0.813211 | 1.00198 | 1.07779 | 1.04241 | False |\n", + "| 8 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/ | 5.2 | 2.14 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | 0 | 0 | 0 | 0 | 0 | 0 | False |\n", + "| 23 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/gin512/dec_drop_0.05/ | 12.9 | 3.33 | 0.02 | 400 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 400: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.822881 | -0.834023 | -0.844881 | 0.376549 | 0.455059 | 0.340289 | False |\n", + "| 24 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/dec_drop_0.05/ | 12.9 | 3.33 | 0.02 | 400 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 400: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | 0 | 0 | 0 | 0 | 0 | 0 | False |\n", + "| 25 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.0/ | 96.2 | 2.29 | nan | 200 | 0 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.813627 | -0.797961 | -0.803228 | 0.975631 | 0.866819 | 0.882062 | False |\n", + "| 26 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/ | 14.6 | 3.26 | 0.02 | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.809709 | -0.82178 | -0.830623 | 0.390625 | 0.4432 | 0.357604 | False |\n", + "| 27 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/100ep/dec_drop_0.05/ | 16.3 | 3.18 | 0.02 | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.795013 | -0.809868 | -0.816514 | 0.408781 | 0.440684 | 0.381301 | False |\n", + "| 28 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.0/ | 16.7 | 3.32 | nan | 200 | 0 | False | 768 | [0: 1e-06, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.846667 | -0.76151 | -0.845555 | 0.409992 | 0.468342 | 0.378645 | False |\n", + "| 29 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.05/ | 15.6 | 3.21 | nan | 200 | 0.05 | False | 768 | [0: 1e-06, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.80793 | -0.820998 | -0.829611 | 0.394024 | 0.448099 | 0.360531 | False |\n", + "| 30 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.05/ | 13.3 | 3.3 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.811426 | -0.824218 | -0.832125 | 0.375146 | 0.412309 | 0.342878 | False |\n", + "| 31 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.0/ | 95.4 | 2.41 | nan | 200 | 0 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.841694 | -0.819818 | -0.83016 | 0.965626 | 1.15174 | 1.18063 | False |\n", + "| 32 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05_epsilon_1e-8/ | 96.7 | 2.27 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-08} | -0.808277 | -0.817505 | -0.821914 | 0.982278 | 0.814173 | 0.826286 | False |\n", + "| 33 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05/ | 98.8 | 1.88 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.809477 | -0.815064 | -0.819234 | 0.989373 | 0.615026 | 0.622701 | False |\n", + "| 34 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.0/grad_clip_10/ | 12.8 | 3.34 | nan | 200 | 0 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.852834 | -0.773976 | -0.851532 | 0.392373 | 0.431975 | 0.362269 | False |\n", + "| 35 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/100ep/dec_drop_0.05/ | 98.1 | 1.52 | nan | 100 | 0.05 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.797353 | -0.804897 | -0.808365 | 0.989832 | 0.777391 | 0.784409 | False |\n", + "| 36 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/grad_clip_10/dec_drop_0.05/ | 14.4 | 3.29 | 0.02 | 400 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 400: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | 0 | 0 | 0 | 0 | 0 | 0 | False |\n", + "| 37 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/gin512/grad_clip_10/dec_drop_0.05/ | 14.4 | 3.29 | 0.02 | 400 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 400: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.824197 | -0.835761 | -0.846551 | 0.372315 | 0.440626 | 0.33371 | False |\n", + "| 38 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/epsilon_1e-8/ | 95.9 | 3.06 | nan | 200 | 0 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-08} | -0.82712 | -0.750288 | -0.813842 | 0.979225 | 0.454561 | 0.462357 | False |\n", + "| 39 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/grad_clip_10/ | 14.5 | 3.31 | nan | 200 | 0 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.850649 | -0.838267 | -0.849902 | 0.396277 | 0.445801 | 0.364157 | False |\n", + "| 40 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05_epsilon_1e-8/ | 15.7 | 3.35 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-08} | -0.811807 | -0.826657 | -0.834532 | 0.386067 | 0.438248 | 0.351594 | False |\n", + "| 41 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05/ | 15.1 | 3.36 | 0.02 | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.812429 | -0.827038 | -0.834987 | 0.387377 | 0.441218 | 0.352695 | False |\n", + "| 42 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.00/ | 16.3 | 3.27 | nan | 100 | 0 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.831177 | -0.821651 | -0.830556 | 0.409946 | 0.437995 | 0.384226 | False |\n", + "| 43 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.05/ | 17.2 | 3.3 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.798251 | -0.814911 | -0.821276 | 0.397457 | 0.432689 | 0.367326 | False |\n", + "| 44 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss/ed_scale_0.1/ | 95.8 | 3.09 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.782615 | -0.779794 | -0.782747 | 1.08489 | 2.0684 | 2.07562 | False |\n", + "| 45 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss_grad_clip_10/ed_scale_1.0/ | 98.8 | 2.27 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.743438 | -0.756143 | -0.757724 | 1.13249 | 1.54302 | 1.5535 | False |\n", + "| 46 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss_grad_clip_10/ed_scale_0.1/ | 95.7 | 3.21 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.798058 | -0.786642 | -0.791474 | 1.07707 | 1.81449 | 1.83075 | False |\n", + "| 47 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_simple_encoder/12cb/200ep/dec_drop_0.05/ | 70.3 | 3.47 | nan | 200 | 0.05 | False | - | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.80368 | -0.821293 | -0.825658 | 0.428016 | 0.498157 | 0.402439 | False |\n", + "| 48 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_simple_encoder/20cb/200ep/dec_drop_0.05/ | 57.3 | 3.46 | nan | 200 | 0.05 | False | - | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.834191 | -0.847676 | -0.856235 | 0.402502 | 0.472223 | 0.376431 | False |\n", + "| 49 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_conformer_coupling/enc768/200ep/dec_drop_0.05/ | 9.6 | 1.79 | 0.03 | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.842848 | -0.852489 | -0.860671 | 0.370438 | 0.41724 | 0.344025 | False |\n", + "| 50 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_multiscale/enc768/200ep/dec_drop_0.05/ | 100 | 1.25 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.785697 | -0.800581 | -0.807986 | 0.533892 | 0.569833 | 0.522571 | False |\n", + "| 51 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_batch_norm/enc768/200ep/dec_drop_0.05/ | 15.1 | 3.4 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.80533 | -0.817184 | -0.826384 | 0.389733 | 0.44642 | 0.357169 | False |\n", + "| 52 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector/enc768/100ep/dec_drop_0.05/ | 97.9 | 1.51 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.781911 | -0.787168 | -0.790923 | 0.991812 | 1.38362 | 1.38305 | False |\n", + "| 77 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc256/not_silence_preprocessed/ | 97.9 | 1.69 | nan | 200 | 0 | False | 256 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.851128 | -0.54391 | | 0.962189 | 0.501158 | | False |\n", + "| 78 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc768/with_sigma/not_silence_preprocessed/ | 15.7 | 3.35 | nan | 200 | 0 | False | 768 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.851343 | -0.833649 | | 0.401905 | 0.450686 | | False |\n", + "| 79 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc768/with_sigma/silence_preprocessed/ | 15.3 | 3.4 | nan | 200 | 0 | False | 768 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.736796 | -0.463881 | | 0.371603 | 0.409141 | | False |\n", + "| 80 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc768/mean_only/silence_preprocessed/ | 16.9 | 3.52 | nan | 200 | 0 | True | 768 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.735579 | -0.714157 | | 0.366752 | 0.410529 | | False |\n", + "| 81 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc768/mean_only/not_silence_preprocessed/ | 15.5 | 3.47 | nan | 200 | 0 | True | 768 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.85035 | -0.763903 | | 0.395024 | 0.448581 | | False |\n", + "| 82 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/not_silence_preprocessed/ | 14.3 | 3.31 | nan | 100 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.8365 | -0.770533 | | 0.406652 | 0.423657 | | False |\n", + "| 83 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/silence_preprocessed/ | 13.5 | 3.3 | nan | 100 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.819508 | -0.80574 | | 0.373986 | 0.385518 | | False |\n", + "| 84 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc192/200ep/long_cooldown/not_silence_preprocessed/ | 14.7 | 3.29 | nan | 200 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-07] | {'class': 'adam', 'epsilon': 1e-08} | -0.839926 | -0.694879 | | 0.411271 | 0.432932 | | False |\n", + "| 85 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc192/200ep/long_cooldown/silence_preprocessed/ | 13.5 | 3.28 | nan | 200 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-07] | {'class': 'adam', 'epsilon': 1e-08} | -0.82215 | -0.807288 | | 0.377117 | 0.390598 | | False |\n", + "| 86 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_simple_encoder/silence_preprocessed/ | 56.4 | 3.33 | nan | 100 | 0 | - | - | [0: 0.0001, 49: 0.0005, 50: 0.0005, 100: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.842638 | -0.822322 | | 0.384891 | 0.428803 | | False |\n", + "| 87 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_ddi_actnorm/enc192/100ep/not_silence_preprocessed/LR_scheduled/ | 100 | 3.12 | nan | 100 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.837399 | -0.801737 | | 0.42113 | 0.441682 | | False |\n", + "| 88 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc768/100ep/not_silence_preprocessed/ | 20.9 | 3.31 | nan | 100 | 0 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.810979 | -0.801251 | | 0.459904 | 0.467786 | | False |\n", + "| 89 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc192/100ep/not_silence_preprocessed/ | 95.4 | 2.63 | nan | 100 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.805523 | -0.788462 | | 0.972006 | 1.27149 | | False |\n", + "| 90 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/not_silence_preprocessed/ | 25.2 | 3.46 | nan | 200 | 0 | - | - | [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.844504 | -0.70768 | | 0.411171 | 0.455673 | | False |\n", + "| 91 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/silence_preprocessed/ | 25.9 | 3.39 | nan | 200 | 0 | - | - | [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.829622 | -0.813688 | | 0.386848 | 0.423963 | | False |\n", + "| 92 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder_no_blstm/silence_preprocessed/ | 105.6 | 3.17 | nan | 100 | 0 | - | - | [0: 0.0001, 49: 0.0005, 50: 0.0001, 100: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.799977 | -0.788486 | | 0.8847 | 0.873624 | | False |\n" ] } ], @@ -1075,57 +1085,54 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "| | Group | Experiment | sWER | autoMOS | autoMOS confidence | num_epochs | decoder dropout | mean only | encoder channels | LR | Optimizer | MLE | dev MLE | devtrain MLE | DP loss | DP dev loss | DP devtrain loss | Joint |\n", - "|---:|:----------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------|-------:|----------:|---------------------:|-------------:|------------------:|:------------|-------------------:|:-------------------------------------------------|:-------------------------------------|------:|----------:|---------------:|----------:|--------------:|-------------------:|:--------|\n", - "| 1 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/ | 20.9 | 2.45 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.67 | nan | -0.69 | 0.44 | nan | 0.41 | True |\n", - "| 2 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/ | 5.2 | 2.14 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.45 | -0.4 | -0.47 | 0.25 | 0.34 | 0.23 | True |\n", - "| 6 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/ | 95.7 | 1.82 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.66 | nan | -0.67 | 1 | nan | 2.09 | True |\n", - "| 8 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/ | 5.2 | 2.14 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.45 | -0.4 | -0.47 | 0.25 | 0.34 | 0.23 | True |\n", - "| 9 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/ | 4.6 | 3.11 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74 | -0.69 | -0.76 | 0.35 | 0.48 | 0.33 | True |\n", - "| 10 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/ | 9.5 | 2.75 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.61 | -0.56 | -0.63 | 0.42 | 0.58 | 0.4 | True |\n", - "| 11 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/ | 11.2 | 3.18 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.76 | -0.69 | -0.78 | 0.4 | 0.56 | 0.38 | True |\n", - "| 12 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/ | 13.3 | 3.12 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.75 | -0.69 | -0.77 | 0.42 | 0.57 | 0.4 | True |\n", - "| 13 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/ | 5.2 | 2.3 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.51 | -0.46 | -0.52 | 0.25 | 0.35 | 0.23 | True |\n", - "| 14 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/ | 9.9 | 2.64 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.62 | -0.57 | -0.64 | 0.4 | 0.55 | 0.38 | True |\n", - "| 15 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/ | 7.9 | 3.19 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.75 | -0.7 | -0.77 | 0.37 | 0.51 | 0.35 | True |\n", - "| 16 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/ | 6.7 | 2.48 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.58 | -0.53 | -0.59 | 0.35 | 0.46 | 0.33 | True |\n", - "| 17 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/ | 7.6 | 3.15 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.75 | -0.7 | -0.77 | 0.38 | 0.5 | 0.36 | True |\n", - "| 18 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/ | 4.4 | 3.2 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.75 | -0.69 | -0.76 | 0.33 | 0.46 | 0.31 | True |\n", - "| 19 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/ | 14.8 | 2.61 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.68 | nan | -0.7 | 0.4 | nan | 0.37 | True |\n", - "| 20 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/ | 6 | 2.32 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.52 | -0.47 | -0.54 | 0.23 | 0.35 | 0.2 | True |\n", - "| 21 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/ | 15.2 | 3.16 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.78 | nan | -0.81 | 0.38 | nan | 0.36 | True |\n", - "| 22 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/ | 98 | 1.58 | 0.01 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.55 | -0.5 | -0.55 | 1.02 | 0.82 | 0.7 | True |\n", - "| 51 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_0.1/ | 25.2 | 3.11 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.8 | -0.82 | -0.82 | 0.43 | 0.46 | 0.41 | True |\n", - "| 52 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_1/ | 88.5 | 1.87 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.72 | -0.73 | -0.74 | 0.75 | 0.79 | 0.69 | True |\n", - "| 53 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 98.1 | 1.82 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.79 | -0.79 | -0.8 | 0.89 | 0.57 | 0.58 | True |\n", - "| 54 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 32.9 | 2.77 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.82 | -0.83 | 0.44 | 0.49 | 0.41 | True |\n", - "| 55 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_1/ | 83.9 | 1.92 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.69 | -0.7 | -0.71 | 0.76 | 0.94 | 0.72 | True |\n", - "| 56 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_0.1/ | 41.9 | 2.75 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.79 | -0.81 | -0.82 | 0.51 | 0.54 | 0.49 | True |\n", - "| 57 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 14.1 | 3.2 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.79 | -0.81 | -0.81 | 0.4 | 0.45 | 0.36 | True |\n", - "| 58 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 23.3 | 3.13 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.78 | -0.79 | -0.79 | 0.43 | 0.48 | 0.4 | True |\n", - "| 59 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/mean_only/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 96.4 | 2.37 | nan | 200 | 0.05 | True | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.8 | -0.81 | 0.93 | 0.72 | 0.73 | True |\n", - "| 60 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 91.7 | 2.66 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.82 | -0.83 | 0.63 | 0.63 | 0.59 | True |\n", - "| 61 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_1.0/ | 97.2 | 2.12 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.8 | -0.81 | 0.83 | 0.61 | 0.62 | True |\n", - "| 62 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 99.9 | 1.56 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.77 | -0.8 | 0.82 | 0.53 | 0.54 | True |\n", - "| 63 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.1/ | 99.7 | 1.83 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.79 | -0.81 | -0.81 | 0.05 | 0.05 | 0.04 | True |\n", - "| 64 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.01/ | 97.2 | 1.97 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.81 | -0.82 | 0.06 | 0.06 | 0.05 | True |\n", - "| 65 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.1/ | 99.1 | 1.64 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.78 | -0.8 | -0.8 | 0.05 | 0.05 | 0.04 | True |\n", - "| 66 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.01/ | 98.2 | 2.18 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.79 | -0.8 | -0.81 | 0.06 | 0.06 | 0.05 | True |\n", - "| 67 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_1.0/ | 100 | 1.97 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.73 | -0.75 | -0.75 | 0.08 | 0.08 | 0.07 | True |\n", - "| 68 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_0.1/ | 100 | 1.98 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.76 | -0.78 | -0.78 | 0.08 | 0.08 | 0.07 | True |\n", - "| 69 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_1.0/ | 100 | 1.98 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.73 | -0.74 | -0.75 | 0.07 | 0.08 | 0.06 | True |\n", - "| 70 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_0.1/ | 100 | 1.98 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.76 | -0.77 | -0.78 | 0.07 | 0.08 | 0.06 | True |\n", - "| 71 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/basic_init/ce_ls_0.1/ | 97.9 | 1.78 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.78 | -0.79 | -0.8 | 0.26 | 0.24 | 0.25 | True |\n", - "| 72 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/ce_ls_0.1/ | 95.4 | 2.53 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.79 | -0.81 | -0.81 | 0.25 | 0.22 | 0.23 | True |\n", - "| 73 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/tts_pretrained/ce_ls_0.1/ | 95.2 | 2.18 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.78 | -0.79 | -0.79 | 0.26 | 0.24 | 0.25 | True |\n", - "| 74 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/basic_init/ce_ls_0.1/ | 96.1 | 2.35 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.77 | -0.78 | -0.79 | 0.27 | 0.26 | 0.26 | True |\n" + "| | Group | Experiment | sWER | autoMOS | autoMOS confidence | num_epochs | decoder dropout | mean only | encoder channels | LR | Optimizer | MLE | dev MLE | devtrain MLE | DP loss | DP dev loss | DP devtrain loss | Joint |\n", + "|---:|:----------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------|-------:|----------:|---------------------:|-------------:|------------------:|:------------|-------------------:|:-------------------------------------------------|:-------------------------------------|----------:|----------:|---------------:|----------:|--------------:|-------------------:|:--------|\n", + "| 6 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/ | 95.7 | 1.82 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.657168 | | -0.670671 | 1.0046 | | 2.09134 | True |\n", + "| 9 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/ | 4.6 | 3.11 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74139 | -0.690156 | -0.759844 | 0.346902 | 0.481245 | 0.329289 | True |\n", + "| 10 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/ | 9.5 | 2.75 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.610471 | -0.555647 | -0.629746 | 0.416831 | 0.583815 | 0.403638 | True |\n", + "| 11 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/ | 11.2 | 3.18 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.758909 | -0.69436 | -0.778751 | 0.400894 | 0.55533 | 0.379635 | True |\n", + "| 12 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/ | 13.3 | 3.12 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.753013 | -0.694853 | -0.771774 | 0.417359 | 0.565613 | 0.399888 | True |\n", + "| 13 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/ | 5.2 | 2.3 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.506816 | -0.458788 | -0.521236 | 0.245737 | 0.346906 | 0.225934 | True |\n", + "| 14 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/ | 9.9 | 2.64 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.621646 | -0.565345 | -0.641523 | 0.402243 | 0.552866 | 0.379653 | True |\n", + "| 15 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/ | 7.9 | 3.19 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.754171 | -0.704233 | -0.772961 | 0.369618 | 0.50542 | 0.346386 | True |\n", + "| 16 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/ | 6.7 | 2.48 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.577228 | -0.531676 | -0.591349 | 0.349734 | 0.46387 | 0.326762 | True |\n", + "| 17 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/ | 7.6 | 3.15 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.750811 | -0.700515 | -0.769183 | 0.380467 | 0.502884 | 0.36247 | True |\n", + "| 18 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/ | 4.4 | 3.2 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.745622 | -0.693835 | -0.764738 | 0.330255 | 0.457476 | 0.306534 | True |\n", + "| 19 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/ | 14.8 | 2.61 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.679734 | | -0.699266 | 0.399065 | | 0.370426 | True |\n", + "| 20 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/ | 6 | 2.32 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.520932 | -0.470559 | -0.535346 | 0.229065 | 0.345538 | 0.203592 | True |\n", + "| 21 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/ | 15.2 | 3.16 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.784652 | | -0.805268 | 0.383526 | | 0.355401 | True |\n", + "| 22 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/ | 98 | 1.58 | 0.01 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.552728 | -0.497187 | -0.548408 | 1.01904 | 0.823625 | 0.701494 | True |\n", + "| 53 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_0.1/ | 25.2 | 3.11 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.802836 | -0.81678 | -0.823897 | 0.434087 | 0.464465 | 0.407059 | True |\n", + "| 54 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_1/ | 88.5 | 1.87 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.72108 | -0.734634 | -0.74084 | 0.747725 | 0.789389 | 0.691204 | True |\n", + "| 55 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 98.1 | 1.82 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.785492 | -0.790627 | -0.79625 | 0.890597 | 0.574966 | 0.575373 | True |\n", + "| 56 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 32.9 | 2.77 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.803716 | -0.816318 | -0.825113 | 0.439343 | 0.490279 | 0.409061 | True |\n", + "| 57 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_1/ | 83.9 | 1.92 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.690569 | -0.702706 | -0.708801 | 0.760198 | 0.943391 | 0.716967 | True |\n", + "| 58 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_0.1/ | 41.9 | 2.75 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.794999 | -0.80951 | -0.816001 | 0.505974 | 0.538897 | 0.493393 | True |\n", + "| 59 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 14.1 | 3.2 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.793062 | -0.805013 | -0.81407 | 0.39621 | 0.451179 | 0.36325 | True |\n", + "| 60 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 23.3 | 3.13 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.77557 | -0.785888 | -0.794856 | 0.432057 | 0.483314 | 0.399083 | True |\n", + "| 61 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/mean_only/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 96.4 | 2.37 | nan | 200 | 0.05 | True | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.803728 | -0.802102 | -0.808006 | 0.932394 | 0.721534 | 0.725609 | True |\n", + "| 62 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 91.7 | 2.66 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.805783 | -0.821988 | -0.827055 | 0.625962 | 0.62989 | 0.592278 | True |\n", + "| 63 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_1.0/ | 97.2 | 2.12 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.801181 | -0.803144 | -0.808404 | 0.834689 | 0.611662 | 0.619833 | True |\n", + "| 64 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 99.9 | 1.56 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.799504 | -0.772613 | -0.798104 | 0.818331 | 0.531823 | 0.536853 | True |\n", + "| 65 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.1/ | 99.7 | 1.83 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.790621 | -0.805995 | -0.811551 | 0.0523576 | 0.0489216 | 0.0415039 | True |\n", + "| 66 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.01/ | 97.2 | 1.97 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.797074 | -0.812303 | -0.818065 | 0.0631472 | 0.0577708 | 0.0508676 | True |\n", + "| 67 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.1/ | 99.1 | 1.64 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.782616 | -0.797878 | -0.803697 | 0.0519058 | 0.0493118 | 0.0413708 | True |\n", + "| 68 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.01/ | 98.2 | 2.18 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.787278 | -0.802672 | -0.808277 | 0.0638544 | 0.0582362 | 0.0520775 | True |\n", + "| 69 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_1.0/ | 100 | 1.97 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.72889 | -0.745734 | -0.75171 | 0.075977 | 0.0820037 | 0.0666421 | True |\n", + "| 70 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_0.1/ | 100 | 1.98 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.760167 | -0.775606 | -0.781364 | 0.075069 | 0.0806164 | 0.0651269 | True |\n", + "| 71 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_1.0/ | 100 | 1.98 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.729906 | -0.74441 | -0.750384 | 0.0698329 | 0.0841703 | 0.0607633 | True |\n", + "| 72 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_0.1/ | 100 | 1.98 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.757141 | -0.771994 | -0.778037 | 0.071102 | 0.0818373 | 0.0614895 | True |\n", + "| 73 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/basic_init/ce_ls_0.1/ | 97.9 | 1.78 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.779347 | -0.79411 | -0.800602 | 0.255822 | 0.238816 | 0.246318 | True |\n", + "| 74 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/ce_ls_0.1/ | 95.4 | 2.53 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.792338 | -0.806682 | -0.812767 | 0.247139 | 0.215995 | 0.232498 | True |\n", + "| 75 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/tts_pretrained/ce_ls_0.1/ | 95.2 | 2.18 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.775121 | -0.788423 | -0.794789 | 0.263264 | 0.235715 | 0.253516 | True |\n", + "| 76 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/basic_init/ce_ls_0.1/ | 96.1 | 2.35 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.76667 | -0.781423 | -0.785757 | 0.265068 | 0.264766 | 0.261945 | True |\n" ] } ], @@ -1135,7 +1142,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -1144,13 +1151,13 @@ "" ] }, - "execution_count": 20, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -1165,7 +1172,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -1174,13 +1181,13 @@ "" ] }, - "execution_count": 21, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -1195,7 +1202,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -1204,13 +1211,13 @@ "" ] }, - "execution_count": 22, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -1225,7 +1232,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -1234,13 +1241,13 @@ "" ] }, - "execution_count": 23, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAkAAAAGwCAYAAABB4NqyAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAABGhUlEQVR4nO39f1zV9eH//98PqCAa+IPfRYLBbJk/yB+ImOkiyfmurLYpX/fSnNWl5o+MXNO2rNYPqldW01yustRtoWtLe9Ua5Qjkq+JvqbnMYYCaCgIKR8Ag4fn5o3HWkR+egxzOOTxv18vlXIrH83GePJ5Py3M/j8fj+XhYDMMwBAAAYCI+7m4AAABAZyMAAQAA0yEAAQAA0yEAAQAA0yEAAQAA0yEAAQAA0yEAAQAA0+nm7gZ4osbGRp04cUKXXXaZLBaLu5sDAAAcYBiGzp49q8jISPn4tN3HQwBqwYkTJxQVFeXuZgAAgHY4duyYrrjiijbrEIBacNlll0n69gYGBga6uTUAAMARVqtVUVFRts/xthCAWtA07BUYGEgAAgDAyzgyfYVJ0AAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHTYCgMAAC9XWFatI6drFd2/l2KCe7m7OV6BAAQAgJeqrK3Xgox85RaU2crGx4VoRWq8ggK6u7Flno8hMAAAvNSCjHxtO1xuV7btcLnmZ+x3U4u8BwEIAAAvVFhWrdyCMjUYhl15g2Eot6BMReU1bmqZdyAAAQDghY6crm3zeHEFAagtBCAAALzQgH4BbR6P7s9k6LYQgAAA8EIDQ3prfFyIfC0Wu3Jfi0Xj40J4GuwiCEAAAHipFanxSooNtitLig3WitR4N7XIe/AYPAAAXioooLvWzRmtovIaFVfUsA6QEwhAAAB4uZhggo+zGAIDAACmQwACAACm49YAlJ6erlGjRumyyy5TaGiopk6dqkOHDl30fe+8846uvvpq+fv7a8iQIfrwww/tjhuGoaVLlyoiIkI9e/ZUcnKyCgoKXHUZAADAy7g1AG3ZskVz587Vjh07tHnzZn3zzTeaNGmSampaX7xp+/btSk1N1Zw5c7R//35NnTpVU6dO1YEDB2x1nn/+eS1fvlyrVq3Szp071atXL6WkpOjrr7/ujMsCAAAezmIYF6yh7UZlZWUKDQ3Vli1bNH78+BbrTJs2TTU1Nfrggw9sZWPGjNHw4cO1atUqGYahyMhIPfTQQ1q0aJEkqaqqSmFhYVqzZo2mT5/e7Jx1dXWqq6uz/Wy1WhUVFaWqqioFBgZ28FUCAABXsFqtCgoKcujz26PmAFVVVUmS+vXr12qdvLw8JScn25WlpKQoLy9PklRUVKSSkhK7OkFBQUpISLDVuVB6erqCgoJsr6ioqEu9FAAA4ME8JgA1NjZq4cKFSkpK0rXXXttqvZKSEoWFhdmVhYWFqaSkxHa8qay1OhdasmSJqqqqbK9jx45dyqUAAAAP5zHrAM2dO1cHDhzQ1q1bO/13+/n5yc/Pr9N/LwAAcA+P6AGaN2+ePvjgA2VnZ+uKK65os254eLhKS0vtykpLSxUeHm473lTWWh0AAGBubg1AhmFo3rx52rhxoz755BPFxMRc9D2JiYnKysqyK9u8ebMSExMlSTExMQoPD7erY7VatXPnTlsdAABgbm4dAps7d67efvttvffee7rssstsc3SCgoLUs2dPSdLMmTN1+eWXKz09XZL0wAMP6IYbbtCyZcs0ZcoUrV+/Xnv27NFrr70mSbJYLFq4cKGeeuopxcXFKSYmRo8++qgiIyM1depUt1wnAADwLG4NQK+++qokacKECXblb731lu666y5J0tGjR+Xj89+OqrFjx+rtt9/Wr3/9az3yyCOKi4vTpk2b7CZOP/zww6qpqdG9996ryspKjRs3TpmZmfL393f5NQEAAM/nUesAeQpn1hEAAACewWvXAQIAAOgMBCAAAGA6BCAAAGA6BCAAAGA6BCAAAGA6BCAAAGA6BCAAAGA6BCAAAGA6BCAAAGA6BCAAAGA6BCAAAGA6BCAAAGA6BCAAAGA6BCAAAGA6BCAAAGA6BCAAAGA6BCAAAGA6BCAAAGA6BCAAAGA63dzdAAAA0H6FZdU6crpW0f17KSa4l7ub4zUIQAAAeKHK2notyMhXbkGZrWx8XIhWpMYrKKC7G1vmHRgCAwDACy3IyNe2w+V2ZdsOl2t+xn43tci7EIAAAPAyhWXVyi0oU4Nh2JU3GIZyC8pUVF7jppZ5DwIQAABe5sjp2jaPF1cQgC6GAAQAgJcZ0C+gzePR/ZkMfTEEIAAAvMzAkN4aHxciX4vFrtzXYtH4uBCeBnMAAQgAAC+0IjVeSbHBdmVJscFakRrvphZ5Fx6DBwDACwUFdNe6OaNVVF6j4ooa1gFyEgEIAAAvFhNM8GkPhsAAAIDpuDUA5ebm6pZbblFkZKQsFos2bdrUZv277rpLFoul2Wvw4MG2Oo8//niz41dffbWLrwQAAHgTtwagmpoaDRs2TCtXrnSo/m9/+1udPHnS9jp27Jj69eunH//4x3b1Bg8ebFdv69atrmg+AADwUm6dAzR58mRNnjzZ4fpBQUEKCgqy/bxp0yadOXNGs2fPtqvXrVs3hYeHd1g7AQBA1+LVc4BWr16t5ORkDRgwwK68oKBAkZGRGjhwoGbMmKGjR4+2eZ66ujpZrVa7FwAA6Lq8NgCdOHFCf//733X33XfblSckJGjNmjXKzMzUq6++qqKiIl1//fU6e/Zsq+dKT0+39S4FBQUpKirK1c0HAABuZDGMC3ZScxOLxaKNGzdq6tSpDtVPT0/XsmXLdOLECfXo0aPVepWVlRowYIBefPFFzZkzp8U6dXV1qqurs/1stVoVFRWlqqoqBQYGOnUdAADAPaxWq4KCghz6/PbKdYAMw9Cbb76p//mf/2kz/EhSnz599L3vfU+HDx9utY6fn5/8/Pw6upkAAMBDeeUQ2JYtW3T48OFWe3S+q7q6Wl9++aUiIiI6oWUAAMAbuDUAVVdXKz8/X/n5+ZKkoqIi5efn2yYtL1myRDNnzmz2vtWrVyshIUHXXntts2OLFi3Sli1bVFxcrO3bt+v222+Xr6+vUlNTXXotAADAe7h1CGzPnj2aOHGi7ee0tDRJ0qxZs7RmzRqdPHmy2RNcVVVV+utf/6rf/va3LZ7zq6++UmpqqioqKhQSEqJx48Zpx44dCgkJcd2FAAAAr+Ixk6A9iTOTqAAAgGdw5vPbK+cAAQAAXAoCEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMJ1u7m4AAADofIVl1TpyulbR/XspJriXu5vT6QhAAACYSGVtvRZk5Cu3oMxWNj4uRCtS4xUU0N2NLetcDIEBAGAiCzLyte1wuV3ZtsPlmp+x300tcg8CEAAAJlFYVq3cgjI1GIZdeYNhKLegTEXlNW5qWecjAAEAYBJHTte2eby4ggAEAAC6mAH9Ato8Ht3fPJOhCUAAAJjEwJDeGh8XIl+Lxa7c12LR+LgQUz0NRgACAMBEVqTGKyk22K4sKTZYK1Lj3dQi93BrAMrNzdUtt9yiyMhIWSwWbdq0qc36OTk5slgszV4lJSV29VauXKno6Gj5+/srISFBu3btcuFVAADgPYICumvdnNHKXjRBb80epexFE7RuzmhTPQIvuTkA1dTUaNiwYVq5cqVT7zt06JBOnjxpe4WGhtqObdiwQWlpaXrssce0b98+DRs2TCkpKTp16lRHNx8AAK8VE9xLEweFmmrY67vcuhDi5MmTNXnyZKffFxoaqj59+rR47MUXX9Q999yj2bNnS5JWrVqlv/3tb3rzzTe1ePHiFt9TV1enuro6289Wq9XpNgEAAO/hlXOAhg8froiICN10003atm2brby+vl579+5VcnKyrczHx0fJycnKy8tr9Xzp6ekKCgqyvaKiolzafgAA4F5eFYAiIiK0atUq/fWvf9Vf//pXRUVFacKECdq3b58kqby8XA0NDQoLC7N7X1hYWLN5Qt+1ZMkSVVVV2V7Hjh1z6XUAAAD38qq9wAYNGqRBgwbZfh47dqy+/PJLvfTSS/rDH/7Q7vP6+fnJz8+vI5oIAAC8gFf1ALVk9OjROnz4sCQpODhYvr6+Ki0ttatTWlqq8PBwdzQPAAB4IK8PQPn5+YqIiJAk9ejRQyNGjFBWVpbteGNjo7KyspSYmOiuJgIAAA/j1iGw6upqW++NJBUVFSk/P1/9+vXTlVdeqSVLluj48eNat26dJOnll19WTEyMBg8erK+//lpvvPGGPvnkE3388ce2c6SlpWnWrFkaOXKkRo8erZdfflk1NTW2p8IAAADcGoD27NmjiRMn2n5OS0uTJM2aNUtr1qzRyZMndfToUdvx+vp6PfTQQzp+/LgCAgI0dOhQ/eMf/7A7x7Rp01RWVqalS5eqpKREw4cPV2ZmZrOJ0QAAwLwshmEY7m6Ep7FarQoKClJVVZUCAwPd3RwAAOAAZz6/vX4OEAAAgLMIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHS8ajd4AADQPoVl1TpyulbR/XspJriXu5vjdgQgAAC6sMraei3IyFduQZmtbHxciFakxisooLsbW+ZeDIEBANCFLcjI17bD5XZl2w6Xa37Gfje1yDMQgAAA6KIKy6qVW1Cmhgu2/WwwDOUWlKmovMZNLXM/AhAAAF3UkdO1bR4vriAAAQCALmZAv4A2j0f3N+9kaAIQAABd1MCQ3hofFyJfi8Wu3Ndi0fi4EFM/DUYAAgCgC1uRGq+k2GC7sqTYYK1IjXdTizwDj8EDANCFBQV017o5o1VUXqPiihrWAfoPAhAAACYQE0zw+S6GwAAAgOkQgAAAgOkQgAAAgOkQgAAAgOkQgAAAgOkQgAAAgOkQgAAAgOkQgAAAgOkQgAAAgOmwEjQAAF6qsKxaR07Xsr1FOzgdgM6dOyfDMBQQECBJOnLkiDZu3KhrrrlGkyZN6vAGAgAAe5W19VqQka/cgjJb2fi4EK1IjVdQQHc3tsx7OD0Edtttt2ndunWSpMrKSiUkJGjZsmW67bbb9Oqrr3Z4AwEAgL0FGfnadrjcrmzb4XLNz9jvphZ5H6cD0L59+3T99ddLkv7yl78oLCxMR44c0bp167R8+XKnzpWbm6tbbrlFkZGRslgs2rRpU5v13333Xd10000KCQlRYGCgEhMT9dFHH9nVefzxx2WxWOxeV199tVPtAgDAUxWWVSu3oEwNhmFX3mAYyi0oU1F5jZta5l2cDkC1tbW67LLLJEkff/yx7rjjDvn4+GjMmDE6cuSIU+eqqanRsGHDtHLlSofq5+bm6qabbtKHH36ovXv3auLEibrlllu0f7994h08eLBOnjxpe23dutWpdgEA4KmOnK5t83hxBQHIEU7PAYqNjdWmTZt0++2366OPPtKDDz4oSTp16pQCAwOdOtfkyZM1efJkh+u//PLLdj8/88wzeu+99/T+++8rPj7eVt6tWzeFh4c71RYAALzBgH4BbR6P7s9kaEc43QO0dOlSLVq0SNHR0UpISFBiYqKkb3uDvhtCOkNjY6POnj2rfv362ZUXFBQoMjJSAwcO1IwZM3T06NE2z1NXVyer1Wr3AgDAEw0M6a3xcSHytVjsyn0tFo2PC+FpMAc5HYB+9KMf6ejRo9qzZ48yMzNt5TfeeKNeeumlDm3cxbzwwguqrq7WT37yE1tZQkKC1qxZo8zMTL366qsqKirS9ddfr7Nnz7Z6nvT0dAUFBdleUVFRndF8AADaZUVqvJJig+3KkmKDtSK1czsivJnFMC6YReUkq9WqTz75RIMGDdL3v//99jfEYtHGjRs1depUh+q//fbbuueee/Tee+8pOTm51XqVlZUaMGCAXnzxRc2ZM6fFOnV1daqrq7P9bLVaFRUVpaqqKqeH9QAA6CxF5TUqrqhhHaD/sFqtCgoKcujz2+k5QD/5yU80fvx4zZs3T+fOndPIkSNVXFwswzC0fv163Xnnne1uuKPWr1+vu+++W++8806b4UeS+vTpo+9973s6fPhwq3X8/Pzk5+fX0c0EAMClYoIJPu3l9BBYbm6u7TH4jRs3yjAMVVZWavny5Xrqqac6vIEXysjI0OzZs5WRkaEpU6ZctH51dbW+/PJLRUREuLxtAADAOzgdgKqqqmyTjjMzM3XnnXcqICBAU6ZMUUFBgVPnqq6uVn5+vvLz8yVJRUVFys/Pt01aXrJkiWbOnGmr//bbb2vmzJlatmyZEhISVFJSopKSElVVVdnqLFq0SFu2bFFxcbG2b9+u22+/Xb6+vkpNTXX2UgEAQBfldACKiopSXl6eampqlJmZadv+4syZM/L393fqXHv27FF8fLzt6bG0tDTFx8dr6dKlkqSTJ0/aPcH12muv6fz585o7d64iIiJsrwceeMBW56uvvlJqaqoGDRqkn/zkJ+rfv7927NihkJAQZy8VAAB0UU5Pgv7d736nBx54QL1799aAAQO0b98++fj4aMWKFXr33XeVnZ3tqrZ2GmcmUQEAAM/g0knQP//5zzV69GgdO3ZMN910k3x8vu1EGjhwYKfMAQIAALhUl/QYfNNbLRcsxuTt6AECAMD7OPP57fQcIElat26dhgwZop49e6pnz54aOnSo/vCHP7SrsQAAAJ3N6SGwF198UY8++qjmzZunpKQkSdLWrVt13333qby83LY3GAAAgKdyeggsJiZGTzzxhN3j6ZK0du1aPf744yoqKurQBroDQ2AAAHgfl06CPnnypMaOHdusfOzYsTp58qSzpwMAAO1UWFatI6dr2QqjHZwOQLGxsfrzn/+sRx55xK58w4YNiouL67CGAQCAllXW1mtBRr5yC8psZePjQrQiNV5BAd3d2DLv4XQAeuKJJzRt2jTl5uba5gBt27ZNWVlZ+vOf/9zhDQQAAPYWZORr2+Fyu7Jth8s1P2O/1s0Z7aZWeRennwK78847tXPnTgUHB2vTpk3atGmTgoODtWvXLt1+++2uaCMAAPiPwrJq5RaUqeGCKbwNhqHcgjIVlde4qWXexekeIEkaMWKE/vjHP3Z0WwAAwEUcOV3b5vHiihrmAznAoQBktVodPiFPTQEA4DoD+gW0eTy6P+HHEQ4FoD59+lx0tWfDMGSxWNTQ0NAhDQMAAM0NDOmt8XEh2na43G4YzNdiUVJsML0/DnIoAHWFDU4BAOgqVqTGa37GfrunwJJig7UiNd6NrfIul7QXWFfFQogAAG9QVF6j4ooa1gH6D5cuhAgAADxDTDDBp73atRkqAACANyMAAQAA0yEAAQAA03E6AD311FNdYsd3AADwrcKyamUfOmWqVaSdfgps2LBhOnDggBISEvTTn/5UP/nJTxQcHOyq9rkFT4EBAMygq22q6sznt9M9QJ9++qk+++wzTZgwQS+88IIiIyM1ZcoUvf3226qtbXt5bgAA4Dna2lS1q2vXHKDBgwfrmWeeUWFhobKzsxUdHa2FCxcqPDy8o9sHAABcwOybql7yJOhevXqpZ8+e6tGjh7755puOaBMAAHAxRzZV7craFYCKior09NNPa/DgwRo5cqT279+vJ554QiUlJR3dPgAA4AJm31TV6ZWgx4wZo927d2vo0KGaPXu2UlNTdfnll7uibQAAwEXMvqmq0wHoxhtv1JtvvqlrrrnGFe0BAACdxMybqrZ7M9T6+noVFRXpqquuUrduXWtLMR6DBwCYSVfZVNWlj8GfO3dOc+bMUUBAgAYPHqyjR49KkubPn69nn322fS0GAABuExPcSxMHhXp1+HGW0wFo8eLF+vTTT5WTkyN/f39beXJysjZs2NChjQMAAHAFp8euNm3apA0bNmjMmDGyWCy28sGDB+vLL7/s0MYBAAC4gtM9QGVlZQoNDW1WXlNTYxeIAAAAPJXTAWjkyJH629/+Zvu5KfS88cYbSkxMdOpcubm5uuWWWxQZGSmLxaJNmzZd9D05OTm67rrr5Ofnp9jYWK1Zs6ZZnZUrVyo6Olr+/v5KSEjQrl27nGoXAADo2pweAnvmmWc0efJkff755zp//rx++9vf6vPPP9f27du1ZcsWp85VU1OjYcOG6Wc/+5nuuOOOi9YvKirSlClTdN999+lPf/qTsrKydPfddysiIkIpKSmSpA0bNigtLU2rVq1SQkKCXn75ZaWkpOjQoUMt9lwBAADzaddj8F9++aWeffZZffrpp6qurtZ1112nX/7ylxoyZEj7G2KxaOPGjZo6dWqrdX75y1/qb3/7mw4cOGArmz59uiorK5WZmSlJSkhI0KhRo/TKK69IkhobGxUVFaX58+dr8eLFDrWFx+ABAPA+znx+t2sBn6uuukqvv/56uxp3KfLy8pScnGxXlpKSooULF0r6dm2ivXv3asmSJbbjPj4+Sk5OVl5eXqvnraurU11dne1nq9XasQ0HAAAexaEA5EwgcGWPSUlJicLCwuzKwsLCZLVade7cOZ05c0YNDQ0t1vniiy9aPW96erqeeOIJl7QZAAB4HocCUJ8+fRx+wquhoeGSGuQOS5YsUVpamu1nq9WqqKgoN7YIAAC4kkMBKDs72/bvxcXFWrx4se666y7bU195eXlau3at0tPTXdPK/wgPD1dpaaldWWlpqQIDA9WzZ0/5+vrK19e3xTrh4eGtntfPz09+fn4uaTMAAPA8DgWgG264wfbvv/nNb/Tiiy8qNTXVVnbrrbdqyJAheu211zRr1qyOb+V/JCYm6sMPP7Qr27x5sy2I9ejRQyNGjFBWVpZtMnVjY6OysrI0b948l7ULAAB4F6fXAcrLy9PIkSOblY8cOdLp9Xaqq6uVn5+v/Px8Sd8+5p6fn2/bX2zJkiWaOXOmrf59992nwsJCPfzww/riiy/0u9/9Tn/+85/14IMP2uqkpaXp9ddf19q1a3Xw4EHdf//9qqmp0ezZs529VAAA0EU5HYCioqJafALsjTfecHrezJ49exQfH6/4+HhJ34aX+Ph4LV26VJJ08uRJWxiSpJiYGP3tb3/T5s2bNWzYMC1btkxvvPGGbQ0gSZo2bZpeeOEFLV26VMOHD1d+fr4yMzObTYwGAJhTYVm1sg+dUlF5jbubAjdyeh2gDz/8UHfeeadiY2OVkJAgSdq1a5cKCgr017/+VT/84Q9d0tDOxDpAAND1VNbWa0FGvnILymxl4+NCtCI1XkEB3d3YMnQUZz6/ne4B+uEPf6iCggLdeuutOn36tE6fPq1bbrlF//73v7tE+AEAdE0LMvK17XC5Xdm2w+Wan7HfTS2CO7VrIcQrrrhCTz/9dEe3BQAAlygsq7br+WnSYBjKLShTUXmNYoJ7uaFlcBene4AAAPA2R07Xtnm8uIL5QGZDAAIAdHkD+gW0eTy6P70/ZkMAAgB0eQNDemt8XIh8L9jVwNdi0fi4EIa/TIgABAAwhRWp8UqKDbYrS4oN1orUeDe1CO7k1CToHTt26P3331d9fb1uvPFG3Xzzza5qFwAAHSoooLvWzRmtovIaFVfUKLp/L3p+TMzhdYD+8pe/aNq0aerZs6e6d+8uq9Wq5557TosWLXJ1Gzsd6wABAOB9XLIOUHp6uu655x5VVVXpzJkzeuqpp/TMM89ccmMBAAA6m8M9QL1791Z+fr5iY2MlSfX19erVq5eOHz+u0NBQlzays9EDBACA93FJD1Btba3dyXr06CF/f39VV1e3v6UAAMBUPGUvNqcmQb/xxhvq3bu37efz589rzZo1Cg7+76z6BQsWdFzrAACAwwrLqnXkdK1HTvD2tL3YHB4Ci46OluWC9ROancxiUWFhYYc0zJ0YAgMAeBNPCxctmbl6l7YdLlfDd2KHr8WipNhgrZszukN+hzOf3w73ABUXF19quwAAgAu0tdFrR4WLS+GJe7GxECIAAF6sKVw0XDCg891w4W6euBebUwGosbFRb775pv7f//t/uvbaazVkyBDdeuutWrdunRwcSQMAAB3IE8PFhTxxLzaHA5BhGLr11lt199136/jx4xoyZIgGDx6sI0eO6K677tLtt9/uynYCAIAWeGK4uJAn7sXmcABas2aNcnNzlZWVpf379ysjI0Pr16/Xp59+qn/84x/65JNPtG7dOle2FQAAXMATw0VLPG0vNoefAps0aZJ+8IMfaPHixS0ef+aZZ7RlyxZ99NFHHdpAd+ApMACAN6mq/UbzM/Zf9CkwT3hM3pV7sTnz+e1wAAoPD1dmZqaGDx/e4vH9+/dr8uTJKikpcbrBnoYABADwRq2FC294TL4juGQl6NOnTyssLKzV42FhYTpz5ozjrQQAAB0qJriXJg4KlWEYdqstt/WYvFk5vA5QQ0ODunVrvbqvr6/Onz/fIY0CAADOa6mnZ1R0X+0ubt5B4c41eDyBwwHIMAzddddd8vPza/F4XV1dhzUKAAA4r6Wenr1H2h6dKa4gALVp1qxZF60zc+bMS2oMAABon9ZWW268yExfT3hM3h0cDkBvvfWWK9sBAIBX8oQnq6SLL4joI6nxOz837cNlxt4fycnd4AEAwLc87cmqiy2IOGJAX+3+znCYO9fg8QQEIAAA2sHTNiBtWhCxrR3Xc/9dpv3Hzui6K/vq+riQTm+jJyEAAQDgJE/c3Vz6drXlCxdETIoN1lNTr9XM1bs8prfKExCAAABwkiMbkLojAAUFdNe6OaObLYg4c/Uuj+qt8gQEIAAAnOTpG5DGBP93Qran9la5m8MrQQMAgG95ywakkmO9VWbkEQFo5cqVio6Olr+/vxISErRr165W606YMEEWi6XZa8qUKbY6d911V7PjN998c2dcCgDAJDxtd/PWeHpvlbu4fQhsw4YNSktL06pVq5SQkKCXX35ZKSkpOnTokEJDQ5vVf/fdd1VfX2/7uaKiQsOGDdOPf/xju3o333yz3dpFra1gDQBAe7Q238bTXOzpME9sc2dwew/Qiy++qHvuuUezZ8/WNddco1WrVikgIEBvvvlmi/X79eun8PBw22vz5s0KCAhoFoD8/Pzs6vXt27fVNtTV1clqtdq9AABwRNMGpJ4cJLylt6ozubUHqL6+Xnv37tWSJUtsZT4+PkpOTlZeXp5D51i9erWmT5+uXr3s/8PLyclRaGio+vbtqx/84Ad66qmn1L9//xbPkZ6erieeeKL9FwIAgAerqKnT7HHRumd8jM43Gh7bW9WZ3BqAysvL1dDQoLCwMLvysLAwffHFFxd9/65du3TgwAGtXr3arvzmm2/WHXfcoZiYGH355Zd65JFHNHnyZOXl5cnX17fZeZYsWaK0tDTbz1arVVFRUe28KgAA3OPCbTnaWq3a7Nw+B+hSrF69WkOGDNHo0fZrGEyfPt3270OGDNHQoUN11VVXKScnRzfeeGOz8/j5+TFHCADgtVoLOt80NGpX0Wm7umZf/6eJW+cABQcHy9fXV6WlpXblpaWlCg8Pb/O9NTU1Wr9+vebMmXPR3zNw4EAFBwfr8OHDl9ReAAA8UUvbcmw9XKa8wgq7ic+S/fo/ZubWANSjRw+NGDFCWVlZtrLGxkZlZWUpMTGxzfe+8847qqur009/+tOL/p6vvvpKFRUVioiIuOQ2AwDgSZoWOrww6DQarbzhP8y6/k8Ttz8FlpaWptdff11r167VwYMHdf/996umpkazZ8+WJM2cOdNuknST1atXa+rUqc0mNldXV+sXv/iFduzYoeLiYmVlZem2225TbGysUlJSOuWaAADoLBdb6LA1Zl3/p4nb5wBNmzZNZWVlWrp0qUpKSjR8+HBlZmbaJkYfPXpUPj72Oe3QoUPaunWrPv7442bn8/X11Weffaa1a9eqsrJSkZGRmjRpkp588knm+QAAupyLLXToI6nxOz+bff2fJhbDMC7SSWY+VqtVQUFBqqqqUmBgoLubAwBAm5o2O71wocOEgf3UzcfHNLvAO/P57fYeIAAAcGlWpMZrfsZ+u6DTtNBhUEB3j1+t2h3oAWoBPUAAAG9k9qBDDxAAACYUE2zO4NMebn8KDAAAoLMRgAAAgOkQgAAAgOkQgAAAgOkQgAAAgOnwFBgAAG5SWFatI6drTfvYujsRgAAA6GSVtfVakJFvmhWaPRFDYAAAdLIFGfnadrjcrmzb4XLNz9jvphaZDwEIAIBOVFhWrdyCMrt9uySpwTCUW1CmovKaDv1d2YdOtXjOto6ZAUNgAAB0oiOna9s8XlxRc8nzgdoaYjNkMPwmeoAAAOhUA/oFtHk8uv+lT4Zua4iN4bdvEYAAAOhEA0N6a3xciHwtFrtyX4tF4+NCLrn352JDbJ01/ObpCEAAAHSyFanxSooNtitLig3WitT4Sz73xYbY2lJcYZ4AxBygLoK1JADAewQFdNe6OaNVVF6j4oqaDv27+2JDbG3piOE3b0EA8nKsJQEA3ismuOO/tDYNsW07XG431OVrsei6AX10rr5Bn5+wqvE77/G1WJQUG2yqL9AMgXk5JrMBAC7U0hBbYM9u2l18RgcuCD+S/fCbWR6PpwfIizVNdLvQdyezOZPmGUYDgK7hwiG2331yWPuOVtrV8bFI10QGakXqdYoJ7qXK2nrNXL3LNCMKBCAv1lFrSTCMBgBdU0xwLxmGod1HzjQ71mhIB45bbT+3NaKwbs5ol7e1szEE5sWcWUuirS5NhtEAoOty5MtyZ65O7SnoAfJirU10a/LYe//SU1MH69eb/tVq705HD6MBADyLI1+WL/b4e0esTu1p6AHyci1NdGuy7XC5blu5rc3eHUe+GcAczDLxETAbRxZe7IzVqT0NAcjLBQV01+O3XtPisQbD0Jnab9rs0jTjf/Sw1zTx8QfLtmj2W7s18YUczVy9S1W137i7aQA6yMUWXnT16tSeiCGwLqC9q34WV9Ro4qDQVteLMNuaEGZltomPgBk5svDiitR4zVm7W3u+M2G6o1an9kT0AHUB7V31s6l3x5VLssOzmXHiI2BmMcG9NHFQaLPwU1lbr/kZ++3Cz6jovl36aWB6gLqAtlb9DOzZTdZz59vs3XHlkuzwbB21lAIA79ZST/C+I5VduieYHqAuorVenP+bO87h3p3Wvhmg62IOGACz9gTTA+QlCsuqtbOoQpJFYwb2bxZS2urFoXcHrWmr95A5YIA5mLUn2CN6gFauXKno6Gj5+/srISFBu3btarXumjVrZLFY7F7+/v52dQzD0NKlSxUREaGePXsqOTlZBQUFrr4Ml6isrdf/7/Ud+sGyLVry7gEtefefmvhCjlJf29HiUzqt9eLQu4PWMAfM3Fj+wDxa+7M2a0+w23uANmzYoLS0NK1atUoJCQl6+eWXlZKSokOHDik0NLTF9wQGBurQoUO2ny0XPLb3/PPPa/ny5Vq7dq1iYmL06KOPKiUlRZ9//nmzsOTpFmTka/uXFc3K8woruvTYbFfkqXutMQfMnNgCxzwu9mdt1p5gi2G0sIRwJ0pISNCoUaP0yiuvSJIaGxsVFRWl+fPna/Hixc3qr1mzRgsXLlRlZWWL5zMMQ5GRkXrooYe0aNEiSVJVVZXCwsK0Zs0aTZ8+/aJtslqtCgoKUlVVlQIDA9t/cZeosKxaP1i2pc062YsmdNn/OLsKPmjgiWau3tXqBx5frLoWR/6sq2q/0fyM/V7/95Qzn99uHQKrr6/X3r17lZycbCvz8fFRcnKy8vLyWn1fdXW1BgwYoKioKN12223617/+ZTtWVFSkkpISu3MGBQUpISGh1XPW1dXJarXavVzFme5mR9b3MftKzd7Qfc9ea/A0Zp30akaO/lk39QRnL5qgt2aPUvaiCVo3Z7RXhR9nuXUIrLy8XA0NDQoLC7MrDwsL0xdffNHiewYNGqQ333xTQ4cOVVVVlV544QWNHTtW//rXv3TFFVeopKTEdo4Lz9l07ELp6el64oknOuCKWteeXgBH1vfpqmOzF+MtvSrstea9PHXIsiOYddKrGTn7Zx0T3PX+e2+NR0yCdkZiYqJmzpyp4cOH64YbbtC7776rkJAQ/f73v2/3OZcsWaKqqirb69ixYx3Y4m+1pxegaVy2NZ60PHlH9sQ4ci5v6VVhrzXvY4atQcw66dWM+LNunVt7gIKDg+Xr66vS0lK78tLSUoWHhzt0ju7duys+Pl6HDx+WJNv7SktLFRERYXfO4cOHt3gOPz8/+fn5teMKHNOeXoCmb5+LUr6n842NzSZCJw7s7xFP6XRkT4yj53Lmfrr7Wzx/+XgfM2wNYtZJr2bEn3Xr3NoD1KNHD40YMUJZWVm2ssbGRmVlZSkxMdGhczQ0NOif//ynLezExMQoPDzc7pxWq1U7d+50+JwdzZlegAu/fd76yjZ18/HR/81N0rN3DFH6HUOUvWiCMu4d02FDPZfSe9ORPTGOnsuR++kp3+LNuMGgNzPT3BiWP+ha2vp7nD/rlrn9Mfi0tDTNmjVLI0eO1OjRo/Xyyy+rpqZGs2fPliTNnDlTl19+udLT0yVJv/nNbzRmzBjFxsaqsrJS//u//6sjR47o7rvvlvTtI/ELFy7UU089pbi4ONtj8JGRkZo6dapbrtGZXoDWQoCkDv/2ebEel9Z6T5oWZSyvruuw+S3O9Oo4cj/v/+M+5RXa95rlFpTpvj/uVca9YxxqU0dZkRrf7OkK/vLxTGaaG8PyB12DIz3n/Fm3zO0BaNq0aSorK9PSpUtVUlKi4cOHKzMz0zaJ+ejRo/Lx+W9H1ZkzZ3TPPfeopKREffv21YgRI7R9+3Zdc801tjoPP/ywampqdO+996qyslLjxo1TZmam29YAGhjSW30DuutMC70PfQO62w3XdOaE2dbC1n1/3Kvuvj52bRnQP0DTRkXp/U9P6ODJsw6d35kPC2c+eC7WpWsYRrPw0ySvsKLTJx7zl4/3MOOQpZkmvXZFzgzZ8mdtz+0BSJLmzZunefPmtXgsJyfH7ueXXnpJL730Upvns1gs+s1vfqPf/OY3HdXES1JYVt1i+JGkM7Xf2D6QO/PbZ1thK6+wQj72IzY6UlGr5zMPNavfFmc+LJz94GmrV+XDAyfbPNeOwgq3/CXAXz6ej/kS8Cau+tLs7rmTncUjAlBX52iw6cxvnxdrU+MlLI9pkTQ40rkFJJ394Gm7V6XtxlvaPAqzY8gS3qKjvzR7y/IiHYUA1AkcDTad+e3TkTWG2suQdOCEVRNfyHHqf572fPC01KuSENO/zd+TMLDt4zCn737rZcgS3qCjvzSb4QnI7yIAdQJngk1nfftsa15SR3Lmf56OmiszMKS3xl7Vv8U91L4ffpnT50PX1ta3XoIPPFlHfmk246Ktbt8LzBO5Yi8wZ/dZaU8IcGbc1pF9xjpSZ+9Z1tL9/q6u3K0L57AnFrxZR+3hlX3olGa/tbvV42/NHqWJg1reoNyTOPP5TQ9QJ3G2d8OZCbPtGbd1ZJ+xjtTZjw9/937Pz9inz09Y7eY1deVuXTjOjN960bV0VM+5GZ+A9LqtMLxdTHAvTRwU2imPs7e1GKEr5wC1xF3/8xiGoQPHrc0mdXfFhe3gPLYqQVdxqZ8tZly0lQDk5dq7cm3Tf+wXPu7e0dz9Pw8fcGiLGb/1Aq0x24rRDIF5uUt5DHJFarxmrN6hA8etrmiaJOm6K/u49X8ePuDQFtb9Af7LbIu20gPk5S7lAz4ooLuWT3dtOPn5D2LdOtHYjN26cI7ZvvUCF+OKqRqeiB4gL3ep32Cb3t/a01KXyhN6WFjYDm0x27deAN/iMfgWuOIxeFe61McgPz1WqdtWbmv1ePodQ/Tu3q+072hliyFLklc8RswHHAB0bTwGbzKX+g3285NVbR63SHpj1qg2e1G8oYeFvbgAAE0IQF1I+z/g234U7IsSqxIG9m8zZDGEAADwJgQgE2taOfryPj3brLdm+xGt2X5Eo6L76o2Zo1oNN/SwAAC8BQHIjZzZuqIjtbRydFDPbqo6d77N9+0uPqMJL2QrZ9FEtpAAAHg1ApAbtLZ1xUOTvqfTtfUuD0QtrRx99tx5hzZHPVP7je5eu1vv3D/WZe0DAMDVCEBucO+6PdpVfMauLLeg7JI3s3NEa3sfNerbcPOHOaO17+gZvbS5oNVz7D5yhj2SAABejYUQO1lhWXWz8NOSi+3l1V4XWzn6fKOhW4ZGXvQ8bCEBAPBmBKBO9sFnJxyq56rNOh1ZOXpgSG+Niu570XoAAHgrAlAnO11T71T9ju5pcXRriDdmjlLfFobffCS2kAAAeD0CUCebOCjUqfqu6GlxZO+joIDuylk0UaMG2PcEjfvP3CQAALwZk6A72Q2DQh165NyVu1E7unJ0UEB3vXP/WBY4BAB0OewF1gJX7wV2rKJWt67cavfIeTcfi843/vePwlVPgQEA0FWxF5iHi+ofoP1LJ+n/X1CmfUfP6Lor++r6uBB6WgAA6CT0ALXA23aDBwAAzn1+MwkaAACYDgEIAACYDgEIAACYDpOgAQDoAgrLqnXkdC0P0jiIAAQAgBerrK3Xgoz8TtlQuythCAwAAC+2ICNf2w6X25W5akPtrsQjAtDKlSsVHR0tf39/JSQkaNeuXa3Wff3113X99derb9++6tu3r5KTk5vVv+uuu2SxWOxeN998s6svAwCATlVYVq3cgjI1XLCijas21O5K3B6ANmzYoLS0ND322GPat2+fhg0bppSUFJ06darF+jk5OUpNTVV2drby8vIUFRWlSZMm6fjx43b1br75Zp08edL2ysjI6IzLAQCg0xw5Xdvm8e9uqF1YVq3sQ6cIRf/h9oUQExISNGrUKL3yyiuSpMbGRkVFRWn+/PlavHjxRd/f0NCgvn376pVXXtHMmTMlfdsDVFlZqU2bNjnUhrq6OtXV1dl+tlqtioqKYiFEAIDHKiyr1s6i01ry7j9brZO9aIL6BnQ3zRwhr1kIsb6+Xnv37lVycrKtzMfHR8nJycrLy3PoHLW1tfrmm2/Ur18/u/KcnByFhoZq0KBBuv/++1VRUdHqOdLT0xUUFGR7RUVFte+CAABwscraes1cvUs/WLal1fDja7FofFyIYoJ7MUeoFW4NQOXl5WpoaFBYWJhdeVhYmEpKShw6xy9/+UtFRkbahaibb75Z69atU1ZWlp577jlt2bJFkydPVkNDQ4vnWLJkiaqqqmyvY8eOtf+iAABwoZYCzYWSYoO1IjWeOUJt8OrH4J999lmtX79eOTk58vf3t5VPnz7d9u9DhgzR0KFDddVVVyknJ0c33nhjs/P4+fnJz8+vU9oMAEB7NQWa1jx7xxAlDOxvWwdo37EzbZ6vuKLGtGsGubUHKDg4WL6+viotLbUrLy0tVXh4eJvvfeGFF/Tss8/q448/1tChQ9usO3DgQAUHB+vw4cOX3GYAANzlYpOeL/PrZhdoBvQLaLN+dH9zhh/JzQGoR48eGjFihLKysmxljY2NysrKUmJiYqvve/755/Xkk08qMzNTI0eOvOjv+eqrr1RRUaGIiIgOaTcAAO5wsUCzZnux3c8DQ3prfFyIfC0Wu/LvzhEyK7c/Bp+WlqbXX39da9eu1cGDB3X//ferpqZGs2fPliTNnDlTS5YssdV/7rnn9Oijj+rNN99UdHS0SkpKVFJSourqaklSdXW1fvGLX2jHjh0qLi5WVlaWbrvtNsXGxiolJcUt1wgAQEcYGNJbo6L7tnp895Ezzeb1rEiNV1JssF1Z0xwhM3P7HKBp06aprKxMS5cuVUlJiYYPH67MzEzbxOijR4/Kx+e/Oe3VV19VfX29fvSjH9md57HHHtPjjz8uX19fffbZZ1q7dq0qKysVGRmpSZMm6cknn2SeDwDA680aG63dxa3P7blwXk9QQHetmzNaReU1Kq6oYa+w/3D7OkCeyJl1BAAA6EyFZdX6wbItrR7PXjTBtAHHa9YBAgAAzmFeT8cgAAEA4GWY13Pp3D4HCAAAOId5PZeOAAQAgJeKCSb4tBdDYAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHTYDBUAANhs2HVUeUUVSroqWD8eGeXu5riMxTAMw92N8DRWq1VBQUGqqqpSYGCgu5sDAIDL/fOrSt3+u+063/jfWNDNx6L/m5ukay4PcmPLHOfM5zdDYAAAoFn4kaTzjYZuXbnNTS1yLQIQAAAmt2HX0Wbhp8n5RkPv7DnWyS1yPQIQAABdVGFZtbIPnVJReU2bZZsPlrZ5no/+VeKyNroLk6ABAOhiKmvrtSAjX7kFZbayxIH9ZbFI27+ssJWNjwvRitR4RfXt2eb5BvQLcFlb3YUABABAF7MgI1/bDpfbleUVVjSrt+1wue5et1t3XHdFm+f7aWJ0RzbPIxCAAADoQgrLqu16ftrSYBjaXXxGu4vPtFrnuqggxQT36qjmeQwCEAAAXciR07Uddq6mIbKuiEnQAAB0IR05X+eJ2wYrKKB7h53PkxCAAADoQgaG9Nb4uBD5WiyXfK7iipqLV/JSBCAAALqYFanxSooNtitLHNhfY6/q79R5ovt3vbk/TZgDBABAFxMU0F3r5oxWUXmNiitqFN2/l20ic1PZ7z45rH1HK9XQwo5YvhaLkmKDu+Tk5yYEIAAAuqiY4F7NQkxT2XVRfTU/Y3+LT4wlxQZ32cnPTTxiCGzlypWKjo6Wv7+/EhIStGvXrjbrv/POO7r66qvl7++vIUOG6MMPP7Q7bhiGli5dqoiICPXs2VPJyckqKChw5SUAAOBVmnqJshdN0FuzR+kPc0brrdmjlL1ogtbNGd1lJz83cXsA2rBhg9LS0vTYY49p3759GjZsmFJSUnTq1KkW62/fvl2pqamaM2eO9u/fr6lTp2rq1Kk6cOCArc7zzz+v5cuXa9WqVdq5c6d69eqllJQUff311511WQAAeIWY4F6aOChU18eFaOKg0C497PVdFsNoYfCvEyUkJGjUqFF65ZVXJEmNjY2KiorS/PnztXjx4mb1p02bppqaGn3wwQe2sjFjxmj48OFatWqVDMNQZGSkHnroIS1atEiSVFVVpbCwMK1Zs0bTp0+/aJusVquCgoJUVVWlwMDADrpSAADgSs58fru1B6i+vl579+5VcnKyrczHx0fJycnKy8tr8T15eXl29SUpJSXFVr+oqEglJSV2dYKCgpSQkNDqOevq6mS1Wu1eAACg63JrACovL1dDQ4PCwsLsysPCwlRS0vLOsyUlJW3Wb/qnM+dMT09XUFCQ7RUVFdWu6wEAAN7B7XOAPMGSJUtUVVVlex07dszdTQIAAC7k1gAUHBwsX19flZaW2pWXlpYqPDy8xfeEh4e3Wb/pn86c08/PT4GBgXYvAADQdbk1APXo0UMjRoxQVlaWrayxsVFZWVlKTExs8T2JiYl29SVp8+bNtvoxMTEKDw+3q2O1WrVz585WzwkAAMzF7QshpqWladasWRo5cqRGjx6tl19+WTU1NZo9e7YkaebMmbr88suVnp4uSXrggQd0ww03aNmyZZoyZYrWr1+vPXv26LXXXpMkWSwWLVy4UE899ZTi4uIUExOjRx99VJGRkZo6daq7LhMAAHgQtwegadOmqaysTEuXLlVJSYmGDx+uzMxM2yTmo0ePysfnvx1VY8eO1dtvv61f//rXeuSRRxQXF6dNmzbp2muvtdV5+OGHVVNTo3vvvVeVlZUaN26cMjMz5e/v3+nXBwAAPI/b1wHyRKwDBACA9/GadYAAAADcwe1DYJ6oqVOMBREBAPAeTZ/bjgxuEYBacPbsWUliQUQAALzQ2bNnFRQU1GYd5gC1oLGxUSdOnNBll12ms2fPKioqSseOHWM+kItYrVbusYtxj12Pe+x63OPO4c332TAMnT17VpGRkXYPULWEHqAW+Pj46IorrpD07WP1klggsRNwj12Pe+x63GPX4x53Dm+9zxfr+WnCJGgAAGA6BCAAAGA6BKCL8PPz02OPPSY/Pz93N6XL4h67HvfY9bjHrsc97hxmuc9MggYAAKZDDxAAADAdAhAAADAdAhAAADAdAhAAADAdAlAbVq5cqejoaPn7+yshIUG7du1yd5O8Vnp6ukaNGqXLLrtMoaGhmjp1qg4dOmRX5+uvv9bcuXPVv39/9e7dW3feeadKS0vd1GLv9+yzz8pisWjhwoW2Mu7xpTt+/Lh++tOfqn///urZs6eGDBmiPXv22I4bhqGlS5cqIiJCPXv2VHJysgoKCtzYYu/T0NCgRx99VDExMerZs6euuuoqPfnkk3b7O3GfnZObm6tbbrlFkZGRslgs2rRpk91xR+7n6dOnNWPGDAUGBqpPnz6aM2eOqqurO/EqOhYBqBUbNmxQWlqaHnvsMe3bt0/Dhg1TSkqKTp065e6meaUtW7Zo7ty52rFjhzZv3qxvvvlGkyZNUk1Nja3Ogw8+qPfff1/vvPOOtmzZohMnTuiOO+5wY6u91+7du/X73/9eQ4cOtSvnHl+aM2fOKCkpSd27d9ff//53ff7551q2bJn69u1rq/P8889r+fLlWrVqlXbu3KlevXopJSVFX3/9tRtb7l2ee+45vfrqq3rllVd08OBBPffcc3r++ee1YsUKWx3us3Nqamo0bNgwrVy5ssXjjtzPGTNm6F//+pc2b96sDz74QLm5ubr33ns76xI6noEWjR492pg7d67t54aGBiMyMtJIT093Y6u6jlOnThmSjC1bthiGYRiVlZVG9+7djXfeecdW5+DBg4YkIy8vz13N9Epnz5414uLijM2bNxs33HCD8cADDxiGwT3uCL/85S+NcePGtXq8sbHRCA8PN/73f//XVlZZWWn4+fkZGRkZndHELmHKlCnGz372M7uyO+64w5gxY4ZhGNznSyXJ2Lhxo+1nR+7n559/bkgydu/ebavz97//3bBYLMbx48c7re0diR6gFtTX12vv3r1KTk62lfn4+Cg5OVl5eXlubFnXUVVVJUnq16+fJGnv3r365ptv7O751VdfrSuvvJJ77qS5c+dqypQpdvdS4h53hP/7v//TyJEj9eMf/1ihoaGKj4/X66+/bjteVFSkkpISu3scFBSkhIQE7rETxo4dq6ysLP373/+WJH366afaunWrJk+eLIn73NEcuZ95eXnq06ePRo4caauTnJwsHx8f7dy5s9Pb3BHYDLUF5eXlamhoUFhYmF15WFiYvvjiCze1qutobGzUwoULlZSUpGuvvVaSVFJSoh49eqhPnz52dcPCwlRSUuKGVnqn9evXa9++fdq9e3ezY9zjS1dYWKhXX31VaWlpeuSRR7R7924tWLBAPXr00KxZs2z3saW/O7jHjlu8eLGsVquuvvpq+fr6qqGhQU8//bRmzJghSdznDubI/SwpKVFoaKjd8W7duqlfv35ee88JQOh0c+fO1YEDB7R161Z3N6VLOXbsmB544AFt3rxZ/v7+7m5Ol9TY2KiRI0fqmWeekSTFx8frwIEDWrVqlWbNmuXm1nUdf/7zn/WnP/1Jb7/9tgYPHqz8/HwtXLhQkZGR3Gd0GIbAWhAcHCxfX99mT8eUlpYqPDzcTa3qGubNm6cPPvhA2dnZuuKKK2zl4eHhqq+vV2VlpV197rnj9u7dq1OnTum6665Tt27d1K1bN23ZskXLly9Xt27dFBYWxj2+RBEREbrmmmvsyr7//e/r6NGjkmS7j/zdcWl+8YtfaPHixZo+fbqGDBmi//mf/9GDDz6o9PR0SdznjubI/QwPD2/2END58+d1+vRpr73nBKAW9OjRQyNGjFBWVpatrLGxUVlZWUpMTHRjy7yXYRiaN2+eNm7cqE8++UQxMTF2x0eMGKHu3bvb3fNDhw7p6NGj3HMH3XjjjfrnP/+p/Px822vkyJGaMWOG7d+5x5cmKSmp2fIN//73vzVgwABJUkxMjMLDw+3usdVq1c6dO7nHTqitrZWPj/3Hk6+vrxobGyVxnzuaI/czMTFRlZWV2rt3r63OJ598osbGRiUkJHR6mzuEu2dhe6r169cbfn5+xpo1a4zPP//cuPfee40+ffoYJSUl7m6aV7r//vuNoKAgIycnxzh58qTtVVtba6tz3333GVdeeaXxySefGHv27DESExONxMREN7ba+333KTDD4B5fql27dhndunUznn76aaOgoMD405/+ZAQEBBh//OMfbXWeffZZo0+fPsZ7771nfPbZZ8Ztt91mxMTEGOfOnXNjy73LrFmzjMsvv9z44IMPjKKiIuPdd981goODjYcffthWh/vsnLNnzxr79+839u/fb0gyXnzxRWP//v3GkSNHDMNw7H7efPPNRnx8vLFz505j69atRlxcnJGamuquS7pkBKA2rFixwrjyyiuNHj16GKNHjzZ27Njh7iZ5LUktvt566y1bnXPnzhk///nPjb59+xoBAQHG7bffbpw8edJ9je4CLgxA3ONL9/777xvXXnut4efnZ1x99dXGa6+9Zne8sbHRePTRR42wsDDDz8/PuPHGG41Dhw65qbXeyWq1Gg888IBx5ZVXGv7+/sbAgQONX/3qV0ZdXZ2tDvfZOdnZ2S3+HTxr1izDMBy7nxUVFUZqaqrRu3dvIzAw0Jg9e7Zx9uxZN1xNx7AYxneW1gQAADAB5gABAADTIQABAADTIQABAADTIQABAADTIQABAADTIQABAADTIQABAADTIQABAADTIQABAADTIQAB8GqZmZmyWCwqKSmxK4+IiFB0dLRdWXFxsSwWi23TxwkTJshisTR73Xfffbb3fLc8MDBQo0aN0nvvvefy6wLgWgQgAF5t3Lhx6tatm3JycmxlBw8e1Llz53TmzBkVFxfbyrOzs+Xn56ekpCRb2T333KOTJ0/avZ5//nm73/HWW2/p5MmT2rNnj5KSkvSjH/1I//znP119aQBciAAEwGv85S9/0ZAhQ9SzZ0/1799fycnJslgsGjVqlF0AysnJ0bhx45SUlNSsfMyYMfL397eVBQQEKDw83O4VGBho93v79Omj8PBwfe9739OTTz6p8+fPKzs729WXC8CFCEAAvMLJkyeVmpqqn/3sZzp48KBycnJ0xx13yDAMTZw40S6QZGdna8KECbrhhhvsynNycjRx4sR2t+H8+fNavXq1JKlHjx7tvxgAbsdu8AC8wr59+zRixAgVFxdrwIABdsf+8Y9/6KabbtKJEycUERGhsLAwffDBBzp//rxSU1NVXFyswsJCXXXVVdqyZYvGjx8v6ds5QNu3b28WZn7/+99rxowZkr6dA+Tv7y9fX1+dO3dOjY2Nio6O1t69e9WvX7/OuXgAHa6buxsAAI4YNmyYbrzxRg0ZMkQpKSmaNGmSfvSjH6lv374aO3asevTooZycHA0bNkznzp3Tddddp8bGRpWVlamoqEg5OTnq2bOnxowZY3feGTNm6Fe/+pVdWVhYmN3PL730kpKTk1VYWKgHH3xQy5cvJ/wAXo4ABMAr+Pr6avPmzdq+fbs+/vhjrVixQr/61a+0c+dOxcTEaPTo0crOztbp06c1btw4+fr6ytfXV2PHjlV2drays7OVlJTUrLcnKChIsbGxbf7u8PBwxcbGKjY2Vm+99ZZ++MMf6vPPP1doaKgrLxmACzEHCIDXsFgsSkpK0hNPPKH9+/erR48e2rhxoyRp4sSJysnJUU5OjiZMmGB7z/jx45WTk6MtW7Zc0vyfJqNHj9aIESP09NNPX/K5ALgPAQiAV9i5c6eeeeYZ7dmzR0ePHtW7776rsrIyff/735f0bQAqKCjQRx99pBtuuMH2vhtuuEGbNm3SsWPHWgxAtbW1KikpsXudOXOmzbYsXLhQv//973X8+PGOvUgAnYZJ0AC8wsGDB/Xggw9q3759slqtGjBggObPn6958+ZJkr7++mv16dNH3bt315kzZ9St27cj/HV1derTp4+6detmVy59Owl6y5YtzX5XSkqKMjMzJX3b67Rx40ZNnTrVdtwwDF1zzTWaOHGifve737nwqgG4CgEIAACYDkNgAADAdAhAAADAdAhAAADAdAhAAADAdAhAAADAdAhAAADAdAhAAADAdAhAAADAdAhAAADAdAhAAADAdAhAAADAdP4/BRXMycXIfuwAAAAASUVORK5CYII=", + "image/png": "", "text/plain": [ "
" ] @@ -1255,7 +1262,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -1264,13 +1271,13 @@ "" ] }, - "execution_count": 24, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -1285,7 +1292,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1303,7 +1310,7 @@ "\u001b[0;31mKeyError\u001b[0m: 'Experiment'", "\nThe above exception was the direct cause of the following exception:\n", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[25], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m df_standard_flow_variations \u001b[38;5;241m=\u001b[39m df[\u001b[38;5;241m~\u001b[39mdf[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mJoint\u001b[39m\u001b[38;5;124m\"\u001b[39m]]\u001b[38;5;241m.\u001b[39mreplace(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-\u001b[39m\u001b[38;5;124m\"\u001b[39m, np\u001b[38;5;241m.\u001b[39mnan)\n\u001b[0;32m----> 2\u001b[0m df_standard_flow_variations[\u001b[38;5;241m~\u001b[39m\u001b[43mdf_standard_flow_variations\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mExperiment\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mcontains(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msimple_encoder\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m&\u001b[39m \u001b[38;5;241m~\u001b[39mdf_standard_flow_variations[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExperiment\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mcontains(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mddi_actnorm\u001b[39m\u001b[38;5;124m\"\u001b[39m)]\n", + "Cell \u001b[0;32mIn[22], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m df_standard_flow_variations \u001b[38;5;241m=\u001b[39m df[\u001b[38;5;241m~\u001b[39mdf[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mJoint\u001b[39m\u001b[38;5;124m\"\u001b[39m]]\u001b[38;5;241m.\u001b[39mreplace(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-\u001b[39m\u001b[38;5;124m\"\u001b[39m, np\u001b[38;5;241m.\u001b[39mnan)\n\u001b[0;32m----> 2\u001b[0m df_standard_flow_variations[\u001b[38;5;241m~\u001b[39m\u001b[43mdf_standard_flow_variations\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mExperiment\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mcontains(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msimple_encoder\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m&\u001b[39m \u001b[38;5;241m~\u001b[39mdf_standard_flow_variations[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExperiment\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mcontains(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mddi_actnorm\u001b[39m\u001b[38;5;124m\"\u001b[39m)]\n", "File \u001b[0;32m/work/tools22/users/lukas.rilling/sis_env/lib/python3.10/site-packages/pandas/core/frame.py:3896\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3894\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 3895\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 3896\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3897\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 3898\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", "File \u001b[0;32m/work/tools22/users/lukas.rilling/sis_env/lib/python3.10/site-packages/pandas/core/indexes/base.py:3797\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3792\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3793\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3794\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3795\u001b[0m ):\n\u001b[1;32m 3796\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3797\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3798\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3799\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3800\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3801\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3802\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", "\u001b[0;31mKeyError\u001b[0m: 'Experiment'" diff --git a/users/rilling/evaluation/wer_eval.ipynb b/users/rilling/evaluation/wer_eval.ipynb index 2bf05746d..ebc9639f8 100644 --- a/users/rilling/evaluation/wer_eval.ipynb +++ b/users/rilling/evaluation/wer_eval.ipynb @@ -28,835 +28,1007 @@ { "data": { "text/plain": [ - "['/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/tuning/lm_1.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/tuning/lm_1.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_1.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_1.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_1.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/tuning/lm_1.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/tuning/lm_1.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_radam/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ddi_actnorm/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_encoder_sample_ctc_scale_0.1/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuned/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_coupling_epsilon/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_trainXvector/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_control/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ddi_actnorm/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_spec_augment/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_radam/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/tuning/lm_1.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_4.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_2.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_3.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_2.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_3.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_4.5/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/specaug/ce_ls_1.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/no_specaug/ce_ls_1.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep_v2/no_specaug/ce_ls_1.0/search/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_4x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/default/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval_spec_aug/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc768_100ep_xvector/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_simple_encoder/silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment_before/glow_enc192_200ep_not_freezed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment_before/glow_enc192_200ep/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/silence_preprocessing/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/drop_around_blstm/lm5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/drop_around_blstm/spec_augment/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder_16blocks/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/silence_preprocessing/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder/silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_with_small_enc/silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_simple_encoder/silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_simple_encoder_epoch84/silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/100epTTS/silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS_early_eval_ep100/silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS_early_eval_ep100/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS/silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/linear_1x512_d0.2_b300_fs4/glow_nar_taco_encoder_16blocks/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/linear_1x512_d0.2_b300_fs4/glow_enc768/tts_dataset/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x1024_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/tts_dataset/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/tts_dataset/spec_augment/no_glow/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/tuned/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/layer_norm/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/batch_norm/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.4/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.1/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.2/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment_before/glow_enc192_200ep_not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm3.0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm2.0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm3.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm2.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm1.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm4.0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/tuned_no_prior/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/tuned/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + "['/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm3.0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm2.0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm3.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm2.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm1.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm4.0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.3/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.5/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.5/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.3/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v2/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/tuned_no_prior/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/tuned/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.3/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_2.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_2.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_2.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_2.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_2.0_ps_0.5/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_2.0_ps_0.3/default_250/dev-other/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer']" + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_1.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_2.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_1.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_2.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_2.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_1.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment_before/glow_enc192_200ep_not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/tuned/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/batch_norm/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/layer_norm/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.2/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.1/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.4/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/tts_dataset/spec_augment/no_glow/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/tts_dataset/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/linear_1x512_d0.2_b300_fs4/glow_nar_taco_encoder_16blocks/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/linear_1x512_d0.2_b300_fs4/glow_enc768/tts_dataset/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x1024_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/silence_preprocessing/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_simple_encoder_epoch84/silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder_16blocks/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder/silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS/silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS_early_eval_ep100/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS_early_eval_ep100/silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/100epTTS/silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_with_small_enc/silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_simple_encoder/silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/default/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_simple_encoder/silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment_before/glow_enc192_200ep/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment_before/glow_enc192_200ep_not_freezed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/drop_around_blstm/spec_augment/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/drop_around_blstm/lm5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/silence_preprocessing/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc768_100ep_xvector/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval_spec_aug/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.5/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_4x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/default_250/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuned/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_radam/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_spec_augment/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.5_ps_0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.0_ps_0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_4.5_ps_0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.0_ps_0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.5_ps_0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_4.0_ps_0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector_specaug/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_trainXvector/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/tuning/lm_1.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_4.5_ps_0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_2.0_ps_0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_2.5_ps_0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_4.0_ps_0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_3.5_ps_0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_3.0_ps_0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_3.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_4.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_2.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_3.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/tuning/lm_4.0/search/dev-other/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/tuning/lm_2.5/search/dev-other/sclite/wer',\n", + " ...]" ] }, "execution_count": 2, @@ -936,6 +1108,1156 @@ "cell_type": "code", "execution_count": 5, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "| | WER (dev-other) |\n", + "||------------------:|\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/default_250/dev-other/sclite/wer') | 19 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer') | 20 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer') | 18.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer') | 18.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer') | 18.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer') | 17.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer') | 17.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer') | 18 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer') | 17.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer') | 18.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer') | 19.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer') | 18.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer') | 19.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer') | 18.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer') | 17.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer') | 17.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer') | 17.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer') | 17.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer') | 18.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer') | 36.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 36 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 36.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.3/default_250/dev-other/sclite/wer') | 40.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 36.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 36.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0/default_250/dev-other/sclite/wer') | 40.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 38 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 36.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.3/default_250/dev-other/sclite/wer') | 39 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 36 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.5/default_250/dev-other/sclite/wer') | 40.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 36.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 36.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 36.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 37.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0/default_250/dev-other/sclite/wer') | 38.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 37.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.5/default_250/dev-other/sclite/wer') | 39.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer') | 49.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.5/default_250/dev-other/sclite/wer') | 52.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 50.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 49.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 49.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 50 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 49.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.5/default_250/dev-other/sclite/wer') | 53.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 49.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0/default_250/dev-other/sclite/wer') | 53.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 50.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.3/default_250/dev-other/sclite/wer') | 51.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 50.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 49.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 49.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 49.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.3/default_250/dev-other/sclite/wer') | 53.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0/default_250/dev-other/sclite/wer') | 51.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 49.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer') | 30 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer') | 32.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer') | 31.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer') | 28.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer') | 30 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer') | 28.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer') | 28.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer') | 32.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer') | 29.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer') | 28.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer') | 29.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer') | 31.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer') | 32.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer') | 30.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer') | 30.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer') | 29 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer') | 28.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer') | 29.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/default_250/dev-other/sclite/wer') | 28.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/default_250/dev-other/sclite/wer') | 28.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer') | 29.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer') | 28.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer') | 28.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer') | 28.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer') | 31.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer') | 29.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer') | 29.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer') | 30.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer') | 31.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer') | 28.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer') | 28.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer') | 28.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer') | 28.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer') | 28.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer') | 29.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer') | 30.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer') | 31.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer') | 30.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 27.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 27.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 27.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 27.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 27.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 28.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 27.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 27.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 28.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 27.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 27.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 28.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/default_250/dev-other/sclite/wer') | 27.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/default_250/dev-other/sclite/wer') | 37.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm3.0/default_250/dev-other/sclite/wer') | 26.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm2.0/default_250/dev-other/sclite/wer') | 25.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm3.5/default_250/dev-other/sclite/wer') | 28 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm2.5/default_250/dev-other/sclite/wer') | 25.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm1.5/default_250/dev-other/sclite/wer') | 27.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/lm_tuning/lm4.0/default_250/dev-other/sclite/wer') | 30.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer') | 42.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 37.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 37.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.3/default_250/dev-other/sclite/wer') | 39.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 38.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0/default_250/dev-other/sclite/wer') | 38.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.3/default_250/dev-other/sclite/wer') | 38.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 38.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 39.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 37.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 37.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 38.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 37.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.5/default_250/dev-other/sclite/wer') | 40 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0/default_250/dev-other/sclite/wer') | 39.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 38.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.5/default_250/dev-other/sclite/wer') | 38.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 37.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 38.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer') | 26.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 22.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 22.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.5/default_250/dev-other/sclite/wer') | 23.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 21.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0/default_250/dev-other/sclite/wer') | 23.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 21.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 21.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 22.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.5/default_250/dev-other/sclite/wer') | 22.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 22.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.3/default_250/dev-other/sclite/wer') | 23.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 21.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0/default_250/dev-other/sclite/wer') | 22.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 21.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 23.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 22 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 21.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.3/default_250/dev-other/sclite/wer') | 22.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v2/default_250/dev-other/sclite/wer') | 26.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 18.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 18.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 18.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 18.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 18.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 18.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 18.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 18 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 18.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 18.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 18.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 18.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/tuned_no_prior/default_250/dev-other/sclite/wer') | 18.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/tuned/default_250/dev-other/sclite/wer') | 18 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/default_250/dev-other/sclite/wer') | 19.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer') | 43.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 38.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.3/default_250/dev-other/sclite/wer') | 39.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 37.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 39.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 38 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 37.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0/default_250/dev-other/sclite/wer') | 38.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 37.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.3/default_250/dev-other/sclite/wer') | 38.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 38.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 38.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.5/default_250/dev-other/sclite/wer') | 39.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 37.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 37.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0/default_250/dev-other/sclite/wer') | 39.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 37.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 38.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.5/default_250/dev-other/sclite/wer') | 38.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 25.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.3/default_250/dev-other/sclite/wer') | 25.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 24.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0/default_250/dev-other/sclite/wer') | 26.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.3/default_250/dev-other/sclite/wer') | 26.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 26.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 26.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 25.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 25 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.5/default_250/dev-other/sclite/wer') | 25.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 27.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 25.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 25 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.5/default_250/dev-other/sclite/wer') | 26.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0/default_250/dev-other/sclite/wer') | 25.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 25.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 25.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 24.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer') | 29.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 23.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 23 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 23.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 22.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 23.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 23.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_2.0_ps_0.3/default_250/dev-other/sclite/wer') | 23.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_2.0_ps_0/default_250/dev-other/sclite/wer') | 24 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 23.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 22.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 23 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 23.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 23.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_2.0_ps_0.5/default_250/dev-other/sclite/wer') | 23.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 23.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/default_250/dev-other/sclite/wer') | 25.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer') | 29.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer') | 26.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer') | 26.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer') | 26.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer') | 27.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer') | 26.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer') | 28.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer') | 30 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer') | 27.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer') | 28 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer') | 26.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer') | 26.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer') | 27.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer') | 30.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer') | 28.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer') | 28.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer') | 27.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer') | 27.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/default_250/dev-other/sclite/wer') | 26.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/default_250/dev-other/sclite/wer') | 24.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 21.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 20.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 20.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 20.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 20 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_2.0_ps_0/default_250/dev-other/sclite/wer') | 21 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_2.0_ps_0.5/default_250/dev-other/sclite/wer') | 20.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 20.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 21.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 20.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 20.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 20.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 21 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_2.0_ps_0.3/default_250/dev-other/sclite/wer') | 20.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 20.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_1.5_ps_0/default_250/dev-other/sclite/wer') | 22.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_2.0_ps_0.3/default_250/dev-other/sclite/wer') | 21.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 21.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 22.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 21.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 21.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 21.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_1.5_ps_0.3/default_250/dev-other/sclite/wer') | 22.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 22.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_2.0_ps_0.5/default_250/dev-other/sclite/wer') | 21.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_2.0_ps_0/default_250/dev-other/sclite/wer') | 22 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 21.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 21.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 21.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 22.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_1.5_ps_0.5/default_250/dev-other/sclite/wer') | 22.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 21.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 22 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/default_250/dev-other/sclite/wer') | 25.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 20 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 20.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 20.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 21.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 19.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 19.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 20.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 20 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 20.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 20 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 20.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 19.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/default_250/dev-other/sclite/wer') | 23.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer') | 27.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer') | 29.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer') | 28.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer') | 26.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer') | 26.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer') | 28.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer') | 26.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer') | 26.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer') | 27.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer') | 26.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer') | 28.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer') | 29.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer') | 29.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer') | 26.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer') | 27.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer') | 26.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer') | 26.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer') | 26.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/default_250/dev-other/sclite/wer') | 26.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/default_250/dev-other/sclite/wer') | 19.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer') | 18.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer') | 20.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer') | 19.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer') | 19.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer') | 18.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer') | 18.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer') | 18.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer') | 18.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer') | 20.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer') | 19 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer') | 18.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer') | 19.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer') | 20.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer') | 18.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer') | 18.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer') | 19 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer') | 18.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer') | 18.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v3/default_250/dev-other/sclite/wer') | 26.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 19.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 19.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.0_ps_0.5/default_250/dev-other/sclite/wer') | 19.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 18.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 19.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 19.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 19.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 19.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.0_ps_0.3/default_250/dev-other/sclite/wer') | 19.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 20.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 19.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 19 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.0_ps_0/default_250/dev-other/sclite/wer') | 20 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 19.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 19.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/default_250/dev-other/sclite/wer') | 23 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 24.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 23.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 25.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 23.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 25.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 24.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 23.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 27.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 24.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 25.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 23.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 24.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/default_250/dev-other/sclite/wer') | 30.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/default_250/dev-other/sclite/wer') | 24 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.0_ps_0.3/default_250/dev-other/sclite/wer') | 20.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 19.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.0_ps_0/default_250/dev-other/sclite/wer') | 20.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 19.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 19.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 20.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 20.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 21.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.0_ps_0.5/default_250/dev-other/sclite/wer') | 20.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 19.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 20.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 19.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 20.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 19.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 20.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment_before/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment_before/glow_enc192_200ep_not_silence_preprocessed/default_250/dev-other/sclite/wer') | 100 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/default_250/dev-other/sclite/wer') | 27.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 25.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 25.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 25.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 25.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 25.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 26 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 25.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 25.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 25.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 25.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 25.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 25.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/tuned/default_250/dev-other/sclite/wer') | 21.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 21.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 22.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 24.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 22.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 23.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 21.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 21.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 22.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 22.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 22.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 22 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 23.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/default_250/dev-other/sclite/wer') | 27.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/batch_norm/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/batch_norm/default_250/dev-other/sclite/wer') | 29.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/layer_norm/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/layer_norm/default_250/dev-other/sclite/wer') | 28.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.2/default_250/dev-other/sclite/wer') | 25.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/default_250/dev-other/sclite/wer') | 25.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 20.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 20 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 21.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 20.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 21.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 20.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 20 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 22.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 20.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 20.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 20 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 20.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.1/default_250/dev-other/sclite/wer') | 25.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 21.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 21.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 22 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 21.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 21.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 23.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 21.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 21.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 22.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 22.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 21.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 21.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/default_250/dev-other/sclite/wer') | 26.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.4/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.4/default_250/dev-other/sclite/wer') | 25.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/default_250/dev-other/sclite/wer') | 100 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 100 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 100 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 100 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 100 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 100 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 100 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 100 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 100 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 100 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 100 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 100 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 100 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/default_250/dev-other/sclite/wer') | 26.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 19.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 19.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 21.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 19.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 19.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 20.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 19.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 20.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 19.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 20.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 19.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 20.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/default_250/dev-other/sclite/wer') | 24.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/tts_dataset/spec_augment/no_glow/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/tts_dataset/spec_augment/no_glow/default_250/dev-other/sclite/wer') | 20.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/tts_dataset/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/tts_dataset/default_250/dev-other/sclite/wer') | 59.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 49.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 49.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 47.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 48.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 49.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 48.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 49.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 50 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 48.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 47.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 50 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 48.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/default_250/dev-other/sclite/wer') | 53 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/default_250/dev-other/sclite/wer') | 57 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/linear_1x512_d0.2_b300_fs4/glow_nar_taco_encoder_16blocks/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/linear_1x512_d0.2_b300_fs4/glow_nar_taco_encoder_16blocks/default_250/dev-other/sclite/wer') | 98.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/linear_1x512_d0.2_b300_fs4/glow_enc768/tts_dataset/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/linear_1x512_d0.2_b300_fs4/glow_enc768/tts_dataset/default_250/dev-other/sclite/wer') | 99.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x1024_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x1024_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/default_250/dev-other/sclite/wer') | 66.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/not_silence_preprocessed/default_250/dev-other/sclite/wer') | 70.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/silence_preprocessing/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc768/with_sigma/silence_preprocessing/default_250/dev-other/sclite/wer') | 71.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_simple_encoder_epoch84/silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_simple_encoder_epoch84/silence_preprocessed/default_250/dev-other/sclite/wer') | 57.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder_16blocks/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder_16blocks/not_silence_preprocessed/default_250/dev-other/sclite/wer') | 55.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder/not_silence_preprocessed/default_250/dev-other/sclite/wer') | 66.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder/silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_nar_taco_encoder/silence_preprocessed/default_250/dev-other/sclite/wer') | 67.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS/silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS/silence_preprocessed/default_250/dev-other/sclite/wer') | 61.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS/not_silence_preprocessed/default_250/dev-other/sclite/wer') | 61.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS_early_eval_ep100/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS_early_eval_ep100/not_silence_preprocessed/default_250/dev-other/sclite/wer') | 60.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS_early_eval_ep100/silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/200epsTTS_early_eval_ep100/silence_preprocessed/default_250/dev-other/sclite/wer') | 60.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/not_silence_preprocessed/default_250/dev-other/sclite/wer') | 61.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/100epTTS/silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_enc192/100epTTS/silence_preprocessed/default_250/dev-other/sclite/wer') | 59.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_with_small_enc/silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_with_small_enc/silence_preprocessed/default_250/dev-other/sclite/wer') | 79.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_simple_encoder/silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_simple_encoder/silence_preprocessed/default_250/dev-other/sclite/wer') | 56.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer') | 35 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer') | 34.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer') | 35.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer') | 34.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer') | 35.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer') | 37.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer') | 36.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer') | 35.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer') | 37.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer') | 34.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer') | 34.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer') | 36.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer') | 35.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer') | 35.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer') | 34 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer') | 35.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer') | 37.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer') | 34.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/default_250/dev-other/sclite/wer') | 40.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/default_250/dev-other/sclite/wer') | 36.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer') | 34.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer') | 33.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer') | 33.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer') | 35.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer') | 34.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer') | 39.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer') | 37.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer') | 37.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer') | 34.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer') | 34 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer') | 34.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer') | 34.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer') | 39.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer') | 35.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer') | 35.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer') | 37.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer') | 39.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer') | 34.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/default/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/default/default_250/dev-other/sclite/wer') | 55.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_simple_encoder/silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_simple_encoder/silence_preprocessed/default_250/dev-other/sclite/wer') | 50.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment_before/glow_enc192_200ep/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment_before/glow_enc192_200ep/default_250/dev-other/sclite/wer') | 92.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment_before/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment_before/glow_enc192_200ep_not_freezed/default_250/dev-other/sclite/wer') | 30.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/drop_around_blstm/spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/drop_around_blstm/spec_augment/default_250/dev-other/sclite/wer') | 94.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/drop_around_blstm/lm5/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/drop_around_blstm/lm5/default_250/dev-other/sclite/wer') | 66.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/silence_preprocessing/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/silence_preprocessing/default_250/dev-other/sclite/wer') | 76.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/default_250/dev-other/sclite/wer') | 73.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/default_250/dev-other/sclite/wer') | 67.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 64 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 63.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 63.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 61.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 62.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 62.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 63.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 63.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 62.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 63.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 62.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 62.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/silence_preprocessed/default_250/dev-other/sclite/wer') | 78.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/default_250/dev-other/sclite/wer') | 79.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 69.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 55.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 64.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 59.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 62.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 62.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 62.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 64.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 64.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 63.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 63.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 64 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 63.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 62.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 63.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/tuning/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 63.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/default_250/dev-other/sclite/wer') | 67.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer') | 26.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer') | 24.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer') | 25 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer') | 24.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer') | 25.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer') | 25.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer') | 24.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer') | 24 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer') | 24.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer') | 24.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer') | 26.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer') | 26 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer') | 24.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer') | 24.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer') | 25.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer') | 26.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer') | 27.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer') | 23.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/default_250/dev-other/sclite/wer') | 32.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc768_100ep_xvector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc768_100ep_xvector/default_250/dev-other/sclite/wer') | 62.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/default_250/dev-other/sclite/wer') | 35.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer') | 32.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer') | 33.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer') | 35.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer') | 33.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer') | 32.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer') | 31.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer') | 31.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer') | 31.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer') | 32.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer') | 32.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer') | 33.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer') | 32.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer') | 36 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer') | 35.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer') | 31.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer') | 32.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer') | 31.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer') | 31.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 44.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 52.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 42.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 45.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 41.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 46.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 41.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 46.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 43.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 48.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 43.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 48.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/default_250/dev-other/sclite/wer') | 60.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 59.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 58.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 58.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 57.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 56.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 57.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 58.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 58.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 56.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 57.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 57.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/tuned/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 58 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/default_250/dev-other/sclite/wer') | 60.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval_spec_aug/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval_spec_aug/not_silence_preprocessed/default_250/dev-other/sclite/wer') | 67.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained/default_250/dev-other/sclite/wer') | 30.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer') | 24.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer') | 23.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer') | 24.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer') | 23.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer') | 24.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer') | 23.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer') | 24.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer') | 26.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer') | 24.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer') | 23.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer') | 25 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer') | 24.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer') | 23.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer') | 23.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer') | 26.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer') | 26.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer') | 26.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrainedlm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer') | 24.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/default_250/dev-other/sclite/wer') | 40.9 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.5/default_250/dev-other/sclite/wer') | 42 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.5/default_250/dev-other/sclite/wer') | 40 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0/default_250/dev-other/sclite/wer') | 38.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.3/default_250/dev-other/sclite/wer') | 38.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0/default_250/dev-other/sclite/wer') | 38.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.3/default_250/dev-other/sclite/wer') | 37.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.3/default_250/dev-other/sclite/wer') | 37.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0/default_250/dev-other/sclite/wer') | 38.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.5/default_250/dev-other/sclite/wer') | 37.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0/default_250/dev-other/sclite/wer') | 41.3 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0.3/default_250/dev-other/sclite/wer') | 39.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm1.5_ps0.3/default_250/dev-other/sclite/wer') | 41.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0/default_250/dev-other/sclite/wer') | 38 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.5_ps0.5/default_250/dev-other/sclite/wer') | 38.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm2.0_ps0/default_250/dev-other/sclite/wer') | 39.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.5_ps0.5/default_250/dev-other/sclite/wer') | 37.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm4.0_ps0.5/default_250/dev-other/sclite/wer') | 37.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm3.0_ps0.3/default_250/dev-other/sclite/wer') | 37.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/default_250/dev-other/sclite/wer') | 54.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.3/default_250/dev-other/sclite/wer') | 35.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0/default_250/dev-other/sclite/wer') | 36.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.5/default_250/dev-other/sclite/wer') | 36.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.3/default_250/dev-other/sclite/wer') | 41.1 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.3/default_250/dev-other/sclite/wer') | 35.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0/default_250/dev-other/sclite/wer') | 41.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0/default_250/dev-other/sclite/wer') | 38.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.5/default_250/dev-other/sclite/wer') | 33.8 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.3/default_250/dev-other/sclite/wer') | 34 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.0_ps_0.5/default_250/dev-other/sclite/wer') | 34.6 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_3.5_ps_0.3/default_250/dev-other/sclite/wer') | 38 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0/default_250/dev-other/sclite/wer') | 35.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0.5/default_250/dev-other/sclite/wer') | 39.5 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_1.5_ps_0.5/default_250/dev-other/sclite/wer') | 35.7 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0.3/default_250/dev-other/sclite/wer') | 34.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.0_ps_0.5/default_250/dev-other/sclite/wer') | 34 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_2.5_ps_0/default_250/dev-other/sclite/wer') | 36.2 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_4.0_ps_0/default_250/dev-other/sclite/wer') | 45.4 |\n", + "| ('librispeech_glow_asr', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_4x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_4x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/default_250/dev-other/sclite/wer') | 52.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/search/dev-other/sclite/wer') | 74.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer') | 66.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer') | 61.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer') | 54.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer') | 57.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer') | 52.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer') | 70.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_2.0/search/dev-other/sclite/wer') | 24.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer') | 23.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_3.0/search/dev-other/sclite/wer') | 24.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer') | 24.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_4.5/search/dev-other/sclite/wer') | 28.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer') | 24.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer') | 25.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer') | 23.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer') | 23.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_4.0/search/dev-other/sclite/wer') | 26.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer') | 23.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer') | 24.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer') | 24.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_2.5/search/dev-other/sclite/wer') | 24.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer') | 26.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer') | 23.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer') | 23.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/tuning/lm_3.5/search/dev-other/sclite/wer') | 25.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/search/dev-other/sclite/wer') | 30.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/search/dev-other/sclite/wer') | 30.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_4.5/search/dev-other/sclite/wer') | 28.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_3.0/search/dev-other/sclite/wer') | 24.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_2.0/search/dev-other/sclite/wer') | 25.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_3.5/search/dev-other/sclite/wer') | 25.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_2.5/search/dev-other/sclite/wer') | 24.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuning/lm_4.0/search/dev-other/sclite/wer') | 26.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/tuned/lm_2.5/search/dev-other/sclite/wer') | 24.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/search/dev-other/sclite/wer') | 56.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_4.5/search/dev-other/sclite/wer') | 52.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_2.0/search/dev-other/sclite/wer') | 39.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_3.0/search/dev-other/sclite/wer') | 42.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_2.5/search/dev-other/sclite/wer') | 40.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_3.5/search/dev-other/sclite/wer') | 45 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tuning/lm_4.0/search/dev-other/sclite/wer') | 48.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer') | 58.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer') | 59.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer') | 59.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer') | 50.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer') | 52.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer') | 52.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer') | 48.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer') | 48.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer') | 63.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer') | 55.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer') | 62.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer') | 49.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer') | 67.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer') | 54.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer') | 55.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer') | 48.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer') | 48.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer') | 50.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/search/dev-other/sclite/wer') | 72 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_4.5/search/dev-other/sclite/wer') | 48.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_3.0/search/dev-other/sclite/wer') | 48.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_2.0/search/dev-other/sclite/wer') | 50.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_3.5/search/dev-other/sclite/wer') | 47.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_2.5/search/dev-other/sclite/wer') | 49.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_4.0/search/dev-other/sclite/wer') | 47.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/search/dev-other/sclite/wer') | 48.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_radam/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_radam/search/dev-other/sclite/wer') | 43.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/search/dev-other/sclite/wer') | 40.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer') | 33.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer') | 29.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer') | 31.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer') | 30 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer') | 30 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer') | 36.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_spec_augment/search/dev-other/sclite/wer') | 37.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/search/dev-other/sclite/wer') | 42.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer') | 31.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer') | 31.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer') | 33.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer') | 30.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer') | 35.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer') | 36.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer') | 32.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer') | 31.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer') | 30.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer') | 38.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer') | 31.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer') | 30.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer') | 34.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer') | 33.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer') | 32.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer') | 31.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer') | 32.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer') | 31 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer') | 23.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer') | 23.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer') | 25.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.5_ps_0/search/dev-other/sclite/wer') | 24.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer') | 24.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer') | 24.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.0_ps_0/search/dev-other/sclite/wer') | 24.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer') | 23.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer') | 23.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_4.5_ps_0/search/dev-other/sclite/wer') | 28.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.0_ps_0/search/dev-other/sclite/wer') | 24.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer') | 23 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer') | 26.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer') | 24.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer') | 24.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer') | 23.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_2.5_ps_0/search/dev-other/sclite/wer') | 23.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tuning/lm_4.0_ps_0/search/dev-other/sclite/wer') | 26.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/search/dev-other/sclite/wer') | 31.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control/search/dev-other/sclite/wer') | 38.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_4.5/search/dev-other/sclite/wer') | 63.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer') | 47.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer') | 48.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer') | 54.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer') | 55.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer') | 45 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_2.0/search/dev-other/sclite/wer') | 47.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer') | 45.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_3.0/search/dev-other/sclite/wer') | 51.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer') | 46.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_2.5/search/dev-other/sclite/wer') | 49 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer') | 50.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer') | 52 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer') | 58.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_3.5/search/dev-other/sclite/wer') | 55.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_4.0/search/dev-other/sclite/wer') | 59.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer') | 45.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer') | 45.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/search/dev-other/sclite/wer') | 68.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer') | 27 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer') | 30.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_4.5/search/dev-other/sclite/wer') | 33.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer') | 24.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer') | 25.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_3.0/search/dev-other/sclite/wer') | 26.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer') | 24.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_2.0/search/dev-other/sclite/wer') | 25.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer') | 25.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer') | 27.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer') | 29.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_3.5/search/dev-other/sclite/wer') | 28 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer') | 24.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_2.5/search/dev-other/sclite/wer') | 25.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer') | 25.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_4.0/search/dev-other/sclite/wer') | 30.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer') | 24.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer') | 25 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/search/dev-other/sclite/wer') | 37.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/search/dev-other/sclite/wer') | 30.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer') | 26 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer') | 25.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer') | 23.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_4.5/search/dev-other/sclite/wer') | 28.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer') | 23.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_3.0/search/dev-other/sclite/wer') | 24.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer') | 24.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_2.0/search/dev-other/sclite/wer') | 25.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer') | 23.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer') | 26.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_3.5/search/dev-other/sclite/wer') | 25.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer') | 24.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer') | 24.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_2.5/search/dev-other/sclite/wer') | 24.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer') | 23.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer') | 24.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer') | 23.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/tuning/lm_4.0/search/dev-other/sclite/wer') | 26.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/search/dev-other/sclite/wer') | 40.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer') | 38.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer') | 34.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer') | 35.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer') | 35.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer') | 34.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer') | 36.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_4.0/search/dev-other/sclite/wer') | 45.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer') | 45.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer') | 43.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer') | 44.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer') | 43.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.5/search/dev-other/sclite/wer') | 44.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.5/search/dev-other/sclite/wer') | 44.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer') | 43 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer') | 43.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.0/search/dev-other/sclite/wer') | 44.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer') | 45.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.0/search/dev-other/sclite/wer') | 45.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer') | 44 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer') | 44.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer') | 43.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_4.5/search/dev-other/sclite/wer') | 46.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer') | 43.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer') | 43.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/search/dev-other/sclite/wer') | 48.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector_specaug/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector_specaug/search/dev-other/sclite/wer') | 44 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_trainXvector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_trainXvector/search/dev-other/sclite/wer') | 47 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/search/dev-other/sclite/wer') | 30.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/tuning/lm_3.0/search/dev-other/sclite/wer') | 24.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/tuning/lm_2.0/search/dev-other/sclite/wer') | 24.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/tuning/lm_2.5/search/dev-other/sclite/wer') | 23.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/tuning/lm_1.5/search/dev-other/sclite/wer') | 26 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_3.0/search/dev-other/sclite/wer') | 35.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_2.0/search/dev-other/sclite/wer') | 36.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_4.5/search/dev-other/sclite/wer') | 41.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_4.0/search/dev-other/sclite/wer') | 39.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_3.5/search/dev-other/sclite/wer') | 37.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/tuning/lm_2.5/search/dev-other/sclite/wer') | 35.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/search/dev-other/sclite/wer') | 44.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/search/dev-other/sclite/wer') | 35.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer') | 23.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer') | 23.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_4.5_ps_0/search/dev-other/sclite/wer') | 31.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_2.0_ps_0/search/dev-other/sclite/wer') | 24.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer') | 22.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_2.5_ps_0/search/dev-other/sclite/wer') | 24 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer') | 24.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_4.0_ps_0/search/dev-other/sclite/wer') | 28.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer') | 25.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer') | 28.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer') | 22.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_3.5_ps_0/search/dev-other/sclite/wer') | 26.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer') | 23.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer') | 23.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer') | 23.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_3.0_ps_0/search/dev-other/sclite/wer') | 24.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer') | 26 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer') | 27.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer') | 36.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer') | 35.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_4.0/search/dev-other/sclite/wer') | 37.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer') | 34.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_3.5/search/dev-other/sclite/wer') | 36.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer') | 35.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer') | 37.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer') | 36.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_2.5/search/dev-other/sclite/wer') | 36.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_3.0/search/dev-other/sclite/wer') | 36.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer') | 37 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer') | 35.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_2.0/search/dev-other/sclite/wer') | 37.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer') | 35.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer') | 34.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer') | 37.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_4.5/search/dev-other/sclite/wer') | 39.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer') | 35.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/search/dev-other/sclite/wer') | 42.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/search/dev-other/sclite/wer') | 44.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_2.5/search/dev-other/sclite/wer') | 38.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_3.5/search/dev-other/sclite/wer') | 39 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_4.0/search/dev-other/sclite/wer') | 40.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_4.5/search/dev-other/sclite/wer') | 41.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_2.0/search/dev-other/sclite/wer') | 39.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tuning/lm_3.0/search/dev-other/sclite/wer') | 38.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector/search/dev-other/sclite/wer') | 43.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_2.0/search/dev-other/sclite/wer') | 44.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer') | 44.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer') | 42.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_3.0/search/dev-other/sclite/wer') | 42.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer') | 44.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_4.5/search/dev-other/sclite/wer') | 46.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer') | 42.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer') | 41.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer') | 41.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_4.0/search/dev-other/sclite/wer') | 44.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer') | 44 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer') | 42.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer') | 43.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_2.5/search/dev-other/sclite/wer') | 42.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer') | 42.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer') | 41.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_3.5/search/dev-other/sclite/wer') | 43.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer') | 41.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/search/dev-other/sclite/wer') | 48.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/search/dev-other/sclite/wer') | 37.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_4.0/search/dev-other/sclite/wer') | 30.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_3.5/search/dev-other/sclite/wer') | 27.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_2.5/search/dev-other/sclite/wer') | 25.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_3.0/search/dev-other/sclite/wer') | 26.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_2.0/search/dev-other/sclite/wer') | 25.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/tuning/lm_4.5/search/dev-other/sclite/wer') | 33.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/search/dev-other/sclite/wer') | 43.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_2.5/search/dev-other/sclite/wer') | 32.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer') | 32.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer') | 31.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_3.5/search/dev-other/sclite/wer') | 34.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_4.0/search/dev-other/sclite/wer') | 37.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer') | 37.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer') | 33.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer') | 32.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer') | 31.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_4.5/search/dev-other/sclite/wer') | 39.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer') | 32.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer') | 31.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_2.0/search/dev-other/sclite/wer') | 32.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer') | 36.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer') | 34.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer') | 32.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer') | 31.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/tuning/lm_3.0/search/dev-other/sclite/wer') | 33 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/tuning/lm_4.0/search/dev-other/sclite/wer') | 26.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/tuning/lm_2.5/search/dev-other/sclite/wer') | 24.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/tuning/lm_3.5/search/dev-other/sclite/wer') | 25.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/tuning/lm_2.0/search/dev-other/sclite/wer') | 25.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/tuning/lm_3.0/search/dev-other/sclite/wer') | 24.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/tuning/lm_4.5/search/dev-other/sclite/wer') | 28.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/search/dev-other/sclite/wer') | 31 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ddi_actnorm/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ddi_actnorm/search/dev-other/sclite/wer') | 94.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_radam/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_radam/search/dev-other/sclite/wer') | 38.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ddi_actnorm/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ddi_actnorm/search/dev-other/sclite/wer') | 66.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/search/dev-other/sclite/wer') | 39.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/tuning/lm_4.5/search/dev-other/sclite/wer') | 36.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/tuning/lm_3.0/search/dev-other/sclite/wer') | 30.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/tuning/lm_2.0/search/dev-other/sclite/wer') | 30 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/tuning/lm_3.5/search/dev-other/sclite/wer') | 31.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/tuning/lm_2.5/search/dev-other/sclite/wer') | 29.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/tuning/lm_4.0/search/dev-other/sclite/wer') | 33.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer') | 36.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_4.0/search/dev-other/sclite/wer') | 39.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer') | 37.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_3.5/search/dev-other/sclite/wer') | 38.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer') | 37.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer') | 39 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer') | 36.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer') | 36.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_2.5/search/dev-other/sclite/wer') | 37 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_3.0/search/dev-other/sclite/wer') | 37.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer') | 36.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer') | 37.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_2.0/search/dev-other/sclite/wer') | 37.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer') | 37.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer') | 39.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer') | 36 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer') | 36.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/tuning/lm_4.5/search/dev-other/sclite/wer') | 42.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/search/dev-other/sclite/wer') | 44.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_3.0/search/dev-other/sclite/wer') | 38.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer') | 39.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_2.0/search/dev-other/sclite/wer') | 39.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer') | 38.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer') | 38.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer') | 38.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer') | 37.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_4.5/search/dev-other/sclite/wer') | 39.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer') | 37.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_4.0/search/dev-other/sclite/wer') | 39 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer') | 40.2 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer') | 38.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer') | 38.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_3.5/search/dev-other/sclite/wer') | 38.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer') | 38 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer') | 37.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_2.5/search/dev-other/sclite/wer') | 38.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer') | 37.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/search/dev-other/sclite/wer') | 41.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_control/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_control/search/dev-other/sclite/wer') | 46.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tuning/lm_3.0/search/dev-other/sclite/wer') | 37 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tuning/lm_2.0/search/dev-other/sclite/wer') | 37.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tuning/lm_4.5/search/dev-other/sclite/wer') | 41.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tuning/lm_4.0/search/dev-other/sclite/wer') | 39.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tuning/lm_3.5/search/dev-other/sclite/wer') | 37.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tuning/lm_2.5/search/dev-other/sclite/wer') | 37 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/search/dev-other/sclite/wer') | 44.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_encoder_sample_ctc_scale_0.1/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_encoder_sample_ctc_scale_0.1/search/dev-other/sclite/wer') | 100 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_coupling_epsilon/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_coupling_epsilon/search/dev-other/sclite/wer') | 30.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer') | 29.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer') | 28.3 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer') | 32.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer') | 29.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_3.0_ps_0/search/dev-other/sclite/wer') | 29.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_3.5_ps_0/search/dev-other/sclite/wer') | 30.8 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_2.0_ps_0.5/search/dev-other/sclite/wer') | 29.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer') | 28.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer') | 28.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_4.0_ps_0/search/dev-other/sclite/wer') | 32.9 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer') | 28.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_2.5_ps_0/search/dev-other/sclite/wer') | 29.5 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer') | 31.7 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer') | 30.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_2.0_ps_0.3/search/dev-other/sclite/wer') | 29.6 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer') | 28.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_2.0_ps_0/search/dev-other/sclite/wer') | 30.1 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/tuning/lm_4.5_ps_0/search/dev-other/sclite/wer') | 35.4 |\n", + "| ('joint_training/default', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/search/dev-other/sclite/wer') | 38.4 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/search/dev-other/sclite/wer') | 32.4 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/tuning/lm_2.0/search/dev-other/sclite/wer') | 30.6 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/tuning/lm_3.0/search/dev-other/sclite/wer') | 29.6 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/tuning/lm_2.5/search/dev-other/sclite/wer') | 29.8 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/tuning/lm_3.5/search/dev-other/sclite/wer') | 29.6 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/tuning/lm_4.0/search/dev-other/sclite/wer') | 30.2 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/tuning/lm_1.5/search/dev-other/sclite/wer') | 32 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/search/dev-other/sclite/wer') | 32.9 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/tuning/lm_2.5/search/dev-other/sclite/wer') | 30.1 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/tuning/lm_3.5/search/dev-other/sclite/wer') | 29.9 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/tuning/lm_4.0/search/dev-other/sclite/wer') | 30.7 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/tuning/lm_1.5/search/dev-other/sclite/wer') | 32.3 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/tuning/lm_2.0/search/dev-other/sclite/wer') | 31 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/tuning/lm_3.0/search/dev-other/sclite/wer') | 29.8 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/tuning/lm_2.0/search/dev-other/sclite/wer') | 19.3 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/tuning/lm_3.0/search/dev-other/sclite/wer') | 18.8 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/tuning/lm_1.5/search/dev-other/sclite/wer') | 20.2 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/tuning/lm_4.0/search/dev-other/sclite/wer') | 20.2 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/tuning/lm_2.5/search/dev-other/sclite/wer') | 18.7 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/tuning/lm_3.5/search/dev-other/sclite/wer') | 19.3 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/search/dev-other/sclite/wer') | 23.1 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/search/dev-other/sclite/wer') | 40.1 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_4.0/search/dev-other/sclite/wer') | 35.3 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_1.5/search/dev-other/sclite/wer') | 36.2 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.5/search/dev-other/sclite/wer') | 33.3 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.5/search/dev-other/sclite/wer') | 33.9 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_2.0/search/dev-other/sclite/wer') | 34.2 |\n", + "| ('joint_training/conformer_coupling', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tuning/lm_3.0/search/dev-other/sclite/wer') | 33.1 |\n", + "| ('joint_training/given_alignments', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/tuning/lm_2.5_ps_0.5/search/dev-other/sclite/wer') | 21.2 |\n", + "| ('joint_training/given_alignments', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/tuning/lm_3.0/search/dev-other/sclite/wer') | 21.3 |\n", + "| ('joint_training/given_alignments', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/tuning/lm_3.5_ps_0.5/search/dev-other/sclite/wer') | 21.1 |\n", + "| ('joint_training/given_alignments', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/tuning/lm_2_ps_0.5/search/dev-other/sclite/wer') | 21.7 |\n", + "| ('joint_training/given_alignments', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/tuning/lm_4.5/search/dev-other/sclite/wer') | 23.3 |\n", + "| ('joint_training/given_alignments', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/tuning/lm_3.0_ps_0.3/search/dev-other/sclite/wer') | 21 |\n", + "| ('joint_training/given_alignments', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/tuning/lm_4.5_ps_0.5/search/dev-other/sclite/wer') | 22.2 |\n", + "| ('joint_training/given_alignments', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/tuning/lm_4.0_ps_0.3/search/dev-other/sclite/wer') | 21.6 |\n", + "| ('joint_training/given_alignments', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/tuning/lm_2/search/dev-other/sclite/wer') | 21.9 |\n", + "| ('joint_training/given_alignments', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/tuning/lm_4.0/search/dev-other/sclite/wer') | 22.3 |\n", + "| ('joint_training/given_alignments', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/tuning/lm_2.5_ps_0.3/search/dev-other/sclite/wer') | 21.2 |\n", + "| ('joint_training/given_alignments', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/tuning/lm_3.5_ps_0.3/search/dev-other/sclite/wer') | 21.2 |\n", + "| ('joint_training/given_alignments', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/tuning/lm_3.0_ps_0.5/search/dev-other/sclite/wer') | 21 |\n", + "| ('joint_training/given_alignments', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/tuning/lm_2.5/search/dev-other/sclite/wer') | 21.3 |\n", + "| ('joint_training/given_alignments', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/tuning/lm_2_ps_0.3/search/dev-other/sclite/wer') | 21.7 |\n", + "| ('joint_training/given_alignments', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/tuning/lm_4.5_ps_0.3/search/dev-other/sclite/wer') | 22.4 |\n", + "| ('joint_training/given_alignments', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/tuning/lm_3.5/search/dev-other/sclite/wer') | 21.7 |\n", + "| ('joint_training/given_alignments', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/tuning/lm_4.0_ps_0.5/search/dev-other/sclite/wer') | 21.6 |\n", + "| ('joint_training/given_alignments', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/search/dev-other/sclite/wer') | 24.7 |\n", + "| ('joint_training/given_alignments', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/specaug/ce_ls_1.0/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/specaug/ce_ls_1.0/search/dev-other/sclite/wer') | 79.5 |\n", + "| ('joint_training/given_alignments', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/no_specaug/ce_ls_1.0/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/no_specaug/ce_ls_1.0/search/dev-other/sclite/wer') | 81 |\n", + "| ('joint_training/given_alignments', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep_v2/no_specaug/ce_ls_1.0/', '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep_v2/no_specaug/ce_ls_1.0/search/dev-other/sclite/wer') | 86.9 |\n" + ] + } + ], + "source": [ + "print(df.to_markdown())" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, "outputs": [], "source": [ "count_series = df.index.get_level_values(\"Experiment\").value_counts()\n", @@ -946,7 +2268,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -955,7 +2277,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -1007,7 +2329,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -1036,7 +2358,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -1076,7 +2398,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -1094,7 +2416,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -1103,20 +2425,21 @@ "text": [ "| | Group | Experiment | WER (dev-other) | Count | Tuned | CTC | dev CTC | overfitting | dev MLE | dev DP | Joint | Still running | Training data available | Num Epochs | LR | ASR Model Type | Missing glow.eval |\n", "|----:|:--------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------:|--------:|:--------|------:|----------:|--------------:|:--------------------|:--------------------|:--------|:----------------|:--------------------------|-------------:|:-------------------------------------------------|:-----------------|:--------------------|\n", - "| 9 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ddi_actnorm/ | 66.7 | 1 | False | 0.282 | 0.583 | 2.068 | -0.696242607904203 | 0.4012426779125676 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 11 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control/ | 38.5 | 1 | False | 0.216 | 0.735 | 3.408 | -0.7795215357433666 | 1.5493716010541627 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 12 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_radam/ | 38.3 | 1 | False | 0.214 | 0.731 | 3.414 | -0.7803357613809181 | 1.4600530691219098 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 13 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_spec_augment/ | 37.9 | 1 | False | 0.501 | 0.588 | 1.173 | -0.7782901200381193 | 1.4496996285337391 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 15 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ddi_actnorm/ | 94.4 | 1 | False | 0.165 | 1.062 | 6.429 | -0.5203935901323954 | 0.21473516580281835 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 16 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_encoder_sample_ctc_scale_0.1/ | 100 | 1 | False | 3.481 | 6.325 | 1.817 | -0.7816050729968331 | 1.1355000418243986 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 17 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_radam/ | 43.8 | 1 | False | 0.098 | 1.135 | 11.599 | -0.5829270274350138 | 0.34816459318002063 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 21 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_trainXvector/ | 47 | 1 | False | 0.046 | 1.485 | 32.45 | -0.5923372669653459 | 0.586278223855929 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 32 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_coupling_epsilon/ | 30.4 | 1 | False | 0.243 | 0.576 | 2.37 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 36 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_control/ | 46.3 | 1 | False | 0.233 | 0.916 | 3.927 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 39 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector/ | 43.5 | 1 | False | 0.251 | 0.762 | 3.03 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 45 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/no_specaug/ce_ls_1.0/ | 81 | 1 | False | 0.036 | 2.023 | 55.937 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", - "| 46 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/specaug/ce_ls_1.0/ | 79.5 | 1 | False | 1.182 | 0.749 | 0.634 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", - "| 47 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep_v2/no_specaug/ce_ls_1.0/ | 86.9 | 1 | False | 0.006 | 3.047 | 476.04 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 5 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector_specaug/ | 44 | 1 | False | 0.027 | 1.443 | 53.493 | -0.6382890479131178 | 0.19790865955027667 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | blstm | False |\n", + "| 8 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ddi_actnorm/ | 66.7 | 1 | False | 0.282 | 0.583 | 2.068 | -0.696242607904203 | 0.4012426779125676 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 10 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control/ | 38.5 | 1 | False | 0.216 | 0.735 | 3.408 | -0.7795215357433666 | 1.5493716010541627 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 11 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_radam/ | 38.3 | 1 | False | 0.214 | 0.731 | 3.414 | -0.7803357613809181 | 1.4600530691219098 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 12 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_spec_augment/ | 37.9 | 1 | False | 0.501 | 0.588 | 1.173 | -0.7782901200381193 | 1.4496996285337391 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 14 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ddi_actnorm/ | 94.4 | 1 | False | 0.165 | 1.062 | 6.429 | -0.5203935901323954 | 0.21473516580281835 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 15 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_encoder_sample_ctc_scale_0.1/ | 100 | 1 | False | 3.481 | 6.325 | 1.817 | -0.7816050729968331 | 1.1355000418243986 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 16 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_radam/ | 43.8 | 1 | False | 0.098 | 1.135 | 11.599 | -0.5829270274350138 | 0.34816459318002063 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 20 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_trainXvector/ | 47 | 1 | False | 0.046 | 1.485 | 32.45 | -0.5923372669653459 | 0.586278223855929 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 31 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_coupling_epsilon/ | 30.4 | 1 | False | 0.243 | 0.576 | 2.37 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 35 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_control/ | 46.3 | 1 | False | 0.233 | 0.916 | 3.927 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 38 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector/ | 43.5 | 1 | False | 0.251 | 0.762 | 3.03 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 44 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/no_specaug/ce_ls_1.0/ | 81 | 1 | False | 0.036 | 2.023 | 55.937 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 45 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/specaug/ce_ls_1.0/ | 79.5 | 1 | False | 1.182 | 0.749 | 0.634 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 46 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep_v2/no_specaug/ce_ls_1.0/ | 86.9 | 1 | False | 0.006 | 3.047 | 476.04 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", "| 48 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x1024_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/ | 66.4 | 1 | False | 0.582 | 1.039 | 1.786 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", "| 49 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/default/ | 55.1 | 1 | False | 0.177 | 1.151 | 6.489 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", "| 50 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/drop_around_blstm/lm5/ | 66.3 | 1 | False | 0.599 | 0.948 | 1.581 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", @@ -1144,19 +2467,21 @@ "| 81 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_simple_encoder_epoch84/silence_preprocessed/ | 57.4 | 1 | False | 0.079 | 1.519 | 19.301 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", "| 82 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_with_small_enc/silence_preprocessed/ | 79.4 | 1 | False | 1.103 | 1.283 | 1.163 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", "| 83 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_4x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/ | 52.4 | 1 | False | 0.378 | 0.812 | 2.15 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", - "| 94 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v2/ | 26.6 | 1 | False | 0.216 | 0.589 | 2.724 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 95 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v3/ | 26.3 | 1 | False | 0.211 | 0.591 | 2.799 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 108 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.1/ | 25.8 | 1 | False | 0.406 | 0.422 | 1.041 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", - "| 109 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.2/ | 25.5 | 1 | False | 0.413 | 0.417 | 1.01 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", - "| 110 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/ | 26.2 | 1 | False | 0.44 | 0.415 | 0.943 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", - "| 111 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.4/ | 25.2 | 1 | False | 0.416 | 0.413 | 0.993 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", - "| 115 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/batch_norm/ | 29.4 | 1 | False | 0.392 | 0.504 | 1.285 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", - "| 116 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/layer_norm/ | 28.5 | 1 | False | 0.407 | 0.488 | 1.197 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", - "| 118 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment_before/glow_enc192_200ep_not_silence_preprocessed/ | 100 | 1 | False | 3.454 | 5.802 | 1.68 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", - "| 119 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/tts_dataset/spec_augment/no_glow/ | 20.9 | 1 | False | 0.091 | 0.419 | 4.591 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 120 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/linear_1x512_d0.2_b300_fs4/glow_enc768/tts_dataset/ | 99.7 | 1 | False | 2.628 | 2.609 | 0.993 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | False |\n", - "| 121 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/linear_1x512_d0.2_b300_fs4/glow_nar_taco_encoder_16blocks/ | 98.8 | 1 | False | 2.23 | 2.39 | 1.072 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | False |\n", - "| 124 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/tts_dataset/ | 59.8 | 1 | False | 0.071 | 1.537 | 21.521 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n" + "| 95 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v2/ | 26.6 | 1 | False | 0.216 | 0.589 | 2.724 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 96 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v3/ | 26.3 | 1 | False | 0.211 | 0.591 | 2.799 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 110 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.1/ | 25.8 | 1 | False | 0.406 | 0.422 | 1.041 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 111 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.2/ | 25.5 | 1 | False | 0.413 | 0.417 | 1.01 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 112 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/ | 26.2 | 1 | False | 0.44 | 0.415 | 0.943 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 113 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.4/ | 25.2 | 1 | False | 0.416 | 0.413 | 0.993 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 117 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/batch_norm/ | 29.4 | 1 | False | 0.392 | 0.504 | 1.285 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 118 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/layer_norm/ | 28.5 | 1 | False | 0.407 | 0.488 | 1.197 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 120 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment_before/glow_enc192_200ep_not_silence_preprocessed/ | 100 | 1 | False | 3.454 | 5.802 | 1.68 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 121 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/tts_dataset/spec_augment/no_glow/ | 20.9 | 1 | False | 0.091 | 0.419 | 4.591 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 122 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/linear_1x512_d0.2_b300_fs4/glow_enc768/tts_dataset/ | 99.7 | 1 | False | 2.628 | 2.609 | 0.993 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | False |\n", + "| 123 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/linear_1x512_d0.2_b300_fs4/glow_nar_taco_encoder_16blocks/ | 98.8 | 1 | False | 2.23 | 2.39 | 1.072 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | unknown | False |\n", + "| 125 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/ | 57 | 1 | False | 0.508 | 0.863 | 1.698 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 126 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/tts_dataset/ | 59.8 | 1 | False | 0.071 | 1.537 | 21.521 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 130 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained/ | 30.5 | 1 | False | 0.245 | 0.58 | 2.37 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n" ] } ], @@ -1166,7 +2491,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -1175,7 +2500,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -1184,17 +2509,16 @@ "text": [ "| | Group | Experiment | WER (dev-other) | Count | Tuned | CTC | dev CTC | overfitting | dev MLE | dev DP | Joint | Still running | Training data available | Num Epochs | LR | ASR Model Type | Missing glow.eval |\n", "|----:|:---------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------:|--------:|:--------|------:|----------:|--------------:|:----------|:---------|:--------|:----------------|:--------------------------|-------------:|:-----------------------------------------------|:-----------------|:--------------------|\n", - "| 52 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/ | 58 | 13 | True | 0.022 | 2.315 | 104.406 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", - "| 54 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/ | 44.6 | 13 | True | 0.326 | 1.042 | 3.202 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | True |\n", - "| 55 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/ | 63.8 | 13 | True | 0.065 | 2.401 | 36.955 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", - "| 56 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/ | 55.4 | 13 | True | 0.882 | 1.139 | 1.292 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", - "| 59 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/ | 63.2 | 13 | True | 0.062 | 2.427 | 39.391 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 52 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/ | 56.6 | 13 | True | 0.022 | 2.315 | 104.406 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 54 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/ | 41.3 | 13 | True | 0.326 | 1.042 | 3.202 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | True |\n", + "| 55 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/ | 62.4 | 13 | True | 0.065 | 2.401 | 36.955 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 56 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/ | 55.4 | 5 | True | 0.882 | 1.139 | 1.292 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 59 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/ | 61.8 | 13 | True | 0.062 | 2.427 | 39.391 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", "| 62 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/ | 34 | 19 | True | 0.058 | 0.913 | 15.782 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", "| 63 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/ | 33.8 | 19 | True | 0.003 | 1.209 | 454.454 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", "| 64 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/ | 31.4 | 19 | True | 0.024 | 0.926 | 39.211 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", "| 65 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/ | 23.9 | 19 | True | 0.187 | 0.566 | 3.019 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", - "| 122 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/ | 49.4 | 13 | True | 0.028 | 1.672 | 58.743 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", - "| 123 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/ | 57 | 13 | True | 0.508 | 0.863 | 1.698 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n" + "| 124 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/ | 47.7 | 13 | True | 0.028 | 1.672 | 58.743 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n" ] } ], @@ -1211,7 +2535,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -1220,21 +2544,21 @@ "text": [ "| | Group | Experiment | WER (dev-other) | Count | Tuned | CTC | dev CTC | overfitting | dev MLE | dev DP | Joint | Still running | Training data available | Num Epochs | LR | ASR Model Type | Missing glow.eval |\n", "|----:|:--------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------:|--------:|:--------|------:|----------:|--------------:|:----------|:---------|:--------|:----------------|:--------------------------|-------------:|:-----------------------------------------------|:-----------------|:--------------------|\n", - "| 45 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/no_specaug/ce_ls_1.0/ | 81 | 1 | False | 0.036 | 2.023 | 55.937 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", - "| 46 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/specaug/ce_ls_1.0/ | 79.5 | 1 | False | 1.182 | 0.749 | 0.634 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", - "| 47 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep_v2/no_specaug/ce_ls_1.0/ | 86.9 | 1 | False | 0.006 | 3.047 | 476.04 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 44 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/no_specaug/ce_ls_1.0/ | 81 | 1 | False | 0.036 | 2.023 | 55.937 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 45 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep/specaug/ce_ls_1.0/ | 79.5 | 1 | False | 1.182 | 0.749 | 0.634 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 46 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step/ga_glowTTS_ASR_ffn_x_vector_v2_blstm_2ndstep_v2/no_specaug/ce_ls_1.0/ | 86.9 | 1 | False | 0.006 | 3.047 | 476.04 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", "| 48 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x1024_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/ | 66.4 | 1 | False | 0.582 | 1.039 | 1.786 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", "| 49 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/default/ | 55.1 | 1 | False | 0.177 | 1.151 | 6.489 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", "| 50 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/drop_around_blstm/lm5/ | 66.3 | 1 | False | 0.599 | 0.948 | 1.581 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", "| 51 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/drop_around_blstm/spec_augment/ | 94.2 | 1 | False | 2.438 | 1.396 | 0.573 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", - "| 52 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/ | 58 | 13 | True | 0.022 | 2.315 | 104.406 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 52 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval/not_silence_preprocessed/ | 56.6 | 13 | True | 0.022 | 2.315 | 104.406 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", "| 53 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_eval_spec_aug/not_silence_preprocessed/ | 67.7 | 1 | False | 0.725 | 1.04 | 1.434 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", - "| 54 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/ | 44.6 | 13 | True | 0.326 | 1.042 | 3.202 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | True |\n", - "| 55 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/ | 63.8 | 13 | True | 0.065 | 2.401 | 36.955 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", - "| 56 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/ | 55.4 | 13 | True | 0.882 | 1.139 | 1.292 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 54 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_not_eval/not_silence_preprocessed/ | 41.3 | 13 | True | 0.326 | 1.042 | 3.202 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | True |\n", + "| 55 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/glow_eval/not_silence_preprocessed/ | 62.4 | 13 | True | 0.065 | 2.401 | 36.955 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 56 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/ | 55.4 | 5 | True | 0.882 | 1.139 | 1.292 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", "| 57 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/silence_preprocessed/ | 78.1 | 1 | False | 0.779 | 1.163 | 1.493 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", "| 58 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/ | 73.4 | 1 | False | 0.667 | 1.099 | 1.648 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", - "| 59 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/ | 63.2 | 13 | True | 0.062 | 2.427 | 39.391 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 59 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/glow_eval/ | 61.8 | 13 | True | 0.062 | 2.427 | 39.391 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", "| 60 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/silence_preprocessing/ | 76.2 | 1 | False | 0.71 | 1.15 | 1.62 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", "| 61 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_simple_encoder/silence_preprocessed/ | 50.3 | 1 | False | 0.046 | 1.527 | 33.333 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", "| 62 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/ | 34 | 19 | True | 0.058 | 0.913 | 15.782 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", @@ -1259,9 +2583,9 @@ "| 81 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_simple_encoder_epoch84/silence_preprocessed/ | 57.4 | 1 | False | 0.079 | 1.519 | 19.301 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", "| 82 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/tts_dataset/glow_with_small_enc/silence_preprocessed/ | 79.4 | 1 | False | 1.103 | 1.283 | 1.163 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", "| 83 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_4x512_d0.2_b300_fs4/asr_dataset/glow_enc768/not_silence_preprocessed/ | 52.4 | 1 | False | 0.378 | 0.812 | 2.15 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", - "| 122 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/ | 49.4 | 13 | True | 0.028 | 1.672 | 58.743 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", - "| 123 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/ | 57 | 13 | True | 0.508 | 0.863 | 1.698 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", - "| 124 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/tts_dataset/ | 59.8 | 1 | False | 0.071 | 1.537 | 21.521 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n" + "| 124 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/ | 47.7 | 13 | True | 0.028 | 1.672 | 58.743 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 125 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/ | 57 | 1 | False | 0.508 | 0.863 | 1.698 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 126 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/tts_dataset/ | 59.8 | 1 | False | 0.071 | 1.537 | 21.521 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n" ] } ], @@ -1282,7 +2606,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -1291,60 +2615,67 @@ "text": [ "| | Group | Experiment | WER (dev-other) | Count | Tuned | CTC | dev CTC | overfitting | dev MLE | dev DP | Joint | Still running | Training data available | Num Epochs | LR | ASR Model Type | Missing glow.eval |\n", "|----:|:----------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------:|--------:|:--------|:------|:----------|:--------------|:----------|:---------|:--------|:----------------|:--------------------------|:-------------|:-------------------------------------------------|:-----------------|:--------------------|\n", - "| 2 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/ | 29.6 | 7 | True | 0.024 | 0.863 | 36.326 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 5 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/ | 29.8 | 7 | True | 0.023 | 0.866 | 37.161 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 6 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/ | 18.7 | 7 | True | 0.133 | 0.444 | 3.33 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 30 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/ | 38.3 | 7 | True | 0.037 | 1.206 | 32.173 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 31 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/ | 24.1 | 7 | True | 0.244 | 0.575 | 2.358 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 32 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_coupling_epsilon/ | 30.4 | 1 | False | 0.243 | 0.576 | 2.37 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 33 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/ | 23.8 | 5 | True | - | - | - | - | - | False | False | True | - | - | conformer | False |\n", - "| 34 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/ | 24.6 | 8 | True | 0.242 | 0.587 | 2.424 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 35 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/ | 35.3 | 7 | True | 0.233 | 0.9 | 3.862 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 36 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_control/ | 46.3 | 1 | False | 0.233 | 0.916 | 3.927 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 37 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/ | 24.5 | 7 | True | 0.23 | 0.59 | 2.568 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 38 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/ | 24.6 | 7 | True | 0.239 | 0.594 | 2.488 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 39 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector/ | 43.5 | 1 | False | 0.251 | 0.762 | 3.03 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 40 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/ | 37 | 7 | True | 0.167 | 0.931 | 5.565 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 41 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/ | 32.4 | 7 | True | 0.244 | 0.761 | 3.119 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 42 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/ | 29.5 | 7 | True | 0.216 | 0.73 | 3.38 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 43 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/ | 25.6 | 7 | True | 0.491 | 0.587 | 1.196 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 44 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/ | 25.4 | 7 | True | 0.502 | 0.592 | 1.181 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 0 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glow_ASR_conformer/ | 29.6 | 7 | True | 0.024 | 0.863 | 36.326 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 2 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer/ | 29.8 | 7 | True | 0.023 | 0.866 | 37.161 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 3 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glow_ASR_conformer_specaugment_before/ | 18.7 | 7 | True | 0.133 | 0.444 | 3.33 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 29 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer/ | 37.7 | 19 | True | 0.037 | 1.206 | 32.173 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 30 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before/ | 23.3 | 19 | True | 0.244 | 0.575 | 2.358 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 31 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_coupling_epsilon/ | 30.4 | 1 | False | 0.243 | 0.576 | 2.37 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 32 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_no_jit/ | 23.8 | 5 | True | - | - | - | - | - | False | False | True | - | - | conformer | False |\n", + "| 33 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_tts_train_segments/ | 24.6 | 8 | True | 0.242 | 0.587 | 2.424 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 34 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector/ | 35.3 | 7 | True | 0.233 | 0.9 | 3.862 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 35 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_control/ | 46.3 | 1 | False | 0.233 | 0.916 | 3.927 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 36 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v2/ | 23.6 | 19 | True | 0.23 | 0.59 | 2.568 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 37 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_specaugment_before_xvector_v3/ | 24.6 | 7 | True | 0.239 | 0.594 | 2.488 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 38 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector/ | 43.5 | 1 | False | 0.251 | 0.762 | 3.03 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 39 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvector_eval/ | 36 | 19 | True | 0.167 | 0.931 | 5.565 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 40 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glow_ASR_conformer_xvectorgrad_clip_10/ | 31.6 | 19 | True | 0.244 | 0.761 | 3.119 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 41 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/ | 28.3 | 19 | True | 0.216 | 0.73 | 3.38 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 42 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/ | 24.2 | 19 | True | 0.491 | 0.587 | 1.196 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 43 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/ | 25.4 | 7 | True | 0.502 | 0.592 | 1.181 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 47 | joint_training/given_alignments | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/second_step_asr/frozen_glowtts/glowASR_conformer_x_vector/ | 21 | 19 | True | 0.27 | 0.492 | 1.826 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", "| 84 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/ | 28.1 | 19 | True | 0.001 | 1.248 | 2248.206 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", "| 85 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/ | 28.7 | 19 | True | 0.001 | 1.295 | 1861.864 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 86 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/ | 49.1 | 13 | True | 0.006 | 2.364 | 381.61 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 87 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/ | 36.1 | 13 | True | 0.002 | 1.65 | 869.021 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 86 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/ | 49.1 | 19 | True | 0.006 | 2.364 | 381.61 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 87 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/ | 36 | 19 | True | 0.002 | 1.65 | 869.021 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", "| 88 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/ | 27.1 | 13 | True | 0.001 | 1.193 | 975.256 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 89 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/ | 20 | 13 | True | 0.331 | 0.453 | 1.37 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 90 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/ | 19.7 | 13 | True | 0.335 | 0.448 | 1.338 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 91 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/ | 18.9 | 13 | True | 0.32 | 0.439 | 1.374 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 92 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/ | 38 | 13 | True | 0.321 | 0.894 | 2.788 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 93 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/ | 26.1 | 19 | True | 0.003 | 1.046 | 335.647 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 94 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v2/ | 26.6 | 1 | False | 0.216 | 0.589 | 2.724 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 95 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v3/ | 26.3 | 1 | False | 0.211 | 0.591 | 2.799 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 96 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/ | 26.6 | 19 | True | 0.003 | 1.081 | 368.105 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 97 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/ | 18.3 | 19 | True | 0.08 | 0.548 | 6.834 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 98 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/ | 23.6 | 13 | True | 0.42 | 0.476 | 1.133 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 99 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/ | 19.7 | 13 | True | 0.281 | 0.458 | 1.63 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 100 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/ | 38.2 | 13 | True | 0.285 | 0.975 | 3.417 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 101 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/ | 25.3 | 13 | True | 0.33 | 0.543 | 1.644 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 102 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/ | 22 | 13 | True | 0.279 | 0.488 | 1.75 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 103 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/ | 18 | 15 | True | 0.083 | 0.544 | 6.584 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 104 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/ | 17.6 | 19 | True | 0.086 | 0.512 | 5.958 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 105 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/ | 25.5 | 7 | True | 0.5 | 0.589 | 1.177 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 106 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/ | 25.5 | 13 | True | 0.03 | 0.732 | 24.376 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", - "| 107 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/ | 21.2 | 13 | True | 0.345 | 0.462 | 1.339 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", - "| 108 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.1/ | 25.8 | 1 | False | 0.406 | 0.422 | 1.041 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", - "| 109 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.2/ | 25.5 | 1 | False | 0.413 | 0.417 | 1.01 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", - "| 110 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/ | 26.2 | 1 | False | 0.44 | 0.415 | 0.943 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", - "| 111 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.4/ | 25.2 | 1 | False | 0.416 | 0.413 | 0.993 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", - "| 112 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/ | 20 | 13 | True | 0.416 | 0.418 | 1.005 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", - "| 113 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/ | 19.4 | 13 | True | 0.364 | 0.417 | 1.144 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", - "| 114 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/ | 21.8 | 14 | True | 0.368 | 0.474 | 1.287 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", - "| 115 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/batch_norm/ | 29.4 | 1 | False | 0.392 | 0.504 | 1.285 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", - "| 116 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/layer_norm/ | 28.5 | 1 | False | 0.407 | 0.488 | 1.197 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", - "| 117 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/ | 100 | 13 | True | 3.463 | 5.332 | 1.54 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", - "| 118 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment_before/glow_enc192_200ep_not_silence_preprocessed/ | 100 | 1 | False | 3.454 | 5.802 | 1.68 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", - "| 119 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/tts_dataset/spec_augment/no_glow/ | 20.9 | 1 | False | 0.091 | 0.419 | 4.591 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n" + "| 89 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/ | 20 | 16 | True | 0.331 | 0.453 | 1.37 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 90 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/ | 19.7 | 16 | True | 0.335 | 0.448 | 1.338 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 91 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/ | 22.9 | 16 | True | 0.214 | 0.575 | 2.69 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 92 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/ | 18.9 | 16 | True | 0.32 | 0.439 | 1.374 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 93 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/ | 37.5 | 19 | True | 0.321 | 0.894 | 2.788 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 94 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/ | 26.1 | 19 | True | 0.003 | 1.046 | 335.647 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 95 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v2/ | 26.6 | 1 | False | 0.216 | 0.589 | 2.724 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 96 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_v3/ | 26.3 | 1 | False | 0.211 | 0.591 | 2.799 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 97 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/ | 26.5 | 19 | True | 0.003 | 1.081 | 368.105 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 98 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/ | 18.3 | 19 | True | 0.08 | 0.548 | 6.834 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 99 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector/ | 23.6 | 13 | True | 0.42 | 0.476 | 1.133 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 100 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_100ep_xvector_v2/ | 19.7 | 13 | True | 0.281 | 0.458 | 1.63 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 101 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2/ | 37.5 | 19 | True | 0.285 | 0.975 | 3.417 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 102 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc192_200ep_dec_0.05_v2/ | 24.8 | 19 | True | 0.33 | 0.543 | 1.644 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 103 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/ | 21.2 | 19 | True | 0.27 | 0.479 | 1.772 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 104 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/ | 21.4 | 19 | True | 0.279 | 0.488 | 1.75 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 105 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/ | 18 | 15 | True | 0.083 | 0.544 | 6.584 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 106 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/ | 17.6 | 19 | True | 0.086 | 0.512 | 5.958 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 107 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/ | 25.5 | 7 | True | 0.5 | 0.589 | 1.177 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 108 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed/ | 25.2 | 13 | True | 0.03 | 0.732 | 24.376 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 109 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed/ | 21.2 | 13 | True | 0.345 | 0.462 | 1.339 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 110 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.1/ | 25.8 | 1 | False | 0.406 | 0.422 | 1.041 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 111 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.2/ | 25.5 | 1 | False | 0.413 | 0.417 | 1.01 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 112 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.3/ | 26.2 | 1 | False | 0.44 | 0.415 | 0.943 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 113 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.4/ | 25.2 | 1 | False | 0.416 | 0.413 | 0.993 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 114 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0.5/ | 20 | 13 | True | 0.416 | 0.418 | 1.005 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 115 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_1/ | 19.4 | 13 | True | 0.364 | 0.417 | 1.144 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 116 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/ | 21.8 | 14 | True | 0.368 | 0.474 | 1.287 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 117 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/batch_norm/ | 29.4 | 1 | False | 0.392 | 0.504 | 1.285 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 118 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/layer_norm/ | 28.5 | 1 | False | 0.407 | 0.488 | 1.197 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 119 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc768_200ep_not_silence_preprocessed/ | 100 | 13 | True | 3.463 | 5.332 | 1.54 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 120 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment_before/glow_enc192_200ep_not_silence_preprocessed/ | 100 | 1 | False | 3.454 | 5.802 | 1.68 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | True |\n", + "| 121 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/tts_dataset/spec_augment/no_glow/ | 20.9 | 1 | False | 0.091 | 0.419 | 4.591 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 127 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/ | 37.4 | 19 | True | 0.039 | 1.198 | 30.666 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 128 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/ | 33.8 | 19 | True | 0.917 | 0.753 | 0.821 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 129 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained | 23.3 | 18 | True | - | - | - | - | - | False | False | True | - | - | conformer | False |\n", + "| 130 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained/ | 30.5 | 1 | False | 0.245 | 0.58 | 2.37 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n" ] } ], @@ -1358,26 +2689,29 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "| | Group | Experiment | WER (dev-other) | Count | Tuned | CTC | dev CTC | overfitting | dev MLE | dev DP | Joint | Still running | Training data available | Num Epochs | LR | ASR Model Type | Missing glow.eval |\n", - "|----:|:---------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------:|--------:|:--------|------:|----------:|--------------:|:----------|:---------|:--------|:----------------|:--------------------------|-------------:|:-------------------------------------------------|:-----------------|:--------------------|\n", - "| 62 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/ | 34 | 19 | True | 0.058 | 0.913 | 15.782 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", - "| 63 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/ | 33.8 | 19 | True | 0.003 | 1.209 | 454.454 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", - "| 64 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/ | 31.4 | 19 | True | 0.024 | 0.926 | 39.211 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", - "| 65 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/ | 23.9 | 19 | True | 0.187 | 0.566 | 3.019 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", - "| 68 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment_before/glow_enc192_200ep_not_freezed/ | 30.9 | 1 | False | 0.172 | 0.56 | 3.248 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", - "| 84 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/ | 28.1 | 19 | True | 0.001 | 1.248 | 2248.21 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 85 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/ | 28.7 | 19 | True | 0.001 | 1.295 | 1861.86 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 93 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/ | 26.1 | 19 | True | 0.003 | 1.046 | 335.647 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 96 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/ | 26.6 | 19 | True | 0.003 | 1.081 | 368.105 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 97 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/ | 18.3 | 19 | True | 0.08 | 0.548 | 6.834 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 104 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/ | 17.6 | 19 | True | 0.086 | 0.512 | 5.958 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n" + "| | Group | Experiment | WER (dev-other) | Count | Tuned | CTC | dev CTC | overfitting | dev MLE | dev DP | Joint | Still running | Training data available | Num Epochs | LR | ASR Model Type | Missing glow.eval |\n", + "|----:|:---------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------:|--------:|:--------|:------|:----------|:--------------|:----------|:---------|:--------|:----------------|:--------------------------|:-------------|:-------------------------------------------------|:-----------------|:--------------------|\n", + "| 62 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_freezed/ | 34 | 19 | True | 0.058 | 0.913 | 15.782 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 63 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/no_spec_augment/glow_enc192_200ep_not_pretrained/ | 33.8 | 19 | True | 0.003 | 1.209 | 454.454 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 64 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_freezed/ | 31.4 | 19 | True | 0.024 | 0.926 | 39.211 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 65 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/glow_enc192_200ep_not_pretrained/ | 23.9 | 19 | True | 0.187 | 0.566 | 3.019 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 68 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment_before/glow_enc192_200ep_not_freezed/ | 30.9 | 1 | False | 0.172 | 0.56 | 3.248 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 84 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/ | 28.1 | 19 | True | 0.001 | 1.248 | 2248.206 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 85 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/ | 28.7 | 19 | True | 0.001 | 1.295 | 1861.864 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 94 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed_not_freezed/ | 26.1 | 19 | True | 0.003 | 1.046 | 335.647 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 97 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/ | 26.5 | 19 | True | 0.003 | 1.081 | 368.105 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 98 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/ | 18.3 | 19 | True | 0.08 | 0.548 | 6.834 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 106 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_not_pretrained/ | 17.6 | 19 | True | 0.086 | 0.512 | 5.958 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 127 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/ | 37.4 | 19 | True | 0.039 | 1.198 | 30.666 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 129 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained | 23.3 | 18 | True | - | - | - | - | - | False | False | True | - | - | conformer | False |\n", + "| 130 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained/ | 30.5 | 1 | False | 0.245 | 0.58 | 2.37 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n" ] } ], @@ -1399,20 +2733,20 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/var/tmp/ipykernel_2362097/226959626.py:22: SettingWithCopyWarning: \n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/recipe/i6_experiments/users/rilling/evaluation/ipykernel_2908543/226959626.py:22: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_joint[\"auto MOS\"] = mos\n", - "/var/tmp/ipykernel_2362097/226959626.py:23: SettingWithCopyWarning: \n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/recipe/i6_experiments/users/rilling/evaluation/ipykernel_2908543/226959626.py:23: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", @@ -1450,7 +2784,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -1459,33 +2793,32 @@ "text": [ "| | Group | Experiment | WER (dev-other) | Count | Tuned | CTC | dev CTC | overfitting | dev MLE | dev DP | Joint | Still running | Training data available | Num Epochs | LR | ASR Model Type | Missing glow.eval | auto MOS | sWER |\n", "|---:|:----------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------:|--------:|:--------|------:|----------:|--------------:|----------:|---------:|:--------|:----------------|:--------------------------|-------------:|:-------------------------------------------------|:-----------------|:--------------------|-----------:|-------:|\n", - "| 0 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/ | 23.9 | 7 | True | 0.292 | 0.564 | 1.933 | -0.670671 | 2.09134 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 1.82 | 95.7 |\n", - "| 1 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/ | 34.6 | 7 | True | 0.204 | 0.881 | 4.312 | -0.465325 | 0.229336 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.14 | 5.2 |\n", - "| 3 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/ | 24 | 7 | True | 0.289 | 0.569 | 1.967 | -0.692025 | 0.412498 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.45 | 20.9 |\n", - "| 4 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/ | 34.6 | 7 | True | 0.204 | 0.881 | 4.312 | -0.465325 | 0.229336 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.14 | 5.2 |\n", - "| 7 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/ | 23.8 | 7 | True | 0.3 | 0.582 | 1.94 | -0.699266 | 0.370426 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.61 | 14.8 |\n", - "| 8 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/ | 24 | 7 | True | 0.431 | 0.558 | 1.294 | -0.805268 | 0.355401 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 3.16 | 15.2 |\n", - "| 9 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ddi_actnorm/ | 66.7 | 1 | False | 0.282 | 0.583 | 2.068 | -0.696243 | 0.401243 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", - "| 10 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/ | 38.4 | 7 | True | 0.103 | 1.099 | 10.685 | -0.548408 | 0.701494 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 1.58 | 98 |\n", - "| 11 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control/ | 38.5 | 1 | False | 0.216 | 0.735 | 3.408 | -0.779522 | 1.54937 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", - "| 12 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_radam/ | 38.3 | 1 | False | 0.214 | 0.731 | 3.414 | -0.780336 | 1.46005 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", - "| 13 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_spec_augment/ | 37.9 | 1 | False | 0.501 | 0.588 | 1.173 | -0.77829 | 1.4497 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", - "| 14 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/ | 34.7 | 7 | True | 0.142 | 0.927 | 6.549 | -0.769183 | 0.36247 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 3.15 | 7.6 |\n", - "| 15 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ddi_actnorm/ | 94.4 | 1 | False | 0.165 | 1.062 | 6.429 | -0.520394 | 0.214735 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", - "| 16 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_encoder_sample_ctc_scale_0.1/ | 100 | 1 | False | 3.481 | 6.325 | 1.817 | -0.781605 | 1.1355 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", - "| 17 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_radam/ | 43.8 | 1 | False | 0.098 | 1.135 | 11.599 | -0.582927 | 0.348165 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", - "| 18 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/ | 37 | 7 | True | 0.197 | 0.946 | 4.803 | -0.521236 | 0.225934 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.3 | 5.2 |\n", - "| 19 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/ | 29.4 | 7 | True | 0.409 | 0.689 | 1.685 | -0.759844 | 0.329289 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 3.11 | 4.6 |\n", - "| 20 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/ | 29.6 | 7 | True | 0.409 | 0.696 | 1.703 | -0.75744 | 0.330754 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", - "| 21 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_trainXvector/ | 47 | 1 | False | 0.046 | 1.485 | 32.45 | -0.592337 | 0.586278 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", - "| 22 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/ | 39.7 | 7 | True | 0.432 | 0.863 | 1.998 | -0.629746 | 0.403638 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.75 | 9.5 |\n", - "| 23 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/ | 52.4 | 7 | True | 0.455 | 1.104 | 2.429 | -0.771774 | 0.399888 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 3.12 | 13.3 |\n", - "| 24 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/ | 47.7 | 7 | True | 0.386 | 1.015 | 2.63 | -0.641523 | 0.379653 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.64 | 9.9 |\n", - "| 25 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/ | 50.7 | 7 | True | 0.441 | 1.084 | 2.461 | -0.778751 | 0.379635 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 3.18 | 11.2 |\n", - "| 26 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/ | 44.1 | 7 | True | 0.058 | 1.44 | 25.027 | -0.591349 | 0.326762 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.48 | 6.7 |\n", - "| 27 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/ | 36.1 | 7 | True | 0.128 | 1.01 | 7.915 | -0.772961 | 0.346386 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 3.19 | 7.9 |\n", - "| 28 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/ | 42.6 | 7 | True | 0.136 | 1.228 | 9.014 | -0.535346 | 0.203592 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.32 | 6 |\n", - "| 29 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/ | 31.9 | 7 | True | 0.378 | 0.76 | 2.009 | -0.764738 | 0.306534 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 3.2 | 4.4 |\n" + "| 1 | joint_training/conformer_coupling | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/ | 33.1 | 7 | True | 0.271 | 0.863 | 3.185 | -0.803254 | 0.310085 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.05 | 4.2 |\n", + "| 4 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/ | 47.7 | 7 | True | 0.002 | 2.18 | 1163.98 | -0.692515 | 0.302857 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | blstm | False | 3.11 | 16 |\n", + "| 5 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector_specaug/ | 44 | 1 | False | 0.027 | 1.443 | 53.493 | -0.638289 | 0.197909 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | blstm | False | 3.01 | 6.3 |\n", + "| 6 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/ | 23 | 19 | True | 0.3 | 0.582 | 1.94 | -0.699266 | 0.370426 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.61 | 14.8 |\n", + "| 7 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/ | 22.9 | 19 | True | 0.431 | 0.558 | 1.294 | -0.805268 | 0.355401 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 3.16 | 15.2 |\n", + "| 8 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ddi_actnorm/ | 66.7 | 1 | False | 0.282 | 0.583 | 2.068 | -0.696243 | 0.401243 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", + "| 9 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/ | 38.4 | 7 | True | 0.103 | 1.099 | 10.685 | -0.548408 | 0.701494 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 1.58 | 98 |\n", + "| 10 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control/ | 38.5 | 1 | False | 0.216 | 0.735 | 3.408 | -0.779522 | 1.54937 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", + "| 11 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_radam/ | 38.3 | 1 | False | 0.214 | 0.731 | 3.414 | -0.780336 | 1.46005 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", + "| 12 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_control_spec_augment/ | 37.9 | 1 | False | 0.501 | 0.588 | 1.173 | -0.77829 | 1.4497 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", + "| 13 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/ | 34.7 | 7 | True | 0.142 | 0.927 | 6.549 | -0.769183 | 0.36247 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 3.15 | 7.6 |\n", + "| 14 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ddi_actnorm/ | 94.4 | 1 | False | 0.165 | 1.062 | 6.429 | -0.520394 | 0.214735 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", + "| 15 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_encoder_sample_ctc_scale_0.1/ | 100 | 1 | False | 3.481 | 6.325 | 1.817 | -0.781605 | 1.1355 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", + "| 16 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_radam/ | 43.8 | 1 | False | 0.098 | 1.135 | 11.599 | -0.582927 | 0.348165 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", + "| 17 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/ | 37 | 7 | True | 0.197 | 0.946 | 4.803 | -0.521236 | 0.225934 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.3 | 5.2 |\n", + "| 18 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/ | 29.4 | 7 | True | 0.409 | 0.689 | 1.685 | -0.759844 | 0.329289 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 3.11 | 4.6 |\n", + "| 19 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1_tts_segments/ | 29.6 | 7 | True | 0.409 | 0.696 | 1.703 | -0.75744 | 0.330754 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", + "| 20 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_trainXvector/ | 47 | 1 | False | 0.046 | 1.485 | 32.45 | -0.592337 | 0.586278 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | nan | nan |\n", + "| 21 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/ | 39.7 | 7 | True | 0.432 | 0.863 | 1.998 | -0.629746 | 0.403638 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.75 | 9.5 |\n", + "| 22 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/ | 52.4 | 7 | True | 0.455 | 1.104 | 2.429 | -0.771774 | 0.399888 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 3.12 | 13.3 |\n", + "| 23 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/ | 45 | 19 | True | 0.386 | 1.015 | 2.63 | -0.641523 | 0.379653 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.64 | 9.9 |\n", + "| 24 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/ | 48.1 | 19 | True | 0.441 | 1.084 | 2.461 | -0.778751 | 0.379635 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 3.18 | 11.2 |\n", + "| 25 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/ | 43 | 19 | True | 0.058 | 1.44 | 25.027 | -0.591349 | 0.326762 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.48 | 6.7 |\n", + "| 26 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/ | 34.8 | 19 | True | 0.128 | 1.01 | 7.915 | -0.772961 | 0.346386 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 3.19 | 7.9 |\n", + "| 27 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/ | 41.6 | 19 | True | 0.136 | 1.228 | 9.014 | -0.535346 | 0.203592 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 2.32 | 6 |\n", + "| 28 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/ | 30.5 | 19 | True | 0.378 | 0.76 | 2.009 | -0.764738 | 0.306534 | True | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False | 3.2 | 4.4 |\n" ] } ], @@ -1502,7 +2835,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -1511,15 +2844,15 @@ "text": [ "| | Group | Experiment | WER (dev-other) | Count | Tuned | CTC | dev CTC | overfitting | dev MLE | dev DP | Joint | Still running | Training data available | Num Epochs | LR | ASR Model Type | Missing glow.eval |\n", "|----:|:-----------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------:|--------:|:--------|------:|----------:|--------------:|:----------|:---------|:--------|:----------------|:--------------------------|-------------:|:-------------------------------------------------|:-----------------|:--------------------|\n", - "| 42 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/ | 29.5 | 7 | True | 0.216 | 0.73 | 3.38 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 43 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/ | 25.6 | 7 | True | 0.491 | 0.587 | 1.196 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 44 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/ | 25.4 | 7 | True | 0.502 | 0.592 | 1.181 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 41 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer/ | 28.3 | 19 | True | 0.216 | 0.73 | 3.38 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 42 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment/ | 24.2 | 19 | True | 0.491 | 0.587 | 1.196 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 43 | joint_training/default | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/only_conformer_spec_augment_tts_train_segments/ | 25.4 | 7 | True | 0.502 | 0.592 | 1.181 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", "| 88 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/no_spec_augment/no_glow/ | 27.1 | 13 | True | 0.001 | 1.193 | 975.256 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 103 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/ | 18 | 15 | True | 0.083 | 0.544 | 6.584 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 119 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/tts_dataset/spec_augment/no_glow/ | 20.9 | 1 | False | 0.091 | 0.419 | 4.591 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", - "| 122 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/ | 49.4 | 13 | True | 0.028 | 1.672 | 58.743 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", - "| 123 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/ | 57 | 13 | True | 0.508 | 0.863 | 1.698 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", - "| 124 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/tts_dataset/ | 59.8 | 1 | False | 0.071 | 1.537 | 21.521 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n" + "| 105 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/ | 18 | 15 | True | 0.083 | 0.544 | 6.584 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 121 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/tts_dataset/spec_augment/no_glow/ | 20.9 | 1 | False | 0.091 | 0.419 | 4.591 | - | - | False | False | True | 250 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | conformer | False |\n", + "| 124 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/ | 47.7 | 13 | True | 0.028 | 1.672 | 58.743 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 125 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/asr_dataset/spec_augment/ | 57 | 1 | False | 0.508 | 0.863 | 1.698 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n", + "| 126 | librispeech_glow_asr | /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/librispeech_glow_asr/pytorch/only_blstm_2x512_d0.2_b300_fs4/tts_dataset/ | 59.8 | 1 | False | 0.071 | 1.537 | 21.521 | - | - | False | False | True | 100 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | blstm | False |\n" ] } ], @@ -1530,7 +2863,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -1539,7 +2872,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1549,7 +2882,7 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[21], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdf_training_avail\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mplot\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkind\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mscatter\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mWER (dev-other)\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mCTC\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[22], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdf_training_avail\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mplot\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkind\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mscatter\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mWER (dev-other)\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mCTC\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/work/tools22/users/lukas.rilling/sis_env/lib/python3.10/site-packages/pandas/plotting/_core.py:976\u001b[0m, in \u001b[0;36mPlotAccessor.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 974\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m kind \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_dataframe_kinds:\n\u001b[1;32m 975\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data, ABCDataFrame):\n\u001b[0;32m--> 976\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mplot_backend\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mplot\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkind\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkind\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 977\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 978\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mplot kind \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkind\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m can only be used for data frames\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", "File \u001b[0;32m/work/tools22/users/lukas.rilling/sis_env/lib/python3.10/site-packages/pandas/plotting/_matplotlib/__init__.py:71\u001b[0m, in \u001b[0;36mplot\u001b[0;34m(data, kind, **kwargs)\u001b[0m\n\u001b[1;32m 69\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124max\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(ax, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mleft_ax\u001b[39m\u001b[38;5;124m\"\u001b[39m, ax)\n\u001b[1;32m 70\u001b[0m plot_obj \u001b[38;5;241m=\u001b[39m PLOT_CLASSES[kind](data, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m---> 71\u001b[0m \u001b[43mplot_obj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 72\u001b[0m plot_obj\u001b[38;5;241m.\u001b[39mdraw()\n\u001b[1;32m 73\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m plot_obj\u001b[38;5;241m.\u001b[39mresult\n", "File \u001b[0;32m/work/tools22/users/lukas.rilling/sis_env/lib/python3.10/site-packages/pandas/plotting/_matplotlib/core.py:453\u001b[0m, in \u001b[0;36mMPLPlot.generate\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 451\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compute_plot_data()\n\u001b[1;32m 452\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_setup_subplots()\n\u001b[0;32m--> 453\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_plot\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 454\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_add_table()\n\u001b[1;32m 455\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_legend()\n", @@ -1651,7 +2984,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/users/rilling/experiments/librispeech/librispeech_conformer_cb_joint/experiments.py b/users/rilling/experiments/librispeech/librispeech_conformer_cb_joint/experiments.py index 2cda50a90..3504f73c8 100644 --- a/users/rilling/experiments/librispeech/librispeech_conformer_cb_joint/experiments.py +++ b/users/rilling/experiments/librispeech/librispeech_conformer_cb_joint/experiments.py @@ -170,6 +170,7 @@ def tune_lm( asr_test_datasets, num_epochs, search_args, + additional_training_args={}, lm_weights=[1.5, 2.0, 2.5, 3.0, 3.5, 4.0], ): for lm in lm_weights: @@ -179,6 +180,7 @@ def tune_lm( training_datasets, asr_test_datasets, num_epochs, + training_args=additional_training_args, search_args={**search_args, **{"lm_weight": lm}}, tts_forward=False, ) @@ -501,23 +503,23 @@ def tune_lm( experiments[alias] = exp_dict net_module = "glowTTS_ASR_conformer_two_forward_pass" - train_args["network_module"] = net_module - alias = "ddi/" + net_module - exp_dict = run_exp( - alias, - train_args, - training_datasets, - asr_test_datasets, - 250, - forward_args=forward_args, - search_args=default_search_args, - tts_forward=True, - tts_eval_datasets=tts_forward_datasets, - ) + # train_args["network_module"] = net_module + # alias = "ddi/" + net_module + # exp_dict = run_exp( + # alias, + # train_args, + # training_datasets, + # asr_test_datasets, + # 250, + # forward_args=forward_args, + # search_args=default_search_args, + # tts_forward=True, + # tts_eval_datasets=tts_forward_datasets, + # ) - experiments[alias] = exp_dict + # experiments[alias] = exp_dict - tune_lm(alias, train_args, training_datasets, asr_test_datasets, 250, search_args=default_search_args) + # tune_lm(alias, train_args, training_datasets, asr_test_datasets, 250, search_args=default_search_args) train_args_no_ddi["network_module"] = net_module alias = "no_ddi/" + net_module @@ -527,6 +529,7 @@ def tune_lm( training_datasets, asr_test_datasets, 250, + training_args={"ctc_scale": 0.1}, forward_args=forward_args, search_args=default_search_args, tts_forward=True, @@ -535,7 +538,7 @@ def tune_lm( experiments[alias] = exp_dict - tune_lm(alias, train_args_no_ddi, training_datasets, asr_test_datasets, 250, search_args=default_search_args) + tune_lm(alias, train_args_no_ddi, training_datasets, asr_test_datasets, 250, search_args=default_search_args, additional_training_args={"ctc_scale": 0.1}) net_module = "glow_ASR_conformer" train_args["network_module"] = net_module @@ -621,34 +624,35 @@ def tune_lm( net_module = "glowTTS_ASR_conformer_x_vector" train_args_with_x_vector["network_module"] = net_module - alias = "ddi/" + net_module - exp_dict = run_exp( - alias, - train_args_with_x_vector, - training_datasets, - asr_test_datasets, - 250, - forward_args=forward_args, - search_args=default_search_args, - tts_forward=True, - tts_eval_datasets=tts_forward_datasets_xvectors, - ) - tune_lm(alias, train_args_with_x_vector, training_datasets, asr_test_datasets, 250, search_args=default_search_args) + # alias = "ddi/" + net_module + # exp_dict = run_exp( + # alias, + # train_args_with_x_vector, + # training_datasets, + # asr_test_datasets, + # 250, + # forward_args=forward_args, + # search_args=default_search_args, + # tts_forward=True, + # tts_eval_datasets=tts_forward_datasets_xvectors, + # ) + # tune_lm(alias, train_args_with_x_vector, training_datasets, asr_test_datasets, 250, search_args=default_search_args) net_module = "glowTTS_ASR_conformer_x_vector_v2" train_args_with_x_vector_no_ddi["network_module"] = net_module alias = "no_ddi/" + net_module exp_dict = run_exp( alias, - train_args_with_x_vector, + train_args_with_x_vector_no_ddi, training_datasets, asr_test_datasets, 250, + training_args={"ctc_scale": 0.1}, forward_args=forward_args, search_args=default_search_args, tts_forward=True, tts_eval_datasets=tts_forward_datasets_xvectors, ) - tune_lm(alias, train_args_with_x_vector, training_datasets, asr_test_datasets, 250, search_args=default_search_args) + tune_lm(alias, train_args_with_x_vector_no_ddi, training_datasets, asr_test_datasets, 250, search_args=default_search_args, additional_training_args={"ctc_scale": 0.1}) return experiments diff --git a/users/rilling/experiments/librispeech/librispeech_conformer_cb_joint/pytorch_networks/glowTTS_ASR_conformer_two_forward_pass.py b/users/rilling/experiments/librispeech/librispeech_conformer_cb_joint/pytorch_networks/glowTTS_ASR_conformer_two_forward_pass.py index d1559f461..45d76f495 100644 --- a/users/rilling/experiments/librispeech/librispeech_conformer_cb_joint/pytorch_networks/glowTTS_ASR_conformer_two_forward_pass.py +++ b/users/rilling/experiments/librispeech/librispeech_conformer_cb_joint/pytorch_networks/glowTTS_ASR_conformer_two_forward_pass.py @@ -40,7 +40,6 @@ SpecaugConfig, VGG4LayerActFrontendV1Config_mod, ModelConfig, - FlowDecoderConfig, ConformerCouplingFlowDecoderConfig, TextEncoderConfig, ) @@ -99,24 +98,22 @@ def forward(self, x, x_mask): class FlowDecoder(nn.Module): - def __init__(self, cfg: ConformerCouplingFlowDecoderConfig, in_channels, gin_channels): + def __init__( + self, + cfg: ConformerCouplingFlowDecoderConfig, + in_channels, + gin_channels=0, + ): """Flow-based decoder model Args: + cfg (FlowDecoderConfig): Decoder specific parameters wrapped in FlowDecoderConfig in_channels (int): Number of incoming channels - hidden_channels (int): Number of hidden channels - kernel_size (int): Kernel Size for convolutions in coupling blocks - dilation_rate (float): Dilation Rate to define dilation in convolutions of coupling block - n_blocks (int): Number of coupling blocks - n_layers (int): Number of layers in CNN of the coupling blocks - p_dropout (float, optional): Dropout probability for CNN in coupling blocks. Defaults to 0.. - n_split (int, optional): Number of splits for the 1x1 convolution for flows in the decoder. Defaults to 4. - n_sqz (int, optional): Squeeze. Defaults to 1. - sigmoid_scale (bool, optional): Boolean to define if log probs in coupling layers should be rescaled using sigmoid. Defaults to False. - gin_channels (int, optional): Number of speaker embedding channels. Defaults to 0. """ super().__init__() + self.cfg = cfg + self.in_channels = in_channels self.flows = nn.ModuleList() @@ -124,12 +121,13 @@ def __init__(self, cfg: ConformerCouplingFlowDecoderConfig, in_channels, gin_cha self.flows.append(modules.ActNorm(channels=in_channels * self.cfg.n_sqz, ddi=self.cfg.ddi)) self.flows.append(modules.InvConvNear(channels=in_channels * self.cfg.n_sqz, n_split=self.cfg.n_split)) self.flows.append( - attentions.CouplingBlock( + attentions.ConformerCouplingBlock( in_channels * self.cfg.n_sqz, self.cfg.hidden_channels, kernel_size=self.cfg.kernel_size, dilation_rate=self.cfg.dilation_rate, n_layers=self.cfg.n_layers, + n_heads=self.cfg.n_heads, gin_channels=gin_channels, p_dropout=self.cfg.p_dropout, sigmoid_scale=self.cfg.sigmoid_scale, @@ -146,6 +144,7 @@ def forward(self, x, x_mask, g=None, reverse=False): if self.cfg.n_sqz > 1: x, x_mask = commons.channel_squeeze(x, x_mask, self.cfg.n_sqz) + for f in flows: if not reverse: x, logdet = f(x, x_mask, g=g, reverse=reverse) diff --git a/users/rilling/experiments/librispeech/librispeech_conformer_cb_joint/pytorch_networks/glowTTS_ASR_conformer_x_vector_v2.py b/users/rilling/experiments/librispeech/librispeech_conformer_cb_joint/pytorch_networks/glowTTS_ASR_conformer_x_vector_v2.py index 037f63ded..8c6850152 100644 --- a/users/rilling/experiments/librispeech/librispeech_conformer_cb_joint/pytorch_networks/glowTTS_ASR_conformer_x_vector_v2.py +++ b/users/rilling/experiments/librispeech/librispeech_conformer_cb_joint/pytorch_networks/glowTTS_ASR_conformer_x_vector_v2.py @@ -40,8 +40,8 @@ SpecaugConfig, VGG4LayerActFrontendV1Config_mod, ModelConfig, - FlowDecoderConfig, TextEncoderConfig, + ConformerCouplingFlowDecoderConfig ) from .shared.configs import DbMelFeatureExtractionConfig @@ -148,24 +148,22 @@ def forward(self, x, x_mask): class FlowDecoder(nn.Module): - def __init__(self, cfg: FlowDecoderConfig, in_channels, gin_channels): + def __init__( + self, + cfg: ConformerCouplingFlowDecoderConfig, + in_channels, + gin_channels=0, + ): """Flow-based decoder model Args: + cfg (FlowDecoderConfig): Decoder specific parameters wrapped in FlowDecoderConfig in_channels (int): Number of incoming channels - hidden_channels (int): Number of hidden channels - kernel_size (int): Kernel Size for convolutions in coupling blocks - dilation_rate (float): Dilation Rate to define dilation in convolutions of coupling block - n_blocks (int): Number of coupling blocks - n_layers (int): Number of layers in CNN of the coupling blocks - p_dropout (float, optional): Dropout probability for CNN in coupling blocks. Defaults to 0.. - n_split (int, optional): Number of splits for the 1x1 convolution for flows in the decoder. Defaults to 4. - n_sqz (int, optional): Squeeze. Defaults to 1. - sigmoid_scale (bool, optional): Boolean to define if log probs in coupling layers should be rescaled using sigmoid. Defaults to False. - gin_channels (int, optional): Number of speaker embedding channels. Defaults to 0. """ super().__init__() + self.cfg = cfg + self.in_channels = in_channels self.flows = nn.ModuleList() @@ -173,12 +171,13 @@ def __init__(self, cfg: FlowDecoderConfig, in_channels, gin_channels): self.flows.append(modules.ActNorm(channels=in_channels * self.cfg.n_sqz, ddi=self.cfg.ddi)) self.flows.append(modules.InvConvNear(channels=in_channels * self.cfg.n_sqz, n_split=self.cfg.n_split)) self.flows.append( - attentions.CouplingBlock( + attentions.ConformerCouplingBlock( in_channels * self.cfg.n_sqz, self.cfg.hidden_channels, kernel_size=self.cfg.kernel_size, dilation_rate=self.cfg.dilation_rate, n_layers=self.cfg.n_layers, + n_heads=self.cfg.n_heads, gin_channels=gin_channels, p_dropout=self.cfg.p_dropout, sigmoid_scale=self.cfg.sigmoid_scale, @@ -198,6 +197,7 @@ def forward(self, x, x_mask, g=None, reverse=False): if self.cfg.n_sqz > 1: x, x_mask = commons.channel_squeeze(x, x_mask, self.cfg.n_sqz) + for f in flows: if not reverse: x, logdet = f(x, x_mask, g=g, reverse=reverse) diff --git a/users/rilling/experiments/librispeech/librispeech_conformer_cb_joint/training_comparison.ipynb b/users/rilling/experiments/librispeech/librispeech_conformer_cb_joint/training_comparison.ipynb index a3dd2e9b2..dd25ac8a0 100644 --- a/users/rilling/experiments/librispeech/librispeech_conformer_cb_joint/training_comparison.ipynb +++ b/users/rilling/experiments/librispeech/librispeech_conformer_cb_joint/training_comparison.ipynb @@ -23,21 +23,17 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS/training': '/glowTTS/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_x_vector/training': '/glowTTS_x_vector/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/training': '/glowTTS_ASR_conformer_x_vector/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/training': '/glowTTS_ASR_conformer_two_forward_pass/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1/training': '/glowTTS_100ep_pe1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1_radam1e-9/training': '/glowTTS_100ep_pe1_radam1e-9/'}" + "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/training': '/glowTTS_ASR_conformer_two_forward_pass/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/training': '/glowTTS_ASR_conformer_x_vector_v2/'}" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -55,8 +51,8 @@ " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/raw_audio/conformer_coupling/ddi/glowTTS_x*/training\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/raw_audio/conformer_coupling/ddi/glow_ASR_conformer*/training\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/raw_audio/conformer_coupling/no_ddi/glow_ASR_conformer*/training\",\n", - " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/conformer_coupling/no_ddi/glowTTS*/training\",\n", - " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS*/training\",\n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR*/training\",\n", + " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS*/training\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/raw_audio/glow_ASR_conformer/training\",\n", "]\n", "# globs = [\"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/raw_audio/glowTTS_ASR*pass*/training\"]\n", @@ -79,26 +75,20 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS/training',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_x_vector/training',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/training',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/training',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1/training',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1_radam1e-9/training',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/not_silence_preprocessed/training',\n", + "['/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/training',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/training',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc768/100ep/not_silence_preprocessed/training',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_ddi_actnorm/enc192/100ep/not_silence_preprocessed/LR_scheduled/training/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/training',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/training/']" + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/training']" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -115,27 +105,19 @@ "# files[lr_files[-1]] = \"Baseline\"\n", "\n", "lr_files.append(\n", - " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/not_silence_preprocessed/training\"\n", - ")\n", - "files[lr_files[-1]] = \"Baseline GlowTTS\"\n", - "\n", - "lr_files.append(\n", " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc768/100ep/not_silence_preprocessed/training\"\n", ")\n", "files[lr_files[-1]] = \"Baseline GlowTTS x-vector\"\n", - "lr_files.append(\n", - " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_ddi_actnorm/enc192/100ep/not_silence_preprocessed/LR_scheduled/training/\"\n", - ")\n", - "files[lr_files[-1]] = \"Baseline GlowTTS ddi\"\n", "\n", "lr_files.append(\n", " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/training\"\n", ")\n", "files[lr_files[-1]] = \"Baseline Weak Conformer\"\n", + "\n", "lr_files.append(\n", - " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/training/\"\n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/training\"\n", ")\n", - "files[lr_files[-1]] = \"Baseline Conformer\"\n", + "files[lr_files[-1]] = \"Joint Training WN coupling\"\n", "\n", "\n", "lr_files" @@ -143,7 +125,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -153,12 +135,13 @@ " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc768/100ep/not_silence_preprocessed/training': 1,\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector_v3/enc768/100ep/not_silence_preprocessed/training': 1,\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector_v3_norm_xvector/enc768/100ep/not_silence_preprocessed/training': 1,\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1/training': 1,\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1_radam1e-9/training': 1,\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1/training': 1,\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_100ep_pe1/training': 1,\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_ddi_actnorm/enc192/100ep/not_silence_preprocessed/LR_scheduled/training/': 1}" ] }, - "execution_count": 8, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -185,24 +168,17 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS/training: 3\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_x_vector/training: 3\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/training: 3\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/training: 3\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1/training: 1\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1_radam1e-9/training: 1\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/not_silence_preprocessed/training: 1\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/training: 3\n", "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc768/100ep/not_silence_preprocessed/training: 1\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_ddi_actnorm/enc192/100ep/not_silence_preprocessed/LR_scheduled/training/: 1\n", "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/training: 3\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/training/: 3\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/training: 3\n", "Large Font: False\n", "Setup Interactive Legend\n", "Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous view', 'arrow-left', 'back'), ('Forward', 'Forward to next view', 'arrow-right', 'forward'), ('Pan', 'Left button pans, Right button zooms\\nx/y fixes axis, CTRL fixes aspect', 'arrows', 'pan'), ('Zoom', 'Zoom to rectangle\\nx/y fixes axis', 'square-o', 'zoom'), ('Download', 'Download plot', 'floppy-o', 'save_figure')]))\n" @@ -211,18 +187,18 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "ce7c336b10804e61adf7a71ab7821c74", + "model_id": "3c478dd060334d60b8990ebf5cc8ca96", "version_major": 2, "version_minor": 0 }, - "image/png": "", + "image/png": "", "text/html": [ "\n", "
\n", "
\n", " Figure\n", "
\n", - " \n", + " \n", "
\n", " " ], diff --git a/users/rilling/experiments/librispeech/librispeech_glow_asr/config.py b/users/rilling/experiments/librispeech/librispeech_glow_asr/config.py index e35213679..16421c75f 100644 --- a/users/rilling/experiments/librispeech/librispeech_glow_asr/config.py +++ b/users/rilling/experiments/librispeech/librispeech_glow_asr/config.py @@ -103,6 +103,50 @@ def get_prior_config( ) return returnn_config +def get_inv_config( + training_datasets: TrainingDatasets, + network_module: str, + net_args: Dict[str, Any], + config: Dict[str, Any], + debug: bool = False, + use_custom_engine=False, + **kwargs, +): + """ + Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner + :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob + :param training_datasets: datasets for training + :param kwargs: arguments to be passed to the network construction + :return: RETURNN training config + """ + + # changing these does not change the hash + post_config = { + } + + base_config = { + ############# + "batch_size": 50000 * 160, + "max_seqs": 60, + ############# + "forward": training_datasets.devtrain.as_returnn_opts() + + } + config = {**base_config, **copy.deepcopy(config)} + post_config["backend"] = "torch" + + serializer = get_pytorch_serializer_v3( + network_module=network_module, + net_args=net_args, + debug=debug, + use_custom_engine=use_custom_engine, + invertibility=True + ) + returnn_config = ReturnnConfig( + config=config, post_config=post_config, python_epilog=[serializer] + ) + return returnn_config + def get_search_config( network_module: str, net_args: Dict[str, Any], diff --git a/users/rilling/experiments/librispeech/librispeech_glow_asr/experiments.py b/users/rilling/experiments/librispeech/librispeech_glow_asr/experiments.py index 4bc50602a..de7fb6b1b 100644 --- a/users/rilling/experiments/librispeech/librispeech_glow_asr/experiments.py +++ b/users/rilling/experiments/librispeech/librispeech_glow_asr/experiments.py @@ -23,9 +23,9 @@ from .data_ctc import get_text_lexicon as get_text_lexicon_asr from .default_tools import RETURNN_EXE, MINI_RETURNN_ROOT -from .pipeline import training, search, compute_prior, compute_phoneme_pred_accuracy +from .pipeline import training, search, compute_prior, compute_phoneme_pred_accuracy, evaluate_invertibility -from .config import get_training_config, get_search_config, get_prior_config +from .config import get_training_config, get_search_config, get_prior_config, get_inv_config from i6_experiments.users.rilling.experiments.librispeech.librispeech_joint_training_given_alignments.storage import ( tts_models, @@ -75,7 +75,7 @@ def glowASR(TTS_experiments: dict): test_dataset_tuples = {} test_dataset_normal_ctc_tuples = {} - for testset in ["test-clean"]: + for testset in ["test-clean", "test-other"]: test_dataset_tuples[testset] = build_test_dataset( librispeech_key="train-clean-100", dataset_key=testset, @@ -135,8 +135,11 @@ def run_exp( test_datasets=dev_dataset_tuples, large_gpu_training=False, only_forward_no_search=False, + eval_invertibility=False, ): - search_args = search_args if search_args is not None else {} + search_args = copy.deepcopy(search_args) if search_args is not None else {} + + with_prior = with_prior or ("prior_scale" in search_args and search_args["prior_scale"] != 0) returnn_config = get_training_config(training_datasets=datasets, **train_args) returnn_search_config = get_search_config(**train_args, search_args=search_args) @@ -190,6 +193,16 @@ def run_exp( # search(ft_name + "/default_best", returnn_search_config, best_checkpoint, test_dataset_tuples, RETURNN_EXE, RETURNN_ROOT) # search(ft_name + "/average_4", returnn_search_config, averaged_checkpoint, test_dataset_tuples, RETURNN_EXE, RETURNN_ROOT) + if eval_invertibility: + inv_config = get_inv_config(training_datasets=datasets, **train_args) + evaluate_invertibility( + ft_name, + inv_config, + checkpoint=train_job.out_checkpoints[num_epochs], + returnn_exe=RETURNN_EXE, + returnn_root=MINI_RETURNN_ROOT, + ) + return train_job train_args = { @@ -581,6 +594,7 @@ def run_exp( train_args=train_args_blstm_frame_stack_v2, search_args={**default_search_args_asr, **additional_search_args}, test_datasets=dev_dataset_normal_ctc_tuples, + with_prior=True, ) # run_exp( # prefix_name + "blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc768/with_sigma/not_silence_preprocessed/lm4", @@ -652,6 +666,7 @@ def run_exp( train_args=train_args_blstm_only, search_args={**default_search_args_asr, **additional_search_args}, test_datasets=dev_dataset_normal_ctc_tuples, + with_prior=True, ) run_exp( @@ -660,6 +675,7 @@ def run_exp( train_args=train_args_blstm_only_spec_aug, search_args=default_search_args_asr, test_datasets=dev_dataset_normal_ctc_tuples, + with_prior=True, ) run_exp( @@ -1325,7 +1341,7 @@ def run_exp( train_args_conformer_speaker_drop_asr_data["config"]["preload_from_files"]["existing-model"]["filename"] = ( TTS_exp_train_job.out_checkpoints[TTS_exp_train_job.returnn_config.get("num_epochs", 100)] ) - for lm_w in [2.5, 3.0, 3.5, 4.0]: + for lm_w in [2.0, 2.5, 3.0, 3.5, 4.0]: for ps in [0, 0.3, 0.5]: additional_search_args = {"lm_weight": lm_w, "prior_scale": ps} run_exp( @@ -1338,6 +1354,39 @@ def run_exp( num_epochs=250, with_prior=True, ) + + train_args_conformer_speaker_drop_asr_data_p0 = copy.deepcopy(train_args_conformer_speaker_drop_asr_data) + tts_exp_name = "glowTTS/enc192/100ep/not_silence_preprocessed" + tts_train_job = TTS_experiments[tts_exp_name]["train_job"] + train_args_conformer_speaker_drop_asr_data_p0["config"]["preload_from_files"]["existing-model"]["filename"] = ( + tts_train_job.out_checkpoints[tts_train_job.returnn_config.get("num_epochs", 100)] + ) + + run_exp( + prefix_name + + f"conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0", + datasets=train_data_normal_ctc, + train_args=train_args_conformer_speaker_drop_asr_data_p0, + search_args=default_search_args_asr, + large_gpu_training=True, + num_epochs=250, + ) + + for lm_w in [2.0, 2.5, 3.0, 3.5, 4.0]: + for ps in [0, 0.3, 0.5]: + additional_search_args = {"lm_weight": lm_w, "prior_scale": ps} + run_exp( + prefix_name + + f"conformer/asr_dataset/spec_augment/glow_enc192_100ep_not_silence_preprocessed_speaker_drop_0/search_params/lm_{lm_w}_ps_{ps}", + datasets=train_data_normal_ctc, + train_args=train_args_conformer_speaker_drop_asr_data_p0, + search_args={**default_search_args_asr, **additional_search_args}, + large_gpu_training=True, + num_epochs=250, + with_prior=True, + ) + + # ------------------------------ No Freezing Experiments ------------------------------ train_args_conformer_no_freeze_asr_data = copy.deepcopy(train_args_conformer_asr_data) @@ -1364,6 +1413,21 @@ def run_exp( datasets=train_data_normal_ctc, train_args=train_args_conformer_no_freeze_asr_data_spec_aug_before, search_args=default_search_args_asr, + large_gpu_training=True, + num_epochs=250, + ) + + + train_args_weak_conformer_no_freeze_asr_data_spec_aug_before = copy.deepcopy(train_args_conformer_no_freeze_asr_data_spec_aug_before) + train_args_weak_conformer_no_freeze_asr_data_spec_aug_before["network_module"] = ( + "glowASR_conformer_no_freeze_spec_augment_before_weak_conf" + ) + run_exp( + prefix_name + + "weak_conformer/asr_dataset/spec_augment_before/glow_enc192_200ep_not_silence_preprocessed_not_freezed", + datasets=train_data_normal_ctc, + train_args=train_args_weak_conformer_no_freeze_asr_data_spec_aug_before, + search_args=default_search_args_asr, large_gpu_training=False, num_epochs=250, ) @@ -1390,7 +1454,21 @@ def run_exp( search_args=default_search_args_asr, large_gpu_training=False, num_epochs=250, + eval_invertibility=True ) + + train_args_weak_conformer_no_pretrained_asr_data = copy.deepcopy(train_args_conformer_no_pretrained_asr_data) + train_args_weak_conformer_no_pretrained_asr_data["network_module"] = "glowASR_conformer_no_freeze_spec_augment_before_weak_conf" + run_exp( + prefix_name + "weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained", + datasets=train_data_normal_ctc, + train_args=train_args_weak_conformer_no_pretrained_asr_data, + search_args=default_search_args_asr, + large_gpu_training=False, + num_epochs=250, + eval_invertibility=True + ) + train_args_conformer_no_pretrained_asr_data_no_spec_augment = copy.deepcopy( train_args_conformer_no_pretrained_asr_data ) @@ -1405,6 +1483,7 @@ def run_exp( search_args=default_search_args_asr, large_gpu_training=False, num_epochs=250, + eval_invertibility=True, ) train_args_conformer_no_pretrained_asr_data_spec_augment_after_fe = copy.deepcopy( @@ -1420,6 +1499,7 @@ def run_exp( search_args=default_search_args_asr, large_gpu_training=False, num_epochs=250, + eval_invertibility=True ) train_args_weak_conformer_no_pretrained_asr_data_spec_augment_after_fe = copy.deepcopy( @@ -1553,6 +1633,17 @@ def run_exp( num_epochs=250, with_prior=True, ) + + run_exp( + prefix_name + f"weak_conformer/asr_dataset/spec_augment_before/glow_not_pretrained/lm_tuning/lm{lm_w}_ps{ps}", + datasets=train_data_normal_ctc, + train_args=train_args_weak_conformer_no_pretrained_asr_data, + search_args={**default_search_args_asr, **additional_search_args}, + large_gpu_training=False, + num_epochs=250, + eval_invertibility=True + ) + run_exp( prefix_name + f"conformer/asr_dataset/no_spec_augment/glow_enc192_not_pretrained/lm_tuning/lm{lm_w}_ps{ps}", @@ -1573,6 +1664,16 @@ def run_exp( large_gpu_training=False, num_epochs=250, ) + + run_exp( + prefix_name + f"weak_conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained/lm_tuning/lm{lm_w}_ps{ps}", + datasets=train_data_normal_ctc, + train_args=train_args_weak_conformer_no_pretrained_asr_data_spec_augment_after_fe, + search_args={**default_search_args_asr, **additional_search_args}, + large_gpu_training=False, + num_epochs=250, + ) + run_exp( prefix_name + f"conformer/asr_dataset/spec_augment/glow_enc192_not_pretrained_control/lm_tuning/lm{lm_w}_ps{ps}", @@ -1684,16 +1785,16 @@ def run_exp( num_epochs=250, ) - train_args_weak_conformer_asr_data_v3_768 = copy.deepcopy(train_args_conformer_asr_data_v3_768) - train_args_weak_conformer_asr_data_v3_768["network_module"] = "glowASR_conformer_x_vector_v2_weak_conf" - run_exp( - prefix_name + "weak_conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2", - datasets=train_data_normal_ctc, - train_args=train_args_weak_conformer_asr_data_v3_768, - search_args=default_search_args_asr, - large_gpu_training=True, - num_epochs=250, - ) + # train_args_weak_conformer_asr_data_v3_768 = copy.deepcopy(train_args_conformer_asr_data_v3_768) + # train_args_weak_conformer_asr_data_v3_768["network_module"] = "glowASR_conformer_v2_weak_conf" + # run_exp( + # prefix_name + "weak_conformer/asr_dataset/spec_augment/glow_enc768_200ep_dec_0.05_v2", + # datasets=train_data_normal_ctc, + # train_args=train_args_weak_conformer_asr_data_v3_768, + # search_args=default_search_args_asr, + # large_gpu_training=True, + # num_epochs=250, + # ) train_args_conformer_asr_data_v3_768_no_specaug = copy.deepcopy(train_args_conformer_asr_data_v3_768) train_args_conformer_asr_data_v3_768_no_specaug["net_args"]["spec_augment"] = False @@ -1706,30 +1807,30 @@ def run_exp( num_epochs=250, ) - train_args_conformer_asr_data_v3_768_400 = copy.deepcopy(train_args_conformer_asr_data_v3_768) - train_args_conformer_asr_data_v3_768_400["config"]["preload_from_files"]["existing-model"]["filename"] = tts_models[ - "glowTTS/enc768/400ep/grad_clip_10/dec_drop_0.05" - ].checkpoint + # train_args_conformer_asr_data_v3_768_400 = copy.deepcopy(train_args_conformer_asr_data_v3_768) + # train_args_conformer_asr_data_v3_768_400["config"]["preload_from_files"]["existing-model"]["filename"] = tts_models[ + # "glowTTS/enc768/400ep/grad_clip_10/dec_drop_0.05" + # ].checkpoint - run_exp( - prefix_name + "conformer/asr_dataset/spec_augment/glow_enc768_400ep_dec_0.05_v2", - datasets=train_data_normal_ctc, - train_args=train_args_conformer_asr_data_v3_768_400, - search_args=default_search_args_asr, - large_gpu_training=True, - num_epochs=250, - ) + # run_exp( + # prefix_name + "conformer/asr_dataset/spec_augment/glow_enc768_400ep_dec_0.05_v2", + # datasets=train_data_normal_ctc, + # train_args=train_args_conformer_asr_data_v3_768_400, + # search_args=default_search_args_asr, + # large_gpu_training=True, + # num_epochs=250, + # ) - train_args_conformer_asr_data_v3_768_400_no_specaug = copy.deepcopy(train_args_conformer_asr_data_v3_768_400) - train_args_conformer_asr_data_v3_768_400_no_specaug["net_args"]["spec_augment"] = False - run_exp( - prefix_name + "conformer/asr_dataset/no_spec_augment/glow_enc768_400ep_dec_0.05_v2", - datasets=train_data_normal_ctc, - train_args=train_args_conformer_asr_data_v3_768_400_no_specaug, - search_args=default_search_args_asr, - large_gpu_training=True, - num_epochs=250, - ) + # train_args_conformer_asr_data_v3_768_400_no_specaug = copy.deepcopy(train_args_conformer_asr_data_v3_768_400) + # train_args_conformer_asr_data_v3_768_400_no_specaug["net_args"]["spec_augment"] = False + # run_exp( + # prefix_name + "conformer/asr_dataset/no_spec_augment/glow_enc768_400ep_dec_0.05_v2", + # datasets=train_data_normal_ctc, + # train_args=train_args_conformer_asr_data_v3_768_400_no_specaug, + # search_args=default_search_args_asr, + # large_gpu_training=True, + # num_epochs=250, + # ) train_args_conformer_asr_data_v3_192 = copy.deepcopy(train_args_conformer_asr_data_v3_768) train_args_conformer_asr_data_v3_192["config"]["preload_from_files"]["existing-model"]["filename"] = tts_models[ @@ -1759,6 +1860,19 @@ def run_exp( num_epochs=250, ) + train_args_conformer_asr_data_xvector_v3_100epTTS = copy.deepcopy(train_args_conformer_asr_data_xvector_v3) + train_args_conformer_asr_data_xvector_v3_100epTTS["config"]["preload_from_files"]["existing-model"]["filename"] = tts_models[ + "glowTTS_x_vector_v2/enc768/100ep/dec_drop_0.05" + ].checkpoint + run_exp( + prefix_name + "conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2", + datasets=train_data_normal_ctc, + train_args=train_args_conformer_asr_data_xvector_v3_100epTTS, + search_args=default_search_args_asr, + large_gpu_training=False, + num_epochs=250, + ) + train_args_weak_conformer_asr_data_xvector_v3 = copy.deepcopy(train_args_conformer_asr_data_xvector_v3) train_args_weak_conformer_asr_data_xvector_v3["network_module"] = "glowASR_conformer_x_vector_v2_bottleneck_weak_conf" run_exp( @@ -1790,7 +1904,7 @@ def run_exp( datasets=train_data_normal_ctc, train_args=train_args_conformer_asr_data_xvector_v3_400, search_args=default_search_args_asr, - large_gpu_training=False, + large_gpu_training=True, num_epochs=250, ) @@ -1818,7 +1932,7 @@ def run_exp( num_epochs=250, ) - for lm_w in [2.5, 3.0, 3.5, 4.0]: + for lm_w in [1.5, 2.0, 2.5, 3.0, 3.5, 4.0]: for ps in [0, 0.3, 0.5]: additional_search_args = {"lm_weight": lm_w, "prior_scale": ps} run_exp( @@ -1830,7 +1944,6 @@ def run_exp( large_gpu_training=True, num_epochs=250, ) - run_exp( prefix_name + f"conformer/asr_dataset/no_spec_augment/glow_enc768_200ep_dec_0.05_v2/search_params/lm_{lm_w}_ps_{ps}", datasets=train_data_normal_ctc, @@ -1860,6 +1973,15 @@ def run_exp( num_epochs=250, ) + run_exp( + prefix_name + f"weak_conformer/asr_dataset/spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_{lm_w}_ps_{ps}", + datasets=train_data_normal_ctc, + train_args=train_args_weak_conformer_asr_data_xvector_v3, + search_args={**default_search_args_asr, **additional_search_args}, + large_gpu_training=False, + num_epochs=250, + ) + run_exp( prefix_name + f"conformer/asr_dataset/no_spec_augment/glow_x_vector_enc768_200ep_dec_0.05_v2/search_params/lm_{lm_w}_ps_{ps}", datasets=train_data_normal_ctc, @@ -1868,6 +1990,14 @@ def run_exp( large_gpu_training=True, num_epochs=250, ) + run_exp( + prefix_name + f"conformer/asr_dataset/spec_augment/glow_x_vector_enc768_100ep_dec_0.05_v2/search_params/lm_{lm_w}_ps_{ps}", + datasets=train_data_normal_ctc, + train_args=train_args_conformer_asr_data_xvector_v3_100epTTS, + search_args={**default_search_args_asr, **additional_search_args}, + large_gpu_training=False, + num_epochs=250, + ) run_exp( prefix_name diff --git a/users/rilling/experiments/librispeech/librispeech_glow_asr/pipeline.py b/users/rilling/experiments/librispeech/librispeech_glow_asr/pipeline.py index 4cd076b54..fbf095bc9 100644 --- a/users/rilling/experiments/librispeech/librispeech_glow_asr/pipeline.py +++ b/users/rilling/experiments/librispeech/librispeech_glow_asr/pipeline.py @@ -209,3 +209,33 @@ def compute_prior( ) search_job.add_alias(prefix_name + "/prior_job") return search_job.out_files["prior.txt"] + +def evaluate_invertibility( + prefix, + config, + checkpoint, + returnn_exe, + returnn_root, +): + hdf_outputs = [] + + last_forward_job = ReturnnForwardJob( + model_checkpoint=checkpoint, + returnn_config=config, + hdf_outputs=hdf_outputs, + returnn_python_exe=returnn_exe, + returnn_root=returnn_root, + mem_rqmt=20, + device="cpu" + ) + + forward_prefix = prefix + "/forward_invertibility" + + last_forward_job.add_alias(forward_prefix) + + tts_hdf = None + + tts_hdf = last_forward_job.out_hdf_files["output.hdf"] + tk.register_output(forward_prefix, tts_hdf) + + return last_forward_job diff --git a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_blstm_frame_stack.py b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_blstm_frame_stack.py index 8cee7af0d..5c964055f 100644 --- a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_blstm_frame_stack.py +++ b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_blstm_frame_stack.py @@ -31,7 +31,7 @@ from .shared import attentions from .monotonic_align import maximum_path -from .shared.forward import forward_init_hook, forward_step, forward_finish_hook +from .shared.forward import forward_init_hook, forward_step, forward_finish_hook, prior_step, prior_finish_hook, prior_init_hook from .shared.train import train_step from IPython import embed diff --git a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_blstm_frame_stack_v2.py b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_blstm_frame_stack_v2.py index d971b6b16..f3c3ef110 100644 --- a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_blstm_frame_stack_v2.py +++ b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_blstm_frame_stack_v2.py @@ -31,7 +31,7 @@ from .shared import attentions from .monotonic_align import maximum_path -from .shared.forward import forward_init_hook, forward_step, forward_finish_hook +from .shared.forward import forward_init_hook, forward_step, forward_finish_hook, prior_step, prior_finish_hook, prior_init_hook from .shared.train import train_step from IPython import embed diff --git a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_blstm_frame_stack_x_vector.py b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_blstm_frame_stack_x_vector.py index 4f415af64..90c7aea2b 100644 --- a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_blstm_frame_stack_x_vector.py +++ b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_blstm_frame_stack_x_vector.py @@ -31,7 +31,7 @@ from .shared import attentions from .monotonic_align import maximum_path -from .shared.forward import forward_init_hook, forward_step, forward_finish_hook +from .shared.forward import forward_init_hook, forward_step, forward_finish_hook, prior_step, prior_finish_hook, prior_init_hook from .shared.train import train_step from IPython import embed diff --git a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_blstm_frame_stack_x_vector_v2.py b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_blstm_frame_stack_x_vector_v2.py index 86a9400cb..842862bca 100644 --- a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_blstm_frame_stack_x_vector_v2.py +++ b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_blstm_frame_stack_x_vector_v2.py @@ -31,7 +31,7 @@ from .shared import attentions from .monotonic_align import maximum_path -from .shared.forward import forward_init_hook, forward_step, forward_finish_hook +from .shared.forward import forward_init_hook, forward_step, forward_finish_hook, prior_step, prior_finish_hook, prior_init_hook from .shared.train import train_step from IPython import embed diff --git a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer.py b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer.py index d85c91898..6c8e5a5ee 100644 --- a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer.py +++ b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer.py @@ -203,7 +203,7 @@ def __init__( final_dropout=0.2, specauc_start_epoch=1 ) - + self.cfg = conformer_model_config frontend_config = self.cfg.frontend_config conformer_size = self.cfg.conformer_size @@ -244,7 +244,6 @@ def __init__( self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) self.specaug_start_epoch = self.cfg.specauc_start_epoch - def forward(self, raw_audio, raw_audio_len): with torch.no_grad(): squeezed_audio = torch.squeeze(raw_audio) @@ -273,12 +272,12 @@ def forward(self, raw_audio, raw_audio_len): audio_features_masked_2 = spec_augment_in conformer_in = audio_features_masked_2 - + if self.layer_norm: conformer_in = torch.nn.functional.layer_norm(conformer_in, (conformer_in.size(-1),)) elif self.bn is not None: conformer_in = self.bn(conformer_in.transpose(1,2)).transpose(1,2) - + conformer_out, out_mask = self.conformer(conformer_in, mask) conformer_out = self.final_dropout(conformer_out) logits = self.final_linear(conformer_out) @@ -296,4 +295,3 @@ def preprocess(self, y, y_lengths, y_max_length): def store_inverse(self): self.decoder.store_inverse() - diff --git a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_no_freeze.py b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_no_freeze.py index 01b444d2f..7eb66c389 100644 --- a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_no_freeze.py +++ b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_no_freeze.py @@ -50,6 +50,7 @@ from .monotonic_align import maximum_path from .shared.forward import forward_init_hook, forward_step, forward_finish_hook, prior_init_hook, prior_finish_hook, prior_step +from .shared.eval_invertibility import * from .shared.train import train_step from IPython import embed diff --git a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_no_freeze_spec_augment.py b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_no_freeze_spec_augment.py index d7e6bd6b0..70dca4b54 100644 --- a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_no_freeze_spec_augment.py +++ b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_no_freeze_spec_augment.py @@ -51,6 +51,7 @@ from .shared.forward import forward_init_hook, forward_step, forward_finish_hook, prior_init_hook, prior_finish_hook, prior_step from .shared.train import train_step +from .shared.eval_invertibility import * from IPython import embed diff --git a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_no_freeze_spec_augment_before_weak_conf.py b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_no_freeze_spec_augment_before_weak_conf.py new file mode 100644 index 000000000..4676f411e --- /dev/null +++ b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_no_freeze_spec_augment_before_weak_conf.py @@ -0,0 +1,298 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +from dataclasses import dataclass +import torch +import numpy as np +from torch import nn +import multiprocessing +from librosa import filters +import sys +import time +from typing import Any, Dict, Optional, Tuple, Union +import math + +from torchaudio.functional import mask_along_axis + +from i6_models.parts.blstm import BlstmEncoderV1, BlstmEncoderV1Config + + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1, VGG4LayerActFrontendV1Config + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config +from ..i6modelsV1_VGG4LayerActFrontendV1_v4_cfg import ModelConfig + +from ..i6modelsV1_VGG4LayerActFrontendV1_v4_cfg import \ + SpecaugConfig, VGG4LayerActFrontendV1Config_mod, ModelConfig + + +from returnn.torch.context import get_run_ctx + +from .shared.configs import DbMelFeatureExtractionConfig +from .shared.feature_extraction import DbMelFeatureExtraction +from .shared.spec_augment import apply_spec_aug +from .shared.mask import mask_tensor + +from .shared import modules +from .shared import commons +from .shared import attentions +from .monotonic_align import maximum_path + +from .shared.forward import forward_init_hook, forward_step, forward_finish_hook, prior_init_hook, prior_finish_hook, prior_step +from .shared.train import train_step +from .shared.eval_invertibility import * + +from IPython import embed + +class Model(nn.Module): + """ + Flow-based ASR model based on GlowTTS Structure using a pre-trained flow-based decoder + trained to generate spectrograms from given statistics coming from an encoder + + Model was pretrained using the architecture in + users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS.py + """ + + def __init__( + self, + n_vocab: int, + hidden_channels: int = 192, + out_channels: int = 80, + n_blocks_dec: int = 12, + kernel_size_dec: int = 5, + dilation_rate: int = 1, + n_block_layers: int = 4, + p_dropout: float = 0.1, + p_dropout_flow: float = 0.05, + gin_channels: int = 0, + n_split: int = 4, + n_sqz: int = 2, + sigmoid_scale: bool = False, + window_size: int = 4, + block_length: int = None, + hidden_channels_dec: int = None, + label_target_size=None, + spec_augment = False, + layer_norm = False, + batch_norm = False, + **kwargs, + ): + """_summary_ + + Args: + n_vocab (int): vocabulary size + hidden_channels (int): Number of hidden channels in encoder + out_channels (int): Number of channels in the output + n_blocks_dec (int, optional): Number of coupling blocks in the decoder. Defaults to 12. + kernel_size_dec (int, optional): Kernel size in the decoder. Defaults to 5. + dilation_rate (int, optional): Dilation rate for CNNs of coupling blocks in decoder. Defaults to 5. + n_block_layers (int, optional): Number of layers in the CNN of the coupling blocks in decoder. Defaults to 4. + p_dropout_dec (_type_, optional): Dropout probability in the decoder. Defaults to 0.. + n_speakers (int, optional): Number of speakers. Defaults to 0. + gin_channels (int, optional): Number of speaker embedding channels. Defaults to 0. + n_split (int, optional): Number of splits for the 1x1 convolution for flows in the decoder. Defaults to 4. + n_sqz (int, optional): Squeeze. Defaults to 1. + sigmoid_scale (bool, optional): Boolean to define if log probs in coupling layers should be rescaled using sigmoid. Defaults to False. + window_size (int, optional): Window size in Multi-Head Self-Attention for encoder. Defaults to None. + block_length (_type_, optional): Block length for optional block masking in Multi-Head Attention for encoder. Defaults to None. + hidden_channels_dec (_type_, optional): Number of hidden channels in decodder. Defaults to hidden_channels. + final_hidden_channels: Number of hidden channels in the final network + final_n_layers: Number of layers in the final network + label_target_size: Target size of target vocabulary, target size for final network + """ + super().__init__() + self.n_vocab = n_vocab + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.n_blocks_dec = n_blocks_dec + self.kernel_size_dec = kernel_size_dec + self.dilation_rate = dilation_rate + self.n_block_layers = n_block_layers + self.p_dropout = p_dropout + self.p_dropout_flow = p_dropout_flow + self.n_split = n_split + self.n_sqz = n_sqz + self.sigmoid_scale = sigmoid_scale + self.window_size = window_size + self.block_length = block_length + self.hidden_channels_dec = hidden_channels_dec + self.spec_augment = spec_augment + self.layer_norm = layer_norm + self.batch_norm = batch_norm + + self.net_kwargs = { + "repeat_per_num_frames": 100, + "max_dim_feat": 8, + "num_repeat_feat": 5, + "max_dim_time": 20, + } + + fe_config = DbMelFeatureExtractionConfig.from_dict(kwargs["fe_config"]) + self.feature_extraction = DbMelFeatureExtraction(config=fe_config) + + if label_target_size is None: + if n_vocab is None: + run_ctx = get_run_ctx() + dataset = run_ctx.engine.train_dataset or run_ctx.engine.forward_dataset + self.label_target_size = len(dataset.datasets["zip_dataset"].targets.labels) + else: + self.label_target_size = n_vocab + else: + self.label_target_size = label_target_size + + self.decoder = modules.Flow( + out_channels, + hidden_channels_dec or hidden_channels, + kernel_size_dec, + dilation_rate, + n_blocks_dec, + n_block_layers, + p_dropout=p_dropout_flow, + n_split=n_split, + n_sqz=n_sqz, + sigmoid_scale=sigmoid_scale, + gin_channels=gin_channels, + ) + + specaug_config = SpecaugConfig( + repeat_per_n_frames=25, + max_dim_time=20, + max_dim_feat=16, + num_repeat_feat=5, + ) + frontend_config = VGG4LayerActFrontendV1Config( + in_features=80, + conv1_channels=16, + conv2_channels=16, + conv3_channels=16, + conv4_channels=16, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + out_features=96, + activation=nn.ReLU(), + ) + + model_config = ModelConfig( + frontend_config=frontend_config, + specaug_config=None, + label_target_size=self.n_vocab, + conformer_size=96, + num_layers=8, + num_heads=2, + ff_dim=384, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=9, + final_dropout=0.2, + specauc_start_epoch=1, + ) + self.cfg = model_config + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, kernel_size=self.cfg.conv_kernel_size, dropout=self.cfg.conv_dropout, activation=nn.functional.silu, + norm=LayerNormNC(conformer_size) + ), + ), + ) + + if self.layer_norm: + print("Using Layer Norm after Flow...") + + if self.batch_norm: + print("Using Batch Norm after Flow...") + self.bn = nn.BatchNorm1d(self.out_channels) + else: + self.bn = None + + self.conformer = ConformerEncoderV1(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + + def forward(self, raw_audio, raw_audio_len): + with torch.no_grad(): + squeezed_audio = torch.squeeze(raw_audio) + log_mel_features, log_mel_features_len = self.feature_extraction(squeezed_audio, raw_audio_len) # [B, T, F] + + audio_max_length = log_mel_features.size(1) + spec_augment_in = log_mel_features + if self.training and self.spec_augment: + audio_features_masked_2 = apply_spec_aug( + spec_augment_in, + num_repeat_time=torch.max(log_mel_features_len).detach().cpu().numpy() + // self.net_kwargs["repeat_per_num_frames"], + max_dim_time=self.net_kwargs["max_dim_time"], + num_repeat_feat=self.net_kwargs["num_repeat_feat"], + max_dim_feat=self.net_kwargs["max_dim_feat"], + ) + else: + audio_features_masked_2 = spec_augment_in + + flow_in = audio_features_masked_2.transpose(1,2) # [B, F, T] + flow_in, flow_in_length, flow_in_max_length = self.preprocess(flow_in, log_mel_features_len, audio_max_length) + mask = torch.unsqueeze(commons.sequence_mask(log_mel_features_len, flow_in.size(2)), 1).to(flow_in.dtype) + flow_out, _ = self.decoder(flow_in, mask, reverse=False) # [B, F, T] + + conformer_in = flow_out.transpose(1,2) + mask = mask_tensor(conformer_in, flow_in_length) + + if self.layer_norm: + conformer_in = torch.nn.functional.layer_norm(conformer_in, (conformer_in.size(-1),)) + elif self.bn is not None: + conformer_in = self.bn(conformer_in.transpose(1,2)).transpose(1,2) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + def preprocess(self, y, y_lengths, y_max_length): + if y_max_length is not None: + y_max_length = (y_max_length // self.n_sqz) * self.n_sqz + y = y[:, :, :y_max_length] + y_lengths = (y_lengths // self.n_sqz) * self.n_sqz + return y, y_lengths, y_max_length + + def store_inverse(self): + self.decoder.store_inverse() + diff --git a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_no_pretrained.py b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_no_pretrained.py index 4bd7877ea..4e6da3392 100644 --- a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_no_pretrained.py +++ b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_no_pretrained.py @@ -51,7 +51,7 @@ from .shared.forward import forward_init_hook, forward_step, forward_finish_hook, prior_init_hook, prior_finish_hook, prior_step from .shared.train import train_step - +from .shared.eval_invertibility import * from IPython import embed class Model(nn.Module): diff --git a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_v2.py b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_v2.py new file mode 100644 index 000000000..0c1fd6c21 --- /dev/null +++ b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/glowASR_conformer_v2.py @@ -0,0 +1,298 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +from dataclasses import dataclass +import torch +import numpy as np +from torch import nn +import multiprocessing +from librosa import filters +import sys +import time +from typing import Any, Dict, Optional, Tuple, Union +import math + +from torchaudio.functional import mask_along_axis + +from i6_models.parts.blstm import BlstmEncoderV1, BlstmEncoderV1Config + + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1, VGG4LayerActFrontendV1Config + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + +from ..i6modelsV1_VGG4LayerActFrontendV1_v4_cfg import \ + SpecaugConfig, VGG4LayerActFrontendV1Config_mod, ModelConfig + + +from returnn.torch.context import get_run_ctx + +from .shared.configs import DbMelFeatureExtractionConfig +from .shared.feature_extraction import DbMelFeatureExtraction +from .shared.spec_augment import apply_spec_aug +from .shared.mask import mask_tensor + +from .shared import modules +from .shared import commons +from .shared import attentions +from .monotonic_align import maximum_path + +from .shared.forward import forward_init_hook, forward_step, forward_finish_hook, prior_init_hook, prior_finish_hook, prior_step +from .shared.train import train_step + +from IPython import embed + +class Model(nn.Module): + """ + Flow-based ASR model based on GlowTTS Structure using a pre-trained flow-based decoder + trained to generate spectrograms from given statistics coming from an encoder + + Model was pretrained using the architecture in + users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS.py + """ + + def __init__( + self, + n_vocab: int, + hidden_channels: int = 192, + out_channels: int = 80, + n_blocks_dec: int = 12, + kernel_size_dec: int = 5, + dilation_rate: int = 1, + n_block_layers: int = 4, + p_dropout: float = 0.1, + p_dropout_flow: float = 0.05, + gin_channels: int = 0, + n_split: int = 4, + n_sqz: int = 2, + sigmoid_scale: bool = False, + window_size: int = 4, + block_length: int = None, + hidden_channels_dec: int = None, + label_target_size: int = None, + spec_augment: bool = False, + layer_norm: bool = False, + batch_norm: bool = False, + conformer_model_config: ModelConfig = None, + **kwargs, + ): + """_summary_ + + Args: + n_vocab (int): vocabulary size + hidden_channels (int): Number of hidden channels in encoder + out_channels (int): Number of channels in the output + n_blocks_dec (int, optional): Number of coupling blocks in the decoder. Defaults to 12. + kernel_size_dec (int, optional): Kernel size in the decoder. Defaults to 5. + dilation_rate (int, optional): Dilation rate for CNNs of coupling blocks in decoder. Defaults to 5. + n_block_layers (int, optional): Number of layers in the CNN of the coupling blocks in decoder. Defaults to 4. + p_dropout_dec (_type_, optional): Dropout probability in the decoder. Defaults to 0.. + n_speakers (int, optional): Number of speakers. Defaults to 0. + gin_channels (int, optional): Number of speaker embedding channels. Defaults to 0. + n_split (int, optional): Number of splits for the 1x1 convolution for flows in the decoder. Defaults to 4. + n_sqz (int, optional): Squeeze. Defaults to 1. + sigmoid_scale (bool, optional): Boolean to define if log probs in coupling layers should be rescaled using sigmoid. Defaults to False. + window_size (int, optional): Window size in Multi-Head Self-Attention for encoder. Defaults to None. + block_length (_type_, optional): Block length for optional block masking in Multi-Head Attention for encoder. Defaults to None. + hidden_channels_dec (_type_, optional): Number of hidden channels in decodder. Defaults to hidden_channels. + final_hidden_channels: Number of hidden channels in the final network + final_n_layers: Number of layers in the final network + label_target_size: Target size of target vocabulary, target size for final network + """ + super().__init__() + self.n_vocab = n_vocab + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.n_blocks_dec = n_blocks_dec + self.kernel_size_dec = kernel_size_dec + self.dilation_rate = dilation_rate + self.n_block_layers = n_block_layers + self.p_dropout = p_dropout + self.p_dropout_flow = p_dropout_flow + self.n_split = n_split + self.n_sqz = n_sqz + self.sigmoid_scale = sigmoid_scale + self.window_size = window_size + self.block_length = block_length + self.hidden_channels_dec = hidden_channels_dec + self.spec_augment = spec_augment + self.layer_norm = layer_norm + self.batch_norm = batch_norm + + self.net_kwargs = { + "repeat_per_num_frames": 100, + "max_dim_feat": 8, + "num_repeat_feat": 5, + "max_dim_time": 20, + } + + fe_config = DbMelFeatureExtractionConfig.from_dict(kwargs["fe_config"]) + self.feature_extraction = DbMelFeatureExtraction(config=fe_config) + + if label_target_size is None: + if n_vocab is None: + run_ctx = get_run_ctx() + dataset = run_ctx.engine.train_dataset or run_ctx.engine.forward_dataset + self.label_target_size = len(dataset.datasets["zip_dataset"].targets.labels) + else: + self.label_target_size = n_vocab + else: + self.label_target_size = label_target_size + + self.decoder = modules.Flow( + out_channels, + hidden_channels_dec or hidden_channels, + kernel_size_dec, + dilation_rate, + n_blocks_dec, + n_block_layers, + p_dropout=p_dropout_flow, + n_split=n_split, + n_sqz=n_sqz, + sigmoid_scale=sigmoid_scale, + gin_channels=gin_channels, + ) + if conformer_model_config is None: + specaug_config = SpecaugConfig( + repeat_per_n_frames=25, + max_dim_time=20, + max_dim_feat=16, + num_repeat_feat=5, + ) + frontend_config = VGG4LayerActFrontendV1Config( + in_features=80, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + out_features=384, + activation=nn.ReLU(), + ) + conformer_model_config = ModelConfig( + frontend_config=frontend_config, + specaug_config=specaug_config, + label_target_size=self.n_vocab, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=1536, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + specauc_start_epoch=1 + ) + + self.cfg = conformer_model_config + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, kernel_size=self.cfg.conv_kernel_size, dropout=self.cfg.conv_dropout, activation=nn.functional.silu, + norm=LayerNormNC(conformer_size) + ), + ), + ) + + if self.layer_norm: + print("Using Layer Norm after Flow...") + + if self.batch_norm: + print("Using Batch Norm after Flow...") + self.bn = nn.BatchNorm1d(self.out_channels) + else: + self.bn = None + + self.conformer = ConformerEncoderV1(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + def forward(self, raw_audio, raw_audio_len): + with torch.no_grad(): + self.decoder.eval() + squeezed_audio = torch.squeeze(raw_audio) + log_mel_features, log_mel_features_len = self.feature_extraction(squeezed_audio, raw_audio_len) # [B, T, F] + + audio_max_length = log_mel_features.size(1) + + flow_in = log_mel_features.transpose(1,2) # [B, F, T] + flow_in, flow_in_length, flow_in_max_length = self.preprocess(flow_in, log_mel_features_len, audio_max_length) + mask = torch.unsqueeze(commons.sequence_mask(log_mel_features_len, flow_in.size(2)), 1).to(flow_in.dtype) + flow_out, _ = self.decoder(flow_in, mask, reverse=False) # [B, F, T] + + spec_augment_in = flow_out.transpose(1,2) # [B, T, F] + mask = mask_tensor(spec_augment_in, flow_in_length) + + if self.training and self.spec_augment: + audio_features_masked_2 = apply_spec_aug( + spec_augment_in, + num_repeat_time=torch.max(log_mel_features_len).detach().cpu().numpy() + // self.net_kwargs["repeat_per_num_frames"], + max_dim_time=self.net_kwargs["max_dim_time"], + num_repeat_feat=self.net_kwargs["num_repeat_feat"], + max_dim_feat=self.net_kwargs["max_dim_feat"], + ) + else: + audio_features_masked_2 = spec_augment_in + + conformer_in = audio_features_masked_2 + + if self.layer_norm: + conformer_in = torch.nn.functional.layer_norm(conformer_in, (conformer_in.size(-1),)) + elif self.bn is not None: + conformer_in = self.bn(conformer_in.transpose(1,2)).transpose(1,2) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + def preprocess(self, y, y_lengths, y_max_length): + if y_max_length is not None: + y_max_length = (y_max_length // self.n_sqz) * self.n_sqz + y = y[:, :, :y_max_length] + y_lengths = (y_lengths // self.n_sqz) * self.n_sqz + return y, y_lengths, y_max_length + + def store_inverse(self): + self.decoder.store_inverse() diff --git a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/only_blstm_frame_stack.py b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/only_blstm_frame_stack.py index 32be20e41..08866c2cc 100644 --- a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/only_blstm_frame_stack.py +++ b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/only_blstm_frame_stack.py @@ -31,7 +31,7 @@ from .shared import attentions from .monotonic_align import maximum_path -from .shared.forward import forward_init_hook, forward_step, forward_finish_hook +from .shared.forward import forward_init_hook, forward_step, forward_finish_hook, prior_init_hook, prior_finish_hook, prior_step from .shared.train import train_step from IPython import embed diff --git a/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/shared/eval_invertibility.py b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/shared/eval_invertibility.py new file mode 100644 index 000000000..127bf3375 --- /dev/null +++ b/users/rilling/experiments/librispeech/librispeech_glow_asr/pytorch_networks/shared/eval_invertibility.py @@ -0,0 +1,83 @@ +import multiprocessing +import torch +import numpy as np +from returnn.datasets.hdf import SimpleHDFWriter +from . import commons + + +def forward_init_hook_invertibility(run_ctx, **kwargs): + run_ctx.total_mae = 0 + run_ctx.total_ae_var = 0 + run_ctx.total_ae_max = torch.tensor(-np.inf) + run_ctx.total_ae_min = torch.tensor(np.inf) + run_ctx.num_of_obs = 0 + + +def forward_finish_hook_invertibility(run_ctx, **kwargs): + with open("output.hdf", "w+") as f: + f.write("total, mean, var, max, min \n") + f.write( + f"{run_ctx.num_of_obs}, {str(float(run_ctx.total_mae))}, {str(float(run_ctx.total_ae_var))}, {str(float(run_ctx.total_ae_max))}, {str(float(run_ctx.total_ae_min))}" + ) + + +def forward_step_invertibility(*, model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, N] (sparse) + raw_audio_len = data["raw_audio:size1"] # [B] + phonemes = data["phon_labels"] + phonemes_len = data["phon_labels:size1"] + + squeezed_audio = torch.squeeze(raw_audio) + y, y_lengths = model.feature_extraction(squeezed_audio, raw_audio_len) # [B, T, F] + y = y.transpose(1, 2) # [B, F, T] + + if hasattr(model, "x_vector"): + _, _, g = model.x_vector(y, y_lengths) + + if hasattr(model, "x_vector_bottleneck"): + g = model.x_vector_bottleneck(g) + elif hasattr(model, "emb_g"): + g = torch.nn.functional.normalize(model.emb_g(g.squeeze(-1))).unsqueeze(-1) + else: + g = None + + y_max_length = y.size(2) + + y, y_lengths, y_max_length = model.preprocess(y, y_lengths, y_max_length) + z_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y_max_length), 1).to(torch.int32) + + z, _ = model.decoder(y, z_mask, g=g, reverse=False) + y_hat, _ = model.decoder(z, z_mask, g=g, reverse=True) + + mae = torch.nn.functional.l1_loss(y_hat * z_mask, y * z_mask, reduction="none") # [B, F, T] + + current_num_of_obs = y_hat.shape[1] * y_lengths.sum() # F * total_number_of_frames_in_batch + + old_mae = run_ctx.total_mae + + current_mae = ( + mae.sum() / current_num_of_obs + ) # This considers the masking by only using the mean over all unmasked elements + + current_var = (mae - current_mae).pow(2).sum() / ( + current_num_of_obs - 1 + ) # Variance over unmasked elements with bias correction 1 + + run_ctx.total_mae = ((run_ctx.num_of_obs / (run_ctx.num_of_obs + current_num_of_obs)) * old_mae) + ( + (current_num_of_obs / (run_ctx.num_of_obs + current_num_of_obs)) * current_mae + ) + + run_ctx.total_ae_var = ( + (run_ctx.num_of_obs / (run_ctx.num_of_obs + current_num_of_obs)) * run_ctx.total_ae_var + + ((current_num_of_obs / (run_ctx.num_of_obs + current_num_of_obs)) * current_var) + + ((run_ctx.num_of_obs * current_num_of_obs) / (run_ctx.num_of_obs + current_num_of_obs) ** 2) + * (old_mae - current_mae) ** 2 + ) + + run_ctx.total_ae_max = torch.max(run_ctx.total_ae_max, mae.max()) + + run_ctx.total_ae_min = torch.min( + run_ctx.total_ae_min, (mae + (-1 * z_mask + 1) * torch.tensor(float("inf")).nan_to_num(0.0)).min() + ) # Masked Min operation + + run_ctx.num_of_obs += current_num_of_obs diff --git a/users/rilling/experiments/librispeech/librispeech_glow_asr/serializer.py b/users/rilling/experiments/librispeech/librispeech_glow_asr/serializer.py index 6c60d9de5..1a926dbee 100644 --- a/users/rilling/experiments/librispeech/librispeech_glow_asr/serializer.py +++ b/users/rilling/experiments/librispeech/librispeech_glow_asr/serializer.py @@ -20,6 +20,7 @@ def get_pytorch_serializer_v3( use_custom_engine=False, search=False, prior=False, + invertibility=False, debug=False, search_args: Dict[str, Any]={}, **kwargs @@ -91,6 +92,27 @@ def get_pytorch_serializer_v3( serializer_objects.extend( [forward_step, init_hook, finish_hook] ) + + if invertibility: + forward_step = Import( + code_object_path=package + ".%s.forward_step_invertibility" % network_module, + unhashed_package_root=PACKAGE, + import_as="forward_step", + ) + init_hook = Import( + code_object_path=package + ".%s.forward_init_hook_invertibility" % network_module, + unhashed_package_root=PACKAGE, + import_as="forward_init_hook", + ) + finish_hook = Import( + code_object_path=package + ".%s.forward_finish_hook_invertibility" % network_module, + import_as="forward_finish_hook", + unhashed_package_root=PACKAGE, + ) + serializer_objects.extend( + [forward_step, init_hook, finish_hook] + ) + serializer = TorchCollection( serializer_objects=serializer_objects, make_local_package_copy=not debug, diff --git a/users/rilling/experiments/librispeech/librispeech_glow_asr/training_comparison.ipynb b/users/rilling/experiments/librispeech/librispeech_glow_asr/training_comparison.ipynb index 326039076..efbe738bc 100644 --- a/users/rilling/experiments/librispeech/librispeech_glow_asr/training_comparison.ipynb +++ b/users/rilling/experiments/librispeech/librispeech_glow_asr/training_comparison.ipynb @@ -32,25 +32,24 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "['/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/training', '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/training']\n" + "['/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_200ep_not_silence_preprocessed_not_freezed/training']\n" ] }, { "data": { "text/plain": [ - "({'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/training': '/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/training': '/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/'},\n", - " 2)" + "({'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_200ep_not_silence_preprocessed_not_freezed/training': ''},\n", + " 1)" ] }, - "execution_count": 17, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -61,8 +60,8 @@ " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed*/training\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/blstm_2x512_d0.2_b300_fs4/asr_dataset/glow_enc192/glow_*eval/not_silence_preprocessed/training\"\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed*/training\"\n", - " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/training\",\n", - " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/training\",\n", + " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/training\",\n", + " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/training\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/encoding_test/enc768/mean_only/*/training\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/encoding_test/enc768/with_sigma/*/training\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/encoding_test/enc192/*/training\",\n", @@ -70,6 +69,7 @@ " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/decoder_test/enc768/mean_only/*/training\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/decoder_test/enc192/*/training\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/decoder_test/enc768/with_sigma/*/training\",\n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment_before/glow_enc192_200ep_not_silence_preprocessed_not_freezed/training\"\n", "]\n", "lr_files = []\n", "for g in globs:\n", @@ -92,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -120,7 +120,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -129,15 +129,13 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/glow_enc192_200ep_dec_0.05_v2/training: 3\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/glow_not_eval/asr_dataset/spec_augment/glow_enc192_200ep_not_silence_preprocessed/training: 3\n", "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/training: 3\n", "Large Font: False\n", "Setup Interactive Legend\n", @@ -147,18 +145,18 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "d2310e7bf4524ee3b98f4cd89b041032", + "model_id": "9711d7e04e59419f918b17db11cf953f", "version_major": 2, "version_minor": 0 }, - "image/png": "", + "image/png": "", "text/html": [ "\n", "
\n", "
\n", " Figure\n", "
\n", - " \n", + " \n", "
\n", " " ], diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/config.py b/users/rilling/experiments/librispeech/librispeech_joint_training/config.py index 48e1c0dcb..c407b624e 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/config.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/config.py @@ -194,3 +194,50 @@ def get_search_config( ) returnn_config = ReturnnConfig(config=config, post_config=post_config, python_epilog=[serializer]) return returnn_config + + +def get_prior_config( + training_datasets: TrainingDataset, + network_module: str, + net_args: Dict[str, Any], + config: Dict[str, Any], + debug: bool = False, + use_custom_engine=False, + target="prior", + **kwargs, +): + """ + Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner + :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob + :param training_datasets: datasets for training + :param kwargs: arguments to be passed to the network construction + :return: RETURNN training config + """ + + # changing these does not change the hash + post_config = { + } + + base_config = { + ############# + "batch_size": 50000 * 160, + "max_seqs": 60, + ############# + "forward": training_datasets.prior.as_returnn_opts() if target == "prior" else training_datasets.devtrain.as_returnn_opts() + + } + config = {**base_config, **copy.deepcopy(config)} + post_config["backend"] = "torch" + + serializer = get_serializer( + network_module=network_module, + net_args=net_args, + debug=debug, + use_custom_engine=use_custom_engine, + forward=True, + target=target, + ) + returnn_config = ReturnnConfig( + config=config, post_config=post_config, python_epilog=[serializer] + ) + return returnn_config diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/data.py b/users/rilling/experiments/librispeech/librispeech_joint_training/data.py index 457b6ce9d..7910752e9 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/data.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/data.py @@ -72,6 +72,7 @@ class TrainingDataset: train: Dataset cv: Dataset devtrain: Dataset + prior: Dataset datastreams: Dict[str, Datastream] @@ -516,7 +517,17 @@ def build_training_dataset( ) devtrain_dataset = make_meta_dataset(devtrain_zip_dataset, joint_speaker_dataset, train_eow_phonemes_dataset) - return TrainingDataset(train=train_dataset, cv=cv_dataset, devtrain=devtrain_dataset, datastreams=datastreams) + prior_zip_dataset = OggZipDataset( + files=train_ogg, + audio_options=training_audio_opts, + target_options=train_phoneme_datastream_tts.as_returnn_targets_opts(), + partition_epoch=1, + seq_ordering="sorted_reverse", + additional_options=additional_opts, + ) + prior_dataset = make_meta_dataset(prior_zip_dataset, joint_speaker_dataset, train_eow_phonemes_dataset) + + return TrainingDataset(train=train_dataset, cv=cv_dataset, devtrain=devtrain_dataset, datastreams=datastreams, prior=prior_dataset) @lru_cache() diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/experiments.py b/users/rilling/experiments/librispeech/librispeech_joint_training/experiments.py index a4d523046..3b116ab20 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/experiments.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/experiments.py @@ -15,8 +15,8 @@ get_arpa_lm, get_text_lexicon, ) -from .config import get_training_config, get_extract_durations_forward__config, get_forward_config, get_search_config -from .pipeline import training, forward, search +from .config import get_training_config, get_extract_durations_forward__config, get_forward_config, get_search_config, get_prior_config +from .pipeline import training, forward, search, compute_prior from i6_experiments.users.rilling.experiments.librispeech.common.tts_eval import tts_eval @@ -24,6 +24,7 @@ from .pytorch_networks.shared.i6modelsV1_VGG4LayerActFrontendV1_v4_cfg import ( SpecaugConfig, ModelConfig, + ModelConfigV2, VGG4LayerActFrontendV1Config_mod, TextEncoderConfig, FlowDecoderConfig, @@ -63,10 +64,14 @@ def run_exp( eval_tts=False, tts_eval_datasets=None, eval_invertibility=False, + eval_asr_invertibility=False, large_gpu_training=False, + with_prior=False, ): exp = {} + with_prior = with_prior or ("prior_scale" in search_args and search_args["prior_scale"] != 0) + if given_train_job_for_forward is None: training_config = get_training_config( training_datasets=dataset, @@ -90,6 +95,18 @@ def run_exp( train_job = given_train_job_for_forward exp["train_job"] = train_job + if with_prior: + returnn_config = get_prior_config(training_datasets=dataset, **args) + prior_file = compute_prior( + prefix + name, + returnn_config, + checkpoint=train_job.out_checkpoints[num_epochs], + returnn_exe=RETURNN_PYTORCH_ASR_SEARCH_EXE, + returnn_root=MINI_RETURNN_ROOT, + ) + tk.register_output(prefix + name + "/prior.txt", prior_file) + search_args["prior_file"] = prior_file + if extract_x_vector: forward_x_vector_config = get_forward_config( forward_dataset=dataset, **args, forward_args=forward_args, target="xvector", train_data=True @@ -115,22 +132,12 @@ def run_exp( ), ) - # forward_config_gl2 = get_forward_config( - # forward_dataset=dataset, - # **{**args, **{"config": {"batch_size": 50 * 16000}}}, - # forward_args={ - # **forward_args, - # "gl_net_checkpoint": gl_checkpoint["checkpoint"], - # "gl_net_config": gl_checkpoint["config"], - # }, - # target="corpus_gl", - # cv_data=True, - # ) - if eval_invertibility: - forward_config_invert = get_forward_config( - forward_dataset=dataset, **{**args, **{"config": {"batch_size": 50 * 16000}}}, target="invertibility" - ) + forward_config_invert = get_prior_config(dataset, target="invertibility", **args) + + if eval_asr_invertibility: + # Only used to check invertibility for models using two forward passes + forward_config_invert = get_prior_config(dataset, target="asr_invertibility", **args) if asr_search: search_config = get_search_config( @@ -174,18 +181,6 @@ def run_exp( nisqa_confidence=True, ) - # forward_job_gl = tts_eval( - # checkpoint=train_job.out_checkpoints[num_epochs], - # prefix_name=prefix + name, - # returnn_config=forward_config_gl2, - # returnn_exe=RETURNN_PYTORCH_EXE, - # returnn_root=MINI_RETURNN_ROOT, - # vocoder="gl", - # nisqa_eval=True, - # swer_eval=True, - # swer_eval_corpus_key="train-clean-100", - # ) - if eval_invertibility: forward_job = forward( checkpoint=train_job.out_checkpoints[num_epochs], @@ -198,6 +193,18 @@ def run_exp( exp["invertibility_job"] = forward_job + if eval_asr_invertibility: + forward_job = forward( + checkpoint=train_job.out_checkpoints[num_epochs], + config=forward_config_invert, + returnn_exe=RETURNN_PYTORCH_EXE, + returnn_root=MINI_RETURNN_ROOT, + prefix=prefix + name, + target="asr_invertibility", + ) + + exp["asr_invertibility_job"] = forward_job + if asr_search: search( prefix + name + "/search", @@ -302,6 +309,11 @@ def run_exp( ) asr_test_datasets2["dev-clean"] = build_test_dataset(librispeech_key="train-clean-100", dataset_key="dev-clean") + asr_test_datasets3 = copy.deepcopy(asr_test_datasets) + asr_test_datasets3["dev-clean"] = build_test_dataset(librispeech_key="train-clean-100", dataset_key="dev-clean") + asr_test_datasets3["test-clean"] = build_test_dataset(librispeech_key="train-clean-100", dataset_key="test-clean") + asr_test_datasets3["test-other"] = build_test_dataset(librispeech_key="train-clean-100", dataset_key="test-other") + tts_forward_datasets = {} tts_forward_datasets_xvectors = {} @@ -433,6 +445,9 @@ def run_exp( n_speakers=speaker_datastream.vocab_size, ) + model_config_strong_conformer_weak_specaug = copy.deepcopy(model_config_strong_conformer) + model_config_strong_conformer_weak_specaug.specaug_config = specaug_config + net_module = "glowTTS_ASR_conformer_x_vector" train_args = { @@ -650,18 +665,23 @@ def run_exp( search_args=default_search_args, eval_tts=True, tts_eval_datasets=tts_forward_datasets_xvectors, + eval_invertibility=True, ) for lm in [2.0, 2.5, 3.0, 3.5, 4.0, 4.5]: - exp_dict = run_exp( - net_module + f"/tuning/lm_{lm}", - train_args_v2, - training_datasets, - asr_test_datasets, - 250, - forward_args=forward_args, - search_args={**default_search_args, **{"lm_weight": lm}}, - ) + for ps in [0, 0.3, 0.5]: + additional_search_args = {"lm_weight": lm} if ps == 0 else {"lm_weight": lm, "prior_scale": ps} + suffix = f"/tuning/lm_{lm}" if ps == 0 else f"/tuning/lm_{lm}_ps_{ps}" + + exp_dict = run_exp( + net_module + suffix, + train_args_v2, + training_datasets, + asr_test_datasets, + 250, + forward_args=forward_args, + search_args={**default_search_args, **additional_search_args}, + ) exp_dict = run_exp( net_module + "_ctc_scale_0.1", train_args_v2, @@ -673,18 +693,23 @@ def run_exp( search_args=default_search_args, eval_tts=True, tts_eval_datasets=tts_forward_datasets_xvectors, + eval_invertibility=True ) for lm in [2.0, 2.5, 3.0, 3.5, 4.0, 4.5]: - exp_dict = run_exp( - net_module + f"_ctc_scale_0.1/tuning/lm_{lm}", - train_args_v2, - training_datasets, - asr_test_datasets, - 250, - training_args={"ctc_scale": 0.1}, - forward_args=forward_args, - search_args={**default_search_args, **{"lm_weight": lm}}, - ) + for ps in [0, 0.3, 0.5]: + additional_search_args = {"lm_weight": lm} if ps == 0 else {"lm_weight": lm, "prior_scale": ps} + suffix = f"_ctc_scale_0.1/tuning/lm_{lm}" if ps == 0 else f"_ctc_scale_0.1/tuning/lm_{lm}_ps_{ps}" + + exp_dict = run_exp( + net_module + suffix, + train_args_v2, + training_datasets, + asr_test_datasets, + 250, + training_args={"ctc_scale": 0.1}, + forward_args=forward_args, + search_args={**default_search_args, **additional_search_args}, + ) train_args_spec_augment_v2 = copy.deepcopy(train_args_v2) train_args_spec_augment_v2["net_args"]["model_config"]["specaug_config"] = asdict(specaug_config) @@ -701,15 +726,18 @@ def run_exp( ) for lm in [2.0, 2.5, 3.0, 3.5, 4.0, 4.5]: - exp_dict = run_exp( - net_module + f"_spec_augment/tuning/lm_{lm}", - train_args_spec_augment_v2, - training_datasets, - asr_test_datasets, - 250, - forward_args=forward_args, - search_args={**default_search_args, **{"lm_weight": lm}}, - ) + for ps in [0, 0.3, 0.5]: + additional_search_args = {"lm_weight": lm} if ps == 0 else {"lm_weight": lm, "prior_scale": ps} + suffix = f"_spec_augment/tuning/lm_{lm}" if ps == 0 else f"_spec_augment/tuning/lm_{lm}_ps_{ps}" + exp_dict = run_exp( + net_module + suffix, + train_args_spec_augment_v2, + training_datasets, + asr_test_datasets, + 250, + forward_args=forward_args, + search_args={**default_search_args, **additional_search_args}, + ) exp_dict = run_exp( net_module + "_spec_augment_ctc_scale_0.1", @@ -725,16 +753,19 @@ def run_exp( ) for lm in [2.0, 2.5, 3.0, 3.5, 4.0, 4.5]: - exp_dict = run_exp( - net_module + f"_spec_augment_ctc_scale_0.1/tuning/lm_{lm}", - train_args_spec_augment_v2, - training_datasets, - asr_test_datasets, - 250, - training_args={"ctc_scale": 0.1}, - forward_args=forward_args, - search_args={**default_search_args, **{"lm_weight": lm}}, - ) + for ps in [0, 0.3, 0.5]: + additional_search_args = {"lm_weight": lm} if ps == 0 else {"lm_weight": lm, "prior_scale": ps} + suffix = f"_spec_augment_ctc_scale_0.1/tuning/lm_{lm}" if ps == 0 else f"_spec_augment_ctc_scale_0.1/tuning/lm_{lm}_ps_{ps}" + exp_dict = run_exp( + net_module + suffix, + train_args_spec_augment_v2, + training_datasets, + asr_test_datasets, + 250, + training_args={"ctc_scale": 0.1}, + forward_args=forward_args, + search_args={**default_search_args, **additional_search_args}, + ) train_args_control = copy.deepcopy(train_args) net_module = "glowTTS_ASR_conformer_x_vector_control" @@ -985,26 +1016,31 @@ def run_exp( ) for lm in [2.0, 2.5, 3.0, 3.5, 4.0, 4.5]: - exp_dict = run_exp( - net_module + f"/tuning/lm_{lm}", - train_args_two_forward_v2, - training_datasets, - asr_test_datasets, - 250, - forward_args=forward_args, - search_args={**default_search_args, **{"lm_weight": lm}}, - ) + for ps in [0, 0.3, 0.5]: + additional_search_args = {"lm_weight": lm} if ps == 0 else {"lm_weight": lm, "prior_scale": ps} + suffix = f"/tuning/lm_{lm}" if ps == 0 else f"/tuning/lm_{lm}_ps_{ps}" + exp_dict = run_exp( + net_module + suffix, + train_args_two_forward_v2, + training_datasets, + asr_test_datasets, + 250, + forward_args=forward_args, + search_args={**default_search_args, **additional_search_args}, + ) - exp_dict = run_exp( - net_module + f"_ctc_scale_0.1/tuning/lm_{lm}", - train_args_two_forward_v2, - training_datasets, - asr_test_datasets, - 250, - training_args={"ctc_scale": 0.1}, - forward_args=forward_args, - search_args={**default_search_args, **{"lm_weight": lm}}, - ) + suffix = f"_ctc_scale_0.1/tuning/lm_{lm}" if ps == 0 else f"_ctc_scale_0.1/tuning/lm_{lm}_ps_{ps}" + + exp_dict = run_exp( + net_module + suffix, + train_args_two_forward_v2, + training_datasets, + asr_test_datasets, + 250, + training_args={"ctc_scale": 0.1}, + forward_args=forward_args, + search_args={**default_search_args, **additional_search_args}, + ) net_module = "glowTTS_ASR_conformer_two_forward_pass" train_args_two_forward_no_xvector = copy.deepcopy(train_args_spec_augment) @@ -1023,6 +1059,7 @@ def run_exp( eval_tts=True, tts_eval_datasets=tts_forward_datasets, eval_invertibility=True, + eval_asr_invertibility=True, ) exp_dict = run_exp( net_module + "_ctc_scale_0.1", @@ -1036,28 +1073,57 @@ def run_exp( eval_tts=True, tts_eval_datasets=tts_forward_datasets, eval_invertibility=True, + eval_asr_invertibility=True, ) for lm in [2.0, 2.5, 3.0, 3.5, 4.0, 4.5]: - exp_dict = run_exp( - net_module + f"/tuning/lm_{lm}", - train_args_two_forward_no_xvector, - training_datasets, - asr_test_datasets, - 250, - forward_args=forward_args, - search_args={**default_search_args, **{"lm_weight": lm}}, - ) - exp_dict = run_exp( - net_module + f"_ctc_scale_0.1/tuning/lm_{lm}", - train_args_two_forward_no_xvector, - training_datasets, - asr_test_datasets, - 250, - training_args={"ctc_scale": 0.1}, - forward_args=forward_args, - search_args={**default_search_args, **{"lm_weight": lm}}, - ) + for ps in [0, 0.3, 0.5]: + additional_search_args = {"lm_weight": lm, "prior_scale": ps} if ps != 0 else {"lm_weight": lm} + exp_dict = run_exp( + net_module + f"/tuning/lm_{lm}_ps_{ps}", + train_args_two_forward_no_xvector, + training_datasets, + asr_test_datasets, + 250, + forward_args=forward_args, + search_args={**default_search_args, **additional_search_args}, + with_prior=(ps!=0) + ) + exp_dict = run_exp( + net_module + f"_ctc_scale_0.1/tuning/lm_{lm}_ps_{ps}", + train_args_two_forward_no_xvector, + training_datasets, + asr_test_datasets, + 250, + training_args={"ctc_scale": 0.1}, + forward_args=forward_args, + search_args={**default_search_args, **additional_search_args}, + with_prior=(ps!=0) + ) + + tuned_search_args = {"lm_weight": 3.0, "prior_scale": 0.5} + exp_dict = run_exp( + net_module + f"/tuned", + train_args_two_forward_no_xvector, + training_datasets, + asr_test_datasets3, + 250, + forward_args=forward_args, + search_args={**default_search_args, **tuned_search_args}, + with_prior=(ps!=0) + ) + tuned_search_args = {"lm_weight": 2.5, "prior_scale": 0.5} + exp_dict = run_exp( + net_module + f"_ctc_scale_0.1/tuned", + train_args_two_forward_no_xvector, + training_datasets, + asr_test_datasets3, + 250, + training_args={"ctc_scale": 0.1}, + forward_args=forward_args, + search_args={**default_search_args, **tuned_search_args}, + with_prior=(ps!=0) + ) train_args_two_forward_no_xvector_strong_conformer = copy.deepcopy(train_args_two_forward_no_xvector) train_args_two_forward_no_xvector_strong_conformer["net_args"]["model_config"] = asdict(model_config_strong_conformer) @@ -1076,6 +1142,111 @@ def run_exp( large_gpu_training=True ) + exp_dict = run_exp( + net_module + "_strong_conformer_ctc_scale_1.0", + train_args_two_forward_no_xvector_strong_conformer, + training_datasets, + asr_test_datasets, + 250, + training_args={"ctc_scale": 1.0}, + forward_args=forward_args, + search_args=default_search_args, + eval_tts=True, + tts_eval_datasets=tts_forward_datasets, + large_gpu_training=True + ) + + train_args_two_forward_no_xvector_strong_conformer_weak_specaug = copy.deepcopy(train_args_two_forward_no_xvector_strong_conformer) + train_args_two_forward_no_xvector_strong_conformer_weak_specaug["net_args"]["model_config"] = asdict(model_config_strong_conformer_weak_specaug) + exp_dict = run_exp( + net_module + "_strong_conformer_weak_specaug_ctc_scale_0.1", + train_args_two_forward_no_xvector_strong_conformer_weak_specaug, + training_datasets, + asr_test_datasets, + 250, + training_args={"ctc_scale": 0.1}, + forward_args=forward_args, + search_args=default_search_args, + eval_tts=True, + tts_eval_datasets=tts_forward_datasets, + large_gpu_training=True + ) + + exp_dict = run_exp( + net_module + "_strong_conformer_weak_specaug_ctc_scale_1.0", + train_args_two_forward_no_xvector_strong_conformer_weak_specaug, + training_datasets, + asr_test_datasets, + 250, + training_args={"ctc_scale": 1.0}, + forward_args=forward_args, + search_args=default_search_args, + eval_tts=True, + tts_eval_datasets=tts_forward_datasets, + large_gpu_training=True + ) + + for lm in [2.0, 2.5, 3.0, 3.5, 4.0, 4.5]: + for ps in [0, 0.3, 0.5]: + additional_search_args = {"lm_weight": lm, "prior_scale": ps} if ps != 0 else {"lm_weight": lm} + exp_dict = run_exp( + net_module + f"_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_{lm}_ps_{ps}", + train_args_two_forward_no_xvector_strong_conformer_weak_specaug, + training_datasets, + asr_test_datasets, + 250, + training_args={"ctc_scale": 0.1}, + forward_args=forward_args, + search_args={**default_search_args, **additional_search_args}, + eval_tts=True, + tts_eval_datasets=tts_forward_datasets, + large_gpu_training=True, + ) + + exp_dict = run_exp( + net_module + f"_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_{lm}_ps_{ps}", + train_args_two_forward_no_xvector_strong_conformer_weak_specaug, + training_datasets, + asr_test_datasets, + 250, + training_args={"ctc_scale": 1.0}, + forward_args=forward_args, + search_args={**default_search_args, **additional_search_args}, + eval_tts=True, + tts_eval_datasets=tts_forward_datasets, + large_gpu_training=True, + ) + + tuned_search_args = {"lm_weight": 3.5, "prior_scale": 0.3} + exp_dict = run_exp( + net_module + f"_strong_conformer_weak_specaug_ctc_scale_0.1/tuned", + train_args_two_forward_no_xvector_strong_conformer_weak_specaug, + training_datasets, + asr_test_datasets3, + 250, + training_args={"ctc_scale": 0.1}, + forward_args=forward_args, + search_args={**default_search_args, **tuned_search_args}, + eval_tts=True, + tts_eval_datasets=tts_forward_datasets, + large_gpu_training=True, + ) + + tuned_search_args = {"lm_weight": 4.0, "prior_scale": 0.5} + exp_dict = run_exp( + net_module + f"_strong_conformer_weak_specaug_ctc_scale_1.0/tuned", + train_args_two_forward_no_xvector_strong_conformer_weak_specaug, + training_datasets, + asr_test_datasets3, + 250, + training_args={"ctc_scale": 1.0}, + forward_args=forward_args, + search_args={**default_search_args, **tuned_search_args}, + eval_tts=True, + tts_eval_datasets=tts_forward_datasets, + large_gpu_training=True, + ) + train_args_conformer_only = copy.deepcopy(train_args) train_args_conformer_only["net_args"]["model_config"] = asdict(model_config) net_module = "only_conformer" @@ -1093,16 +1264,19 @@ def run_exp( ) for lm in [2.0, 2.5, 3.0, 3.5, 4.0, 4.5]: - exp_dict = run_exp( - net_module + f"/tuning/lm_{lm}", - train_args_conformer_only, - training_datasets, - asr_test_datasets, - 250, - forward_args=forward_args, - search_args={**default_search_args, **{"lm_weight": lm}}, - tts_forward=False, - ) + for ps in [0, 0.3, 0.5]: + additional_search_args = {"lm_weight": lm, "prior_scale": ps} if ps != 0 else {"lm_weight": lm} + exp_dict = run_exp( + net_module + f"/tuning/lm_{lm}_ps_{ps}", + train_args_conformer_only, + training_datasets, + asr_test_datasets, + 250, + forward_args=forward_args, + search_args={**default_search_args, **additional_search_args}, + tts_forward=False, + with_prior=(ps!=0) + ) train_args_conformer_only_spec_augment = copy.deepcopy(train_args_spec_augment) train_args_conformer_only_spec_augment["network_module"] = net_module @@ -1119,16 +1293,31 @@ def run_exp( ) for lm in [2.0, 2.5, 3.0, 3.5, 4.0, 4.5]: - exp_dict = run_exp( - net_module + f"_spec_augment/tuning/lm_{lm}", - train_args_conformer_only_spec_augment, - training_datasets, - asr_test_datasets, - 250, - forward_args=forward_args, - search_args={**default_search_args, **{"lm_weight": lm}}, - tts_forward=False, - ) + for ps in [0, 0.3, 0.5]: + additional_search_args = {"lm_weight": lm, "prior_scale": ps} if ps != 0 else {"lm_weight": lm} + suffix = f"_spec_augment/tuning/lm_{lm}_ps_{ps}" if ps != 0 else f"_spec_augment/tuning/lm_{lm}" + exp_dict = run_exp( + net_module + suffix, + train_args_conformer_only_spec_augment, + training_datasets, + asr_test_datasets, + 250, + forward_args=forward_args, + search_args={**default_search_args, **additional_search_args}, + tts_forward=False, + ) + + tuned_search_args = {"lm_weight": 2.5, "prior_scale": 0.5} + exp_dict = run_exp( + net_module + "_spec_augment/tuned", + train_args_conformer_only_spec_augment, + training_datasets, + asr_test_datasets3, + 250, + forward_args=forward_args, + search_args={**default_search_args, **tuned_search_args}, + tts_forward=False, + ) exp_dict = run_exp( net_module + "_spec_augment_tts_train_segments", @@ -1181,16 +1370,19 @@ def run_exp( ) for lm in [2.0, 2.5, 3.0, 3.5, 4.0, 4.5]: - exp_dict = run_exp( - net_module + f"/tuning/lm_{lm}", - train_args_glow_conformer, - training_datasets, - asr_test_datasets, - 250, - forward_args=forward_args, - search_args={**default_search_args, **{"lm_weight": lm}}, - tts_forward=False, - ) + for ps in [0, 0.3, 0.5]: + additional_search_args = {"lm_weight": lm} if ps == 0 else {"lm_weight": lm, "prior_scale": ps} + suffix = f"/tuning/lm_{lm}" if ps == 0 else f"/tuning/lm_{lm}_ps_{ps}" + exp_dict = run_exp( + net_module + suffix, + train_args_glow_conformer, + training_datasets, + asr_test_datasets, + 250, + forward_args=forward_args, + search_args={**default_search_args, **additional_search_args}, + tts_forward=False, + ) net_module = "glow_ASR_conformer_specaugment_before" train_args_glow_conformer_specaugment_before = copy.deepcopy(train_args_glow_conformer) @@ -1208,16 +1400,19 @@ def run_exp( ) for lm in [2.0, 2.5, 3.0, 3.5, 4.0, 4.5]: - exp_dict = run_exp( - net_module + f"/tuning/lm_{lm}", - train_args_glow_conformer_specaugment_before, - training_datasets, - asr_test_datasets, - 250, - forward_args=forward_args, - search_args={**default_search_args, **{"lm_weight": lm}}, - tts_forward=False, - ) + for ps in [0, 0.3, 0.5]: + additional_search_args = {"lm_weight": lm} if ps == 0 else {"lm_weight": lm, "prior_scale": ps} + suffix = f"/tuning/lm_{lm}" if ps == 0 else f"/tuning/lm_{lm}_ps_{ps}" + exp_dict = run_exp( + net_module + suffix, + train_args_glow_conformer_specaugment_before, + training_datasets, + asr_test_datasets, + 250, + forward_args=forward_args, + search_args={**default_search_args, **additional_search_args}, + tts_forward=False, + ) exp_dict = run_exp( net_module + "_tts_train_segments", @@ -1310,16 +1505,19 @@ def run_exp( ) for lm in [2.0, 2.5, 3.0, 3.5, 4.0, 4.5]: - exp_dict = run_exp( - net_module + f"/tuning/lm_{lm}", - train_args_glow_conformer_specaugment_before_x_vector_v2, - training_datasets, - asr_test_datasets, - 250, - forward_args=forward_args, - search_args={**default_search_args, **{"lm_weight": lm}}, - tts_forward=False, - ) + for ps in [0, 0.3, 0.5]: + additional_search_args = {"lm_weight": lm} if ps == 0 else {"lm_weight": lm, "prior_scale": ps} + suffix = f"/tuning/lm_{lm}" if ps == 0 else f"/tuning/lm_{lm}_ps_{ps}" + exp_dict = run_exp( + net_module + suffix, + train_args_glow_conformer_specaugment_before_x_vector_v2, + training_datasets, + asr_test_datasets, + 250, + forward_args=forward_args, + search_args={**default_search_args, **additional_search_args}, + tts_forward=False, + ) net_module = "glow_ASR_conformer_specaugment_before_xvector_v3" train_args_glow_conformer_specaugment_before_x_vector_v3 = copy.deepcopy( @@ -1391,27 +1589,32 @@ def run_exp( ) for lm in [2.0, 2.5, 3.0, 3.5, 4.0, 4.5]: - exp_dict = run_exp( - train_args_glow_conformer_xvector["network_module"] + f"grad_clip_10/tuning/lm_{lm}", - train_args_glow_conformer_xvector, - training_datasets, - asr_test_datasets, - 250, - forward_args=forward_args, - search_args={**default_search_args, **{"lm_weight": lm}}, - tts_forward=False, - ) + for ps in [0, 0.3, 0.5]: + additional_search_args = {"lm_weight": lm} if ps == 0 else {"lm_weight": lm, "prior_scale": ps} + suffix = f"grad_clip_10/tuning/lm_{lm}" if ps == 0 else f"grad_clip_10/tuning/lm_{lm}_ps_{ps}" + exp_dict = run_exp( + train_args_glow_conformer_xvector["network_module"] + suffix, + train_args_glow_conformer_xvector, + training_datasets, + asr_test_datasets, + 250, + forward_args=forward_args, + search_args={**default_search_args, **additional_search_args}, + tts_forward=False, + ) - exp_dict = run_exp( - train_args_glow_conformer_xvector_eval["network_module"] + f"/tuning/lm_{lm}", - train_args_glow_conformer_xvector_eval, - training_datasets, - asr_test_datasets, - 250, - forward_args=forward_args, - search_args={**default_search_args, **{"lm_weight": lm}}, - tts_forward=False, - ) + suffix = f"/tuning/lm_{lm}" if ps == 0 else f"/tuning/lm_{lm}_ps_{ps}" + + exp_dict = run_exp( + train_args_glow_conformer_xvector_eval["network_module"] + suffix, + train_args_glow_conformer_xvector_eval, + training_datasets, + asr_test_datasets, + 250, + forward_args=forward_args, + search_args={**default_search_args, **additional_search_args}, + tts_forward=False, + ) train_args_glow_conformer_specaug_before_ddi_actnorm = copy.deepcopy(train_args_glow_conformer_specaugment_before) net_module = "glow_ASR_conformer_specaugment_before_ddi_actnorm" @@ -1500,4 +1703,68 @@ def run_exp( search_args=default_search_args, tts_forward=False, ) + # ================== BLSTM ================= + model_config_blstm = ModelConfigV2( + specaug_config=None, + decoder_config=flow_decoder_config, + text_encoder_config=text_encoder_config, + specauc_start_epoch=1, + label_target_size=vocab_size_without_blank_asr, + subsampling_factor=4, + blstm_layers=2, + blstm_hidden_dim=512, + blstm_dropout=0.2, + out_channels=80, + gin_channels=256, + n_speakers=speaker_datastream.vocab_size, + ) + + net_module="glowTTS_ASR_blstm_x_vector" + train_args_blstm = copy.deepcopy(train_args) + train_args_blstm["net_args"]["model_config"] = asdict(model_config_blstm) + train_args_blstm["network_module"] = net_module + exp_dict = run_exp( + net_module, + train_args_blstm, + training_datasets, + asr_test_datasets, + 250, + forward_args=forward_args, + search_args=default_search_args, + eval_tts=True, + tts_forward=False, + tts_eval_datasets=tts_forward_datasets_xvectors, + ) + + for lm_w in [2.0, 2.5, 3.0, 3.5, 4.0, 4.5]: + additional_search_args = {"lm_weight": lm_w} + exp_dict = run_exp( + net_module + f"/tuning/lm_{lm_w}", + train_args_blstm, + training_datasets, + asr_test_datasets, + 250, + forward_args=forward_args, + search_args={**default_search_args, **additional_search_args}, + eval_tts=True, + tts_forward=False, + tts_eval_datasets=tts_forward_datasets_xvectors, + ) + + model_config_blstm_specaug = copy.deepcopy(model_config_blstm) + model_config_blstm_specaug.specaug_config = specaug_config + train_args_blstm["net_args"]["model_config"] = asdict(model_config_blstm_specaug) + exp_dict = run_exp( + net_module + "_specaug", + train_args_blstm, + training_datasets, + asr_test_datasets, + 250, + forward_args=forward_args, + search_args=default_search_args, + eval_tts=True, + tts_forward=False, + tts_eval_datasets=tts_forward_datasets_xvectors, + ) + return experiments diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pipeline.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pipeline.py index 1f85c074e..0ba7a3b33 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/pipeline.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pipeline.py @@ -199,6 +199,40 @@ def search(prefix_name, returnn_config, checkpoint, test_dataset_tuples, returnn return format_string_report, values_report +@tk.block() +def compute_prior( + prefix_name, + returnn_config, + checkpoint, + returnn_exe, + returnn_root, + mem_rqmt=8, +): + """ + Run search for a specific test dataset + + :param str prefix_name: + :param ReturnnConfig returnn_config: + :param Checkpoint checkpoint: + :param Path returnn_exe: + :param Path returnn_root: + """ + search_job = ReturnnForwardJobV2( + model_checkpoint=checkpoint, + returnn_config=returnn_config, + log_verbosity=5, + mem_rqmt=mem_rqmt, + time_rqmt=1, + device="gpu", + cpu_rqmt=4, + returnn_python_exe=returnn_exe, + returnn_root=returnn_root, + output_files=["prior.txt"], + ) + search_job.add_alias(prefix_name + "/prior_job") + return search_job.out_files["prior.txt"] + + # def evaluate_invertibility(name, checkpoint, forward_config, returnn_root, returnn_exe): # forward_job = forward( diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_blstm_two_forward_pass.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_blstm_two_forward_pass.py new file mode 100644 index 000000000..5e3a588e3 --- /dev/null +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_blstm_two_forward_pass.py @@ -0,0 +1,632 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +from dataclasses import dataclass +import torch +import numpy as np +from torch import nn +import multiprocessing +from librosa import filters +import sys +import time +from typing import Any, Dict, Optional, Tuple, Union +import math +import os +import soundfile + +from torchaudio.functional import mask_along_axis + +from i6_models.parts.blstm import BlstmEncoderV1, BlstmEncoderV1Config + + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1, VGG4LayerActFrontendV1Config + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + +from returnn.torch.context import get_run_ctx + +from .shared.i6modelsV1_VGG4LayerActFrontendV1_v4_cfg import ( + SpecaugConfig, + VGG4LayerActFrontendV1Config_mod, + ModelConfigV2, + FlowDecoderConfig, + TextEncoderConfig, +) + +from .shared.configs import DbMelFeatureExtractionConfig +from .shared.feature_extraction import DbMelFeatureExtraction +from .shared.spec_augment import apply_spec_aug +from .shared.mask import mask_tensor + +from .shared import modules +from .shared import commons +from .shared import attentions +from .monotonic_align import maximum_path + +from .shared.forward import prior_init_hook, prior_step, prior_finish_hook + +from .shared.eval_forward import * +from .shared.eval_invertibility import * + +from IPython import embed + +class DurationPredictor(nn.Module): + """ + Duration Predictor module, trained using calculated durations coming from monotonic alignment search + """ + + def __init__(self, in_channels, filter_channels, filter_size, p_dropout): + super().__init__() + + self.in_channels = in_channels + self.filter_channels = filter_channels + self.filter_size = filter_size + self.p_dropout = p_dropout + + self.convs = nn.Sequential( + modules.Conv1DBlock( + in_size=self.in_channels, + out_size=self.filter_channels, + filter_size=self.filter_size, + p_dropout=p_dropout, + ), + modules.Conv1DBlock( + in_size=self.filter_channels, + out_size=self.filter_channels, + filter_size=self.filter_size, + p_dropout=p_dropout, + ), + ) + self.proj = nn.Conv1d(in_channels=self.filter_channels, out_channels=1, kernel_size=1) + + def forward(self, x, x_mask): + x_with_mask = (x, x_mask) + (x, x_mask) = self.convs(x_with_mask) + x = self.proj(x * x_mask) + return x + + +class FlowDecoder(nn.Module): + def __init__(self, cfg: FlowDecoderConfig, in_channels, gin_channels): + """Flow-based decoder model + + Args: + in_channels (int): Number of incoming channels + hidden_channels (int): Number of hidden channels + kernel_size (int): Kernel Size for convolutions in coupling blocks + dilation_rate (float): Dilation Rate to define dilation in convolutions of coupling block + n_blocks (int): Number of coupling blocks + n_layers (int): Number of layers in CNN of the coupling blocks + p_dropout (float, optional): Dropout probability for CNN in coupling blocks. Defaults to 0.. + n_split (int, optional): Number of splits for the 1x1 convolution for flows in the decoder. Defaults to 4. + n_sqz (int, optional): Squeeze. Defaults to 1. + sigmoid_scale (bool, optional): Boolean to define if log probs in coupling layers should be rescaled using sigmoid. Defaults to False. + gin_channels (int, optional): Number of speaker embedding channels. Defaults to 0. + """ + super().__init__() + self.cfg = cfg + + self.flows = nn.ModuleList() + + for _ in range(self.cfg.n_blocks): + self.flows.append(modules.ActNorm(channels=in_channels * self.cfg.n_sqz)) + self.flows.append(modules.InvConvNear(channels=in_channels * self.cfg.n_sqz, n_split=self.cfg.n_split)) + self.flows.append( + attentions.CouplingBlock( + in_channels * self.cfg.n_sqz, + self.cfg.hidden_channels, + kernel_size=self.cfg.kernel_size, + dilation_rate=self.cfg.dilation_rate, + n_layers=self.cfg.n_layers, + gin_channels=gin_channels, + p_dropout=self.cfg.p_dropout, + sigmoid_scale=self.cfg.sigmoid_scale, + ) + ) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + flows = self.flows + logdet_tot = 0 + else: + flows = reversed(self.flows) + logdet_tot = None + + if self.cfg.n_sqz > 1: + x, x_mask = commons.channel_squeeze(x, x_mask, self.cfg.n_sqz) + for f in flows: + if not reverse: + x, logdet = f(x, x_mask, g=g, reverse=reverse) + logdet_tot += logdet + else: + x, logdet = f(x, x_mask, g=g, reverse=reverse) + if self.cfg.n_sqz > 1: + x, x_mask = commons.channel_unsqueeze(x, x_mask, self.cfg.n_sqz) + return x, logdet_tot + + def store_inverse(self): + for f in self.flows: + f.store_inverse() + +class TextEncoder(nn.Module): + """ + Text Encoder model + """ + + def __init__(self, cfg: TextEncoderConfig, out_channels, gin_channels): + """Text Encoder Model based on Multi-Head Self-Attention combined with FF-CCNs + + Args: + n_vocab (int): Size of vocabulary for embeddings + out_channels (int): Number of output channels + hidden_channels (int): Number of hidden channels + filter_channels (int): Number of filter channels + filter_channels_dp (int): Number of filter channels for duration predictor + n_heads (int): Number of heads in encoder's Multi-Head Attention + n_layers (int): Number of layers consisting of Multi-Head Attention and CNNs in encoder + kernel_size (int): Kernel Size for CNNs in encoder layers + p_dropout (float): Dropout probability for both encoder and duration predictor + window_size (int, optional): Window size in Multi-Head Self-Attention for encoder. Defaults to None. + block_length (_type_, optional): Block length for optional block masking in Multi-Head Attention for encoder. Defaults to None. + mean_only (bool, optional): Boolean to only project text encodings to mean values instead of mean and std. Defaults to False. + prenet (bool, optional): Boolean to add ConvReluNorm prenet before encoder . Defaults to False. + gin_channels (int, optional): Number of channels for speaker condition. Defaults to 0. + """ + super().__init__() + self.cfg = cfg + + self.emb = nn.Embedding(self.cfg.n_vocab, self.cfg.hidden_channels) + nn.init.normal_(self.emb.weight, 0.0, self.cfg.hidden_channels**-0.5) + + if self.cfg.prenet: + self.pre = modules.ConvReluNorm( + self.cfg.hidden_channels, + self.cfg.hidden_channels, + self.cfg.hidden_channels, + kernel_size=5, + n_layers=3, + p_dropout=0.5, + ) + self.encoder = attentions.Encoder( + self.cfg.hidden_channels, + self.cfg.filter_channels, + self.cfg.n_heads, + self.cfg.n_layers, + self.cfg.kernel_size, + self.cfg.p_dropout, + window_size=self.cfg.window_size, + block_length=self.cfg.block_length, + ) + + self.proj_m = nn.Conv1d(self.cfg.hidden_channels, out_channels, 1) + if not self.cfg.mean_only: + self.proj_s = nn.Conv1d(self.cfg.hidden_channels, out_channels, 1) + self.proj_w = DurationPredictor( + self.cfg.hidden_channels + gin_channels, + self.cfg.filter_channels_dp, + self.cfg.kernel_size, + self.cfg.p_dropout, + ) + + def forward(self, x, x_lengths, g=None): + x = self.emb(x) * math.sqrt(self.cfg.hidden_channels) # [b, t, h] + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + + if self.cfg.prenet: + x = self.pre(x, x_mask) + x = self.encoder(x, x_mask) + + if g is not None: + g_exp = g.expand(-1, -1, x.size(-1)) + # print(f"Dimension of input in Text Encoder: x.shape: {x.shape}; g: {g.shape}, g_exp: {g_exp.shape}") + x_dp = torch.cat([torch.detach(x), g_exp], 1) + else: + x_dp = torch.detach(x) + + x_m = self.proj_m(x) * x_mask + if not self.cfg.mean_only: + x_logs = self.proj_s(x) * x_mask + else: + x_logs = torch.zeros_like(x_m) + + # print(f"Dimension of input in Text Encoder before DP: {x_dp.shape}") + + logw = self.proj_w(x_dp, x_mask) + return x_m, x_logs, logw, x_mask + +class Model(nn.Module): + """ + Flow-based ASR model based on GlowTTS Structure using a pre-trained flow-based decoder + trained to generate spectrograms from given statistics coming from an encoder + + Model was pretrained using the architecture in + users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS.py + """ + + def __init__( + self, + model_config: dict, + **kwargs, + ): + """_summary_ + + Args: + n_vocab (int): vocabulary size + hidden_channels (int): Number of hidden channels in encoder + out_channels (int): Number of channels in the output + n_blocks_dec (int, optional): Number of coupling blocks in the decoder. Defaults to 12. + kernel_size_dec (int, optional): Kernel size in the decoder. Defaults to 5. + dilation_rate (int, optional): Dilation rate for CNNs of coupling blocks in decoder. Defaults to 5. + n_block_layers (int, optional): Number of layers in the CNN of the coupling blocks in decoder. Defaults to 4. + p_dropout_dec (_type_, optional): Dropout probability in the decoder. Defaults to 0.. + n_speakers (int, optional): Number of speakers. Defaults to 0. + gin_channels (int, optional): Number of speaker embedding channels. Defaults to 0. + n_split (int, optional): Number of splits for the 1x1 convolution for flows in the decoder. Defaults to 4. + n_sqz (int, optional): Squeeze. Defaults to 1. + sigmoid_scale (bool, optional): Boolean to define if log probs in coupling layers should be rescaled using sigmoid. Defaults to False. + window_size (int, optional): Window size in Multi-Head Self-Attention for encoder. Defaults to None. + block_length (_type_, optional): Block length for optional block masking in Multi-Head Attention for encoder. Defaults to None. + hidden_channels_dec (_type_, optional): Number of hidden channels in decodder. Defaults to hidden_channels. + final_hidden_channels: Number of hidden channels in the final network + final_n_layers: Number of layers in the final network + label_target_size: Target size of target vocabulary, target size for final network + """ + super().__init__() + + self.net_kwargs = { + "repeat_per_num_frames": 100, + "max_dim_feat": 8, + "num_repeat_feat": 5, + "max_dim_time": 20, + } + + fe_config = DbMelFeatureExtractionConfig.from_dict(kwargs["fe_config"]) + self.feature_extraction = DbMelFeatureExtraction(config=fe_config) + + # if label_target_size is None: + # if n_vocab is None: + # run_ctx = get_run_ctx() + # dataset = run_ctx.engine.train_dataset or run_ctx.engine.forward_dataset + # self.label_target_size = len(dataset.datasets["zip_dataset"].targets.labels) + # else: + # self.label_target_size = n_vocab + # else: + # self.label_target_size = label_target_size + + self.cfg = ModelConfigV2.from_dict(model_config) + text_encoder_config = self.cfg.text_encoder_config + decoder_config = self.cfg.decoder_config + + if self.cfg.n_speakers > 1: + self.emb_g = nn.Embedding(self.cfg.n_speakers, self.cfg.gin_channels) + nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) + + self.encoder = TextEncoder( + text_encoder_config, out_channels=self.cfg.out_channels, gin_channels=self.cfg.gin_channels + ) + + self.decoder = FlowDecoder( + decoder_config, in_channels=self.cfg.out_channels, gin_channels=self.cfg.gin_channels + ) + + blstm_config = BlstmEncoderV1Config(num_layers=self.cfg.blstm_layers, input_dim=self.cfg.out_channels*self.cfg.subsampling_factor, hidden_dim=self.cfg.blstm_hidden_dim, dropout=self.cfg.blstm_dropout, enforce_sorted=False) + + self.final = BlstmEncoderV1(blstm_config) + self.final_linear = nn.Linear(2*self.cfg.blstm_hidden_dim, self.cfg.label_target_size + 1) # + CTC blank + + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + def forward( + self, x=None, x_lengths=None, raw_audio=None, raw_audio_lengths=None, g=None, gen=False, recognition=False, noise_scale=1.0, length_scale=1.0 + ): + if not gen: + with torch.no_grad(): + squeezed_audio = torch.squeeze(raw_audio) + y, y_lengths = self.feature_extraction(squeezed_audio, raw_audio_lengths) # [B, T, F] + y = y.transpose(1, 2) # [B, F, T] + else: + y, y_lengths = (None, None) + + if not recognition: + if g is not None: + g = nn.functional.normalize(self.emb_g(g.squeeze(-1))).unsqueeze(-1) + x_m, x_logs, logw, x_mask = self.encoder(x, x_lengths, g=g) # mean, std logs, duration logs, mask + else: + assert g is None, "Using model for recognition with given speaker identity." + + if gen: # durations from dp only used during generation + w = torch.exp(logw) * x_mask * length_scale # durations + w_ceil = torch.ceil(w) # durations ceiled + y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() + y_max_length = None + else: + y_max_length = y.size(2) + + y, y_lengths, y_max_length = self.preprocess(y, y_lengths, y_max_length) + z_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y_max_length), 1).to(torch.int32) + + if not recognition: + attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(z_mask, 2) + + if gen: + attn = commons.generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1) + z_m = torch.matmul(attn.squeeze(1).transpose(1, 2), x_m.transpose(1, 2)).transpose(1, 2) + z_logs = torch.matmul(attn.squeeze(1).transpose(1, 2), x_logs.transpose(1, 2)).transpose(1, 2) + logw_ = torch.log(1e-8 + torch.sum(attn, -1)) * x_mask + + z = (z_m + torch.exp(z_logs) * torch.randn_like(z_m) * noise_scale) * z_mask + y, logdet = self.decoder(z, z_mask, g=g, reverse=True) + + return (y, z_m, z_logs, logdet, z_mask, y_lengths), (x_m, x_logs, x_mask), (attn, logw, logw_) + else: + if recognition: + mask = mask_tensor(y.transpose(1, 2), y_lengths) + + if self.training and self.cfg.specaug_config: + spec_augment_in = y.transpose(1, 2) # [B, T, F] + + if self.training and self.cfg.specaug_config is not None: + audio_features_masked_2 = apply_spec_aug( + spec_augment_in, + num_repeat_time=torch.max(y_lengths).detach().cpu().numpy() + // self.cfg.specaug_config.repeat_per_n_frames, + max_dim_time=self.cfg.specaug_config.max_dim_time, + num_repeat_feat=self.cfg.specaug_config.num_repeat_feat, + max_dim_feat=self.cfg.specaug_config.max_dim_feat, + ) + else: + audio_features_masked_2 = y.transpose(1, 2) + + z, logdet = self.decoder(audio_features_masked_2.transpose(1, 2), z_mask, g=g, reverse=False) # [B, F, T] + + blstm_in, mask = commons.channel_squeeze(audio_features_masked_2.transpose(1,2), mask, self.cfg.subsampling_factor) # frame stacking for subsampling is equivalent to the channel squeezing operation in glowTTS + blstm_in_length = y_lengths // 4 + + blstm_out = self.final(blstm_in.transpose(1,2), blstm_in_length) # [B, T, F] + logits = self.final_linear(blstm_out) + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, blstm_in_length + else: + z, logdet = self.decoder(y, z_mask, g=g, reverse=False) # [B, F, T] + + with torch.no_grad(): + x_s_sq_r = torch.exp(-2 * x_logs) + logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - x_logs, [1]).unsqueeze(-1) # [b, t, 1] + logp2 = torch.matmul(x_s_sq_r.transpose(1, 2), -0.5 * (z**2)) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp3 = torch.matmul((x_m * x_s_sq_r).transpose(1, 2), z) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp4 = torch.sum(-0.5 * (x_m**2) * x_s_sq_r, [1]).unsqueeze(-1) # [b, t, 1] + logp = logp1 + logp2 + logp3 + logp4 # [b, t, t'] + + attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() + # embed() + + z_m = torch.matmul(attn.squeeze(1).transpose(1, 2), x_m.transpose(1, 2)).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + z_logs = torch.matmul(attn.squeeze(1).transpose(1, 2), x_logs.transpose(1, 2)).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + + logw_ = torch.log(1e-8 + torch.sum(attn, -1)) * x_mask + return ( + (z, z_m, z_logs, logdet, z_mask), + (x_m, x_logs, x_mask), + y_lengths, + (attn, logw, logw_) + ) + + def preprocess(self, y, y_lengths, y_max_length): + if y_max_length is not None: + y_max_length = (y_max_length // self.cfg.decoder_config.n_sqz) * self.cfg.decoder_config.n_sqz + y = y[:, :, :y_max_length] + y_lengths = (y_lengths // self.cfg.decoder_config.n_sqz) * self.cfg.decoder_config.n_sqz + return y, y_lengths, y_max_length + + def store_inverse(self): + self.decoder.store_inverse() + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + tags = data["seq_tag"] + audio_features = data["audio_features"] # [B, T, F] + # audio_features = audio_features.transpose(1, 2) # [B, F, T] necessary because glowTTS expects the channels to be in the 2nd dimension + audio_features_len = data["audio_features:size1"] # [B] + + # perform local length sorting for more efficient packing + audio_features_len, indices = torch.sort(audio_features_len, descending=True) + + audio_features = audio_features[indices, :, :] + phonemes = data["phonemes"][indices, :] # [B, T] (sparse) + phonemes_len = data["phonemes:size1"][indices] # [B, T] + phonemes_eow = data["phonemes_eow"][indices, :] # [B, T] + phonemes_eow_len = data["phonemes_eow:size1"][indices] + if "speaker_labels" in data: + speaker_labels = data["speaker_labels"][indices, :] # [B, 1] (sparse) + else: + speaker_labels = None + tags = list(np.array(tags)[indices.detach().cpu().numpy()]) + + if speaker_labels is not None: + ( + (z, z_m, z_logs, logdet, z_mask), + (x_m, x_logs, x_mask), + y_lengths, + (attn, logw, logw_) + ) = model(phonemes, phonemes_len, audio_features, audio_features_len, g=speaker_labels) + l_mle = commons.mle_loss(z, z_m, z_logs, logdet, z_mask) + l_dp = commons.duration_loss(logw, logw_, phonemes_len) + run_ctx.mark_as_loss(name="mle", loss=l_mle) + run_ctx.mark_as_loss(name="dp", loss=l_dp) + + logprobs, ctc_input_length = model(raw_audio=audio_features, raw_audio_lengths=audio_features_len, recognition=True) + + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) + l_ctc = nn.functional.ctc_loss( + transposed_logprobs, + phonemes_eow, + input_lengths=ctc_input_length, + target_lengths=phonemes_eow_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True, + ) + + num_phonemes = torch.sum(phonemes_eow_len) + if "ctc_scale" in kwargs: + ctc_scale = kwargs["ctc_scale"] + else: + ctc_scale = 1 + run_ctx.mark_as_loss(name="ctc", loss=l_ctc, inv_norm_factor=num_phonemes, scale=ctc_scale) + + +def forward_init_hook(run_ctx, **kwargs): + import json + import utils + from utils import AttrDict + from inference import load_checkpoint + from generator import UnivNet as Generator + import numpy as np + + with open("/u/lukas.rilling/experiments/glow_tts_asr_v2/config_univ.json") as f: + data = f.read() + + json_config = json.loads(data) + h = AttrDict(json_config) + + generator = Generator(h).to(run_ctx.device) + + state_dict_g = load_checkpoint( + "/work/asr3/rossenbach/rilling/vocoder/univnet/glow_finetuning/g_01080000", run_ctx.device + ) + generator.load_state_dict(state_dict_g["generator"]) + + run_ctx.generator = generator + run_ctx.speaker_x_vectors = torch.load( + "/work/asr3/rossenbach/rilling/sisyphus_work_dirs/glow_tts_asr_v2/i6_core/returnn/forward/ReturnnForwardJob.U6UwGhE7ENbp/output/output_pooled.hdf" + ) + + +def forward_finish_hook(run_ctx, **kwargs): + pass + + +MAX_WAV_VALUE = 32768.0 + + +def forward_step(*, model: Model, data, run_ctx, **kwargs): + phonemes = data["phonemes"] # [B, N] (sparse) + phonemes_len = data["phonemes:size1"] # [B] + speaker_labels = data["speaker_labels"] # [B, 1] (sparse) + audio_features = data["audio_features"] + + tags = data["seq_tag"] + + (log_mels, z_m, z_logs, logdet, z_mask, y_lengths), (x_m, x_logs, x_mask), (attn, logw, logw_) = model( + phonemes, + phonemes_len, + g=speaker_labels, + gen=True, + noise_scale=kwargs["noise_scale"], + length_scale=kwargs["length_scale"], + ) + + noise = torch.randn([1, 64, log_mels.shape[-1]]).to(device=log_mels.device) + audios = run_ctx.generator.forward(noise, log_mels) + audios = audios * MAX_WAV_VALUE + audios = audios.cpu().numpy().astype("int16") + + if not os.path.exists("/var/tmp/lukas.rilling/"): + os.makedirs("/var/tmp/lukas.rilling/") + if not os.path.exists("/var/tmp/lukas.rilling/out"): + os.makedirs("/var/tmp/lukas.rilling/out/", exist_ok=True) + for audio, tag in zip(audios, tags): + soundfile.write(f"/var/tmp/lukas.rilling/out/" + tag.replace("/", "_") + ".wav", audio[0], 16000) + +def search_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + from torchaudio.models.decoder import ctc_decoder + with open("output.hdf", "w+"): + print("Created unnecessary output.hdf...") + + run_ctx.recognition_file = open("search_out.py", "wt") + run_ctx.recognition_file.write("{\n") + import subprocess + if kwargs["arpa_lm"] is not None: + lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip() + else: + lm = None + from returnn.datasets.util.vocabulary import Vocabulary + vocab = Vocabulary.create_vocab( + vocab_file=kwargs["returnn_vocab"], unknown_label=None) + labels = vocab.labels + + run_ctx.ctc_decoder = ctc_decoder( + lexicon=kwargs["lexicon"], + lm=lm, + lm_weight=kwargs["lm_weight"], + tokens=labels + ["[blank]", "[SILENCE]", "[UNK]"], + # "[SILENCE]" and "[UNK]" are not actually part of the vocab, + # but the decoder is happy as long they are defined in the token list + # even if they do not exist as label index in the softmax output, + blank_token="[blank]", + sil_token="[SILENCE]", + unk_word="[unknown]", + nbest=1, + beam_size=kwargs["beam_size"], + beam_size_token=kwargs.get("beam_size_token", None), + beam_threshold=kwargs["beam_threshold"], + sil_score=kwargs.get("sil_score", 0.0), + word_score=kwargs.get("word_score", 0.0), + ) + run_ctx.labels = labels + run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None) + + if kwargs.get("prior_file", None): + run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32") + run_ctx.prior_scale = kwargs["prior_scale"] + else: + run_ctx.prior = None + +def search_finish_hook(run_ctx, **kwargs): + run_ctx.recognition_file.write("}\n") + run_ctx.recognition_file.close() + +def search_step(*, model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_lengths=raw_audio_len, + recognition=True + ) + + tags = data["seq_tag"] + + logprobs_cpu = logprobs.cpu() + if run_ctx.blank_log_penalty is not None: + # assumes blank is last + logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty + if run_ctx.prior is not None: + logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior + hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu()) + + for hyp, tag in zip(hypothesis, tags): + words = hyp[0].words + sequence = " ".join([word for word in words if not word.startswith("[")]) + print(sequence) + run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence))) diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_blstm_x_vector.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_blstm_x_vector.py new file mode 100644 index 000000000..f128c1c75 --- /dev/null +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_blstm_x_vector.py @@ -0,0 +1,679 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +from dataclasses import dataclass +import torch +import numpy as np +from torch import nn +import multiprocessing +from librosa import filters +import sys +import time +from typing import Any, Dict, Optional, Tuple, Union +import math +import os +import soundfile + +from torchaudio.functional import mask_along_axis + +from i6_models.parts.blstm import BlstmEncoderV1, BlstmEncoderV1Config + + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1, VGG4LayerActFrontendV1Config + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + +from returnn.torch.context import get_run_ctx + +from .shared.i6modelsV1_VGG4LayerActFrontendV1_v4_cfg import ( + SpecaugConfig, + VGG4LayerActFrontendV1Config_mod, + ModelConfigV2, + FlowDecoderConfig, + TextEncoderConfig, +) + +from .shared.configs import DbMelFeatureExtractionConfig +from .shared.feature_extraction import DbMelFeatureExtraction +from .shared.spec_augment import apply_spec_aug +from .shared.mask import mask_tensor + +from .shared import modules +from .shared import commons +from .shared import attentions +from .monotonic_align import maximum_path + +from .shared.forward import search_init_hook, search_finish_hook +from .shared.eval_forward import * + +from IPython import embed + + +class XVector(nn.Module): + def __init__(self, input_dim=40, num_classes=8, **kwargs): + super(XVector, self).__init__() + self.tdnn1 = modules.TDNN( + input_dim=input_dim, output_dim=512, context_size=5, dilation=1, dropout_p=0.5, batch_norm=True + ) + self.tdnn2 = modules.TDNN( + input_dim=512, output_dim=512, context_size=3, dilation=2, dropout_p=0.5, batch_norm=True + ) + self.tdnn3 = modules.TDNN( + input_dim=512, output_dim=512, context_size=2, dilation=3, dropout_p=0.5, batch_norm=True + ) + self.tdnn4 = modules.TDNN( + input_dim=512, output_dim=512, context_size=1, dilation=1, dropout_p=0.5, batch_norm=True + ) + self.tdnn5 = modules.TDNN( + input_dim=512, output_dim=512, context_size=1, dilation=1, dropout_p=0.5, batch_norm=True + ) + #### Frame levelPooling + self.segment6 = nn.Linear(1024, 512) + self.segment7 = nn.Linear(512, 512) + self.output = nn.Linear(512, num_classes) + self.softmax = nn.Softmax(dim=1) + + # fe_config = DbMelFeatureExtractionConfig.from_dict(kwargs["fe_config"]) + # self.feature_extraction = DbMelFeatureExtraction(config=fe_config) + + def forward(self, x, x_lengths): + # with torch.no_grad(): + # squeezed_audio = torch.squeeze(raw_audio) + # x, x_lengths = self.feature_extraction(squeezed_audio, raw_audio_lengths) # [B, T, F] + + # x = x.transpose(1, 2) + tdnn1_out = self.tdnn1(x) + # return tdnn1_out + tdnn2_out = self.tdnn2(tdnn1_out) + tdnn3_out = self.tdnn3(tdnn2_out) + tdnn4_out = self.tdnn4(tdnn3_out) + tdnn5_out = self.tdnn5(tdnn4_out) + ### Stat Pool + mean = torch.mean(tdnn5_out, 2) + std = torch.std(tdnn5_out, 2) + stat_pooling = torch.cat((mean, std), 1) + segment6_out = self.segment6(stat_pooling) + x_vec = self.segment7(segment6_out) + output = self.output(x_vec) + predictions = self.softmax(output) + return output, predictions, x_vec + + +class DurationPredictor(nn.Module): + """ + Duration Predictor module, trained using calculated durations coming from monotonic alignment search + """ + + def __init__(self, in_channels, filter_channels, filter_size, p_dropout): + super().__init__() + + self.in_channels = in_channels + self.filter_channels = filter_channels + self.filter_size = filter_size + self.p_dropout = p_dropout + + self.convs = nn.Sequential( + modules.Conv1DBlock( + in_size=self.in_channels, + out_size=self.filter_channels, + filter_size=self.filter_size, + p_dropout=p_dropout, + ), + modules.Conv1DBlock( + in_size=self.filter_channels, + out_size=self.filter_channels, + filter_size=self.filter_size, + p_dropout=p_dropout, + ), + ) + self.proj = nn.Conv1d(in_channels=self.filter_channels, out_channels=1, kernel_size=1) + + def forward(self, x, x_mask): + x_with_mask = (x, x_mask) + (x, x_mask) = self.convs(x_with_mask) + x = self.proj(x * x_mask) + return x + + +class FlowDecoder(nn.Module): + def __init__(self, cfg: FlowDecoderConfig, in_channels, gin_channels): + """Flow-based decoder model + + Args: + in_channels (int): Number of incoming channels + hidden_channels (int): Number of hidden channels + kernel_size (int): Kernel Size for convolutions in coupling blocks + dilation_rate (float): Dilation Rate to define dilation in convolutions of coupling block + n_blocks (int): Number of coupling blocks + n_layers (int): Number of layers in CNN of the coupling blocks + p_dropout (float, optional): Dropout probability for CNN in coupling blocks. Defaults to 0.. + n_split (int, optional): Number of splits for the 1x1 convolution for flows in the decoder. Defaults to 4. + n_sqz (int, optional): Squeeze. Defaults to 1. + sigmoid_scale (bool, optional): Boolean to define if log probs in coupling layers should be rescaled using sigmoid. Defaults to False. + gin_channels (int, optional): Number of speaker embedding channels. Defaults to 0. + """ + super().__init__() + self.cfg = cfg + + self.flows = nn.ModuleList() + + for _ in range(self.cfg.n_blocks): + self.flows.append(modules.ActNorm(channels=in_channels * self.cfg.n_sqz)) + self.flows.append(modules.InvConvNear(channels=in_channels * self.cfg.n_sqz, n_split=self.cfg.n_split)) + self.flows.append( + attentions.CouplingBlock( + in_channels * self.cfg.n_sqz, + self.cfg.hidden_channels, + kernel_size=self.cfg.kernel_size, + dilation_rate=self.cfg.dilation_rate, + n_layers=self.cfg.n_layers, + gin_channels=gin_channels, + p_dropout=self.cfg.p_dropout, + sigmoid_scale=self.cfg.sigmoid_scale, + ) + ) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + flows = self.flows + logdet_tot = 0 + else: + flows = reversed(self.flows) + logdet_tot = None + + if g is not None: + g = g.unsqueeze(-1) + + if self.cfg.n_sqz > 1: + x, x_mask = commons.channel_squeeze(x, x_mask, self.cfg.n_sqz) + for f in flows: + if not reverse: + x, logdet = f(x, x_mask, g=g, reverse=reverse) + logdet_tot += logdet + else: + x, logdet = f(x, x_mask, g=g, reverse=reverse) + if self.cfg.n_sqz > 1: + x, x_mask = commons.channel_unsqueeze(x, x_mask, self.cfg.n_sqz) + return x, logdet_tot + + def store_inverse(self): + for f in self.flows: + f.store_inverse() + +class TextEncoder(nn.Module): + """ + Text Encoder model + """ + + def __init__(self, cfg: TextEncoderConfig, out_channels, gin_channels): + """Text Encoder Model based on Multi-Head Self-Attention combined with FF-CCNs + + Args: + n_vocab (int): Size of vocabulary for embeddings + out_channels (int): Number of output channels + hidden_channels (int): Number of hidden channels + filter_channels (int): Number of filter channels + filter_channels_dp (int): Number of filter channels for duration predictor + n_heads (int): Number of heads in encoder's Multi-Head Attention + n_layers (int): Number of layers consisting of Multi-Head Attention and CNNs in encoder + kernel_size (int): Kernel Size for CNNs in encoder layers + p_dropout (float): Dropout probability for both encoder and duration predictor + window_size (int, optional): Window size in Multi-Head Self-Attention for encoder. Defaults to None. + block_length (_type_, optional): Block length for optional block masking in Multi-Head Attention for encoder. Defaults to None. + mean_only (bool, optional): Boolean to only project text encodings to mean values instead of mean and std. Defaults to False. + prenet (bool, optional): Boolean to add ConvReluNorm prenet before encoder . Defaults to False. + gin_channels (int, optional): Number of channels for speaker condition. Defaults to 0. + """ + super().__init__() + self.cfg = cfg + + self.emb = nn.Embedding(self.cfg.n_vocab, self.cfg.hidden_channels) + nn.init.normal_(self.emb.weight, 0.0, self.cfg.hidden_channels**-0.5) + + if self.cfg.prenet: + self.pre = modules.ConvReluNorm( + self.cfg.hidden_channels, + self.cfg.hidden_channels, + self.cfg.hidden_channels, + kernel_size=5, + n_layers=3, + p_dropout=0.5, + ) + self.encoder = attentions.Encoder( + self.cfg.hidden_channels, + self.cfg.filter_channels, + self.cfg.n_heads, + self.cfg.n_layers, + self.cfg.kernel_size, + self.cfg.p_dropout, + window_size=self.cfg.window_size, + block_length=self.cfg.block_length, + ) + + self.proj_m = nn.Conv1d(self.cfg.hidden_channels, out_channels, 1) + if not self.cfg.mean_only: + self.proj_s = nn.Conv1d(self.cfg.hidden_channels, out_channels, 1) + self.proj_w = DurationPredictor( + self.cfg.hidden_channels + gin_channels, + self.cfg.filter_channels_dp, + self.cfg.kernel_size, + self.cfg.p_dropout, + ) + + def forward(self, x, x_lengths, g=None): + x = self.emb(x) * math.sqrt(self.cfg.hidden_channels) # [b, t, h] + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + + if self.cfg.prenet: + x = self.pre(x, x_mask) + x = self.encoder(x, x_mask) + + if g is not None: + g_exp = g.unsqueeze(-1).expand(-1, -1, x.size(-1)) + # print(f"Dimension of input in Text Encoder: x.shape: {x.shape}; g: {g.shape}, g_exp: {g_exp.shape}") + x_dp = torch.cat([torch.detach(x), g_exp], 1) + else: + x_dp = torch.detach(x) + + x_m = self.proj_m(x) * x_mask + if not self.cfg.mean_only: + x_logs = self.proj_s(x) * x_mask + else: + x_logs = torch.zeros_like(x_m) + + # print(f"Dimension of input in Text Encoder before DP: {x_dp.shape}") + + logw = self.proj_w(x_dp, x_mask) + return x_m, x_logs, logw, x_mask + +class Model(nn.Module): + """ + Flow-based ASR model based on GlowTTS Structure using a pre-trained flow-based decoder + trained to generate spectrograms from given statistics coming from an encoder + + Model was pretrained using the architecture in + users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS.py + """ + + def __init__( + self, + model_config: dict, + **kwargs, + ): + """_summary_ + + Args: + n_vocab (int): vocabulary size + hidden_channels (int): Number of hidden channels in encoder + out_channels (int): Number of channels in the output + n_blocks_dec (int, optional): Number of coupling blocks in the decoder. Defaults to 12. + kernel_size_dec (int, optional): Kernel size in the decoder. Defaults to 5. + dilation_rate (int, optional): Dilation rate for CNNs of coupling blocks in decoder. Defaults to 5. + n_block_layers (int, optional): Number of layers in the CNN of the coupling blocks in decoder. Defaults to 4. + p_dropout_dec (_type_, optional): Dropout probability in the decoder. Defaults to 0.. + n_speakers (int, optional): Number of speakers. Defaults to 0. + gin_channels (int, optional): Number of speaker embedding channels. Defaults to 0. + n_split (int, optional): Number of splits for the 1x1 convolution for flows in the decoder. Defaults to 4. + n_sqz (int, optional): Squeeze. Defaults to 1. + sigmoid_scale (bool, optional): Boolean to define if log probs in coupling layers should be rescaled using sigmoid. Defaults to False. + window_size (int, optional): Window size in Multi-Head Self-Attention for encoder. Defaults to None. + block_length (_type_, optional): Block length for optional block masking in Multi-Head Attention for encoder. Defaults to None. + hidden_channels_dec (_type_, optional): Number of hidden channels in decodder. Defaults to hidden_channels. + final_hidden_channels: Number of hidden channels in the final network + final_n_layers: Number of layers in the final network + label_target_size: Target size of target vocabulary, target size for final network + """ + super().__init__() + + self.net_kwargs = { + "repeat_per_num_frames": 100, + "max_dim_feat": 8, + "num_repeat_feat": 5, + "max_dim_time": 20, + } + + fe_config = DbMelFeatureExtractionConfig.from_dict(kwargs["fe_config"]) + self.feature_extraction = DbMelFeatureExtraction(config=fe_config) + + # if label_target_size is None: + # if n_vocab is None: + # run_ctx = get_run_ctx() + # dataset = run_ctx.engine.train_dataset or run_ctx.engine.forward_dataset + # self.label_target_size = len(dataset.datasets["zip_dataset"].targets.labels) + # else: + # self.label_target_size = n_vocab + # else: + # self.label_target_size = label_target_size + + self.cfg = ModelConfigV2.from_dict(model_config) + + text_encoder_config = self.cfg.text_encoder_config + decoder_config = self.cfg.decoder_config + + if self.cfg.n_speakers > 1: + self.x_vector = XVector(self.cfg.out_channels, self.cfg.n_speakers) + self.x_vector_bottleneck = nn.Sequential( + nn.Linear(512, self.cfg.gin_channels), + nn.ReLU() + ) + + self.encoder = TextEncoder( + text_encoder_config, out_channels=self.cfg.out_channels, gin_channels=self.cfg.gin_channels + ) + + self.decoder = FlowDecoder( + decoder_config, in_channels=self.cfg.out_channels, gin_channels=self.cfg.gin_channels + ) + + blstm_config = BlstmEncoderV1Config(num_layers=self.cfg.blstm_layers, input_dim=self.cfg.out_channels*self.cfg.subsampling_factor, hidden_dim=self.cfg.blstm_hidden_dim, dropout=self.cfg.blstm_dropout, enforce_sorted=False) + + self.final = BlstmEncoderV1(blstm_config) + self.final_linear = nn.Linear(2*self.cfg.blstm_hidden_dim, self.cfg.label_target_size + 1) # + CTC blank + + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + def forward( + self, x=None, x_lengths=None, raw_audio=None, raw_audio_lengths=None, g=None, gen=False, recognition=False, noise_scale=1.0, length_scale=1.0 + ): + if not gen: + with torch.no_grad(): + squeezed_audio = torch.squeeze(raw_audio) + y, y_lengths = self.feature_extraction(squeezed_audio, raw_audio_lengths) # [B, T, F] + y = y.transpose(1, 2) # [B, F, T] + self.x_vector.eval() + _, _, g = self.x_vector(y, y_lengths) + else: + y, y_lengths = (None, None) + + g = self.x_vector_bottleneck(g) + if not recognition: + x_m, x_logs, logw, x_mask = self.encoder(x, x_lengths, g=g) # mean, std logs, duration logs, mask + + if gen: # durations from dp only used during generation + w = torch.exp(logw) * x_mask * length_scale # durations + w_ceil = torch.ceil(w) # durations ceiled + y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() + y_max_length = None + else: + y_max_length = y.size(2) + + y, y_lengths, y_max_length = self.preprocess(y, y_lengths, y_max_length) + z_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y_max_length), 1).to(torch.int32) + + if not recognition: + attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(z_mask, 2) + + if gen: + attn = commons.generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1) + z_m = torch.matmul(attn.squeeze(1).transpose(1, 2), x_m.transpose(1, 2)).transpose(1, 2) + z_logs = torch.matmul(attn.squeeze(1).transpose(1, 2), x_logs.transpose(1, 2)).transpose(1, 2) + logw_ = torch.log(1e-8 + torch.sum(attn, -1)) * x_mask + + z = (z_m + torch.exp(z_logs) * torch.randn_like(z_m) * noise_scale) * z_mask + y, logdet = self.decoder(z, z_mask, g=g, reverse=True) + + return (y, z_m, z_logs, logdet, z_mask, y_lengths), (x_m, x_logs, x_mask), (attn, logw, logw_) + else: + z, logdet = self.decoder(y, z_mask, g=g, reverse=False) + + spec_augment_in = z.transpose(1, 2) # [B, T, F] + mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype) + + if self.training and self.cfg.specaug_config is not None: + audio_features_masked_2 = apply_spec_aug( + spec_augment_in, + num_repeat_time=torch.max(y_lengths).detach().cpu().numpy() + // self.cfg.specaug_config.repeat_per_n_frames, + max_dim_time=self.cfg.specaug_config.max_dim_time, + num_repeat_feat=self.cfg.specaug_config.num_repeat_feat, + max_dim_feat=self.cfg.specaug_config.max_dim_feat, + ) + else: + audio_features_masked_2 = spec_augment_in + + blstm_in, mask = commons.channel_squeeze(audio_features_masked_2.transpose(1,2), mask, self.cfg.subsampling_factor) # frame stacking for subsampling is equivalent to the channel squeezing operation in glowTTS + blstm_in_length = y_lengths // 4 + + blstm_out = self.final(blstm_in.transpose(1,2), blstm_in_length) # [B, T, F] + logits = self.final_linear(blstm_out) + log_probs = torch.log_softmax(logits, dim=2) + + if recognition: + return log_probs, blstm_in_length + else: + with torch.no_grad(): + x_s_sq_r = torch.exp(-2 * x_logs) + logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - x_logs, [1]).unsqueeze(-1) # [b, t, 1] + logp2 = torch.matmul(x_s_sq_r.transpose(1, 2), -0.5 * (z**2)) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp3 = torch.matmul((x_m * x_s_sq_r).transpose(1, 2), z) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp4 = torch.sum(-0.5 * (x_m**2) * x_s_sq_r, [1]).unsqueeze(-1) # [b, t, 1] + logp = logp1 + logp2 + logp3 + logp4 # [b, t, t'] + + attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() + # embed() + + z_m = torch.matmul(attn.squeeze(1).transpose(1, 2), x_m.transpose(1, 2)).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + z_logs = torch.matmul(attn.squeeze(1).transpose(1, 2), x_logs.transpose(1, 2)).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + + logw_ = torch.log(1e-8 + torch.sum(attn, -1)) * x_mask + return ( + (z, z_m, z_logs, logdet, z_mask), + (x_m, x_logs, x_mask), + y_lengths, + (attn, logw, logw_), + (log_probs, blstm_in_length), + ) + + def preprocess(self, y, y_lengths, y_max_length): + if y_max_length is not None: + y_max_length = (y_max_length // self.cfg.decoder_config.n_sqz) * self.cfg.decoder_config.n_sqz + y = y[:, :, :y_max_length] + y_lengths = (y_lengths // self.cfg.decoder_config.n_sqz) * self.cfg.decoder_config.n_sqz + return y, y_lengths, y_max_length + + def store_inverse(self): + self.decoder.store_inverse() + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + tags = data["seq_tag"] + audio_features = data["audio_features"] # [B, T, F] + # audio_features = audio_features.transpose(1, 2) # [B, F, T] necessary because glowTTS expects the channels to be in the 2nd dimension + audio_features_len = data["audio_features:size1"] # [B] + + # perform local length sorting for more efficient packing + audio_features_len, indices = torch.sort(audio_features_len, descending=True) + + audio_features = audio_features[indices, :, :] + phonemes = data["phonemes"][indices, :] # [B, T] (sparse) + phonemes_len = data["phonemes:size1"][indices] # [B, T] + phonemes_eow = data["phonemes_eow"][indices, :] # [B, T] + phonemes_eow_len = data["phonemes_eow:size1"][indices] + # speaker_labels = data["speaker_labels"][indices, :] # [B, 1] (sparse) + tags = list(np.array(tags)[indices.detach().cpu().numpy()]) + + ( + (z, z_m, z_logs, logdet, z_mask), + (x_m, x_logs, x_mask), + y_lengths, + (attn, logw, logw_), + (logprobs, ctc_input_length), + ) = model(phonemes, phonemes_len, audio_features, audio_features_len) + l_mle = commons.mle_loss(z, z_m, z_logs, logdet, z_mask) + l_dp = commons.duration_loss(logw, logw_, phonemes_len) + + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) + l_ctc = nn.functional.ctc_loss( + transposed_logprobs, + phonemes_eow, + input_lengths=ctc_input_length, + target_lengths=phonemes_eow_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True, + ) + + run_ctx.mark_as_loss(name="mle", loss=l_mle) + run_ctx.mark_as_loss(name="dp", loss=l_dp) + + num_phonemes = torch.sum(phonemes_eow_len) + if "ctc_scale" in kwargs: + ctc_scale = kwargs["ctc_scale"] + else: + ctc_scale = 1 + run_ctx.mark_as_loss(name="ctc", loss=l_ctc, inv_norm_factor=num_phonemes, scale=ctc_scale) + + +def forward_init_hook(run_ctx, **kwargs): + import json + import utils + from utils import AttrDict + from inference import load_checkpoint + from generator import UnivNet as Generator + import numpy as np + + with open("/u/lukas.rilling/experiments/glow_tts_asr_v2/config_univ.json") as f: + data = f.read() + + json_config = json.loads(data) + h = AttrDict(json_config) + + generator = Generator(h).to(run_ctx.device) + + state_dict_g = load_checkpoint( + "/work/asr3/rossenbach/rilling/vocoder/univnet/glow_finetuning/g_01080000", run_ctx.device + ) + generator.load_state_dict(state_dict_g["generator"]) + + run_ctx.generator = generator + run_ctx.speaker_x_vectors = torch.load( + "/work/asr3/rossenbach/rilling/sisyphus_work_dirs/glow_tts_asr_v2/i6_core/returnn/forward/ReturnnForwardJob.U6UwGhE7ENbp/output/output_pooled.hdf" + ) + + +def forward_finish_hook(run_ctx, **kwargs): + pass + + +MAX_WAV_VALUE = 32768.0 + + +def forward_step(*, model: Model, data, run_ctx, **kwargs): + phonemes = data["phonemes"] # [B, N] (sparse) + phonemes_len = data["phonemes:size1"] # [B] + speaker_labels = data["speaker_labels"] # [B, 1] (sparse) + audio_features = data["audio_features"] + + tags = data["seq_tag"] + + speaker_x_vector = run_ctx.speaker_x_vectors[speaker_labels.detach().cpu().numpy(), :].squeeze(1) + + (log_mels, z_m, z_logs, logdet, z_mask, y_lengths), (x_m, x_logs, x_mask), (attn, logw, logw_) = model( + phonemes, + phonemes_len, + g=speaker_x_vector.to(run_ctx.device), + gen=True, + noise_scale=kwargs["noise_scale"], + length_scale=kwargs["length_scale"], + ) + + noise = torch.randn([1, 64, log_mels.shape[-1]]).to(device=log_mels.device) + audios = run_ctx.generator.forward(noise, log_mels) + audios = audios * MAX_WAV_VALUE + audios = audios.cpu().numpy().astype("int16") + + if not os.path.exists("/var/tmp/lukas.rilling/"): + os.makedirs("/var/tmp/lukas.rilling/") + if not os.path.exists("/var/tmp/lukas.rilling/out"): + os.makedirs("/var/tmp/lukas.rilling/out/", exist_ok=True) + for audio, tag in zip(audios, tags): + soundfile.write(f"/var/tmp/lukas.rilling/out/" + tag.replace("/", "_") + ".wav", audio[0], 16000) + +def search_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + from torchaudio.models.decoder import ctc_decoder + run_ctx.recognition_file = open("search_out.py", "wt") + run_ctx.recognition_file.write("{\n") + import subprocess + if kwargs["arpa_lm"] is not None: + lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip() + else: + lm = None + from returnn.datasets.util.vocabulary import Vocabulary + vocab = Vocabulary.create_vocab( + vocab_file=kwargs["returnn_vocab"], unknown_label=None) + labels = vocab.labels + + run_ctx.ctc_decoder = ctc_decoder( + lexicon=kwargs["lexicon"], + lm=lm, + lm_weight=kwargs["lm_weight"], + tokens=labels + ["[blank]", "[SILENCE]", "[UNK]"], + # "[SILENCE]" and "[UNK]" are not actually part of the vocab, + # but the decoder is happy as long they are defined in the token list + # even if they do not exist as label index in the softmax output, + blank_token="[blank]", + sil_token="[SILENCE]", + unk_word="[unknown]", + nbest=1, + beam_size=kwargs["beam_size"], + beam_size_token=kwargs.get("beam_size_token", None), + beam_threshold=kwargs["beam_threshold"], + sil_score=kwargs.get("sil_score", 0.0), + word_score=kwargs.get("word_score", 0.0), + ) + run_ctx.labels = labels + run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None) + + if kwargs.get("prior_file", None): + run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32") + run_ctx.prior_scale = kwargs["prior_scale"] + else: + run_ctx.prior = None + +def search_finish_hook(run_ctx, **kwargs): + run_ctx.recognition_file.write("}\n") + run_ctx.recognition_file.close() + +def search_step(*, model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_lengths=raw_audio_len, + recognition=True + ) + + tags = data["seq_tag"] + + logprobs_cpu = logprobs.cpu() + if run_ctx.blank_log_penalty is not None: + # assumes blank is last + logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty + if run_ctx.prior is not None: + logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior + hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu()) + + for hyp, tag in zip(hypothesis, tags): + words = hyp[0].words + sequence = " ".join([word for word in words if not word.startswith("[")]) + print(sequence) + run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence))) diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_two_forward_pass.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_two_forward_pass.py index 256267ce4..c9b6ec80b 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_two_forward_pass.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_two_forward_pass.py @@ -54,7 +54,7 @@ from .shared import attentions from .monotonic_align import maximum_path -from .shared.forward import search_init_hook, search_finish_hook +from .shared.forward import prior_init_hook, prior_step, prior_finish_hook from .shared.eval_forward import * from .shared.eval_invertibility import * @@ -520,6 +520,7 @@ def train_step(*, model: Model, data, run_ctx, **kwargs): ) num_phonemes = torch.sum(phonemes_eow_len) + breakpoint() if "ctc_scale" in kwargs: ctc_scale = kwargs["ctc_scale"] else: diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_x_vector_two_forward_pass.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_x_vector_two_forward_pass.py index 05dbaf9d9..6def36536 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_x_vector_two_forward_pass.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_x_vector_two_forward_pass.py @@ -54,7 +54,7 @@ from .shared import attentions from .monotonic_align import maximum_path -from .shared.forward import search_init_hook, search_finish_hook +from .shared.forward import search_init_hook, search_finish_hook, prior_init_hook, prior_finish_hook, prior_step from .shared.eval_forward import * diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2.py index 7de3a79cf..56c1b4fb9 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2.py @@ -54,7 +54,7 @@ from .shared import attentions from .monotonic_align import maximum_path -from .shared.forward import search_init_hook, search_finish_hook +from .shared.forward import search_init_hook, search_finish_hook, prior_init_hook, prior_finish_hook, prior_step from .shared.eval_forward import * diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_x_vector_v2.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_x_vector_v2.py index 370350966..65a206701 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_x_vector_v2.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_x_vector_v2.py @@ -54,8 +54,9 @@ from .shared import attentions from .monotonic_align import maximum_path -from .shared.forward import search_init_hook, search_finish_hook +from .shared.forward import search_init_hook, search_finish_hook, prior_init_hook, prior_finish_hook, prior_step from .shared.eval_forward import * +from .shared.eval_invertibility import * from IPython import embed diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer.py index 181e53bd6..a60cf462d 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer.py @@ -54,7 +54,7 @@ from .shared import attentions from .monotonic_align import maximum_path -from .shared.forward import search_init_hook, search_finish_hook +from .shared.forward import search_init_hook, search_finish_hook, prior_finish_hook, prior_init_hook, prior_step from .shared.eval_invertibility import forward_init_hook_invertibility, forward_finish_hook_invertibility, forward_step_invertibility from IPython import embed diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_specaugment_before.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_specaugment_before.py index 99c78cbfd..84a7f5656 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_specaugment_before.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_specaugment_before.py @@ -54,7 +54,7 @@ from .shared import attentions from .monotonic_align import maximum_path -from .shared.forward import search_init_hook, search_finish_hook +from .shared.forward import search_init_hook, search_finish_hook, prior_finish_hook, prior_init_hook, prior_step from .shared.eval_invertibility import forward_init_hook_invertibility, forward_finish_hook_invertibility, forward_step_invertibility from IPython import embed diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_specaugment_before_xvector.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_specaugment_before_xvector.py index 3565e6b00..54e8d707f 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_specaugment_before_xvector.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_specaugment_before_xvector.py @@ -54,7 +54,7 @@ from .shared import attentions from .monotonic_align import maximum_path -from .shared.forward import search_init_hook, search_finish_hook +from .shared.forward import search_init_hook, search_finish_hook, prior_finish_hook, prior_init_hook, prior_step from .shared.eval_invertibility import forward_init_hook_invertibility, forward_finish_hook_invertibility, forward_step_invertibility from IPython import embed diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_specaugment_before_xvector_v2.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_specaugment_before_xvector_v2.py index 476060b0c..61468f88d 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_specaugment_before_xvector_v2.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_specaugment_before_xvector_v2.py @@ -54,7 +54,7 @@ from .shared import attentions from .monotonic_align import maximum_path -from .shared.forward import search_init_hook, search_finish_hook +from .shared.forward import search_init_hook, search_finish_hook, prior_finish_hook, prior_init_hook, prior_step from IPython import embed diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_xvector.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_xvector.py index a60d5bad9..bbce70703 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_xvector.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_xvector.py @@ -54,7 +54,7 @@ from .shared import attentions from .monotonic_align import maximum_path -from .shared.forward import search_init_hook, search_finish_hook +from .shared.forward import search_init_hook, search_finish_hook, prior_finish_hook, prior_init_hook, prior_step from IPython import embed diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_xvector_eval.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_xvector_eval.py index 4eaf3b251..f6178e1f9 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_xvector_eval.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glow_ASR_conformer_xvector_eval.py @@ -54,7 +54,7 @@ from .shared import attentions from .monotonic_align import maximum_path -from .shared.forward import search_init_hook, search_finish_hook +from .shared.forward import search_init_hook, search_finish_hook, prior_finish_hook, prior_init_hook, prior_step from IPython import embed diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/only_conformer.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/only_conformer.py index 9b5d9d215..3f236c30f 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/only_conformer.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/only_conformer.py @@ -56,9 +56,6 @@ # search_init_hook, search_step, search_finish_hook, - prior_init_hook, - prior_finish_hook, - prior_step, ) from IPython import embed @@ -278,3 +275,38 @@ def search_init_hook(run_ctx, **kwargs): else: run_ctx.prior = None + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", 'w') as f: + np.savetxt(f, log_average_probs, delimiter=' ') + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model, data, run_ctx, **kwargs): + raw_audio = data["audio_features"] # [B, T', F] + raw_audio_len = data["audio_features:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) + diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/shared/eval_invertibility.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/shared/eval_invertibility.py index 3cdc198d0..7dcde6680 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/shared/eval_invertibility.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/shared/eval_invertibility.py @@ -66,7 +66,91 @@ def forward_step_invertibility(*, model, data, run_ctx, **kwargs): mae.sum() / current_num_of_obs ) # This considers the masking by only using the mean over all unmasked elements - current_var = (mae - current_mae).sum() / ( + current_var = (mae - current_mae).pow(2).sum() / ( + current_num_of_obs - 1 + ) # Variance over unmasked elements with bias correction 1 + + run_ctx.total_mae = ((run_ctx.num_of_obs / (run_ctx.num_of_obs + current_num_of_obs)) * old_mae) + ( + (current_num_of_obs / (run_ctx.num_of_obs + current_num_of_obs)) * current_mae + ) + + run_ctx.total_ae_var = ( + (run_ctx.num_of_obs / (run_ctx.num_of_obs + current_num_of_obs)) * run_ctx.total_ae_var + + ((current_num_of_obs / (run_ctx.num_of_obs + current_num_of_obs)) * current_var) + + ((run_ctx.num_of_obs * current_num_of_obs) / (run_ctx.num_of_obs + current_num_of_obs) ** 2) + * (old_mae - current_mae) ** 2 + ) + + run_ctx.total_ae_max = torch.max(run_ctx.total_ae_max, mae.max()) + + run_ctx.total_ae_min = torch.min( + run_ctx.total_ae_min, (mae + (-1 * z_mask + 1) * torch.tensor(float("inf")).nan_to_num(0.0)).min() + ) # Masked Min operation + + run_ctx.num_of_obs += current_num_of_obs + +def forward_init_hook_asr_invertibility(run_ctx, **kwargs): + run_ctx.total_mae = 0 + run_ctx.total_ae_var = 0 + run_ctx.total_ae_max = torch.tensor(-np.inf) + run_ctx.total_ae_min = torch.tensor(np.inf) + run_ctx.num_of_obs = 0 + + +def forward_finish_hook_asr_invertibility(run_ctx, **kwargs): + with open("output.hdf", "w+") as f: + f.write("total, mean, var, max, min \n") + f.write( + f"{run_ctx.num_of_obs}, {str(float(run_ctx.total_mae))}, {str(float(run_ctx.total_ae_var))}, {str(float(run_ctx.total_ae_max))}, {str(float(run_ctx.total_ae_min))}" + ) + + +def forward_step_asr_invertibility(*, model, data, run_ctx, **kwargs): + raw_audio = data["audio_features"] # [B, N] (sparse) + raw_audio_len = data["audio_features:size1"] # [B] + phonemes = data["phonemes"] + phonemes_len = data["phonemes:size1"] + + if "xvectors" in data: + g = data["xvectors"] + elif "speaker_labels" in data: + g = data["speaker_labels"] + else: + raise Exception("Missing speaker embedding!") + + squeezed_audio = torch.squeeze(raw_audio) + y, y_lengths = model.feature_extraction(squeezed_audio, raw_audio_len) # [B, T, F] + y = y.transpose(1, 2) # [B, F, T] + + if hasattr(model, "x_vector"): + _, _, g = model.x_vector(y, y_lengths) + + if hasattr(model, "x_vector_bottleneck"): + g = model.x_vector_bottleneck(g) + elif hasattr(model, "emb_g"): + g = torch.nn.functional.normalize(model.emb_g(g.squeeze(-1))).unsqueeze(-1) + else: + g = None + + y_max_length = y.size(2) + + y, y_lengths, y_max_length = model.preprocess(y, y_lengths, y_max_length) + z_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y_max_length), 1).to(torch.int32) + + z, _ = model.decoder(y, z_mask, g=g, reverse=False) + y_hat, _ = model.decoder(z, z_mask, g=g, reverse=True) + + mae = torch.nn.functional.l1_loss(y_hat * z_mask, y * z_mask, reduction="none") # [B, F, T] + + current_num_of_obs = y_hat.shape[1] * y_lengths.sum() # F * total_number_of_frames_in_batch + + old_mae = run_ctx.total_mae + + current_mae = ( + mae.sum() / current_num_of_obs + ) # This considers the masking by only using the mean over all unmasked elements + + current_var = (mae - current_mae).pow(2).sum() / ( current_num_of_obs - 1 ) # Variance over unmasked elements with bias correction 1 diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/shared/forward.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/shared/forward.py index 972b8500f..5d0d956d6 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/shared/forward.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/shared/forward.py @@ -93,12 +93,13 @@ def prior_finish_hook(run_ctx, **kwargs): def prior_step(*, model, data, run_ctx, **kwargs): - raw_audio = data["raw_audio"] # [B, T', F] - raw_audio_len = data["raw_audio:size1"] # [B] + raw_audio = data["audio_features"] # [B, T', F] + raw_audio_len = data["audio_features:size1"] # [B] logprobs, audio_features_len = model( raw_audio=raw_audio, - raw_audio_len=raw_audio_len, + raw_audio_lengths=raw_audio_len, + recognition=True ) probs = torch.exp(logprobs) diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/shared/i6modelsV1_VGG4LayerActFrontendV1_v4_cfg.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/shared/i6modelsV1_VGG4LayerActFrontendV1_v4_cfg.py index 48df0d70d..eafc1faa9 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/shared/i6modelsV1_VGG4LayerActFrontendV1_v4_cfg.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/shared/i6modelsV1_VGG4LayerActFrontendV1_v4_cfg.py @@ -130,5 +130,28 @@ def from_dict(cls, d): d["decoder_config"] = FlowDecoderConfig.from_dict(d["decoder_config"]) d["text_encoder_config"] = TextEncoderConfig.from_dict(d["text_encoder_config"]) return ModelConfig(**d) + +@dataclass +class ModelConfigV2(): + specaug_config: Union[SpecaugConfig, None] + decoder_config: FlowDecoderConfig + text_encoder_config: TextEncoderConfig + specauc_start_epoch: int + label_target_size: int + subsampling_factor: int + blstm_layers: int + blstm_hidden_dim: int + blstm_dropout: float + out_channels: int + gin_channels: int + n_speakers: Union[tk.Variable, int] + + @classmethod + def from_dict(cls, d): + d = d.copy() + d["specaug_config"] = SpecaugConfig.from_dict(d["specaug_config"]) + d["decoder_config"] = FlowDecoderConfig.from_dict(d["decoder_config"]) + d["text_encoder_config"] = TextEncoderConfig.from_dict(d["text_encoder_config"]) + return ModelConfigV2(**d) diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/serializer.py b/users/rilling/experiments/librispeech/librispeech_joint_training/serializer.py index 03d4e2079..82160af4a 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/serializer.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/serializer.py @@ -188,6 +188,25 @@ def get_serializer( import_as="forward_finish_hook", ) serializer_objects.extend([vocoder_dependencies, forward_step, init_hook, finish_hook]) + elif target == "prior": + forward_step = Import( + code_object_path=package + ".%s.prior_step" % network_module, + unhashed_package_root=PACKAGE, + import_as="forward_step", + ) + init_hook = Import( + code_object_path=package + ".%s.prior_init_hook" % network_module, + unhashed_package_root=PACKAGE, + import_as="forward_init_hook", + ) + finish_hook = Import( + code_object_path=package + ".%s.prior_finish_hook" % network_module, + import_as="forward_finish_hook", + unhashed_package_root=PACKAGE, + ) + serializer_objects.extend( + [forward_step, init_hook, finish_hook] + ) else: forward_step = Import( code_object_path=package + f".{network_module}.forward_step_{target}", diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/training_comparison.ipynb b/users/rilling/experiments/librispeech/librispeech_joint_training/training_comparison.ipynb index 8219973d6..56937615a 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/training_comparison.ipynb +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/training_comparison.ipynb @@ -2,9 +2,18 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], "source": [ "import sys\n", "sys.path.append(\"/u/lukas.rilling/dev/\")\n", @@ -23,24 +32,33 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_encoder_sample_ctc_scale_0.1/training': ''}" + "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/training': '/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_ctc_scale_1.0/training': '/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_ctc_scale_1.0/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/training': '/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ddi_actnorm/training': '/glowTTS_ASR_conformer_two_forward_pass_ddi_actnorm/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_ctc_scale_0.1/training': '/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_ctc_scale_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/training': '/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/training': '/glowTTS_ASR_conformer_two_forward_pass/'}" ] }, - "execution_count": 2, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "globs = [\n", - " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/training\",\n", - " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_encoder*/training\"\n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass*/training\",\n", + " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_ctc_scale_1.0/training\",\n", + " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/training\",\n", + " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector*/training\",\n", + " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_encoder*/training\"\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/training\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/raw_audio/glowTTS_ASR_conformer_x_vector_control_spec_augment/training\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/training\"\n", @@ -70,19 +88,26 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_encoder_sample_ctc_scale_0.1/training',\n", + "['/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/training',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_ctc_scale_1.0/training',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/training',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ddi_actnorm/training',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_ctc_scale_0.1/training',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/training',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/training',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/not_silence_preprocessed/training',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc768/100ep/not_silence_preprocessed/training',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/training']" + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/training',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/training']" ] }, - "execution_count": 3, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -112,12 +137,17 @@ ")\n", "files[lr_files[-1]] = \"Baseline Weak Conformer\"\n", "\n", + "lr_files.append(\n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/training\"\n", + ")\n", + "files[lr_files[-1]] = \"Baseline Strong Conformer\"\n", + "\n", "lr_files" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -125,15 +155,15 @@ "text/plain": [ "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/not_silence_preprocessed/training': 1,\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector_v3_norm_xvector/enc768/100ep/not_silence_preprocessed/training': 1,\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector_v3/enc768/100ep/not_silence_preprocessed/training': 1,\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc768/100ep/not_silence_preprocessed/training': 1,\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector_v3/enc768/100ep/not_silence_preprocessed/training': 1,\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_x_vector_pe1_radam/training': 1,\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_x_vector_pe1_radam_no_dec_dropout/training': 1,\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_x_vector_pe1/training': 1,\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_x_vector_pe1_radam_no_dec_dropout/training': 1,\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS/training': 1}" ] }, - "execution_count": 4, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -158,17 +188,24 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_encoder_sample_ctc_scale_0.1/training: 3\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/training: 3\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_ctc_scale_1.0/training: 3\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/training: 3\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ddi_actnorm/training: 3\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_ctc_scale_0.1/training: 3\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/training: 3\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/training: 3\n", "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/not_silence_preprocessed/training: 1\n", "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc768/100ep/not_silence_preprocessed/training: 1\n", "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/training: 3\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/training: 3\n", "Large Font: False\n", "Setup Interactive Legend\n", "Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous view', 'arrow-left', 'back'), ('Forward', 'Forward to next view', 'arrow-right', 'forward'), ('Pan', 'Left button pans, Right button zooms\\nx/y fixes axis, CTRL fixes aspect', 'arrows', 'pan'), ('Zoom', 'Zoom to rectangle\\nx/y fixes axis', 'square-o', 'zoom'), ('Download', 'Download plot', 'floppy-o', 'save_figure')]))\n" @@ -177,18 +214,18 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "ee80df0cc455421888693e1de3ae055b", + "model_id": "79c0076f856b4e01889533746fe74923", "version_major": 2, "version_minor": 0 }, - "image/png": "", + "image/png": "", "text/html": [ "\n", "
\n", "
\n", " Figure\n", "
\n", - " \n", + " \n", "
\n", " " ], @@ -225,9 +262,9 @@ " plot_lr=False,\n", " large_font=False,\n", " shrink_axes=0.6,\n", - " # ylim_max=4,\n", + " ylim_max=4,\n", " ylim_min=-0.9,\n", - " keys_exclude=\"devtrain\",\n", + " keys_exclude=\"devtrain_loss_(ctc|mle)\",\n", " color_map=\"Set1\",\n", " draggable=True,\n", ")" diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/config.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/config.py index 669e1954b..c96fd1fcb 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/config.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/config.py @@ -110,6 +110,7 @@ def get_forward_config( target="audio", train_data=False, joint_data=False, + cv_asr=False, ): """ Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for forward_ctc_aligner @@ -125,6 +126,8 @@ def get_forward_config( fd = forward_dataset.train.as_returnn_opts() elif joint_data: fd = forward_dataset.joint.as_returnn_opts() + elif cv_asr: + fd = forward_dataset.cv_asr.as_returnn_opts() else: fd = forward_dataset.cv.as_returnn_opts() @@ -191,3 +194,49 @@ def get_search_config( ) returnn_config = ReturnnConfig(config=config, post_config=post_config, python_epilog=[serializer]) return returnn_config + +def get_prior_config( + training_datasets: TrainingDataset, + network_module: str, + net_args: Dict[str, Any], + config: Dict[str, Any], + debug: bool = False, + use_custom_engine=False, + target="prior", + **kwargs, +): + """ + Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner + :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob + :param training_datasets: datasets for training + :param kwargs: arguments to be passed to the network construction + :return: RETURNN training config + """ + + # changing these does not change the hash + post_config = { + } + + base_config = { + ############# + "batch_size": 50000 * 160, + "max_seqs": 60, + ############# + "forward": training_datasets.prior.as_returnn_opts() if target == "prior" else training_datasets.devtrain.as_returnn_opts() + + } + config = {**base_config, **copy.deepcopy(config)} + post_config["backend"] = "torch" + + serializer = get_serializer( + network_module=network_module, + net_args=net_args, + debug=debug, + use_custom_engine=use_custom_engine, + forward=True, + target=target, + ) + returnn_config = ReturnnConfig( + config=config, post_config=post_config, python_epilog=[serializer] + ) + return returnn_config diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/data.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/data.py index 8e1229403..32960285d 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/data.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/data.py @@ -74,6 +74,7 @@ class TrainingDataset: devtrain: Dataset cv_asr: Dataset datastreams: Dict[str, Datastream] + prior:Dataset @dataclass() @@ -555,7 +556,17 @@ def build_training_dataset( ) devtrain_dataset = make_meta_dataset(devtrain_zip_dataset, joint_speaker_dataset, train_eow_phonemes_dataset, duration_dataset=duration_dataset, xvector_dataset=xvector_dataset) - return TrainingDataset(train=train_dataset, cv=cv_dataset, cv_asr=cv_dataset_asr, devtrain=devtrain_dataset, datastreams=datastreams) + prior_zip_dataset = OggZipDataset( + files=train_ogg, + audio_options=training_audio_opts, + target_options=train_phoneme_datastream_tts.as_returnn_targets_opts(), + partition_epoch=1, + seq_ordering="sorted_reverse", + additional_options=additional_opts, + ) + prior_dataset = make_meta_dataset(prior_zip_dataset, joint_speaker_dataset, train_eow_phonemes_dataset) + + return TrainingDataset(train=train_dataset, cv=cv_dataset, cv_asr=cv_dataset_asr, devtrain=devtrain_dataset, datastreams=datastreams, prior=prior_dataset) @lru_cache() diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_2step/experiments.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_2step/experiments.py index 277fae3fd..167f98973 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_2step/experiments.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_2step/experiments.py @@ -15,8 +15,8 @@ get_text_lexicon, get_bliss_corpus_dict ) -from ..config import get_training_config, get_extract_durations_forward__config, get_forward_config, get_search_config -from ..pipeline import training, forward, search, compute_phoneme_pred_accuracy +from ..config import get_training_config, get_extract_durations_forward__config, get_forward_config, get_search_config, get_prior_config +from ..pipeline import training, forward, search, compute_phoneme_pred_accuracy, compute_prior from i6_experiments.users.rilling.experiments.librispeech.common.tts_eval import tts_eval @@ -74,6 +74,8 @@ def run_exp( assert num_epochs == len(args["config"]["learning_rates"]), "Number of Epochs and Number of LR steps differs!" + with_prior = "prior_scale" in search_args + if given_train_job_for_forward is None: training_config = get_training_config( training_datasets=dataset, @@ -121,6 +123,18 @@ def run_exp( train_job = given_train_job_for_forward exp["train_job"] = train_job + if with_prior: + returnn_config = get_prior_config(training_datasets=dataset, **args) + prior_file = compute_prior( + prefix + name, + returnn_config, + checkpoint=train_job.out_checkpoints[num_epochs], + returnn_exe=RETURNN_PYTORCH_ASR_SEARCH_EXE, + returnn_root=MINI_RETURNN_ROOT, + ) + tk.register_output(prefix + name + "/prior.txt", prior_file) + search_args["prior_file"] = prior_file + if tts_forward: forward_job_gl = tts_eval( checkpoint=train_job.out_checkpoints[num_epochs], @@ -587,4 +601,22 @@ def run_exp( search_args=default_search_args, asr_search=True, asr_cv_set=True, - ) \ No newline at end of file + phoneme_pred=False, + ) + + for lm_w in [2, 2.5, 3.0, 3.5, 4.0, 4.5]: + for ps in [0, 0.3, 0.5]: + additional_search_args = {"lm_weight": lm_w} if ps == 0 else {"lm_weight": lm_w, "prior_scale": ps} + suffix = f"/tuning/lm_{lm_w}" if ps == 0 else f"/tuning/lm_{lm_w}_ps_{ps}" + exp_dict = run_exp( + "second_step_asr/" + net_module.replace(".", "/") + suffix, + train_args, + training_datasets_pe3, + asr_test_datasets, + 250, + forward_args=forward_args, + search_args={**default_search_args, **additional_search_args}, + asr_search=True, + asr_cv_set=True, + phoneme_pred=False, + ) \ No newline at end of file diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_2step/training_comparison.ipynb b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_2step/training_comparison.ipynb index 51ba86d4c..d5f74bf85 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_2step/training_comparison.ipynb +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_2step/training_comparison.ipynb @@ -144,18 +144,18 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "8879ad67b2a94651b6ff26731340937c", + "model_id": "6b994cc2a036408b805ee43ad71b84b6", "version_major": 2, "version_minor": 0 }, - "image/png": "", + "image/png": "", "text/html": [ "\n", "
\n", "
\n", " Figure\n", "
\n", - " \n", + " \n", "
\n", " " ], @@ -237,33 +237,33 @@ " \n", " /\n", " 1\n", - " 3.854557\n", - " 3.863975\n", - " 3.834595\n", + " 3.864722\n", + " 3.874153\n", + " 3.841878\n", " \n", " \n", " 2\n", - " 3.742074\n", - " 3.567398\n", - " 3.467777\n", + " 3.768973\n", + " 3.639531\n", + " 3.550206\n", " \n", " \n", " 3\n", - " 2.479751\n", - " 2.167383\n", - " 1.838922\n", + " 2.512775\n", + " 2.184052\n", + " 1.858979\n", " \n", " \n", " 4\n", - " 2.051774\n", - " 1.811539\n", - " 1.417094\n", + " 2.055174\n", + " 1.809958\n", + " 1.418657\n", " \n", " \n", " 5\n", - " 1.835545\n", - " 1.586035\n", - " 1.171362\n", + " 1.839511\n", + " 1.598297\n", + " 1.182285\n", " \n", " \n", "\n", @@ -271,11 +271,11 @@ ], "text/plain": [ " ctc dev_loss_ctc devtrain_loss_ctc\n", - "/ 1 3.854557 3.863975 3.834595\n", - " 2 3.742074 3.567398 3.467777\n", - " 3 2.479751 2.167383 1.838922\n", - " 4 2.051774 1.811539 1.417094\n", - " 5 1.835545 1.586035 1.171362" + "/ 1 3.864722 3.874153 3.841878\n", + " 2 3.768973 3.639531 3.550206\n", + " 3 2.512775 2.184052 1.858979\n", + " 4 2.055174 1.809958 1.418657\n", + " 5 1.839511 1.598297 1.182285" ] }, "execution_count": 6, @@ -333,7 +333,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_tts/experiments.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_tts/experiments.py index 4e74cd979..72af00dba 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_tts/experiments.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_tts/experiments.py @@ -16,7 +16,7 @@ get_text_lexicon, get_bliss_corpus_dict, ) -from ..config import get_training_config, get_extract_durations_forward__config, get_forward_config, get_search_config +from ..config import get_training_config, get_extract_durations_forward__config, get_forward_config, get_search_config, get_prior_config from ..pipeline import training, forward, search, compute_phoneme_pred_accuracy from i6_experiments.users.rilling.experiments.librispeech.common.tts_eval import tts_eval @@ -39,11 +39,8 @@ def get_glow_tts(x_vector_exp, joint_exps, tts_exps, gl_checkpoint): """ - Baseline for the glow TTS in returnn_common with serialization - - Uses updated RETURNN_COMMON - - :return: durations_hdf + Contains TTS-only experiments similar to the experiments in ../../librispeech_glowtts/glowTTS/experiments.py + but in a cleaner setup similar to the setup used for joint training and joint training with external alignments """ prefix = "experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/" @@ -52,44 +49,56 @@ def run_exp( name, args, dataset, - test_dataset, num_epochs=100, - use_custom_engine=False, training_args={}, forward_args={}, keep_epochs=None, - extract_x_vector=False, - asr_cv_set=False, - given_train_job_for_forward=None, nisqa_evaluation=True, swer_evaluation=True, tts_eval_datasets=None, eval_invertibility=False, nisqa_confidence=False, + large_gpu=False, + compute_feature_stats=False, ): + """Creates the training and forward configs and runs additional evaluations according to the given parameters + + :param str name: name to be used in alias creation + :param dict args: General training/forward arguments passed to the config creation + :param TrainingDataset dataset: Dataset to be used for training + :param int num_epochs: Number of Epochs for training, defaults to 100 + :param dict training_args: Arguments passed to the train steps, defaults to {} + :param dict forward_args: Arguments passed to the forward steps, defaults to {} + :param list keep_epochs: List of epoch numbers to be kept during training, defaults to None + :param bool nisqa_evaluation: whether autoMOS should be evaluated using NISQA, defaults to True + :param bool swer_evaluation: whether synthetic WER should be evaluated, defaults to True + :param dict[MetaDataset] tts_eval_datasets: Datasets to be used for TTS evaluation, defaults to None + :param bool eval_invertibility: whether invertibility of coupling blocks should be evaluated, defaults to False + :param bool nisqa_confidence: if autoMOS is evaluated, whether confidence intervals should be computed, defaults to False + :param bool large_gpu: whether 24gb should be required for training, defaults to False + :param bool compute_feature_stats: for invertibility, also run a forward pass calculating the dataset statistics for comparison; typically only necessary once not for all experiments, defaults to False + :return dict: Dictionary of this experiment containing all jobs that are created + """ exp = {} assert len(args["config"]["learning_rates"]) == num_epochs, "Length of LR schedule and number of epochs differ." - if given_train_job_for_forward is None: - training_config = get_training_config( - training_datasets=dataset, - **args, - training_args=training_args, - use_custom_engine=use_custom_engine, - keep_epochs=keep_epochs, - asr_cv_set=asr_cv_set, - ) # implicit reconstruction loss - - if given_train_job_for_forward is None: - train_job = training( - config=training_config, - returnn_exe=RETURNN_PYTORCH_EXE, - returnn_root=MINI_RETURNN_ROOT, - prefix=prefix + name, - num_epochs=num_epochs, - ) - else: - train_job = given_train_job_for_forward + + training_config = get_training_config( + training_datasets=dataset, + **args, + training_args=training_args, + use_custom_engine=False, + keep_epochs=keep_epochs, + ) # implicit reconstruction loss + + train_job = training( + config=training_config, + returnn_exe=RETURNN_PYTORCH_EXE, + returnn_root=MINI_RETURNN_ROOT, + prefix=prefix + name, + num_epochs=num_epochs, + large_gpu=large_gpu, + ) exp["train_job"] = train_job for ds_k, ds in tts_eval_datasets.items(): @@ -117,33 +126,29 @@ def run_exp( nisqa_confidence=nisqa_confidence, ) - if extract_x_vector: - forward_x_vector_config = get_forward_config( - forward_dataset=dataset, **args, forward_args=forward_args, target="xvector", train_data=True - ) + if eval_invertibility: + forward_invertibility_config = get_prior_config(dataset, target="invertibility", **args) forward_xvector_job = forward( checkpoint=train_job.out_checkpoints[num_epochs], - config=forward_x_vector_config, + config=forward_invertibility_config, returnn_exe=RETURNN_PYTORCH_EXE, returnn_root=MINI_RETURNN_ROOT, prefix=prefix + name, - target="xvector", + target="invertibility", ) - exp["forward_xvector_job"] = forward_xvector_job + exp["forward_invertibility_job"] = forward_xvector_job - if eval_invertibility: - forward_x_vector_config = get_forward_config( - forward_dataset=dataset, **args, forward_args=forward_args, target="invertibility" - ) + if compute_feature_stats: + forward_statistics_config = get_prior_config(dataset, target="statistics", **args) forward_xvector_job = forward( checkpoint=train_job.out_checkpoints[num_epochs], - config=forward_x_vector_config, + config=forward_statistics_config, returnn_exe=RETURNN_PYTORCH_EXE, returnn_root=MINI_RETURNN_ROOT, prefix=prefix + name, - target="invertibility", + target="statistics", ) - exp["forward_invertibility_job"] = forward_xvector_job + exp["forward_features_statistics"] = forward_xvector_job return exp @@ -250,23 +255,6 @@ def run_exp( xvectors_file=x_vector_extractions["x_vector_cnn/1e-3_not_silence_preprocessed/test-clean"]["hdf"], ) - asr_test_datasets = {} - - asr_test_datasets["dev-other"] = build_test_dataset(librispeech_key="train-clean-100", dataset_key="dev-other") - - asr_test_datasets2 = copy.deepcopy(asr_test_datasets) - asr_test_datasets2["train-clean-100-cv"] = build_test_dataset( - librispeech_key="train-clean-100", dataset_key="train-clean-100", test_on_tts_cv=True - ) - asr_test_datasets2["dev-clean"] = build_test_dataset(librispeech_key="train-clean-100", dataset_key="dev-clean") - - dev_dataset_tuples_with_phon = {} - for testset in ["train-clean"]: - dev_dataset_tuples_with_phon[testset] = ( - training_datasets_pe1_tts_segments.cv, - get_bliss_corpus_dict()["train-clean-100"], - ) - specaug_config = SpecaugConfig( repeat_per_n_frames=100, max_dim_time=20, @@ -387,7 +375,6 @@ def run_exp( net_module + "/enc768/100ep/dec_drop_0.05", train_args_TTS_xvector, training_datasets_pe1_tts_segments, - asr_test_datasets, 100, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets_xvectors, @@ -409,7 +396,6 @@ def run_exp( net_module + "/enc768/100ep/dec_drop_0.05", train_args_TTS_xvector, training_datasets_pe1_tts_segments, - asr_test_datasets, 100, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets_xvectors, @@ -423,13 +409,20 @@ def run_exp( ), ) + add_tts_model( + net_module + "/enc768/100ep/dec_drop_0.05", + TTSModel( + config=ModelConfigV1.from_dict(train_args_TTS_xvector["net_args"]["model_config"]), + checkpoint=exp_dict["train_job"].out_checkpoints[100], + ), + ) + train_args_TTS_xvector_200ep = copy.deepcopy(train_args_TTS_xvector) train_args_TTS_xvector_200ep["config"]["learning_rates"] = lr_schedule_200ep exp_dict = run_exp( net_module + "/enc768/200ep/dec_drop_0.05", train_args_TTS_xvector_200ep, training_datasets_pe1_tts_segments, - asr_test_datasets, 200, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets_xvectors, @@ -450,7 +443,6 @@ def run_exp( net_module + "/enc192/200ep/dec_drop_0.05", train_args_TTS_xvector_200ep, training_datasets_pe1_tts_segments, - asr_test_datasets, 200, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets_xvectors, @@ -476,7 +468,6 @@ def run_exp( net_module + "/enc768/200ep/dec_drop_0.0", train_args_TTS_xvector_200ep_no_dec_dropout, training_datasets_pe1_tts_segments, - asr_test_datasets, 200, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets_xvectors, @@ -490,7 +481,6 @@ def run_exp( net_module + "/enc192/200ep/dec_drop_0.0", train_args_TTS_xvector_200ep_no_dec_dropout, training_datasets_pe1_tts_segments, - asr_test_datasets, 200, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets_xvectors, @@ -507,7 +497,6 @@ def run_exp( net_module + "/enc768/200ep_long_cooldown/dec_drop_0.05", train_args_xvector_altLR, training_datasets_pe1_tts_segments, - asr_test_datasets, 200, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets_xvectors, @@ -522,7 +511,6 @@ def run_exp( net_module + "/enc768/200ep_long_cooldown/dec_drop_0.0", train_args_xvector_altLR_no_dec_drop, training_datasets_pe1_tts_segments, - asr_test_datasets, 200, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets_xvectors, @@ -538,7 +526,6 @@ def run_exp( net_module + "/enc768/100ep/dec_drop_0.05", train_args_TTS, training_datasets_pe1_tts_segments, - asr_test_datasets, 100, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets, @@ -553,7 +540,6 @@ def run_exp( net_module + "/enc768/100ep/dec_drop_0.00", train_args_TTS_100ep_no_dec_dropout, training_datasets_pe1_tts_segments, - asr_test_datasets, 100, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets, @@ -575,13 +561,13 @@ def run_exp( net_module + "/enc768/200ep/dec_drop_0.05", train_args_TTS_200ep, training_datasets_pe1_tts_segments, - asr_test_datasets, 200, forward_args=forward_args, swer_evaluation=True, tts_eval_datasets=tts_forward_datasets, eval_invertibility=True, nisqa_confidence=True, + compute_feature_stats=True, ) add_tts_model( net_module + "/enc768/200ep/dec_drop_0.05", @@ -595,7 +581,6 @@ def run_exp( net_module + "/enc768/200ep/dec_drop_0.05_epsilon_1e-8", train_args_TTS_200ep_alt_epsilon, training_datasets_pe1_tts_segments, - asr_test_datasets, 200, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets, @@ -608,7 +593,6 @@ def run_exp( net_module + "/enc192/100ep/dec_drop_0.05", train_args_TTS, training_datasets_pe1_tts_segments, - asr_test_datasets, 100, forward_args=forward_args, swer_evaluation=True, @@ -619,7 +603,6 @@ def run_exp( net_module + "/enc192/200ep/dec_drop_0.05", train_args_TTS_200ep, training_datasets_pe1_tts_segments, - asr_test_datasets, 200, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets, @@ -636,7 +619,6 @@ def run_exp( net_module + "/enc192/200ep/dec_drop_0.05_epsilon_1e-8", train_args_TTS_200ep_alt_epsilon, training_datasets_pe1_tts_segments, - asr_test_datasets, 200, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets, @@ -651,7 +633,6 @@ def run_exp( net_module + "/enc768/200ep/dec_drop_0.0/epsilon_1e-8", train_args_TTS_no_dec_dropout, training_datasets_pe1_tts_segments, - asr_test_datasets, 200, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets, @@ -662,7 +643,6 @@ def run_exp( net_module + "/enc192/200ep/dec_drop_0.0/epsilon_1e-8", train_args_TTS_no_dec_dropout, training_datasets_pe1_tts_segments, - asr_test_datasets, 200, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets, @@ -676,7 +656,6 @@ def run_exp( net_module + "/enc768/200ep/dec_drop_0.0/grad_clip_10", train_args_TTS_no_dec_dropout, training_datasets_pe1_tts_segments, - asr_test_datasets, 200, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets, @@ -687,7 +666,6 @@ def run_exp( net_module + "/enc192/200ep/dec_drop_0.0/grad_clip_10", train_args_TTS_no_dec_dropout, training_datasets_pe1_tts_segments, - asr_test_datasets, 200, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets, @@ -729,7 +707,6 @@ def run_exp( net_module + "/12cb/200ep/dec_drop_0.05", train_args_TTS_simple_encoder, training_datasets_pe1_tts_segments, - asr_test_datasets, 200, forward_args=forward_args, swer_evaluation=True, @@ -742,7 +719,6 @@ def run_exp( net_module + "/20cb/200ep/dec_drop_0.05", train_args_TTS_simple_encoder, training_datasets_pe1_tts_segments, - asr_test_datasets, 200, forward_args=forward_args, swer_evaluation=True, @@ -788,7 +764,6 @@ def run_exp( net_module + "/enc768/200ep/dec_drop_0.05", train_args_TTS_xvector_200ep_conformer_coupling, training_datasets_pe1_tts_segments, - asr_test_datasets, 200, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets_xvectors, @@ -820,7 +795,6 @@ def run_exp( net_module + "/enc768/200ep/dec_drop_0.05", train_args_TTS_xvector_200ep_multiscale, training_datasets_pe1_tts_segments, - asr_test_datasets, 200, forward_args=forward_args, tts_eval_datasets=tts_forward_datasets_xvectors, @@ -837,7 +811,6 @@ def run_exp( net_module + f"/ed_scale_{s}", train_args_xvector_dist_loss, training_datasets_pe1_tts_segments, - asr_test_datasets, 200, training_args={"ed_scale": s}, forward_args=forward_args, @@ -852,7 +825,6 @@ def run_exp( net_module + f"/ed_scale_{s}", train_args_xvector_dist_loss, training_datasets_pe1_tts_segments, - asr_test_datasets, 200, training_args={"ed_scale": s}, forward_args=forward_args, @@ -865,7 +837,6 @@ def run_exp( net_module + f"_grad_clip_10/ed_scale_{s}", train_args_xvector_dist_loss, training_datasets_pe1_tts_segments, - asr_test_datasets, 200, training_args={"ed_scale": s}, forward_args=forward_args, @@ -882,7 +853,6 @@ def run_exp( net_module + "/enc768/200ep/dec_drop_0.05", train_args_TTS_200ep_batch_norm, training_datasets_pe1_tts_segments, - asr_test_datasets, 200, forward_args=forward_args, swer_evaluation=True, @@ -923,7 +893,6 @@ def run_exp( net_module + "/enc768/400ep/gin512/dec_drop_0.05", train_args_400_gin512, training_datasets_pe1_tts_segments, - asr_test_datasets, 400, forward_args=forward_args, swer_evaluation=True, @@ -938,7 +907,6 @@ def run_exp( net_module + "/enc768/400ep/gin512/grad_clip_10/dec_drop_0.05", train_args_400_gin512_grad_norm, training_datasets_pe1_tts_segments, - asr_test_datasets, 400, forward_args=forward_args, swer_evaluation=True, @@ -959,7 +927,6 @@ def run_exp( net_module + "/enc768/400ep/dec_drop_0.05", train_args_400, training_datasets_pe1_tts_segments, - asr_test_datasets, 400, forward_args=forward_args, swer_evaluation=True, @@ -974,7 +941,6 @@ def run_exp( net_module + "/enc768/400ep/grad_clip_10/dec_drop_0.05", train_args_400_grad_norm, training_datasets_pe1_tts_segments, - asr_test_datasets, 400, forward_args=forward_args, swer_evaluation=True, @@ -996,7 +962,6 @@ def run_exp( net_module + "/enc192/400ep/grad_clip_10/dec_drop_0.05", train_args_400_grad_norm, training_datasets_pe1_tts_segments, - asr_test_datasets, 400, forward_args=forward_args, swer_evaluation=True, @@ -1023,7 +988,6 @@ def run_exp( net_module + "/enc768/400ep/gin512/dec_drop_0.05", train_args_400_xvector_gin512, training_datasets_pe1_tts_segments, - asr_test_datasets, 400, forward_args=forward_args, swer_evaluation=True, @@ -1044,13 +1008,13 @@ def run_exp( net_module + "/enc768/400ep/dec_drop_0.05", train_args_400_xvector, training_datasets_pe1_tts_segments, - asr_test_datasets, 400, forward_args=forward_args, swer_evaluation=True, nisqa_evaluation=True, tts_eval_datasets=tts_forward_datasets_xvectors, nisqa_confidence=True, + large_gpu=True ) add_tts_model( @@ -1067,7 +1031,6 @@ def run_exp( net_module + "/enc192/400ep/dec_drop_0.05", train_args_400_xvector, training_datasets_pe1_tts_segments, - asr_test_datasets, 400, forward_args=forward_args, swer_evaluation=True, diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_tts/training_comparison.ipynb b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_tts/training_comparison.ipynb index 72d769622..f611192ee 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_tts/training_comparison.ipynb +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_tts/training_comparison.ipynb @@ -23,21 +23,36 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.0/grad_clip_10/training': '/enc192/200ep/dec_drop_0.0/grad_clip_10/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/grad_clip_10/training': '/enc768/200ep/dec_drop_0.0/grad_clip_10/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05_epsilon_1e-8/training': '/enc192/200ep/dec_drop_0.05_epsilon_1e-8/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05/training': '/enc192/200ep/dec_drop_0.05/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05_epsilon_1e-8/training': '/enc768/200ep/dec_drop_0.05_epsilon_1e-8/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05/training': '/enc768/200ep/dec_drop_0.05/'}" + "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.0/grad_clip_10/training': '/glowTTS/enc192/200ep/dec_drop_0.0/grad_clip_10/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/grad_clip_10/training': '/glowTTS/enc768/200ep/dec_drop_0.0/grad_clip_10/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05_epsilon_1e-8/training': '/glowTTS/enc192/200ep/dec_drop_0.05_epsilon_1e-8/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05/training': '/glowTTS/enc192/200ep/dec_drop_0.05/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/100ep/dec_drop_0.05/training': '/glowTTS/enc192/100ep/dec_drop_0.05/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.00/training': '/glowTTS/enc768/100ep/dec_drop_0.00/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.05/training': '/glowTTS/enc768/100ep/dec_drop_0.05/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05_epsilon_1e-8/training': '/glowTTS/enc768/200ep/dec_drop_0.05_epsilon_1e-8/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05/training': '/glowTTS/enc768/200ep/dec_drop_0.05/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/dec_drop_0.05/training': '/glowTTS/enc768/400ep/dec_drop_0.05/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/400ep/grad_clip_10/dec_drop_0.05/training': '/glowTTS/enc192/400ep/grad_clip_10/dec_drop_0.05/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/grad_clip_10/dec_drop_0.05/training': '/glowTTS/enc768/400ep/grad_clip_10/dec_drop_0.05/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/dec_drop_0.05/training': '/glowTTS_x_vector_v2/enc768/400ep/dec_drop_0.05/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.0/training': '/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.0/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/training': '/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/100ep/dec_drop_0.05/training': '/glowTTS_x_vector_v2/enc768/100ep/dec_drop_0.05/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.0/training': '/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.0/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.05/training': '/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.05/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.0/training': '/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.0/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.05/training': '/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.05/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/400ep/dec_drop_0.05/training': '/glowTTS_x_vector_v2/enc192/400ep/dec_drop_0.05/'}" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -47,8 +62,10 @@ " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS*/enc768/400ep/dec_drop_0.05/training\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/grad_clip_10/dec_drop_0.05/training\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.0*/training\",\n", - " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc*/200ep/dec_drop_0.0*/grad_clip_10/training\",\n", - " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc*/200ep/dec_drop_0.0*/training\",\n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc*/*/dec_drop_0.0*/grad_clip_10/training\",\n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc*/*/dec_drop_0.0*/training\",\n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc*/*/grad_clip_10/dec_drop_0.0*/training\",\n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc*/*/dec_drop_0.0*/training\",\n", "]\n", "breakpoint()\n", "lr_files = []\n", @@ -68,7 +85,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -78,6 +95,17 @@ "files[lr_files[-1]] = \"Baseline Glow-TTS 768\"" ] }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "pes = {\n", + " x: int(x.split(\"/enc\")[1][4:].split(\"00ep\")[0]) for x in lr_files\n", + "}" + ] + }, { "cell_type": "code", "execution_count": 5, @@ -87,13 +115,28 @@ "name": "stdout", "output_type": "stream", "text": [ - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.0/grad_clip_10/training: 1\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/grad_clip_10/training: 1\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05_epsilon_1e-8/training: 1\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05/training: 1\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05_epsilon_1e-8/training: 1\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05/training: 1\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05/training/: 1\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.0/grad_clip_10/training: 2\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/grad_clip_10/training: 2\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05_epsilon_1e-8/training: 2\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05/training: 2\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/100ep/dec_drop_0.05/training: 1\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.00/training: 1\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.05/training: 1\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05_epsilon_1e-8/training: 2\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05/training: 2\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/dec_drop_0.05/training: 4\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/400ep/grad_clip_10/dec_drop_0.05/training: 4\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/grad_clip_10/dec_drop_0.05/training: 4\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/dec_drop_0.05/training: 4\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.0/training: 2\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/training: 2\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/100ep/dec_drop_0.05/training: 1\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.0/training: 2\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.05/training: 2\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.0/training: 2\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.05/training: 2\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/400ep/dec_drop_0.05/training: 4\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05/training/: 2\n", "Large Font: False\n", "Setup Interactive Legend\n", "Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous view', 'arrow-left', 'back'), ('Forward', 'Forward to next view', 'arrow-right', 'forward'), ('Pan', 'Left button pans, Right button zooms\\nx/y fixes axis, CTRL fixes aspect', 'arrows', 'pan'), ('Zoom', 'Zoom to rectangle\\nx/y fixes axis', 'square-o', 'zoom'), ('Download', 'Download plot', 'floppy-o', 'save_figure')]))\n" @@ -102,18 +145,18 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "2084e635587446d2aab96b81f3fa6006", + "model_id": "8a172637e75a4d7c9eb3ccf1e74c6551", "version_major": 2, "version_minor": 0 }, - "image/png": "", + "image/png": "", "text/html": [ "\n", "
\n", "
\n", " Figure\n", "
\n", - " \n", + " \n", "
\n", " " ], @@ -132,7 +175,7 @@ " data = get_epoch_data(lr_file, epoch=None)\n", " if data is None:\n", " continue\n", - " p_e = 1\n", + " p_e = pes[lr_file]\n", " print(f\"{lr_file}: {p_e}\")\n", " error_data_tmp = []\n", " for ep, ep_data in data.items():\n", @@ -152,7 +195,7 @@ " shrink_axes=0.6,\n", " ylim_max=2,\n", " ylim_min=-0.9,\n", - " keys_exclude=\"devtrain|ctc|100ep|ed\",\n", + " keys_exclude=\"devtrain|ctc|ed\",\n", " # color_map=\"Set1\",\n", " draggable=True\n", ")" @@ -160,7 +203,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -169,7 +212,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -197,34 +240,32 @@ " \n", " \n", " \n", - " dev_loss_dp\n", - " dev_loss_mle\n", + " dev_loss_dp\n", " ...\n", - " dp\n", - " mle\n", + " mle\n", " \n", " \n", " level_0\n", - " /enc192/200ep/dec_drop_0.0/grad_clip_10/\n", - " /enc192/200ep/dec_drop_0.05/\n", - " /enc192/200ep/dec_drop_0.05_epsilon_1e-8/\n", - " /enc768/200ep/dec_drop_0.0/grad_clip_10/\n", - " /enc768/200ep/dec_drop_0.05/\n", - " /enc768/200ep/dec_drop_0.05_epsilon_1e-8/\n", - " Baseline Glow-TTS 768\n", - " /enc192/200ep/dec_drop_0.0/grad_clip_10/\n", - " /enc192/200ep/dec_drop_0.05/\n", - " /enc192/200ep/dec_drop_0.05_epsilon_1e-8/\n", + " /glowTTS/enc192/100ep/dec_drop_0.05/\n", + " /glowTTS/enc192/200ep/dec_drop_0.0/grad_clip_10/\n", + " /glowTTS/enc192/200ep/dec_drop_0.05/\n", + " /glowTTS/enc192/200ep/dec_drop_0.05_epsilon_1e-8/\n", + " /glowTTS/enc192/400ep/grad_clip_10/dec_drop_0.05/\n", + " /glowTTS/enc768/100ep/dec_drop_0.00/\n", + " /glowTTS/enc768/100ep/dec_drop_0.05/\n", + " /glowTTS/enc768/200ep/dec_drop_0.0/grad_clip_10/\n", + " /glowTTS/enc768/200ep/dec_drop_0.05/\n", + " /glowTTS/enc768/200ep/dec_drop_0.05_epsilon_1e-8/\n", " ...\n", - " /enc768/200ep/dec_drop_0.05/\n", - " /enc768/200ep/dec_drop_0.05_epsilon_1e-8/\n", - " Baseline Glow-TTS 768\n", - " /enc192/200ep/dec_drop_0.0/grad_clip_10/\n", - " /enc192/200ep/dec_drop_0.05/\n", - " /enc192/200ep/dec_drop_0.05_epsilon_1e-8/\n", - " /enc768/200ep/dec_drop_0.0/grad_clip_10/\n", - " /enc768/200ep/dec_drop_0.05/\n", - " /enc768/200ep/dec_drop_0.05_epsilon_1e-8/\n", + " /glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.0/\n", + " /glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.05/\n", + " /glowTTS_x_vector_v2/enc192/400ep/dec_drop_0.05/\n", + " /glowTTS_x_vector_v2/enc768/100ep/dec_drop_0.05/\n", + " /glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.0/\n", + " /glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/\n", + " /glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.0/\n", + " /glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.05/\n", + " /glowTTS_x_vector_v2/enc768/400ep/dec_drop_0.05/\n", " Baseline Glow-TTS 768\n", " \n", " \n", @@ -255,123 +296,123 @@ " \n", " \n", " 1\n", - " 1.238803\n", - " 1.230068\n", - " 1.229908\n", - " 1.076835\n", - " 1.238104\n", - " 1.235275\n", - " 1.238104\n", - " 0.046450\n", - " 0.073396\n", - " 0.073425\n", + " 1.230100\n", + " 1.221815\n", + " 1.235547\n", + " 1.237038\n", + " 1.256802\n", + " 1.076987\n", + " 1.237908\n", + " 1.094651\n", + " 1.295716\n", + " 1.296690\n", " ...\n", - " 1.145111\n", - " 1.145046\n", - " 1.145111\n", - " 0.537252\n", - " 0.568119\n", - " 0.569599\n", - " 0.502620\n", - " 0.530890\n", - " 0.532545\n", - " 0.530890\n", + " 0.287960\n", + " 0.290018\n", + " 0.029518\n", + " 0.541159\n", + " 0.266416\n", + " 0.267837\n", + " 0.798228\n", + " 0.798930\n", + " 0.004251\n", + " 0.232372\n", " \n", " \n", " 2\n", - " 1.204827\n", - " 1.241025\n", - " 1.244168\n", - " 1.112467\n", - " 1.353328\n", - " 1.358104\n", - " 1.353328\n", - " -0.219648\n", - " -0.152898\n", - " -0.153953\n", + " 1.246446\n", + " 1.173005\n", + " 1.247849\n", + " 1.251819\n", + " 1.329956\n", + " 1.101618\n", + " 1.429635\n", + " 1.071850\n", + " 1.645345\n", + " 1.645638\n", " ...\n", - " 1.166236\n", - " 1.166261\n", - " 1.166236\n", - " -0.101299\n", - " -0.082530\n", - " -0.082966\n", - " -0.083080\n", - " -0.066146\n", - " -0.066379\n", - " -0.066146\n", + " -0.235335\n", + " -0.231702\n", + " -0.455761\n", + " -0.029426\n", + " -0.264689\n", + " -0.259478\n", + " -0.206659\n", + " -0.201726\n", + " -0.479359\n", + " -0.332803\n", " \n", " \n", " 3\n", - " 1.180695\n", - " 1.246691\n", - " 1.245888\n", - " 1.089663\n", - " 1.722623\n", - " 1.748128\n", - " 1.722623\n", - " -0.341818\n", - " -0.295465\n", - " -0.295729\n", + " 1.255786\n", + " 1.157437\n", + " 1.267252\n", + " 1.275061\n", + " 1.074281\n", + " 1.093754\n", + " 1.723272\n", + " 1.056306\n", + " 1.434315\n", + " 1.437929\n", " ...\n", - " 1.148909\n", - " 1.149577\n", - " 1.148909\n", - " -0.294475\n", - " -0.281159\n", - " -0.281597\n", - " -0.293310\n", - " -0.281814\n", - " -0.282104\n", - " -0.281814\n", + " -0.421064\n", + " -0.418663\n", + " -0.553655\n", + " -0.247951\n", + " -0.447814\n", + " -0.443378\n", + " -0.453987\n", + " -0.448675\n", + " -0.573150\n", + " -0.450035\n", " \n", " \n", " 4\n", - " 1.165315\n", - " 1.249006\n", - " 1.257749\n", - " 1.054037\n", - " 1.568067\n", - " 1.543148\n", - " 1.568067\n", - " -0.392180\n", - " -0.361857\n", - " -0.362248\n", + " 1.246023\n", + " 1.232874\n", + " 1.308826\n", + " 1.298183\n", + " 1.002807\n", + " 1.051502\n", + " 1.547274\n", + " 1.042766\n", + " 1.238428\n", + " 1.253040\n", " ...\n", - " 1.126033\n", - " 1.126183\n", - " 1.126033\n", - " -0.371938\n", - " -0.375815\n", - " -0.375824\n", - " -0.381286\n", - " -0.383792\n", - " -0.383782\n", - " -0.383792\n", + " -0.494228\n", + " -0.492276\n", + " -0.606141\n", + " -0.392323\n", + " -0.518874\n", + " -0.515698\n", + " -0.540485\n", + " -0.536202\n", + " -0.618904\n", + " -0.507752\n", " \n", " \n", " 5\n", - " 1.159592\n", - " 1.247790\n", - " 1.248264\n", - " 1.055813\n", - " 1.463525\n", - " 1.484941\n", - " 1.463525\n", - " -0.419574\n", - " -0.413436\n", - " -0.414346\n", + " 1.236534\n", + " 1.229356\n", + " 1.346720\n", + " 1.328731\n", + " 1.011019\n", + " 1.074268\n", + " 1.308311\n", + " 1.062859\n", + " 1.124593\n", + " 1.129739\n", " ...\n", - " 1.104397\n", - " 1.104511\n", - " 1.104397\n", - " -0.410438\n", - " -0.422834\n", - " -0.422658\n", - " -0.422354\n", - " -0.433206\n", - " -0.433054\n", - " -0.433206\n", + " -0.533924\n", + " -0.536283\n", + " -0.640664\n", + " -0.467505\n", + " -0.559829\n", + " -0.558296\n", + " -0.586900\n", + " -0.583516\n", + " -0.647800\n", + " -0.551041\n", " \n", " \n", " ...\n", @@ -398,315 +439,420 @@ " ...\n", " \n", " \n", - " 196\n", - " 0.430796\n", - " 0.678780\n", - " 0.824717\n", - " 0.445126\n", - " 0.442368\n", - " 0.438285\n", - " 0.442368\n", - " -0.837367\n", - " -0.813777\n", - " -0.816068\n", + " 96\n", + " 0.795653\n", + " 0.431044\n", + " 0.795220\n", + " 0.750872\n", + " NaN\n", + " 0.438743\n", + " 0.433290\n", + " 0.445215\n", + " 0.442588\n", + " 0.438332\n", " ...\n", - " 0.388870\n", - " 0.387081\n", - " 0.388870\n", - " -0.850404\n", - " -0.808053\n", - " -0.806879\n", - " -0.848083\n", - " -0.811209\n", - " -0.810240\n", - " -0.811209\n", + " -0.836890\n", + " -0.808481\n", + " NaN\n", + " -0.792415\n", + " -0.808796\n", + " -0.806530\n", + " -0.842475\n", + " -0.805711\n", + " NaN\n", + " -0.809823\n", " \n", " \n", - " 197\n", - " 0.429870\n", - " 0.745400\n", - " 0.824725\n", - " 0.444218\n", - " 0.440721\n", - " 0.437338\n", - " 0.440721\n", - " -0.837755\n", - " -0.810379\n", - " -0.815329\n", + " 97\n", + " 0.805976\n", + " 0.430928\n", + " 0.700293\n", + " 0.790482\n", + " NaN\n", + " 0.438660\n", + " 0.434222\n", + " 0.446601\n", + " 0.442310\n", + " 0.439489\n", " ...\n", - " 0.388216\n", - " 0.386526\n", - " 0.388216\n", - " -0.850987\n", - " -0.808369\n", - " -0.807225\n", - " -0.848792\n", - " -0.811539\n", - " -0.810735\n", - " -0.811539\n", + " -0.837922\n", + " -0.809084\n", + " NaN\n", + " -0.793009\n", + " -0.809899\n", + " -0.807328\n", + " -0.843173\n", + " -0.806196\n", + " NaN\n", + " -0.810414\n", " \n", " \n", - " 198\n", - " 0.432065\n", - " 0.682156\n", - " 0.812607\n", - " 0.445028\n", - " 0.441705\n", - " 0.437170\n", - " 0.441705\n", - " -0.830906\n", - " -0.814995\n", - " -0.817098\n", + " 98\n", + " 0.772572\n", + " 0.430030\n", + " 0.646661\n", + " 0.820845\n", + " NaN\n", + " 0.438703\n", + " 0.433427\n", + " 0.445311\n", + " 0.442427\n", + " 0.437876\n", " ...\n", - " 0.388245\n", - " 0.386545\n", - " 0.388245\n", - " -0.851446\n", - " -0.808689\n", - " -0.807542\n", - " -0.849257\n", - " -0.810812\n", - " -0.811058\n", - " -0.810812\n", + " -0.838980\n", + " -0.809520\n", + " NaN\n", + " -0.793083\n", + " -0.811023\n", + " -0.808048\n", + " -0.844026\n", + " -0.806705\n", + " NaN\n", + " -0.811041\n", " \n", " \n", - " 199\n", - " 0.430398\n", - " 0.600570\n", - " 0.808616\n", - " 0.446232\n", - " 0.442230\n", - " 0.437704\n", - " 0.442230\n", - " -0.835626\n", - " -0.814634\n", - " -0.816613\n", + " 99\n", + " 0.793428\n", + " 0.430968\n", + " 0.713778\n", + " 0.818666\n", + " NaN\n", + " 0.439357\n", + " 0.433838\n", + " 0.444623\n", + " 0.441213\n", + " 0.437254\n", " ...\n", - " 0.387994\n", - " 0.386290\n", - " 0.387994\n", - " -0.852212\n", - " -0.809059\n", - " -0.807872\n", - " -0.849940\n", - " -0.811891\n", - " -0.811443\n", - " -0.811891\n", + " -0.840099\n", + " -0.810355\n", + " NaN\n", + " -0.793955\n", + " -0.812145\n", + " -0.808773\n", + " -0.845038\n", + " -0.807293\n", + " NaN\n", + " -0.811176\n", " \n", " \n", - " 200\n", - " 0.431975\n", - " 0.615026\n", - " 0.814173\n", - " 0.445801\n", - " 0.441218\n", - " 0.438248\n", - " 0.441218\n", - " -0.773976\n", - " -0.815064\n", - " -0.817505\n", + " 100\n", + " 0.777391\n", + " 0.431187\n", + " 0.607798\n", + " 0.811394\n", + " NaN\n", + " 0.437995\n", + " 0.432689\n", + " 0.446017\n", + " 0.441724\n", + " 0.437976\n", " ...\n", - " 0.387377\n", - " 0.386067\n", - " 0.387377\n", - " -0.852834\n", - " -0.809477\n", - " -0.808277\n", - " -0.850649\n", - " -0.812429\n", - " -0.811807\n", - " -0.812429\n", + " -0.841319\n", + " -0.811221\n", + " NaN\n", + " -0.795013\n", + " -0.813296\n", + " -0.809496\n", + " -0.846297\n", + " -0.807831\n", + " NaN\n", + " -0.812160\n", " \n", " \n", "\n", - "

200 rows × 28 columns

\n", + "

100 rows × 88 columns

\n", "" ], "text/plain": [ - " dev_loss_dp \\\n", - "level_0 /enc192/200ep/dec_drop_0.0/grad_clip_10/ /enc192/200ep/dec_drop_0.05/ \n", - "level_1 \n", - "1 1.238803 1.230068 \n", - "2 1.204827 1.241025 \n", - "3 1.180695 1.246691 \n", - "4 1.165315 1.249006 \n", - "5 1.159592 1.247790 \n", - "... ... ... \n", - "196 0.430796 0.678780 \n", - "197 0.429870 0.745400 \n", - "198 0.432065 0.682156 \n", - "199 0.430398 0.600570 \n", - "200 0.431975 0.615026 \n", + " dev_loss_dp \\\n", + "level_0 /glowTTS/enc192/100ep/dec_drop_0.05/ \n", + "level_1 \n", + "1 1.230100 \n", + "2 1.246446 \n", + "3 1.255786 \n", + "4 1.246023 \n", + "5 1.236534 \n", + "... ... \n", + "96 0.795653 \n", + "97 0.805976 \n", + "98 0.772572 \n", + "99 0.793428 \n", + "100 0.777391 \n", "\n", - " \\\n", - "level_0 /enc192/200ep/dec_drop_0.05_epsilon_1e-8/ \n", - "level_1 \n", - "1 1.229908 \n", - "2 1.244168 \n", - "3 1.245888 \n", - "4 1.257749 \n", - "5 1.248264 \n", - "... ... \n", - "196 0.824717 \n", - "197 0.824725 \n", - "198 0.812607 \n", - "199 0.808616 \n", - "200 0.814173 \n", + " \\\n", + "level_0 /glowTTS/enc192/200ep/dec_drop_0.0/grad_clip_10/ \n", + "level_1 \n", + "1 1.221815 \n", + "2 1.173005 \n", + "3 1.157437 \n", + "4 1.232874 \n", + "5 1.229356 \n", + "... ... \n", + "96 0.431044 \n", + "97 0.430928 \n", + "98 0.430030 \n", + "99 0.430968 \n", + "100 0.431187 \n", "\n", - " \\\n", - "level_0 /enc768/200ep/dec_drop_0.0/grad_clip_10/ /enc768/200ep/dec_drop_0.05/ \n", - "level_1 \n", - "1 1.076835 1.238104 \n", - "2 1.112467 1.353328 \n", - "3 1.089663 1.722623 \n", - "4 1.054037 1.568067 \n", - "5 1.055813 1.463525 \n", - "... ... ... \n", - "196 0.445126 0.442368 \n", - "197 0.444218 0.440721 \n", - "198 0.445028 0.441705 \n", - "199 0.446232 0.442230 \n", - "200 0.445801 0.441218 \n", + " \\\n", + "level_0 /glowTTS/enc192/200ep/dec_drop_0.05/ \n", + "level_1 \n", + "1 1.235547 \n", + "2 1.247849 \n", + "3 1.267252 \n", + "4 1.308826 \n", + "5 1.346720 \n", + "... ... \n", + "96 0.795220 \n", + "97 0.700293 \n", + "98 0.646661 \n", + "99 0.713778 \n", + "100 0.607798 \n", "\n", - " \\\n", - "level_0 /enc768/200ep/dec_drop_0.05_epsilon_1e-8/ Baseline Glow-TTS 768 \n", - "level_1 \n", - "1 1.235275 1.238104 \n", - "2 1.358104 1.353328 \n", - "3 1.748128 1.722623 \n", - "4 1.543148 1.568067 \n", - "5 1.484941 1.463525 \n", - "... ... ... \n", - "196 0.438285 0.442368 \n", - "197 0.437338 0.440721 \n", - "198 0.437170 0.441705 \n", - "199 0.437704 0.442230 \n", - "200 0.438248 0.441218 \n", + " \\\n", + "level_0 /glowTTS/enc192/200ep/dec_drop_0.05_epsilon_1e-8/ \n", + "level_1 \n", + "1 1.237038 \n", + "2 1.251819 \n", + "3 1.275061 \n", + "4 1.298183 \n", + "5 1.328731 \n", + "... ... \n", + "96 0.750872 \n", + "97 0.790482 \n", + "98 0.820845 \n", + "99 0.818666 \n", + "100 0.811394 \n", "\n", - " dev_loss_mle \\\n", - "level_0 /enc192/200ep/dec_drop_0.0/grad_clip_10/ /enc192/200ep/dec_drop_0.05/ \n", - "level_1 \n", - "1 0.046450 0.073396 \n", - "2 -0.219648 -0.152898 \n", - "3 -0.341818 -0.295465 \n", - "4 -0.392180 -0.361857 \n", - "5 -0.419574 -0.413436 \n", - "... ... ... \n", - "196 -0.837367 -0.813777 \n", - "197 -0.837755 -0.810379 \n", - "198 -0.830906 -0.814995 \n", - "199 -0.835626 -0.814634 \n", - "200 -0.773976 -0.815064 \n", + " \\\n", + "level_0 /glowTTS/enc192/400ep/grad_clip_10/dec_drop_0.05/ \n", + "level_1 \n", + "1 1.256802 \n", + "2 1.329956 \n", + "3 1.074281 \n", + "4 1.002807 \n", + "5 1.011019 \n", + "... ... \n", + "96 NaN \n", + "97 NaN \n", + "98 NaN \n", + "99 NaN \n", + "100 NaN \n", "\n", - " ... \\\n", - "level_0 /enc192/200ep/dec_drop_0.05_epsilon_1e-8/ ... \n", - "level_1 ... \n", - "1 0.073425 ... \n", - "2 -0.153953 ... \n", - "3 -0.295729 ... \n", - "4 -0.362248 ... \n", - "5 -0.414346 ... \n", - "... ... ... \n", - "196 -0.816068 ... \n", - "197 -0.815329 ... \n", - "198 -0.817098 ... \n", - "199 -0.816613 ... \n", - "200 -0.817505 ... \n", + " \\\n", + "level_0 /glowTTS/enc768/100ep/dec_drop_0.00/ \n", + "level_1 \n", + "1 1.076987 \n", + "2 1.101618 \n", + "3 1.093754 \n", + "4 1.051502 \n", + "5 1.074268 \n", + "... ... \n", + "96 0.438743 \n", + "97 0.438660 \n", + "98 0.438703 \n", + "99 0.439357 \n", + "100 0.437995 \n", "\n", - " dp \\\n", - "level_0 /enc768/200ep/dec_drop_0.05/ \n", - "level_1 \n", - "1 1.145111 \n", - "2 1.166236 \n", - "3 1.148909 \n", - "4 1.126033 \n", - "5 1.104397 \n", - "... ... \n", - "196 0.388870 \n", - "197 0.388216 \n", - "198 0.388245 \n", - "199 0.387994 \n", - "200 0.387377 \n", + " \\\n", + "level_0 /glowTTS/enc768/100ep/dec_drop_0.05/ \n", + "level_1 \n", + "1 1.237908 \n", + "2 1.429635 \n", + "3 1.723272 \n", + "4 1.547274 \n", + "5 1.308311 \n", + "... ... \n", + "96 0.433290 \n", + "97 0.434222 \n", + "98 0.433427 \n", + "99 0.433838 \n", + "100 0.432689 \n", "\n", - " \\\n", - "level_0 /enc768/200ep/dec_drop_0.05_epsilon_1e-8/ Baseline Glow-TTS 768 \n", - "level_1 \n", - "1 1.145046 1.145111 \n", - "2 1.166261 1.166236 \n", - "3 1.149577 1.148909 \n", - "4 1.126183 1.126033 \n", - "5 1.104511 1.104397 \n", - "... ... ... \n", - "196 0.387081 0.388870 \n", - "197 0.386526 0.388216 \n", - "198 0.386545 0.388245 \n", - "199 0.386290 0.387994 \n", - "200 0.386067 0.387377 \n", + " \\\n", + "level_0 /glowTTS/enc768/200ep/dec_drop_0.0/grad_clip_10/ \n", + "level_1 \n", + "1 1.094651 \n", + "2 1.071850 \n", + "3 1.056306 \n", + "4 1.042766 \n", + "5 1.062859 \n", + "... ... \n", + "96 0.445215 \n", + "97 0.446601 \n", + "98 0.445311 \n", + "99 0.444623 \n", + "100 0.446017 \n", "\n", - " mle \\\n", - "level_0 /enc192/200ep/dec_drop_0.0/grad_clip_10/ /enc192/200ep/dec_drop_0.05/ \n", - "level_1 \n", - "1 0.537252 0.568119 \n", - "2 -0.101299 -0.082530 \n", - "3 -0.294475 -0.281159 \n", - "4 -0.371938 -0.375815 \n", - "5 -0.410438 -0.422834 \n", - "... ... ... \n", - "196 -0.850404 -0.808053 \n", - "197 -0.850987 -0.808369 \n", - "198 -0.851446 -0.808689 \n", - "199 -0.852212 -0.809059 \n", - "200 -0.852834 -0.809477 \n", + " \\\n", + "level_0 /glowTTS/enc768/200ep/dec_drop_0.05/ \n", + "level_1 \n", + "1 1.295716 \n", + "2 1.645345 \n", + "3 1.434315 \n", + "4 1.238428 \n", + "5 1.124593 \n", + "... ... \n", + "96 0.442588 \n", + "97 0.442310 \n", + "98 0.442427 \n", + "99 0.441213 \n", + "100 0.441724 \n", "\n", - " \\\n", - "level_0 /enc192/200ep/dec_drop_0.05_epsilon_1e-8/ \n", - "level_1 \n", - "1 0.569599 \n", - "2 -0.082966 \n", - "3 -0.281597 \n", - "4 -0.375824 \n", - "5 -0.422658 \n", - "... ... \n", - "196 -0.806879 \n", - "197 -0.807225 \n", - "198 -0.807542 \n", - "199 -0.807872 \n", - "200 -0.808277 \n", + " ... \\\n", + "level_0 /glowTTS/enc768/200ep/dec_drop_0.05_epsilon_1e-8/ ... \n", + "level_1 ... \n", + "1 1.296690 ... \n", + "2 1.645638 ... \n", + "3 1.437929 ... \n", + "4 1.253040 ... \n", + "5 1.129739 ... \n", + "... ... ... \n", + "96 0.438332 ... \n", + "97 0.439489 ... \n", + "98 0.437876 ... \n", + "99 0.437254 ... \n", + "100 0.437976 ... \n", "\n", - " \\\n", - "level_0 /enc768/200ep/dec_drop_0.0/grad_clip_10/ /enc768/200ep/dec_drop_0.05/ \n", - "level_1 \n", - "1 0.502620 0.530890 \n", - "2 -0.083080 -0.066146 \n", - "3 -0.293310 -0.281814 \n", - "4 -0.381286 -0.383792 \n", - "5 -0.422354 -0.433206 \n", - "... ... ... \n", - "196 -0.848083 -0.811209 \n", - "197 -0.848792 -0.811539 \n", - "198 -0.849257 -0.810812 \n", - "199 -0.849940 -0.811891 \n", - "200 -0.850649 -0.812429 \n", + " mle \\\n", + "level_0 /glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.0/ \n", + "level_1 \n", + "1 0.287960 \n", + "2 -0.235335 \n", + "3 -0.421064 \n", + "4 -0.494228 \n", + "5 -0.533924 \n", + "... ... \n", + "96 -0.836890 \n", + "97 -0.837922 \n", + "98 -0.838980 \n", + "99 -0.840099 \n", + "100 -0.841319 \n", + "\n", + " \\\n", + "level_0 /glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.05/ \n", + "level_1 \n", + "1 0.290018 \n", + "2 -0.231702 \n", + "3 -0.418663 \n", + "4 -0.492276 \n", + "5 -0.536283 \n", + "... ... \n", + "96 -0.808481 \n", + "97 -0.809084 \n", + "98 -0.809520 \n", + "99 -0.810355 \n", + "100 -0.811221 \n", + "\n", + " \\\n", + "level_0 /glowTTS_x_vector_v2/enc192/400ep/dec_drop_0.05/ \n", + "level_1 \n", + "1 0.029518 \n", + "2 -0.455761 \n", + "3 -0.553655 \n", + "4 -0.606141 \n", + "5 -0.640664 \n", + "... ... \n", + "96 NaN \n", + "97 NaN \n", + "98 NaN \n", + "99 NaN \n", + "100 NaN \n", + "\n", + " \\\n", + "level_0 /glowTTS_x_vector_v2/enc768/100ep/dec_drop_0.05/ \n", + "level_1 \n", + "1 0.541159 \n", + "2 -0.029426 \n", + "3 -0.247951 \n", + "4 -0.392323 \n", + "5 -0.467505 \n", + "... ... \n", + "96 -0.792415 \n", + "97 -0.793009 \n", + "98 -0.793083 \n", + "99 -0.793955 \n", + "100 -0.795013 \n", "\n", - " \n", - "level_0 /enc768/200ep/dec_drop_0.05_epsilon_1e-8/ Baseline Glow-TTS 768 \n", + " \\\n", + "level_0 /glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.0/ \n", + "level_1 \n", + "1 0.266416 \n", + "2 -0.264689 \n", + "3 -0.447814 \n", + "4 -0.518874 \n", + "5 -0.559829 \n", + "... ... \n", + "96 -0.808796 \n", + "97 -0.809899 \n", + "98 -0.811023 \n", + "99 -0.812145 \n", + "100 -0.813296 \n", + "\n", + " \\\n", + "level_0 /glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/ \n", + "level_1 \n", + "1 0.267837 \n", + "2 -0.259478 \n", + "3 -0.443378 \n", + "4 -0.515698 \n", + "5 -0.558296 \n", + "... ... \n", + "96 -0.806530 \n", + "97 -0.807328 \n", + "98 -0.808048 \n", + "99 -0.808773 \n", + "100 -0.809496 \n", + "\n", + " \\\n", + "level_0 /glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.0/ \n", + "level_1 \n", + "1 0.798228 \n", + "2 -0.206659 \n", + "3 -0.453987 \n", + "4 -0.540485 \n", + "5 -0.586900 \n", + "... ... \n", + "96 -0.842475 \n", + "97 -0.843173 \n", + "98 -0.844026 \n", + "99 -0.845038 \n", + "100 -0.846297 \n", + "\n", + " \\\n", + "level_0 /glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.05/ \n", "level_1 \n", - "1 0.532545 0.530890 \n", - "2 -0.066379 -0.066146 \n", - "3 -0.282104 -0.281814 \n", - "4 -0.383782 -0.383792 \n", - "5 -0.433054 -0.433206 \n", - "... ... ... \n", - "196 -0.810240 -0.811209 \n", - "197 -0.810735 -0.811539 \n", - "198 -0.811058 -0.810812 \n", - "199 -0.811443 -0.811891 \n", - "200 -0.811807 -0.812429 \n", + "1 0.798930 \n", + "2 -0.201726 \n", + "3 -0.448675 \n", + "4 -0.536202 \n", + "5 -0.583516 \n", + "... ... \n", + "96 -0.805711 \n", + "97 -0.806196 \n", + "98 -0.806705 \n", + "99 -0.807293 \n", + "100 -0.807831 \n", "\n", - "[200 rows x 28 columns]" + " \n", + "level_0 /glowTTS_x_vector_v2/enc768/400ep/dec_drop_0.05/ Baseline Glow-TTS 768 \n", + "level_1 \n", + "1 0.004251 0.232372 \n", + "2 -0.479359 -0.332803 \n", + "3 -0.573150 -0.450035 \n", + "4 -0.618904 -0.507752 \n", + "5 -0.647800 -0.551041 \n", + "... ... ... \n", + "96 NaN -0.809823 \n", + "97 NaN -0.810414 \n", + "98 NaN -0.811041 \n", + "99 NaN -0.811176 \n", + "100 NaN -0.812160 \n", + "\n", + "[100 rows x 88 columns]" ] }, - "execution_count": 14, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -717,7 +863,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -755,7 +901,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pipeline.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pipeline.py index ed5402169..a2e532e79 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pipeline.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pipeline.py @@ -14,7 +14,7 @@ from .default_tools import SCTK_BINARY_PATH, NISQA_REPO -def training(config, returnn_exe, returnn_root, prefix, num_epochs=65): +def training(config, returnn_exe, returnn_root, prefix, num_epochs=65, large_gpu=False): train_job = ReturnnTrainingJob( config, log_verbosity=5, @@ -25,6 +25,10 @@ def training(config, returnn_exe, returnn_root, prefix, num_epochs=65): returnn_python_exe=returnn_exe, returnn_root=returnn_root, ) + + if large_gpu: + train_job.rqmt["gpu_mem"] = 24 + train_job.add_alias(prefix + "/training") tk.register_output(prefix + "/training.models", train_job.out_model_dir) @@ -240,3 +244,36 @@ def compute_phoneme_pred_accuracy( tk.register_output(prefix_name + f"/{target}_pred/{key}/accuracy", mean_accuracy_job.out_accuracy) jobs.append(search_job) return jobs + +@tk.block() +def compute_prior( + prefix_name, + returnn_config, + checkpoint, + returnn_exe, + returnn_root, + mem_rqmt=8, +): + """ + Run search for a specific test dataset + + :param str prefix_name: + :param ReturnnConfig returnn_config: + :param Checkpoint checkpoint: + :param Path returnn_exe: + :param Path returnn_root: + """ + search_job = ReturnnForwardJobV2( + model_checkpoint=checkpoint, + returnn_config=returnn_config, + log_verbosity=5, + mem_rqmt=mem_rqmt, + time_rqmt=1, + device="gpu", + cpu_rqmt=4, + returnn_python_exe=returnn_exe, + returnn_root=returnn_root, + output_files=["prior.txt"], + ) + search_job.add_alias(prefix_name + "/prior_job") + return search_job.out_files["prior.txt"] diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/glowASR_conformer_x_vector.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/glowASR_conformer_x_vector.py index 0d34a2d53..ae4660b7a 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/glowASR_conformer_x_vector.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/frozen_glowtts/glowASR_conformer_x_vector.py @@ -380,7 +380,7 @@ def train_step(*, model: nn.Module, data, run_ctx, **kwargs): phon_labels, input_lengths=audio_features_len, target_lengths=phon_labels_len, - blank=model.cfg.label_target_size, + blank=model.conf_cfg.label_target_size, reduction="sum", zero_infinity=True ) @@ -388,7 +388,7 @@ def train_step(*, model: nn.Module, data, run_ctx, **kwargs): run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) -def forward_init_hook(run_ctx, **kwargs): +def search_init_hook(run_ctx, **kwargs): # we are storing durations, but call it output.hdf to match # the default output of the ReturnnForwardJob from torchaudio.models.decoder import ctc_decoder @@ -403,44 +403,26 @@ def forward_init_hook(run_ctx, **kwargs): vocab = Vocabulary.create_vocab( vocab_file=kwargs["returnn_vocab"], unknown_label=None) labels = vocab.labels - print(f"labels from vocab:{labels}") - if "asr_data" in kwargs.keys() and kwargs["asr_data"]: - print(f"Using ctc_decoder for ASR data...") - run_ctx.ctc_decoder = ctc_decoder( - lexicon=kwargs["lexicon"], - lm=lm, - lm_weight=kwargs["lm_weight"], - tokens=labels + ["[blank]", "[SILENCE]", "[UNK]"], - # "[SILENCE]" and "[UNK]" are not actually part of the vocab, - # but the decoder is happy as long they are defined in the token list - # even if they do not exist as label index in the softmax output, - blank_token="[blank]", - sil_token="[SILENCE]", - unk_word="[unknown]", - nbest=1, - beam_size=kwargs["beam_size"], - beam_size_token=kwargs.get("beam_size_token", None), - beam_threshold=kwargs["beam_threshold"], - sil_score=kwargs.get("sil_score", 0.0), - word_score=kwargs.get("word_score", 0.0), - ) - else: - print(f"Using ctc_decoder for TTS data...") - - run_ctx.ctc_decoder = ctc_decoder( - lexicon=kwargs["lexicon"], - lm=lm, - lm_weight=kwargs["lm_weight"], - tokens=labels, - blank_token="[blank]", - sil_token="[space]", # [space] is our actual silence - unk_word="[UNKNOWN]", - nbest=1, - beam_size=kwargs["beam_size"], - beam_threshold=kwargs["beam_threshold"], - sil_score=kwargs.get("sil_score", 0.0), - word_score=kwargs.get("word_score", 0.0), - ) + print(f"Using ctc_decoder for ASR data...") + run_ctx.ctc_decoder = ctc_decoder( + lexicon=kwargs["lexicon"], + lm=lm, + lm_weight=kwargs["lm_weight"], + tokens=labels + ["[blank]", "[SILENCE]", "[UNK]"], + # "[SILENCE]" and "[UNK]" are not actually part of the vocab, + # but the decoder is happy as long they are defined in the token list + # even if they do not exist as label index in the softmax output, + blank_token="[blank]", + sil_token="[SILENCE]", + unk_word="[unknown]", + nbest=1, + beam_size=kwargs["beam_size"], + beam_size_token=kwargs.get("beam_size_token", None), + beam_threshold=kwargs["beam_threshold"], + sil_score=kwargs.get("sil_score", 0.0), + word_score=kwargs.get("word_score", 0.0), + ) + run_ctx.labels = labels run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None) @@ -451,12 +433,12 @@ def forward_init_hook(run_ctx, **kwargs): run_ctx.prior = None -def forward_finish_hook(run_ctx, **kwargs): +def search_finish_hook(run_ctx, **kwargs): run_ctx.recognition_file.write("}\n") run_ctx.recognition_file.close() -def forward_step(*, model, data, run_ctx, **kwargs): +def search_step(*, model, data, run_ctx, **kwargs): raw_audio = data["raw_audio"] # [B, T', F] raw_audio_len = data["raw_audio:size1"] # [B] logprobs, audio_features_len = model( @@ -480,4 +462,38 @@ def forward_step(*, model, data, run_ctx, **kwargs): print(sequence) run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence))) +def forward_init_hook_prior(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def forward_finish_hook_prior(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", 'w') as f: + np.savetxt(f, log_average_probs, delimiter=' ') + print("Saved prior in prior.txt in +log space.") + + +def forward_step_prior(*, model, data, run_ctx, **kwargs): + raw_audio = data["audio_features"] # [B, T', F] + raw_audio_len = data["audio_features:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) + diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/glowTTS.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/glowTTS.py index 4ebb7b991..caacb4de7 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/glowTTS.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/glowTTS.py @@ -20,6 +20,8 @@ from .shared.eval_invertibility import * +from .shared.feature_statistics import * + class DurationPredictor(nn.Module): """ Duration Predictor module, trained using calculated durations coming from monotonic alignment search diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/shared/eval_invertibility.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/shared/eval_invertibility.py index 3cdc198d0..92ee59238 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/shared/eval_invertibility.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/shared/eval_invertibility.py @@ -66,7 +66,7 @@ def forward_step_invertibility(*, model, data, run_ctx, **kwargs): mae.sum() / current_num_of_obs ) # This considers the masking by only using the mean over all unmasked elements - current_var = (mae - current_mae).sum() / ( + current_var = (mae - current_mae).pow(2).sum() / ( current_num_of_obs - 1 ) # Variance over unmasked elements with bias correction 1 diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/shared/feature_statistics.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/shared/feature_statistics.py new file mode 100644 index 000000000..26642ded3 --- /dev/null +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/pytorch_networks/shared/feature_statistics.py @@ -0,0 +1,65 @@ +import multiprocessing +import torch +import numpy as np +from returnn.datasets.hdf import SimpleHDFWriter +from . import commons + +def forward_init_hook_statistics(run_ctx, **kwargs): + run_ctx.total_mean = 0 + run_ctx.total_var = 0 + run_ctx.total_max = torch.tensor(-np.inf) + run_ctx.total_min = torch.tensor(np.inf) + run_ctx.num_of_obs = 0 + +def forward_finish_hook_statistics(run_ctx, **kwargs): + with open("output.hdf", "w+") as f: + f.write("total, mean, var, max, min \n") + f.write( + f"{run_ctx.num_of_obs}, {str(float(run_ctx.total_mean))}, {str(float(run_ctx.total_var))}, {str(float(run_ctx.total_max))}, {str(float(run_ctx.total_min))}" + ) + +def forward_step_statistics(*, model, data, run_ctx, **kwargs): + raw_audio = data["audio_features"] # [B, N] (sparse) + raw_audio_len = data["audio_features:size1"] # [B] + + + squeezed_audio = torch.squeeze(raw_audio) + y, y_lengths = model.feature_extraction(squeezed_audio, raw_audio_len) # [B, T, F] + y = y.transpose(1,2) + y_max_length = y.size(2) + + y, y_lengths, y_max_length = model.preprocess(y, y_lengths, y_max_length) + z_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y_max_length), 1).to(torch.int32) + + # y = torch.nn.functional.l1_loss(y_hat * z_mask, y * z_mask, reduction="none") # [B, F, T] + current_num_of_obs = y.shape[1] * y_lengths.sum() # F * total_number_of_frames_in_batch + + old_mean = run_ctx.total_mean + + masked_y = y * z_mask + current_mean = ( + masked_y.sum() / current_num_of_obs + ) # This considers the masking by only using the mean over all unmasked elements + + current_var = (masked_y - current_mean).pow(2).sum() / ( + current_num_of_obs - 1 + ) # Variance over unmasked elements with bias correction 1 + + run_ctx.total_mean = ((run_ctx.num_of_obs / (run_ctx.num_of_obs + current_num_of_obs)) * old_mean) + ( + (current_num_of_obs / (run_ctx.num_of_obs + current_num_of_obs)) * current_mean + ) + + run_ctx.total_var = ( + (run_ctx.num_of_obs / (run_ctx.num_of_obs + current_num_of_obs)) * run_ctx.total_var + + ((current_num_of_obs / (run_ctx.num_of_obs + current_num_of_obs)) * current_var) + + ((run_ctx.num_of_obs * current_num_of_obs) / (run_ctx.num_of_obs + current_num_of_obs) ** 2) + * (old_mean - current_mean) ** 2 + ) + + run_ctx.total_max = torch.max(run_ctx.total_max, masked_y.max()) + + run_ctx.total_min = torch.min( + run_ctx.total_min, (masked_y + (-1 * z_mask + 1) * torch.tensor(float("inf")).nan_to_num(0.0)).min() + ) # Masked Min operation + + run_ctx.num_of_obs += current_num_of_obs diff --git a/users/rilling/joint_training/text_hdf/text_hdf_from_bliss.py b/users/rilling/joint_training/text_hdf/text_hdf_from_bliss.py index 821678647..9aff54205 100644 --- a/users/rilling/joint_training/text_hdf/text_hdf_from_bliss.py +++ b/users/rilling/joint_training/text_hdf/text_hdf_from_bliss.py @@ -12,7 +12,7 @@ class TextHDFFromBliss(Job): """ Extract text from a bliss corpus and save to HDF format - Used for joint training, where additional ASR corpus is necessary. + E.g. used for joint training, where additional ASR corpus is necessary. """ def __init__( @@ -43,7 +43,7 @@ def run(self): vocab_num_labels=vocabs[-1].num_labels else: assert vocabs[-1].num_labels == vocab_num_labels - + hdf_writer = SimpleHDFWriter(self.out_text_hdf.get_path(), dim=vocab_num_labels, ndim=1) for b, v in zip(self.bliss_corpora, vocabs): diff --git a/users/rilling/speakers/pooling.py b/users/rilling/speakers/pooling.py index 1213f7c61..c112be5c8 100644 --- a/users/rilling/speakers/pooling.py +++ b/users/rilling/speakers/pooling.py @@ -6,8 +6,15 @@ class AverageXVectorSpeakerEmbeddingsJob(Job): + """ + Job to compute the average x-vector for each speaker in a corpus + Takes an HDF containing an x-vector for each sequence in a corpus in combination with the speaker id, + collects all x-vectors for every speaker id and pools them using the mean vector. + + :param tk.Path x_vector_hdf: Path to HDF from x-vector forward job + :param tk.Path returnn_root: Path to Returnn Root + """ def __init__(self, x_vector_hdf: tk.Path, returnn_root: tk.Path): - # def __init__(self, x_vector_hdf: str, returnn_root: str): self.x_vector_hdf = x_vector_hdf self.returnn_root = returnn_root @@ -17,10 +24,7 @@ def task(self): yield Task("run", mini_task=True, rqmt={"sbatch_args": ["-p", "cpu_slow"]}) def run(self): - print(f"self.x_vector_hdf: {self.x_vector_hdf}") x_vectors, seq_tags, speaker_labels = self.load_xvector_data(self.x_vector_hdf.get_path()) - # x_vectors, seq_tags, speaker_labels = self.load_xvector_data(self.x_vector_hdf) - print(f"x_vectors.shape: {x_vectors.shape}") speaker_labels, indices = torch.sort(torch.Tensor(speaker_labels)) x_vectors = x_vectors[indices, :] @@ -61,23 +65,11 @@ def load_xvector_data(self, hdf_filename): seq_tags = input_data["seqTags"] lengths = input_data["seqLengths"] + # break if targets are not given hence no speaker_labels are in the HDF assert "targets" in input_data.keys() assert "speaker_labels" in input_data["targets"]["data"] speaker_labels = input_data["targets"]["data"]["speaker_labels"] - # data_seqs = [] - # data_tags = [] - # data_speaker_label = [] - # offset = 0 - - # for tag, length, speaker_label in zip(seq_tags, lengths, speaker_labels): - # tag = tag if isinstance(tag, str) else tag.decode() - # in_data = inputs[offset : offset + length[0]] - # data_seqs.append(in_data) - # offset += length[0] - # data_tags.append(tag) - # data_speaker_label.append(speaker_label) - x_vectors = np.array(inputs) x_vectors = x_vectors.reshape((lengths.shape[0], 512)) From 8ac7da62a88a79cce00c024fb5f47f8efa115203 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 16 May 2024 10:36:47 +0000 Subject: [PATCH 027/227] fix --- users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py | 2 -- users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py b/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py index 89d0d2557..9831b8e61 100644 --- a/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py +++ b/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py @@ -151,8 +151,6 @@ def __init__( self.enc_val_per_head_dim = enc_key_dim // att_num_heads self.ff_dim = ff_dim - if self.ff_dim is None: - self.ff_dim = 2 * self.enc_key_dim self.target = target diff --git a/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py b/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py index 3f06572c9..01c65076c 100644 --- a/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py +++ b/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py @@ -106,8 +106,6 @@ def __init__( self.enc_val_per_head_dim = enc_key_dim // att_num_heads self.ff_dim = ff_dim - if self.ff_dim is None: - self.ff_dim = 2 * self.enc_key_dim self.target = target From 6e1a700213ef7a0c1a1b3fffeae56c3b310ca429 Mon Sep 17 00:00:00 2001 From: Lukas Rilling Date: Thu, 16 May 2024 17:11:20 +0200 Subject: [PATCH 028/227] Glow-TTS-ASR: Cleanup and comments/documentation --- .../configs/__init__.py | 0 .../configs/legacy_baseline.py | 320 ------------ .../configs/pytorch_baseline.py | 204 -------- .../configs/rc_baseline.py | 220 -------- .../librispeech_100_hybrid/data.py | 304 ----------- .../librispeech_100_hybrid/default_tools.py | 26 - .../librispeech_100_hybrid/experiments.py | 86 ---- .../librispeech_100_hybrid/gmm_baseline.py | 77 --- .../librispeech_100_hybrid/lm/__init__.py | 9 - .../librispeech_100_hybrid/lm/config.py | 92 ---- .../librispeech_100_hybrid/lm/data.py | 154 ------ .../lm/default_tools.py | 13 - .../librispeech_100_hybrid/lm/experiment.py | 36 -- .../pytorch_networks/__init__.py | 0 .../pytorch_networks/blstm8x1024.py | 94 ---- .../blstm8x1024_custom_engine.py | 121 ----- .../blstm8x1024_more_specaug.py | 115 ----- .../blstm8x1024_more_specaug_backup.py | 75 --- .../blstm8x1024_more_specaug_fp16.py | 176 ------- .../espnet_conformer_large.py | 121 ----- .../pytorch_networks/espnet_conformer_test.py | 124 ----- .../pytorch_networks/mlp_test.py | 31 -- .../pytorch_networks/torchaudio_conformer.py | 423 ---------------- .../torchaudio_conformer_debug.py | 425 ---------------- .../torchaudio_conformer_large_fp16.py | 476 ------------------ .../torchaudio_conformer_subsample.py | 426 ---------------- ...torchaudio_conformer_subsample_upsample.py | 440 ---------------- .../rc_networks/__init__.py | 0 .../rc_networks/debug.py | 142 ------ .../rc_networks/debug_random_mask.py | 54 -- .../rc_networks/default_hybrid.py | 162 ------ .../rc_networks/default_hybrid_v2.py | 167 ------ .../specaugment_clean_legacy.py | 141 ------ .../experiments.py | 76 ++- .../librispeech_glow_asr/experiments.py | 19 + .../{glowTTS => }/feature_config.py | 0 .../glowTTS/experiments.py | 39 +- .../glowTTS/gt_extraction.py | 2 +- .../glowTTS/vocoder/__init__.py | 0 .../pytorch_networks/feature_extraction.py | 3 +- .../pytorch_networks/glowTTS.py | 2 +- .../pytorch_networks/glowTTS_ddi_actnorm.py | 2 +- .../glowTTS_decoder_test_blstm.py | 2 +- .../glowTTS_decoder_test_multi_layer_ffn.py | 2 +- .../glowTTS_decoder_test_simple_linear.py | 2 +- .../glowTTS_encoder_sample_test_blstm.py | 2 +- ...der_sample_test_maxlike_alignment_blstm.py | 2 +- ..._test_maxlike_alignment_multi_layer_ffn.py | 2 +- ...le_test_maxlike_alignment_simple_linear.py | 2 +- ...TTS_encoder_sample_test_multi_layer_ffn.py | 2 +- ...owTTS_encoder_sample_test_simple_linear.py | 2 +- .../glowTTS_nar_taco_encoder.py | 2 +- .../glowTTS_nar_taco_encoder_no_blstm.py | 2 +- .../glowTTS_one_hot_encoder_mean.py | 2 +- .../glowTTS_one_hot_encoder_std.py | 2 +- .../glowTTS_simple_encoder.py | 2 +- ..._test_maxlike_alignment_multi_layer_ffn.py | 2 +- ...st_maxlike_alignment_multi_layer_ffn_v2.py | 2 +- .../pytorch_networks/glowTTS_x_vector.py | 2 +- .../pytorch_networks/glowTTS_x_vector_eval.py | 2 +- .../pytorch_networks/glowTTS_x_vector_v2.py | 2 +- .../pytorch_networks/glowTTS_x_vector_v3.py | 2 +- .../glowTTS_x_vector_v3_norm_xvector.py | 2 +- .../pytorch_networks/gt_extractor.py | 2 +- .../vocoder}/__init__.py | 0 .../vocoder}/config.py | 93 ++-- .../librispeech_glowtts/vocoder/data.py | 370 ++++++++++++++ .../librispeech_glowtts/vocoder/pipeline.py | 81 +++ .../{glowTTS => }/vocoder/simple_gl.py | 10 +- .../librispeech_joint_training/experiments.py | 36 +- .../README.md | 15 + .../exp_joint/experiments.py | 111 ++-- .../exp_joint_2step/experiments.py | 77 ++- .../exp_joint_flow_ga/experiments.py | 169 ++----- .../experiments.py | 133 ++--- .../x_vectors/experiments.py | 9 +- .../__init__.py | 0 .../ctc_aligner/__init__.py | 0 .../ctc_aligner/data.py | 115 ----- .../ctc_aligner/experiments.py | 309 ------------ .../ctc_aligner/pipeline.py | 35 -- .../tts_architecture_improvement_23/data.py | 320 ------------ .../default_tools.py | 17 - .../pytorch_networks/__init__.py | 0 .../pytorch_networks/ctc_aligner_rf.py | 120 ----- .../pytorch_networks/ctc_aligner_v1.py | 296 ----------- .../ctc_aligner_v1_ctc_sum.py | 109 ---- .../ctc_aligner_v1_ctc_sum_nobroad.py | 103 ---- .../ctc_aligner_v1_gradaccum.py | 149 ------ .../pytorch_networks/ctc_aligner_v2.py | 265 ---------- .../rc_networks/__init__.py | 0 .../rc_networks/ctc_aligner/__init__.py | 0 .../rc_networks/ctc_aligner/conv_blstm_rec.py | 211 -------- .../rc_networks/ctc_aligner/parameters.py | 16 - .../rc_networks/shared/__init__.py | 0 .../rc_networks/shared/convolution.py | 128 ----- .../serializer.py | 117 ----- .../storage.py | 13 - 98 files changed, 839 insertions(+), 8029 deletions(-) delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/configs/__init__.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/configs/legacy_baseline.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/configs/pytorch_baseline.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/configs/rc_baseline.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/data.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/default_tools.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/experiments.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/gmm_baseline.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/lm/__init__.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/lm/config.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/lm/data.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/lm/default_tools.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/lm/experiment.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/__init__.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/blstm8x1024.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/blstm8x1024_custom_engine.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/blstm8x1024_more_specaug.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/blstm8x1024_more_specaug_backup.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/blstm8x1024_more_specaug_fp16.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/espnet_conformer_large.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/espnet_conformer_test.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/mlp_test.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/torchaudio_conformer.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/torchaudio_conformer_debug.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/torchaudio_conformer_large_fp16.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/torchaudio_conformer_subsample.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/torchaudio_conformer_subsample_upsample.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/rc_networks/__init__.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/rc_networks/debug.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/rc_networks/debug_random_mask.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/rc_networks/default_hybrid.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/rc_networks/default_hybrid_v2.py delete mode 100644 users/rilling/experiments/librispeech/librispeech_100_hybrid/specaugment_clean_legacy.py rename users/rilling/experiments/librispeech/librispeech_glowtts/{glowTTS => }/feature_config.py (100%) delete mode 100644 users/rilling/experiments/librispeech/librispeech_glowtts/glowTTS/vocoder/__init__.py rename users/rilling/experiments/librispeech/{librispeech_100_hybrid => librispeech_glowtts/vocoder}/__init__.py (100%) rename users/rilling/experiments/librispeech/{tts_architecture_improvement_23/ctc_aligner => librispeech_glowtts/vocoder}/config.py (64%) create mode 100644 users/rilling/experiments/librispeech/librispeech_glowtts/vocoder/data.py create mode 100644 users/rilling/experiments/librispeech/librispeech_glowtts/vocoder/pipeline.py rename users/rilling/experiments/librispeech/librispeech_glowtts/{glowTTS => }/vocoder/simple_gl.py (93%) create mode 100644 users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/README.md delete mode 100644 users/rilling/experiments/librispeech/tts_architecture_improvement_23/__init__.py delete mode 100644 users/rilling/experiments/librispeech/tts_architecture_improvement_23/ctc_aligner/__init__.py delete mode 100644 users/rilling/experiments/librispeech/tts_architecture_improvement_23/ctc_aligner/data.py delete mode 100644 users/rilling/experiments/librispeech/tts_architecture_improvement_23/ctc_aligner/experiments.py delete mode 100644 users/rilling/experiments/librispeech/tts_architecture_improvement_23/ctc_aligner/pipeline.py delete mode 100644 users/rilling/experiments/librispeech/tts_architecture_improvement_23/data.py delete mode 100644 users/rilling/experiments/librispeech/tts_architecture_improvement_23/default_tools.py delete mode 100644 users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/__init__.py delete mode 100644 users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/ctc_aligner_rf.py delete mode 100644 users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/ctc_aligner_v1.py delete mode 100644 users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/ctc_aligner_v1_ctc_sum.py delete mode 100644 users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/ctc_aligner_v1_ctc_sum_nobroad.py delete mode 100644 users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/ctc_aligner_v1_gradaccum.py delete mode 100644 users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/ctc_aligner_v2.py delete mode 100644 users/rilling/experiments/librispeech/tts_architecture_improvement_23/rc_networks/__init__.py delete mode 100644 users/rilling/experiments/librispeech/tts_architecture_improvement_23/rc_networks/ctc_aligner/__init__.py delete mode 100644 users/rilling/experiments/librispeech/tts_architecture_improvement_23/rc_networks/ctc_aligner/conv_blstm_rec.py delete mode 100644 users/rilling/experiments/librispeech/tts_architecture_improvement_23/rc_networks/ctc_aligner/parameters.py delete mode 100644 users/rilling/experiments/librispeech/tts_architecture_improvement_23/rc_networks/shared/__init__.py delete mode 100644 users/rilling/experiments/librispeech/tts_architecture_improvement_23/rc_networks/shared/convolution.py delete mode 100644 users/rilling/experiments/librispeech/tts_architecture_improvement_23/serializer.py delete mode 100644 users/rilling/experiments/librispeech/tts_architecture_improvement_23/storage.py diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/configs/__init__.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/configs/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/configs/legacy_baseline.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/configs/legacy_baseline.py deleted file mode 100644 index f7ff3c35d..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/configs/legacy_baseline.py +++ /dev/null @@ -1,320 +0,0 @@ -import copy -import numpy as np -from typing import List - -from i6_core.returnn.config import ReturnnConfig -from i6_experiments.common.setups.rasr.util import HybridArgs - - -def blstm_network(layers, input_layers, dropout=0.1, l2=0.0): - num_layers = len(layers) - assert num_layers > 0 - - network = {} - - for idx, size in enumerate(layers): - idx += 1 - for direction, name in [(1, "fwd"), (-1, "bwd")]: - if idx == 1: - from_layers = input_layers - else: - from_layers = [ - "lstm_fwd_{}".format(idx - 1), - "lstm_bwd_{}".format(idx - 1), - ] - network["lstm_{}_{}".format(name, idx)] = { - "class": "rec", - "unit": "nativelstm2", - "direction": direction, - "n_out": size, - "dropout": dropout, - "L2": l2, - "from": from_layers, - } - - output_layers = ["lstm_fwd_{}".format(num_layers), "lstm_bwd_{}".format(num_layers)] - - return network, output_layers - - -def get_nn_args(num_outputs: int = 12001, num_epochs: int = 250, extra_exps=False): - evaluation_epochs = list(np.arange(num_epochs, num_epochs + 1, 10)) - - returnn_configs = get_returnn_configs( - num_inputs=50, num_outputs=num_outputs, batch_size=5000, - evaluation_epochs=evaluation_epochs, extra_exps=extra_exps, - ) - - returnn_recog_configs = get_returnn_configs( - num_inputs=50, num_outputs=num_outputs, batch_size=5000, - evaluation_epochs=evaluation_epochs, - recognition=True, extra_exps=extra_exps, - ) - - - training_args = { - "log_verbosity": 5, - "num_epochs": num_epochs, - "num_classes": num_outputs, - "save_interval": 1, - "keep_epochs": None, - "time_rqmt": 168, - "mem_rqmt": 7, - "cpu_rqmt": 3, - "partition_epochs": {"train": 40, "dev": 20}, - "use_python_control": False, - } - recognition_args = { - "dev-other": { - "epochs": evaluation_epochs, - "feature_flow_key": "gt", - "prior_scales": [0.3], - "pronunciation_scales": [6.0], - "lm_scales": [20.0], - "lm_lookahead": True, - "lookahead_options": None, - "create_lattice": True, - "eval_single_best": True, - "eval_best_in_lattice": True, - "search_parameters": { - "beam-pruning": 12.0, - "beam-pruning-limit": 100000, - "word-end-pruning": 0.5, - "word-end-pruning-limit": 15000, - }, - "lattice_to_ctm_kwargs": { - "fill_empty_segments": True, - "best_path_algo": "bellman-ford", - }, - "optimize_am_lm_scale": True, - "rtf": 50, - "mem": 8, - "lmgc_mem": 16, - "cpu": 4, - "parallelize_conversion": True, - "forward_output_layer": "log_output", - "native_ops": ["NativeLstm2"], - }, - } - test_recognition_args = None - - nn_args = HybridArgs( - returnn_training_configs=returnn_configs, - returnn_recognition_configs=returnn_recog_configs, - training_args=training_args, - recognition_args=recognition_args, - test_recognition_args=test_recognition_args, - ) - - return nn_args - -def get_realign_args(num_outputs: int = 12001, num_epochs: int = 250, extra_exps=False): - evaluation_epochs = list(np.arange(num_epochs, num_epochs + 1, 10)) - - returnn_configs = get_returnn_configs( - num_inputs=50, num_outputs=num_outputs, batch_size=5000, - evaluation_epochs=evaluation_epochs, extra_exps=extra_exps, - ) - - returnn_recog_configs = get_returnn_configs( - num_inputs=50, num_outputs=num_outputs, batch_size=5000, - evaluation_epochs=evaluation_epochs, - recognition=True, extra_exps=extra_exps, - ) - - - training_args = { - "log_verbosity": 5, - "num_epochs": num_epochs, - "num_classes": num_outputs, - "save_interval": 1, - "keep_epochs": None, - "time_rqmt": 168, - "mem_rqmt": 7, - "cpu_rqmt": 3, - "partition_epochs": {"train": 40, "dev": 20}, - "use_python_control": False, - } - recognition_args = { - "dev-other": { - "epochs": evaluation_epochs, - "feature_flow_key": "gt", - "prior_scales": [0.3], - "pronunciation_scales": [6.0], - "lm_scales": [20.0], - "lm_lookahead": True, - "lookahead_options": None, - "create_lattice": True, - "eval_single_best": True, - "eval_best_in_lattice": True, - "search_parameters": { - "beam-pruning": 12.0, - "beam-pruning-limit": 100000, - "word-end-pruning": 0.5, - "word-end-pruning-limit": 15000, - }, - "lattice_to_ctm_kwargs": { - "fill_empty_segments": True, - "best_path_algo": "bellman-ford", - }, - "optimize_am_lm_scale": True, - "rtf": 50, - "mem": 8, - "lmgc_mem": 16, - "cpu": 4, - "parallelize_conversion": True, - "forward_output_layer": "log_output", - "native_ops": ["NativeLstm2"], - }, - } - test_recognition_args = None - - nn_args = HybridArgs( - returnn_training_configs=returnn_configs, - returnn_recognition_configs=returnn_recog_configs, - training_args=training_args, - recognition_args=recognition_args, - test_recognition_args=test_recognition_args, - ) - - return nn_args - - -def get_feature_extraction_args(fix_features_output=False): - dc_detection = False - samples_options = { - 'audio_format': "wav", - 'dc_detection': dc_detection, - } - - return { - "gt": { - "gt_options": { - "minfreq": 100, - "maxfreq": 7500, - "channels": 50, - # "warp_freqbreak": 7400, - "tempint_type": "hanning", - "tempint_shift": 0.01, - "tempint_length": 0.025, - "flush_before_gap": True, - "do_specint": False, - "specint_type": "hanning", - "specint_shift": 4, - "specint_length": 9, - "normalize": True, - "preemphasis": True, - "legacy_scaling": False, - "without_samples": False, - "samples_options": samples_options, - "normalization_options": {}, - "add_features_output": fix_features_output, - } - }, - } - -def get_returnn_configs( - num_inputs: int, num_outputs: int, batch_size: int, evaluation_epochs: List[int], - recognition=False, extra_exps=False -): - # ******************** blstm base ******************** - - base_config = { - "extern_data": { - "data": {"dim": num_inputs}, - "classes": {"dim": num_outputs, "sparse": True}, - }, - } - base_post_config = { - "use_tensorflow": True, - "debug_print_layer_output_template": True, - "log_batch_size": True, - "tf_log_memory_usage": True, - "cache_size": "0", - - } - if not recognition: - base_post_config["cleanup_old_models"] = { - "keep_last_n": 5, - "keep_best_n": 5, - "keep": evaluation_epochs, - } - - network, last_layer = blstm_network([1024]*8, ["specaug"], dropout=0.0, l2=0.0) - from ..specaugment_clean_legacy import specaug_layer, get_funcs - - network["specaug"] = specaug_layer(["data"]) - network["out_linear"] = { - "class": "linear", - "activation": None, - "from": last_layer, - "n_out": num_outputs, - } - network["output"] = { - "class": "activation", - "activation": "softmax", - "from": ["out_linear"], - "loss": "ce", - 'loss_opts': {'focal_loss_factor': 2.0}, - "target": "classes" - } - network["log_output"] = { - "class": "activation", - "activation": "log_softmax", - "from": ["out_linear"], - } - - if recognition: - network["log_output"]["is_output_layer"] = True - - - blstm_base_config = copy.deepcopy(base_config) - blstm_base_config.update( - { - "batch_size": batch_size, # {"classes": batch_size, "data": batch_size}, - "chunking": "50:25", - "optimizer": {"class": "nadam", "epsilon": 1e-8}, - "gradient_noise": 0.3, - "learning_rates": list(np.linspace(2.5e-5, 2.5e-4, 10)), - "learning_rate_control": "newbob_multi_epoch", - "learning_rate_control_min_num_epochs_per_new_lr": 3, - "learning_rate_control_relative_error_relative_lr": True, - #"min_learning_rate": 1e-5, - "newbob_learning_rate_decay": 0.707, - "newbob_multi_num_epochs": 40, - "newbob_multi_update_interval": 1, - "network": network, - } - ) - - def make_returnn_config(config): - return ReturnnConfig( - config=config, - post_config=base_post_config, - hash_full_python_code=True, - python_prolog=get_funcs(), - pprint_kwargs={"sort_dicts": False}, - ) - blstm_base_returnn_config = make_returnn_config(blstm_base_config) - - if extra_exps: - oclr_v1_config = copy.deepcopy(blstm_base_config) - oclr_v1_config["learning_rates"] = list(np.linspace(2.5e-5, 3e-4, 50)) + list(np.linspace(3e-4, 2.5e-5, 50)) - oclr_v1_config["newbob_multi_num_epochs"] = 3 - oclr_v1_nofl_config = copy.deepcopy(oclr_v1_config) - oclr_v1_nofl_config["network"]["output"]["loss_opts"] = None - oclr_v2_config = copy.deepcopy(blstm_base_config) - oclr_v2_config["learning_rates"] = list(np.linspace(2.5e-5, 4e-4, 50)) + list(np.linspace(4e-4, 2.5e-5, 50)) - oclr_v2_config["newbob_multi_num_epochs"] = 3 - - return { - "blstm_oclr_v1": make_returnn_config(oclr_v1_config), - "blstm_oclr_v1_nofl": make_returnn_config(oclr_v1_nofl_config), - "blstm_oclr_v2": make_returnn_config(oclr_v2_config), - "blstm_base": blstm_base_returnn_config, - } - else: - return { - "blstm_base": blstm_base_returnn_config, - } - diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/configs/pytorch_baseline.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/configs/pytorch_baseline.py deleted file mode 100644 index 888adbc40..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/configs/pytorch_baseline.py +++ /dev/null @@ -1,204 +0,0 @@ -import copy -from dataclasses import dataclass -import numpy as np -from typing import List, Dict, Any - -from i6_core.returnn.config import ReturnnConfig -from i6_experiments.common.setups.rasr.util import HybridArgs - -from i6_experiments.common.setups.serialization import Import, ExplicitHash, ExternalImport -from i6_experiments.common.setups.returnn_pytorch.serialization import PyTorchModel, Collection - - -from ..default_tools import PACKAGE - - -def get_nn_args(num_outputs: int = 12001, num_epochs: int = 250, use_rasr_returnn_training=True, debug=False, **net_kwargs): - evaluation_epochs = list(range(num_epochs, num_epochs + 1, 10)) - - returnn_configs = get_pytorch_returnn_configs( - num_inputs=50, num_outputs=num_outputs, batch_size=5000, - evaluation_epochs=evaluation_epochs, debug=debug, - ) - - returnn_recog_configs = get_pytorch_returnn_configs( - num_inputs=50, num_outputs=num_outputs, batch_size=5000, - evaluation_epochs=evaluation_epochs, - recognition=True, debug=debug, - ) - - - training_args = { - "log_verbosity": 5, - "num_epochs": num_epochs, - "save_interval": 1, - "keep_epochs": None, - "time_rqmt": 168, - "mem_rqmt": 8, - "cpu_rqmt": 3, - } - - if use_rasr_returnn_training: - training_args["num_classes"] = num_outputs - training_args["use_python_control"] = False - training_args["partition_epochs"] = {"train": 40, "dev": 20} - - recognition_args = { - "dev-other": { - "epochs": evaluation_epochs, - "feature_flow_key": "gt", - "prior_scales": [0.3], - "pronunciation_scales": [6.0], - "lm_scales": [20.0], - "lm_lookahead": True, - "lookahead_options": None, - "create_lattice": True, - "eval_single_best": True, - "eval_best_in_lattice": True, - "search_parameters": { - "beam-pruning": 12.0, - "beam-pruning-limit": 100000, - "word-end-pruning": 0.5, - "word-end-pruning-limit": 15000, - }, - "lattice_to_ctm_kwargs": { - "fill_empty_segments": True, - "best_path_algo": "bellman-ford", - }, - "optimize_am_lm_scale": True, - "rtf": 50, - "mem": 7, - "lmgc_mem": 16, - "needs_features_size": False, - "cpu": 2, - "parallelize_conversion": True, - } - } - test_recognition_args = None - - nn_args = HybridArgs( - returnn_training_configs=returnn_configs, - returnn_recognition_configs=returnn_recog_configs, - training_args=training_args, - recognition_args=recognition_args, - test_recognition_args=test_recognition_args, - ) - - return nn_args - - -def get_pytorch_returnn_configs( - num_inputs: int, num_outputs: int, batch_size: int, evaluation_epochs: List[int], - recognition=False, debug=False, -): - # ******************** blstm base ******************** - - base_config = { - "extern_data": { - "data": {"dim": num_inputs}, - "classes": {"dim": num_outputs, "sparse": True}, - }, - } - base_post_config = { - "backend": "torch", - "debug_print_layer_output_template": True, - "log_batch_size": True, - "tf_log_memory_usage": True, - "cache_size": "0", - - } - - blstm_base_config = copy.deepcopy(base_config) - blstm_base_config.update( - { - "behavior_version": 15, - "batch_size": batch_size, # {"classes": batch_size, "data": batch_size}, - "chunking": "50:25", - "optimizer": {"class": "nadam", "epsilon": 1e-8}, - "gradient_noise": 0.3, - "learning_rates": list(np.linspace(2.5e-5, 3e-4, 50)) + list(np.linspace(3e-4, 2.5e-5, 50)), - "learning_rate_control": "newbob_multi_epoch", - "learning_rate_control_min_num_epochs_per_new_lr": 3, - "learning_rate_control_relative_error_relative_lr": True, - #"min_learning_rate": 1e-5, - "newbob_learning_rate_decay": 0.707, - "newbob_multi_num_epochs": 40, - "newbob_multi_update_interval": 1, - } - ) - if not recognition: - base_post_config["cleanup_old_models"] = { - "keep_last_n": 5, - "keep_best_n": 5, - "keep": evaluation_epochs, - } - - high_lr_config = copy.deepcopy(blstm_base_config) - high_lr_config["learning_rates"] = list(np.linspace(2.5e-4, 3e-3, 50)) + list(np.linspace(3e-3, 2.5e-4, 50)) - - # those are hashed - pytorch_package = PACKAGE + ".pytorch_networks" - - def construct_from_net_kwargs(base_config, net_kwargs, explicit_hash=None, use_tracing=False, use_custom_engine=False, use_espnet=False): - model_type = net_kwargs.pop("model_type") - pytorch_model_import = Import( - PACKAGE + ".pytorch_networks.%s.Model" % model_type - ) - pytorch_train_step = Import( - PACKAGE + ".pytorch_networks.%s.train_step" % model_type - ) - pytorch_model = PyTorchModel( - model_class_name=pytorch_model_import.object_name, - model_kwargs=net_kwargs, - ) - serializer_objects = [ - pytorch_model_import, - pytorch_train_step, - pytorch_model, - ] - if use_espnet: - from i6_core.tools.git import CloneGitRepositoryJob - espnet_path = CloneGitRepositoryJob( - url="https://github.com/espnet/espnet", - checkout_folder_name="espnet" - ).out_repository - espnet_path.hash_overwrite = "DEFAULT_ESPNET" - serializer_objects.insert(0, ExternalImport(espnet_path)) - if use_custom_engine: - pytorch_engine = Import( - PACKAGE + ".pytorch_networks.%s.CustomEngine" % model_type - ) - serializer_objects.append(pytorch_engine) - if recognition: - if use_tracing: - pytorch_export = Import( - PACKAGE + ".pytorch_networks.%s.export_trace" % model_type, - import_as="export" - ) - else: - pytorch_export = Import( - PACKAGE + ".pytorch_networks.%s.export" % model_type - ) - serializer_objects.append(pytorch_export) - if explicit_hash: - serializer_objects.append(ExplicitHash(explicit_hash)) - serializer = Collection( - serializer_objects=serializer_objects, - make_local_package_copy=not debug, - packages={ - pytorch_package, - }, - ) - - blstm_base_returnn_config = ReturnnConfig( - config=base_config, - post_config=base_post_config, - python_epilog=[serializer], - pprint_kwargs={"sort_dicts": False}, - ) - - return blstm_base_returnn_config - - return { - "torchaudio_conformer": construct_from_net_kwargs(high_lr_config, {"model_type": "torchaudio_conformer"}, use_tracing=False),# here the config is wrong, it does use tracing - } diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/configs/rc_baseline.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/configs/rc_baseline.py deleted file mode 100644 index b0ccd6eed..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/configs/rc_baseline.py +++ /dev/null @@ -1,220 +0,0 @@ -import copy -from dataclasses import dataclass -import numpy as np -from typing import List, Dict, Any - -from i6_core.returnn.config import ReturnnConfig -from i6_experiments.common.setups.rasr.util import HybridArgs - -from i6_experiments.common.setups.returnn_common.serialization import ( - DataInitArgs, - DimInitArgs, - Collection, - Network, - ExternData, - Import, - ExplicitHash -) - - -from ..default_tools import RETURNN_COMMON - - -def get_nn_args(num_outputs: int = 12001, num_epochs: int = 250, use_rasr_returnn_training=True, **net_kwargs): - evaluation_epochs = list(range(num_epochs, num_epochs + 1, 10)) - - returnn_configs = get_rc_returnn_configs( - num_inputs=50, num_outputs=num_outputs, batch_size=5000, - evaluation_epochs=evaluation_epochs, - ) - - returnn_recog_configs = get_rc_returnn_configs( - num_inputs=50, num_outputs=num_outputs, batch_size=5000, - evaluation_epochs=evaluation_epochs, - recognition=True, - ) - - - training_args = { - "log_verbosity": 5, - "num_epochs": num_epochs, - "save_interval": 1, - "keep_epochs": None, - "time_rqmt": 168, - "mem_rqmt": 8, - "cpu_rqmt": 3, - } - - if use_rasr_returnn_training: - training_args["num_classes"] = num_outputs - training_args["use_python_control"] = False - training_args["partition_epochs"] = {"train": 40, "dev": 20} - - recognition_args = { - "dev-other": { - "epochs": evaluation_epochs, - "feature_flow_key": "gt", - "prior_scales": [0.3], - "pronunciation_scales": [6.0], - "lm_scales": [20.0], - "lm_lookahead": True, - "lookahead_options": None, - "create_lattice": True, - "eval_single_best": True, - "eval_best_in_lattice": True, - "search_parameters": { - "beam-pruning": 12.0, - "beam-pruning-limit": 100000, - "word-end-pruning": 0.5, - "word-end-pruning-limit": 15000, - }, - "lattice_to_ctm_kwargs": { - "fill_empty_segments": True, - "best_path_algo": "bellman-ford", - }, - "optimize_am_lm_scale": True, - "rtf": 50, - "mem": 8, - "lmgc_mem": 16, - "cpu": 4, - "parallelize_conversion": True, - "use_epoch_for_compile": True, - "native_ops": ["NativeLstm2"], - }, - } - test_recognition_args = None - - nn_args = HybridArgs( - returnn_training_configs=returnn_configs, - returnn_recognition_configs=returnn_recog_configs, - training_args=training_args, - recognition_args=recognition_args, - test_recognition_args=test_recognition_args, - ) - - return nn_args - - -def get_default_data_init_args(num_inputs: int, num_outputs: int): - """ - default for this hybrid model - - :param num_inputs: - :param num_outputs: - :return: - """ - time_dim = DimInitArgs("data_time", dim=None) - data_feature = DimInitArgs("data_feature", dim=num_inputs, is_feature=True) - classes_feature = DimInitArgs("classes_feature", dim=num_outputs, is_feature=True) - - return [ - DataInitArgs(name="data", available_for_inference=True, dim_tags=[time_dim, data_feature], sparse_dim=None), - DataInitArgs(name="classes", available_for_inference=False, dim_tags=[time_dim], sparse_dim=classes_feature) - ] - - - -def get_rc_returnn_configs( - num_inputs: int, num_outputs: int, batch_size: int, evaluation_epochs: List[int], - recognition=False -): - # ******************** blstm base ******************** - - base_config = { - } - base_post_config = { - "use_tensorflow": True, - "debug_print_layer_output_template": True, - "log_batch_size": True, - "tf_log_memory_usage": True, - "cache_size": "0", - - } - - blstm_base_config = copy.deepcopy(base_config) - blstm_base_config.update( - { - "behavior_version": 15, - "batch_size": batch_size, # {"classes": batch_size, "data": batch_size}, - "chunking": "50:25", - "optimizer": {"class": "nadam", "epsilon": 1e-8}, - "gradient_noise": 0.3, - "learning_rates": list(np.linspace(2.5e-5, 3e-4, 50)) + list(np.linspace(3e-4, 2.5e-5, 50)), - "learning_rate_control": "newbob_multi_epoch", - "learning_rate_control_min_num_epochs_per_new_lr": 3, - "learning_rate_control_relative_error_relative_lr": True, - #"min_learning_rate": 1e-5, - "newbob_learning_rate_decay": 0.707, - "newbob_multi_num_epochs": 40, - "newbob_multi_update_interval": 1, - } - ) - if not recognition: - base_post_config["cleanup_old_models"] = { - "keep_last_n": 5, - "keep_best_n": 5, - "keep": evaluation_epochs, - } - - rc_extern_data = ExternData(extern_data=get_default_data_init_args(num_inputs=num_inputs, num_outputs=num_outputs)) - - # those are hashed - rc_package = "i6_experiments.users.rossenbach.experiments.librispeech.librispeech_100_hybrid.rc_networks" - rc_construction_code = Import(rc_package + ".default_hybrid_v2.construct_hybrid_network") - - - net_kwargs = { - "train": not recognition, - "num_layers": 8, - "size": 1024, - "dropout": 0.0, - "specaugment_options": { - "min_frame_masks": 1, - "mask_each_n_frames": 100, - "max_frames_per_mask": 20, - "min_feature_masks": 1, - "max_feature_masks": 2, - "max_features_per_mask": 10 - } - } - - net_kwargs_focal_loss = copy.deepcopy(net_kwargs) - net_kwargs_focal_loss["focal_loss_scale"] = 2.0 - - def construct_from_net_kwargs(base_config, net_kwargs, explicit_hash=None): - rc_network = Network( - net_func_name=rc_construction_code.object_name, - net_func_map={ - "audio_data": "data", # name of the constructor parameter vs name of the data object in RETURNN - "label_data": "classes" - }, # this is hashed - net_kwargs=net_kwargs, - ) - serializer_objects = [ - rc_extern_data, - rc_construction_code, - rc_network, - ] - if explicit_hash: - serializer_objects.append(ExplicitHash(explicit_hash)) - serializer = Collection( - serializer_objects=serializer_objects, - returnn_common_root=RETURNN_COMMON, - make_local_package_copy=True, - packages={ - rc_package, - }, - ) - - blstm_base_returnn_config = ReturnnConfig( - config=base_config, - post_config=base_post_config, - python_epilog=[serializer], - pprint_kwargs={"sort_dicts": False}, - ) - return blstm_base_returnn_config - - return { - "blstm_oclr_v1": construct_from_net_kwargs(blstm_base_config, net_kwargs), - "blstm_oclr_v1_focal_loss": construct_from_net_kwargs(blstm_base_config, net_kwargs_focal_loss), - } diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/data.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/data.py deleted file mode 100644 index 0f4a7d88b..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/data.py +++ /dev/null @@ -1,304 +0,0 @@ -from i6_core import corpus as corpus_recipe -from i6_core import text - -from i6_experiments.common.setups.rasr.gmm_system import GmmSystem - -from .default_tools import RETURNN_EXE, RETURNN_RC_ROOT - - -def get_corpus_data_inputs(gmm_system): - """ - - :param GmmSystem gmm_system: - :return: - """ - - - train_corpus_path = gmm_system.corpora["train-clean-100"].corpus_file - total_train_num_segments = 28539 - cv_size = 1000 / total_train_num_segments - - all_segments = corpus_recipe.SegmentCorpusJob( - train_corpus_path, 1 - ).out_single_segment_files[1] - - splitted_segments_job = corpus_recipe.ShuffleAndSplitSegmentsJob( - all_segments, {"train": 1 - cv_size, "cv": cv_size} - ) - train_segments = splitted_segments_job.out_segments["train"] - cv_segments = splitted_segments_job.out_segments["cv"] - devtrain_segments = text.TailJob( - train_segments, num_lines=1000, zip_output=False - ).out - - # ******************** NN Init ******************** - - nn_train_data = gmm_system.outputs["train-clean-100"][ - "final" - ].as_returnn_rasr_data_input(shuffle_data=True) - nn_train_data.update_crp_with(segment_path=train_segments, concurrent=1) - nn_train_data_inputs = { - "train-clean-100.train": nn_train_data, - } - - nn_cv_data = gmm_system.outputs["train-clean-100"][ - "final" - ].as_returnn_rasr_data_input() - nn_cv_data.update_crp_with(segment_path=cv_segments, concurrent=1) - nn_cv_data_inputs = { - "train-clean-100.cv": nn_cv_data, - } - - nn_devtrain_data = gmm_system.outputs["train-clean-100"][ - "final" - ].as_returnn_rasr_data_input() - nn_devtrain_data.update_crp_with(segment_path=devtrain_segments, concurrent=1) - nn_devtrain_data_inputs = { - "train-clean-100.devtrain": nn_devtrain_data, - } - nn_dev_data_inputs = { - # "dev-clean": gmm_system.outputs["dev-clean"][ - # "final" - # ].as_returnn_rasr_data_input(), - "dev-other": gmm_system.outputs["dev-other"][ - "final" - ].as_returnn_rasr_data_input(), - } - nn_test_data_inputs = { - # "test-clean": gmm_system.outputs["test-clean"][ - # "final" - # ].as_returnn_rasr_data_input(), - # "test-other": gmm_system.outputs["test-other"][ - # "final" - # ].as_returnn_rasr_data_input(), - } - - - return nn_train_data_inputs, nn_cv_data_inputs, nn_devtrain_data_inputs, nn_dev_data_inputs, nn_test_data_inputs - - -def get_corpus_data_inputs_newcv(gmm_system): - """ - - :param GmmSystem gmm_system: - :return: - """ - train_corpus_path = gmm_system.corpora["train-clean-100"].corpus_file - total_train_num_segments = 28539 - cv_size = 300 / total_train_num_segments - - all_segments = corpus_recipe.SegmentCorpusJob( - train_corpus_path, 1 - ).out_single_segment_files[1] - - splitted_segments_job = corpus_recipe.ShuffleAndSplitSegmentsJob( - all_segments, {"train": 1 - cv_size, "cv": cv_size} - ) - train_segments = splitted_segments_job.out_segments["train"] - cv_segments = splitted_segments_job.out_segments["cv"] - devtrain_segments = text.TailJob( - train_segments, num_lines=1000, zip_output=False - ).out - - # ******************** NN Init ******************** - - nn_train_data = gmm_system.outputs["train-clean-100"][ - "final" - ].as_returnn_rasr_data_input(shuffle_data=True) - nn_train_data.update_crp_with(segment_path=train_segments, concurrent=1) - nn_train_data_inputs = { - "train-clean-100.train": nn_train_data, - } - - nn_cv_data = gmm_system.outputs["train-clean-100"][ - "final" - ].as_returnn_rasr_data_input() - nn_cv_data.update_crp_with(segment_path=cv_segments, concurrent=1) - nn_cv_data_inputs = { - "train-clean-100.cv": nn_cv_data, - } - - nn_devtrain_data = gmm_system.outputs["train-clean-100"][ - "final" - ].as_returnn_rasr_data_input() - nn_devtrain_data.update_crp_with(segment_path=devtrain_segments, concurrent=1) - nn_devtrain_data_inputs = { - "train-clean-100.devtrain": nn_devtrain_data, - } - nn_dev_data_inputs = { - # "dev-clean": gmm_system.outputs["dev-clean"][ - # "final" - # ].as_returnn_rasr_data_input(), - "dev-other": gmm_system.outputs["dev-other"][ - "final" - ].as_returnn_rasr_data_input(), - } - nn_test_data_inputs = { - # "test-clean": gmm_system.outputs["test-clean"][ - # "final" - # ].as_returnn_rasr_data_input(), - # "test-other": gmm_system.outputs["test-other"][ - # "final" - # ].as_returnn_rasr_data_input(), - } - - - return nn_train_data_inputs, nn_cv_data_inputs, nn_devtrain_data_inputs, nn_dev_data_inputs, nn_test_data_inputs - - -def get_corpus_data_inputs_newcv_hdf(gmm_system): - """ - - :param GmmSystem gmm_system: - :return: - """ - from i6_experiments.common.setups.rasr.util.nn import SingleHdfDataInput - train_corpus_path = gmm_system.corpora["train-clean-100"].corpus_file - total_train_num_segments = 28539 - cv_size = 300 / total_train_num_segments - - all_segments = corpus_recipe.SegmentCorpusJob( - train_corpus_path, 1 - ).out_single_segment_files[1] - - splitted_segments_job = corpus_recipe.ShuffleAndSplitSegmentsJob( - all_segments, {"train": 1 - cv_size, "cv": cv_size} - ) - train_segments = splitted_segments_job.out_segments["train"] - cv_segments = splitted_segments_job.out_segments["cv"] - devtrain_segments = text.TailJob( - train_segments, num_lines=300, zip_output=False - ).out - - # ******************** NN Init ******************** - - nn_train_data = gmm_system.outputs["train-clean-100"][ - "final" - ].as_returnn_rasr_data_input(shuffle_data=True) - nn_train_data.update_crp_with(segment_path=train_segments, concurrent=1) - nn_train_data_inputs = { - "train-clean-100.train": SingleHdfDataInput.from_returnn_rasr_data( - nn_train_data, - feature_flow_key="uncached_gt", - returnn_python_exe=RETURNN_EXE, - returnn_root=RETURNN_RC_ROOT, - parition_epoch=10, - seq_ordering="laplace:.1000" - ) - } - - nn_cv_data = gmm_system.outputs["train-clean-100"][ - "final" - ].as_returnn_rasr_data_input() - nn_cv_data.update_crp_with(segment_path=cv_segments, concurrent=1) - nn_cv_data_inputs = { - "train-clean-100.cv": SingleHdfDataInput.from_returnn_rasr_data( - nn_cv_data, - feature_flow_key="uncached_gt", - returnn_python_exe=RETURNN_EXE, - returnn_root=RETURNN_RC_ROOT, - parition_epoch=1, - seq_ordering="sorted" - ) - } - - nn_devtrain_data = gmm_system.outputs["train-clean-100"][ - "final" - ].as_returnn_rasr_data_input() - nn_devtrain_data.update_crp_with(segment_path=devtrain_segments, concurrent=1) - nn_devtrain_data_inputs = { - "train-clean-100.devtrain": SingleHdfDataInput.from_returnn_rasr_data( - nn_devtrain_data, - feature_flow_key="uncached_gt", - returnn_python_exe=RETURNN_EXE, - returnn_root=RETURNN_RC_ROOT, - parition_epoch=1, - seq_ordering="sorted" - ), - } - - nn_dev_data_inputs = { - # "dev-clean": gmm_system.outputs["dev-clean"][ - # "final" - # ].as_returnn_rasr_data_input(), - "dev-other": gmm_system.outputs["dev-other"][ - "final" - ].as_returnn_rasr_data_input(), - } - nn_test_data_inputs = { - # "test-clean": gmm_system.outputs["test-clean"][ - # "final" - # ].as_returnn_rasr_data_input(), - # "test-other": gmm_system.outputs["test-other"][ - # "final" - # ].as_returnn_rasr_data_input(), - } - - - return nn_train_data_inputs, nn_cv_data_inputs, nn_devtrain_data_inputs, nn_dev_data_inputs, nn_test_data_inputs - - -def get_corpus_data_inputs_devcv(gmm_system): - """ - - :param GmmSystem gmm_system: - :return: - """ - train_corpus_path = gmm_system.corpora["train-clean-100"].corpus_file - dev_corpus_path = gmm_system.corpora["dev-other"].corpus_file - total_train_num_segments = 28539 - devtrain_size = 300 / total_train_num_segments - - all_segments = corpus_recipe.SegmentCorpusJob( - train_corpus_path, 1 - ).out_single_segment_files[1] - splitted_segments_job = corpus_recipe.ShuffleAndSplitSegmentsJob( - all_segments, {"_": 1 - devtrain_size, "devtrain": devtrain_size} - ) - devtrain_segments = splitted_segments_job.out_segments["devtrain"] - - - # ******************** NN Init ******************** - - nn_train_data = gmm_system.outputs["train-clean-100"][ - "final" - ].as_returnn_rasr_data_input(shuffle_data=True) - nn_train_data.update_crp_with(concurrent=1) - nn_train_data_inputs = { - "train-clean-100.train": nn_train_data, - } - - nn_cv_data = gmm_system.outputs["dev-other"][ - "final" - ].as_returnn_rasr_data_input() - nn_cv_data.update_crp_with(concurrent=1) - nn_cv_data_inputs = { - "train-clean-100.cv": nn_cv_data, - } - - nn_devtrain_data = gmm_system.outputs["train-clean-100"][ - "final" - ].as_returnn_rasr_data_input() - nn_devtrain_data.update_crp_with(segment_path=devtrain_segments, concurrent=1) - nn_devtrain_data_inputs = { - "train-clean-100.devtrain": nn_devtrain_data, - } - nn_dev_data_inputs = { - # "dev-clean": gmm_system.outputs["dev-clean"][ - # "final" - # ].as_returnn_rasr_data_input(), - "dev-other": gmm_system.outputs["dev-other"][ - "final" - ].as_returnn_rasr_data_input(), - } - nn_test_data_inputs = { - # "test-clean": gmm_system.outputs["test-clean"][ - # "final" - # ].as_returnn_rasr_data_input(), - # "test-other": gmm_system.outputs["test-other"][ - # "final" - # ].as_returnn_rasr_data_input(), - } - - - return nn_train_data_inputs, nn_cv_data_inputs, nn_devtrain_data_inputs, nn_dev_data_inputs, nn_test_data_inputs \ No newline at end of file diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/default_tools.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/default_tools.py deleted file mode 100644 index d769bd62a..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/default_tools.py +++ /dev/null @@ -1,26 +0,0 @@ -from sisyphus import tk -from i6_core.tools.git import CloneGitRepositoryJob -from i6_experiments.common.tools.rasr import compile_rasr_binaries_i6mode - - -PACKAGE = __package__ - -#RASR_BINARY_PATH = compile_rasr_binaries_i6mode(commit="907eec4f4e36c11153f6ab6b5dd7675116f909f6") -RASR_BINARY_PATH = compile_rasr_binaries_i6mode(branch="bene_unpushed_assert") -RASR_BINARY_PATH.hash_overwrite = "LIBRISPEECH_DEFAULT_RASR_BINARY_PATH" - -RASR_BINARY_PATH_U22 = tk.Path("/work/asr4/rossenbach/neon_test/rasr_versions/rasr_no_tf/arch/linux-x86_64-standard/") -RASR_BINARY_PATH_U22.hash_overwrite = "LIBRISPEECH_DEFAULT_RASR_BINARY_PATH" - -RASR_BINARY_PATH_APPTAINER = tk.Path("/work/asr4/rossenbach/rescale/pytorch_mixed_precision/onnx_extended_rasr/arch/linux-x86_64-standard") -RASR_BINARY_PATH_APPTAINER.hash_overwrite = "LIBRISPEECH_DEFAULT_RASR_BINARY_PATH" - -RETURNN_EXE = tk.Path("/u/rossenbach/bin/returnn/returnn_tf2.3.4_mkl_launcher.sh", hash_overwrite="GENERIC_RETURNN_LAUNCHER") -RETURNN_DATA_ROOT = CloneGitRepositoryJob("https://github.com/rwth-i6/returnn", - commit="37ba06ab2697e7af4de96037565fdf4f78acdb80").out_repository - -RETURNN_RC_ROOT = CloneGitRepositoryJob("https://github.com/rwth-i6/returnn", commit="ac52150e2309e00d55af70e879c176c6400cc536").out_repository -RETURNN_RC_ROOT.hash_overwrite = "LIBRISPEECH_DEFAULT_RETURNN_RC_ROOT" - -RETURNN_COMMON = CloneGitRepositoryJob("https://github.com/rwth-i6/returnn_common", commit="e3083fac1899bb764710ca46ff9257247e4e6b14", checkout_folder_name="returnn_common").out_repository -RETURNN_COMMON.hash_overwrite = "LIBRISPEECH_DEFAULT_RETURNN_COMMON" diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/experiments.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/experiments.py deleted file mode 100644 index 96782b6d6..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/experiments.py +++ /dev/null @@ -1,86 +0,0 @@ -import copy -from sisyphus import tk, gs - -from i6_core.tools.git import CloneGitRepositoryJob - -from i6_experiments.common.setups.rasr.util import RasrSteps -from i6_experiments.common.setups.rasr.hybrid_system import HybridSystem -from i6_experiments.common.baselines.librispeech.default_tools import RASR_BINARY_PATH - - -from .data import get_corpus_data_inputs -from .configs.legacy_baseline import get_nn_args, get_feature_extraction_args -from .configs.rc_baseline import get_nn_args as get_rc_nn_args -from .configs.pytorch_baseline import get_nn_args as get_pytorch_nn_args -from .default_tools import RETURNN_RC_ROOT, RASR_BINARY_PATH_APPTAINER - -from .data import get_corpus_data_inputs_newcv, get_corpus_data_inputs_newcv_hdf - - -def run_gmm_system(): - from .gmm_baseline import run_librispeech_100_common_baseline - - system = run_librispeech_100_common_baseline(extract_additional_rasr_features=get_feature_extraction_args()) - return system - - -def run_gmm_system_v2(): - from .gmm_baseline import run_librispeech_100_common_baseline - - system = run_librispeech_100_common_baseline( - extract_additional_rasr_features=get_feature_extraction_args(fix_features_output=True) - ) - return system - - -def run_hybrid_baseline_pytorch(): - from i6_experiments.users.rossenbach.common_setups.rasr.pytorch_onnx_hybrid_system import PyTorchOnnxHybridSystem - - gs.ALIAS_AND_OUTPUT_SUBDIR = "experiments/librispeech/librispeech_100_hybrid/common_baseline_pytorch" - - gmm_system = run_gmm_system() - rasr_init_args = copy.deepcopy(gmm_system.rasr_init_args) - rasr_init_args.feature_extraction_args = get_feature_extraction_args() - ( - nn_train_data_inputs, - nn_cv_data_inputs, - nn_devtrain_data_inputs, - nn_dev_data_inputs, - nn_test_data_inputs, - ) = get_corpus_data_inputs_newcv(gmm_system) - - nn_args = get_pytorch_nn_args(num_epochs=125, debug=True) - nn_args.training_args["partition_epochs"] = {"train": 10, "dev": 1} - nn_steps = RasrSteps() - nn_steps.add_step("nn", nn_args) - - # ******************** NN System ******************** - - # image only, so just python3 - returnn_exe = tk.Path("/usr/bin/python3", hash_overwrite="GENERIC_RETURNN_LAUNCHER") - - # returnn_root = tk.Path("/u/rossenbach/src/returnn", hash_overwrite="LIBRISPEECH_DEFAULT_RETURNN_ROOT") - returnn_root_experimental = tk.Path( - "/u/lukas.rilling/github/MiniReturnn", hash_overwrite="LIBRISPEECH_DEFAULT_MINIRETURNN_ROOT" - ) - lbs_nn_system = PyTorchOnnxHybridSystem( - returnn_root=returnn_root_experimental, - returnn_python_exe=returnn_exe, - rasr_arch="linux-x86_64-standard", - rasr_binary_path=RASR_BINARY_PATH_APPTAINER, - ) - # manually override RASR binary for trainer - # lbs_nn_system.crp["base"].nn_trainer_exe = RASR_BINARY_PATH_U22.join_right(f"nn-trainer.linux-x86_64-standard") - lbs_nn_system.init_system( - rasr_init_args=rasr_init_args, - train_data=nn_train_data_inputs, - cv_data=nn_cv_data_inputs, - devtrain_data=nn_devtrain_data_inputs, - dev_data=nn_dev_data_inputs, - test_data=nn_test_data_inputs, - train_cv_pairing=[tuple(["train-clean-100.train", "train-clean-100.cv"])], - ) - - lbs_nn_system.run(nn_steps) - - gs.ALIAS_AND_OUTPUT_SUBDIR = "" diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/gmm_baseline.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/gmm_baseline.py deleted file mode 100644 index 510f8db3e..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/gmm_baseline.py +++ /dev/null @@ -1,77 +0,0 @@ -""" -Definition of the pipeline in terms of inputs and steps that are executed -""" -from sisyphus import gs - -from typing import Any, Dict, Optional - -from i6_experiments.common.setups.rasr import gmm_system -from i6_experiments.common.setups.rasr.util import RasrSteps, OutputArgs - -from i6_experiments.common.baselines.librispeech.ls100.gmm import baseline_args -from i6_experiments.common.baselines.librispeech.data import get_corpus_data_inputs - -from i6_experiments.common.baselines.librispeech.default_tools import RASR_BINARY_PATH - - -def run_librispeech_100_common_baseline( - alias_prefix="baselines/librispeech/ls100/gmm/common_baseline", - extract_additional_rasr_features: Optional[Dict[str, Any]] = None, -) -> gmm_system.GmmSystem: - """ - - :param alias_prefix: - :param extract_additional_rasr_features: a dict of : , - used to extract additional RASR-based features to be used on subsequent systems such as Hybrid - """ - - stored_alias_subdir = gs.ALIAS_AND_OUTPUT_SUBDIR - gs.ALIAS_AND_OUTPUT_SUBDIR = alias_prefix - - rasr_init_args = baseline_args.get_init_args() - - if extract_additional_rasr_features is not None: - for feature_key, feature_options in extract_additional_rasr_features.items(): - rasr_init_args.feature_extraction_args[feature_key] = feature_options - - mono_args = baseline_args.get_monophone_args() - # no unknown question needed when G2P is used - cart_args = baseline_args.get_cart_args(add_unknown=False) - tri_args = baseline_args.get_triphone_args() - vtln_args = baseline_args.get_vtln_args() - sat_args = baseline_args.get_sat_args() - vtln_sat_args = baseline_args.get_vtln_sat_args() - - final_output_args = OutputArgs("final") - final_output_args.define_corpus_type("train-clean-100", "train") - final_output_args.define_corpus_type("dev-clean", "dev") - final_output_args.define_corpus_type("dev-other", "dev") - - if extract_additional_rasr_features: - for feature_key in extract_additional_rasr_features.keys(): - final_output_args.add_feature_to_extract(feature_key) - - steps = RasrSteps() - steps.add_step("extract", rasr_init_args.feature_extraction_args) - steps.add_step("mono", mono_args) - steps.add_step("cart", cart_args) - steps.add_step("tri", tri_args) - steps.add_step("vtln", vtln_args) - steps.add_step("sat", sat_args) - steps.add_step("vtln+sat", vtln_sat_args) - steps.add_step("output", final_output_args) - - corpus_data = get_corpus_data_inputs(corpus_key="train-clean-100", use_g2p_training=True, use_stress_marker=False) - - system = gmm_system.GmmSystem(rasr_binary_path=RASR_BINARY_PATH) - system.init_system( - rasr_init_args=rasr_init_args, - train_data=corpus_data.train_data, - dev_data=corpus_data.dev_data, - test_data=corpus_data.test_data, - ) - system.run(steps) - - gs.ALIAS_AND_OUTPUT_SUBDIR = stored_alias_subdir - - return system \ No newline at end of file diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/lm/__init__.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/lm/__init__.py deleted file mode 100644 index 6b28960dd..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/lm/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from i6_experiments.common.datasets.librispeech.language_model import get_librispeech_normalized_lm_data -from i6_experiments.common.datasets.librispeech.vocab import get_lm_vocab - - -def test_train_lm(): - pass - # lm_index_vocab = get_lm_vocab() - # lm_training_data = get_librispeech_normalized_lm_data() - diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/lm/config.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/lm/config.py deleted file mode 100644 index 541d27bb7..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/lm/config.py +++ /dev/null @@ -1,92 +0,0 @@ -from i6_core.returnn.config import ReturnnConfig - -from .data import TrainingDatasets, SOURCE_DATASTREAM_KEY, TARGET_DATASTREAM_KEY - -# changing these does not change the hash -post_config = { - 'use_tensorflow': True, - 'tf_log_memory_usage': True, - 'cleanup_old_models': True, - 'log_batch_size': True, - 'debug_print_layer_output_template': True, - 'debug_mode': False, - 'batching': 'random' -} - -CF_CODE = """ -import os -from subprocess import check_output - -_cf_cache = {} - -def cf(filename): - if filename in _cf_cache: - return _cf_cache[filename] - cached_fn = check_output(["cf", filename]).strip().decode("utf8") - assert os.path.exists(cached_fn) - _cf_cache[filename] = cached_fn - return cached_fn -""" - -STACK_CODE = """ -# https://github.com/rwth-i6/returnn/issues/957 -# https://stackoverflow.com/a/16248113/133374 -import resource -import sys -try: - resource.setrlimit(resource.RLIMIT_STACK, (2 ** 29, -1)) -except Exception as exc: - print(f"resource.setrlimit {type(exc).__name__}: {exc}") -sys.setrecursionlimit(10 ** 6) -""" - - -def get_training_config(datasets: TrainingDatasets): - config = { - "batch_size": 900, - "max_seq_length": 602, - "max_seqs": 32, - "chunking": "0", - "calculate_exp_loss": True, - "gradient_clip_global_norm": 2.0, - "gradient_noise": 0., - "learning_rate": 1., - "learning_rate_control": "newbob_rel", - "learning_rate_control_relative_error_relative_lr": False, - "newbob_learning_rate_decay": 0.8, - "newbob_relative_error_threshold": 0, - "learning_rate_control_error_measure": "dev_score_output:exp", - ############# - "train": datasets.train.as_returnn_opts(), - "dev": datasets.cv.as_returnn_opts(), - } - - config["network"] = { - "input": {"class": "linear", "n_out": 128, "activation": "identity", - "forward_weights_init": "random_normal_initializer(mean=0.0, stddev=0.1)", - "from": [f"data:{SOURCE_DATASTREAM_KEY}"]}, - "lstm0": {"class": "rec", "unit": "lstm", - "forward_weights_init" : "random_normal_initializer(mean=0.0, stddev=0.1)", - "recurrent_weights_init": "random_normal_initializer(mean=0.0, stddev=0.1)", - "bias_init": "random_normal_initializer(mean=0.0, stddev=0.1)", - "n_out": 4096, "dropout": 0.2, "L2": 0.0, "direction": 1, "from": ["input"]}, - "lstm1": {"class": "rec", "unit": "lstm", - "forward_weights_init" : "random_normal_initializer(mean=0.0, stddev=0.1)", - "recurrent_weights_init": "random_normal_initializer(mean=0.0, stddev=0.1)", - "bias_init": "random_normal_initializer(mean=0.0, stddev=0.1)", - "n_out": 4096, "dropout": 0.2, "L2": 0.0, "direction": 1, "from": ["lstm0"]}, - "output": {"class": "softmax", "dropout": 0.2, "use_transposed_weights": True, - "loss_opts": {'num_sampled': 16384, 'use_full_softmax': True, 'nce_loss': False}, - "forward_weights_init": "random_normal_initializer(mean=0.0, stddev=0.1)", - "bias_init": "random_normal_initializer(mean=0.0, stddev=0.1)", - "loss": "sampling_loss", "target": f"data:{TARGET_DATASTREAM_KEY}", "from": ["lstm1"]} - } - - returnn_config = ReturnnConfig( - config=config, - post_config=post_config, - python_prolog=[CF_CODE, STACK_CODE], - python_prolog_hash="", - ) - - return returnn_config diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/lm/data.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/lm/data.py deleted file mode 100644 index c88cfc87f..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/lm/data.py +++ /dev/null @@ -1,154 +0,0 @@ -from sisyphus import tk -from sisyphus.delayed_ops import DelayedFormat - -from dataclasses import dataclass -from typing import Any, Dict, Optional, Union - -from i6_core.corpus.convert import CorpusToTxtJob -from i6_core.text.processing import ConcatenateJob -from i6_core.returnn.config import CodeWrapper - -from i6_experiments.common.datasets.librispeech import get_bliss_corpus_dict -from i6_experiments.common.datasets.librispeech.language_model import get_librispeech_normalized_lm_data -from i6_experiments.common.datasets.librispeech.vocab import get_lm_vocab - -from returnn_common.datasets import Dataset, ControlDataset -from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LmLabelDatastream - - -SOURCE_DATASTREAM_KEY = "delayed" -TARGET_DATASTREAM_KEY = "data" - - -@dataclass(frozen=True) -class TrainingDatasets: - train: Dataset - cv: Dataset - devtrain: Dataset - extern_data: Dict[str, Dict[str, Any]] - - -class LmDataset(ControlDataset): - - def __init__( - self, - *, - corpus_file: tk.Path, - vocab_file: tk.Path, - unknown_symbol: Union[str, tk.Variable] = "", - auto_replace_unknown_symbol: bool = False, - # super parameters - partition_epoch: Optional[int] = None, - segment_file: Optional[tk.Path] = None, - seq_ordering: Optional[str] = None, - random_subset: Optional[int] = None, - additional_options: Optional[Dict] = None, - ): - super().__init__( - partition_epoch=partition_epoch, - segment_file=segment_file, - seq_ordering=seq_ordering, - random_subset=random_subset, - additional_options=additional_options - ) - - self.corpus_file = corpus_file - self.vocab_file = vocab_file - self.unknown_symbol = unknown_symbol - self.auto_replace_unknown_symbol = auto_replace_unknown_symbol - - def as_returnn_opts(self) -> Dict[str, Any]: - d = { - "class": "LmDataset", - "corpus_file": CodeWrapper(DelayedFormat('lambda: cf("{}")', self.corpus_file)), - "orth_symbols_map_file": self.vocab_file, - "orth_replace_map_file": "", - "word_based": True, - "seq_end_symbol": "", - "auto_replace_unknown_symbol": self.auto_replace_unknown_symbol, - "unknown_symbol": self.unknown_symbol, - "add_delayed_seq_data": True, - "delayed_seq_data_start_symbol": "", - } - sd = super().as_returnn_opts() - assert all([k not in sd.keys() for k in d.keys()]), ( - "conflicting keys in %s and %s" - % (str(list(sd.keys())), str(list(d.keys()))), - ) - d.update(sd) - - return d - - -def build_training_data(output_prefix="", partition_epoch=20): - # conversion factor for PPL computation is 1.448 - ls_bliss_corpus_dict = get_bliss_corpus_dict() - lm_vocab = get_lm_vocab(output_prefix=output_prefix) - label_datastream = LmLabelDatastream( - available_for_inference=True, - lm_index_vocab=lm_vocab - ) - - #### Training Data #### - - lm_data = get_librispeech_normalized_lm_data() - ls_train_bliss = ls_bliss_corpus_dict["train-other-960"] - ls_train_text = CorpusToTxtJob( - bliss_corpus=ls_train_bliss, - gzip=True, - ).out_txt - full_train_text = ConcatenateJob( - text_files=[lm_data, ls_train_text], - zip_out=True, - ).out - - #### Dev Data #### - - dev_clean_text = CorpusToTxtJob(bliss_corpus=ls_bliss_corpus_dict["dev-clean"], gzip=True).out_txt - dev_other_text = CorpusToTxtJob(bliss_corpus=ls_bliss_corpus_dict["dev-other"], gzip=True).out_txt - cv_text = ConcatenateJob( - text_files=[dev_clean_text, dev_other_text], - zip_out=True, - ).out - - #### datasets #### - lm_train_dataset = LmDataset( - corpus_file=full_train_text, - vocab_file=lm_vocab.vocab, - unknown_symbol=lm_vocab.unknown_token, - auto_replace_unknown_symbol=True, - partition_epoch=partition_epoch, - segment_file=None, - seq_ordering="sort_bin_shuffle:.32" - ) - - lm_cv_dataset = LmDataset( - corpus_file=cv_text, - vocab_file=lm_vocab.vocab, - unknown_symbol=lm_vocab.unknown_token, - auto_replace_unknown_symbol=True, - partition_epoch=1, - segment_file=None, - seq_ordering="sorted" - ) - - lm_devtrain_dataset = LmDataset( - corpus_file=full_train_text, - vocab_file=lm_vocab.vocab, - unknown_symbol=lm_vocab.unknown_token, - auto_replace_unknown_symbol=True, - partition_epoch=1, - segment_file=None, - seq_ordering="sorted", - random_subset=3000, - ) - - return TrainingDatasets( - train=lm_train_dataset, - cv=lm_cv_dataset, - devtrain=lm_devtrain_dataset, - extern_data={ - SOURCE_DATASTREAM_KEY: label_datastream.as_returnn_extern_data_opts(available_for_inference=True), - TARGET_DATASTREAM_KEY: label_datastream.as_returnn_extern_data_opts(available_for_inference=False) - } - ) diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/lm/default_tools.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/lm/default_tools.py deleted file mode 100644 index 78a0ce27a..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/lm/default_tools.py +++ /dev/null @@ -1,13 +0,0 @@ -from sisyphus import tk -from i6_core.tools.git import CloneGitRepositoryJob - - -RETURNN_EXE = tk.Path("/u/rossenbach/bin/returnn/returnn_tf2.3.4_mkl_launcher.sh", hash_overwrite="GENERIC_RETURNN_LAUNCHER") - -# RETURNN_ROOT = CloneGitRepositoryJob("https://github.com/rwth-i6/returnn", commit="45fad83c785a45fa4abfeebfed2e731dd96f960c").out_repository - -# run with LM vocab fix -RETURNN_ROOT = CloneGitRepositoryJob("https://github.com/rwth-i6/returnn", commit="9e7b7f24b090f4a4909c9be3e276194f026a6932").out_repository -RETURNN_ROOT.hash_overwrite = "LIBRISPEECH_DEFAULT_RETURNN_ROOT" - - diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/lm/experiment.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/lm/experiment.py deleted file mode 100644 index de06613e1..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/lm/experiment.py +++ /dev/null @@ -1,36 +0,0 @@ -from sisyphus import tk - -from i6_core.returnn.training import ReturnnTrainingJob - -from .default_tools import RETURNN_EXE, RETURNN_ROOT -from .data import build_training_data -from .config import get_training_config - -def train(config, num_epochs=50): - default_rqmt = { - 'mem_rqmt': 15, - 'time_rqmt': 168, - 'log_verbosity': 5, - 'returnn_python_exe': RETURNN_EXE, - 'returnn_root': RETURNN_ROOT, - } - - train_job = ReturnnTrainingJob( - returnn_config=config, - num_epochs=num_epochs, - **default_rqmt - ) - return train_job - - -def test_train_lm(): - - exp_prefix = "experiments/librispeech/librispeech_100_hybrid/lm/kazuki_lstm_2x4k_samp16k_v2" - training_data = build_training_data(output_prefix=exp_prefix) - - config = get_training_config(training_data) - config.config["batch_size"] = 950 - train_job = train(config) - train_job.add_alias(exp_prefix + "/training") - tk.register_output(exp_prefix + "/learning_rates", train_job.out_learning_rates) - diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/__init__.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/blstm8x1024.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/blstm8x1024.py deleted file mode 100644 index 0b7c34f08..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/blstm8x1024.py +++ /dev/null @@ -1,94 +0,0 @@ -import time -import torch -from torch import nn -from torch.onnx import export as onnx_export -from torchaudio.functional import mask_along_axis - -from returnn.torch.engine import TrainCtx - - -class Model(torch.nn.Module): - - def __init__(self): - super().__init__() - lstm_size = 1024 - target_size=12001 - self.blstm1 = nn.LSTM(input_size=50, hidden_size=lstm_size, bidirectional=True, batch_first=False) - self.blstm_stack = nn.LSTM(input_size=2*lstm_size, hidden_size=lstm_size, bidirectional=True, num_layers=7, batch_first=False) - self.final_linear = nn.Linear(2*lstm_size, target_size) - self.lstm_size = lstm_size - - def forward( - self, - audio_features: torch.Tensor, - audio_features_len: torch.Tensor, - ): - if self.training: - audio_features_time_masked = mask_along_axis(audio_features, mask_param=20, mask_value=0.0, axis=1) - audio_features_masked = mask_along_axis(audio_features_time_masked, mask_param=10, mask_value=0.0, axis=2) - else: - audio_features_masked = audio_features - blstm_in = torch.swapaxes(audio_features_masked, 0, 1) # [B, T, F] -> [T, B, F] - - blstm_packed_in = nn.utils.rnn.pack_padded_sequence(blstm_in, audio_features_len) - blstm_first, _ = self.blstm1(blstm_packed_in) - blstm_packed_out, _ = self.blstm_stack(blstm_first) - blstm_out, _ = nn.utils.rnn.pad_packed_sequence(blstm_packed_out, padding_value=0.0, batch_first=False) # [T, B, F] - - logits = self.final_linear(blstm_out) # [T, B, F] - logits_rasr_order = torch.permute(logits, dims=(1, 0, 2)) # RASR expects [B, T, F] - logits_ce_order = torch.permute(logits, dims=(1, 2, 0)) # CE expects [B, F, T] - log_probs = torch.log_softmax(logits_rasr_order, dim=2) - - return log_probs, logits_ce_order - - -scripted_model = None - -def train_step(*, model: Model, data, train_ctx, **_kwargs): - global scripted_model - audio_features = data["data"] - audio_features_len = data["data:seq_len"] - - audio_features_len, indices = torch.sort(audio_features_len, descending=True) - audio_features = audio_features[indices, :, :] - - phonemes = data["classes"][indices, :] - phonemes_len = data["classes:seq_len"][indices] - - if scripted_model is None: - scripted_model = torch.jit.script(model) - - log_probs, logits = model( - audio_features=audio_features, - audio_features_len=audio_features_len, - ) - - targets_packed = nn.utils.rnn.pack_padded_sequence(phonemes, phonemes_len, batch_first=True, enforce_sorted=False) - targets_masked, _ = nn.utils.rnn.pad_packed_sequence(targets_packed, batch_first=True, padding_value=-100) - - loss = nn.functional.cross_entropy(logits, targets_masked) - - train_ctx.mark_as_loss(name="CE", loss=loss) - - -def export(*, model: Model, model_filename: str): - scripted_model = torch.jit.optimize_for_inference(torch.jit.script(model.eval())) - dummy_data = torch.randn(1, 30, 50, device="cpu") - dummy_data_len, _ = torch.sort(torch.randint(low=10, high=30, size=(1,), device="cpu", dtype=torch.int32), descending=True) - onnx_export( - scripted_model, - (dummy_data, dummy_data_len), - f=model_filename, - verbose=True, - input_names=["data", "data_len"], - output_names=["classes"], - dynamic_axes={ - # dict value: manually named axes - "data": {0: "batch", 1: "time"}, - "data_len": {0: "batch"}, - "classes": {0: "batch", 1: "time"} - } - ) - - diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/blstm8x1024_custom_engine.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/blstm8x1024_custom_engine.py deleted file mode 100644 index f9d1bc455..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/blstm8x1024_custom_engine.py +++ /dev/null @@ -1,121 +0,0 @@ -from random import random -import torch -import time -from typing import Dict, Optional -from torch import nn -from torch.onnx import export as onnx_export -from torchaudio.functional import mask_along_axis - -from returnn.torch.engine import Engine as TorchEngine -from returnn.util import NumbersDict -from returnn.log import log - - - -class Model(torch.nn.Module): - - def __init__(self): - super().__init__() - lstm_size = 1024 - target_size=12001 - self.blstm1 = nn.LSTM(input_size=50, hidden_size=lstm_size, bidirectional=True, batch_first=False) - self.blstm_stack = nn.LSTM(input_size=2*lstm_size, hidden_size=lstm_size, bidirectional=True, num_layers=7, batch_first=False) - self.final_linear = nn.Linear(2*lstm_size, target_size) - self.lstm_size = lstm_size - - def forward( - self, - audio_features: torch.Tensor, - audio_features_len: torch.Tensor, - ): - if self.training: - audio_features_time_masked = mask_along_axis(audio_features, mask_param=20, mask_value=0.0, axis=1) - audio_features_time_masked_2 = mask_along_axis(audio_features_time_masked, mask_param=20, mask_value=0.0, axis=1) - audio_features_masked = mask_along_axis(audio_features_time_masked_2, mask_param=10, mask_value=0.0, axis=2) - audio_features_masked_2 = mask_along_axis(audio_features_masked, mask_param=10, mask_value=0.0, axis=2) - else: - audio_features_masked_2 = audio_features - blstm_in = torch.swapaxes(audio_features_masked_2, 0, 1) # [B, T, F] -> [T, B, F] - - blstm_packed_in = nn.utils.rnn.pack_padded_sequence(blstm_in, audio_features_len) - blstm_first, _ = self.blstm1(blstm_packed_in) - blstm_packed_out, _ = self.blstm_stack(blstm_first) - blstm_out, _ = nn.utils.rnn.pad_packed_sequence(blstm_packed_out, padding_value=0.0, batch_first=False) # [T, B, F] - - logits = self.final_linear(blstm_out) # [T, B, F] - logits_rasr_order = torch.permute(logits, dims=(1, 0, 2)) # RASR expects [B, T, F] - logits_ce_order = torch.permute(logits, dims=(1, 2, 0)) # CE expects [B, F, T] - log_probs = torch.log_softmax(logits_rasr_order, dim=2) - - return log_probs, logits_ce_order - - -scripted_model = None - -def train_step(*, model: Model, data, run_ctx, **_kwargs): - global scripted_model - audio_features = data["data"] - audio_features_len = data["data:seq_len"] - - audio_features_len, indices = torch.sort(audio_features_len, descending=True) - audio_features = audio_features[indices, :, :] - - phonemes = data["classes"][indices, :] - phonemes_len = data["classes:seq_len"][indices] - - if scripted_model is None: - scripted_model = torch.jit.script(model) - - # distributed_model = DataParallel(model) - log_probs, logits = model( - audio_features=audio_features, - audio_features_len=audio_features_len, - ) - - targets_packed = nn.utils.rnn.pack_padded_sequence(phonemes, phonemes_len, batch_first=True, enforce_sorted=False) - targets_masked, _ = nn.utils.rnn.pad_packed_sequence(targets_packed, batch_first=True, padding_value=-100) - - loss = nn.functional.cross_entropy(logits, targets_masked) - - run_ctx.mark_as_loss(name="ce", loss=loss) - - -def export(*, model: Model, model_filename: str): - scripted_model = torch.jit.optimize_for_inference(torch.jit.script(model.eval())) - dummy_data = torch.randn(1, 30, 50, device="cpu") - dummy_data_len, _ = torch.sort(torch.randint(low=10, high=30, size=(1,), device="cpu", dtype=torch.int32), descending=True) - onnx_export( - scripted_model, - (dummy_data, dummy_data_len), - f=model_filename, - verbose=True, - input_names=["data", "data_len"], - output_names=["classes"], - dynamic_axes={ - # dict value: manually named axes - "data": {0: "batch", 1: "time"}, - "data_len": {0: "batch"}, - "classes": {0: "batch", 1: "time"} - } - ) - - -def export_trace(*, model: Model, model_filename: str): - dummy_data = torch.randn(1, 30, 50, device="cpu") - dummy_data_len, _ = torch.sort(torch.randint(low=10, high=30, size=(1,), device="cpu", dtype=torch.int32), descending=True) - scripted_model = torch.jit.optimize_for_inference(torch.jit.trace(model.eval(), example_inputs=(dummy_data, dummy_data_len))) - onnx_export( - scripted_model, - (dummy_data, dummy_data_len), - f=model_filename, - verbose=True, - input_names=["data", "data_len"], - output_names=["classes"], - dynamic_axes={ - # dict value: manually named axes - "data": {0: "batch", 1: "time"}, - "data_len": {0: "batch"}, - "classes": {0: "batch", 1: "time"} - } - ) - diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/blstm8x1024_more_specaug.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/blstm8x1024_more_specaug.py deleted file mode 100644 index b2e0a2fd5..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/blstm8x1024_more_specaug.py +++ /dev/null @@ -1,115 +0,0 @@ -import time -import torch -from torch import nn -from torch.onnx import export as onnx_export -from torchaudio.functional import mask_along_axis - - -class Model(torch.nn.Module): - - def __init__(self): - super().__init__() - lstm_size = 1024 - target_size=12001 - self.blstm1 = nn.LSTM(input_size=50, hidden_size=lstm_size, bidirectional=True, batch_first=False) - self.blstm_stack = nn.LSTM(input_size=2*lstm_size, hidden_size=lstm_size, bidirectional=True, num_layers=7, batch_first=False) - self.final_linear = nn.Linear(2*lstm_size, target_size) - self.lstm_size = lstm_size - - def forward( - self, - audio_features: torch.Tensor, - audio_features_len: torch.Tensor, - ): - if self.training: - audio_features_time_masked = mask_along_axis(audio_features, mask_param=20, mask_value=0.0, axis=1) - audio_features_time_masked_2 = mask_along_axis(audio_features_time_masked, mask_param=20, mask_value=0.0, axis=1) - audio_features_masked = mask_along_axis(audio_features_time_masked_2, mask_param=10, mask_value=0.0, axis=2) - audio_features_masked_2 = mask_along_axis(audio_features_masked, mask_param=10, mask_value=0.0, axis=2) - else: - audio_features_masked_2 = audio_features - blstm_in = torch.swapaxes(audio_features_masked_2, 0, 1) # [B, T, F] -> [T, B, F] - - blstm_packed_in = nn.utils.rnn.pack_padded_sequence(blstm_in, audio_features_len) - blstm_first, _ = self.blstm1(blstm_packed_in) - blstm_packed_out, _ = self.blstm_stack(blstm_first) - blstm_out, _ = nn.utils.rnn.pad_packed_sequence(blstm_packed_out, padding_value=0.0, batch_first=False) # [T, B, F] - - logits = self.final_linear(blstm_out) # [T, B, F] - logits_rasr_order = torch.permute(logits, dims=(1, 0, 2)) # RASR expects [B, T, F] - logits_ce_order = torch.permute(logits, dims=(1, 2, 0)) # CE expects [B, F, T] - log_probs = torch.log_softmax(logits_rasr_order, dim=2) - - return log_probs, logits_ce_order - - -scripted_model = None - -def train_step(*, model: Model, data, run_ctx, **_kwargs): - global scripted_model - audio_features = data["data"] - audio_features_len = data["data:seq_len"] - - audio_features_len, indices = torch.sort(audio_features_len, descending=True) - audio_features = audio_features[indices, :, :] - - phonemes = data["classes"][indices, :] - phonemes_len = data["classes:seq_len"][indices] - - if scripted_model is None: - scripted_model = torch.jit.script(model) - - # distributed_model = DataParallel(model) - log_probs, logits = model( - audio_features=audio_features, - audio_features_len=audio_features_len, - ) - - targets_packed = nn.utils.rnn.pack_padded_sequence(phonemes, phonemes_len, batch_first=True, enforce_sorted=False) - targets_masked, _ = nn.utils.rnn.pad_packed_sequence(targets_packed, batch_first=True, padding_value=-100) - - loss = nn.functional.cross_entropy(logits, targets_masked) - - run_ctx.mark_as_loss(name="CE", loss=loss) - - -def export(*, model: Model, model_filename: str): - scripted_model = torch.jit.optimize_for_inference(torch.jit.script(model.eval())) - dummy_data = torch.randn(1, 30, 50, device="cpu") - dummy_data_len, _ = torch.sort(torch.randint(low=10, high=30, size=(1,), device="cpu", dtype=torch.int32), descending=True) - onnx_export( - scripted_model, - (dummy_data, dummy_data_len), - f=model_filename, - verbose=True, - input_names=["data", "data_len"], - output_names=["classes"], - dynamic_axes={ - # dict value: manually named axes - "data": {0: "batch", 1: "time"}, - "data_len": {0: "batch"}, - "classes": {0: "batch", 1: "time"} - } - ) - - -def export_trace(*, model: Model, model_filename: str): - dummy_data = torch.randn(1, 30, 50, device="cpu") - dummy_data_len, _ = torch.sort(torch.randint(low=10, high=30, size=(1,), device="cpu", dtype=torch.int32), descending=True) - scripted_model = torch.jit.optimize_for_inference(torch.jit.trace(model.eval(), example_inputs=(dummy_data, dummy_data_len))) - onnx_export( - scripted_model, - (dummy_data, dummy_data_len), - f=model_filename, - verbose=True, - input_names=["data", "data_len"], - output_names=["classes"], - dynamic_axes={ - # dict value: manually named axes - "data": {0: "batch", 1: "time"}, - "data_len": {0: "batch"}, - "classes": {0: "batch", 1: "time"} - } - ) - - diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/blstm8x1024_more_specaug_backup.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/blstm8x1024_more_specaug_backup.py deleted file mode 100644 index d553b6c08..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/blstm8x1024_more_specaug_backup.py +++ /dev/null @@ -1,75 +0,0 @@ -import time -import torch -from torch import nn -from torch.onnx import export -from torchaudio.functional import mask_along_axis - -from returnn.torch.engine import TrainCtx - - -class Model(torch.nn.Module): - - def __init__(self): - super().__init__() - lstm_size = 1024 - target_size=12001 - self.blstm1 = nn.LSTM(input_size=50, hidden_size=lstm_size, bidirectional=True, batch_first=False) - self.blstm_stack = nn.LSTM(input_size=2*lstm_size, hidden_size=lstm_size, bidirectional=True, num_layers=7, batch_first=False) - self.final_linear = nn.Linear(2*lstm_size, target_size) - self.lstm_size = lstm_size - - def forward( - self, - audio_features: torch.Tensor, - audio_features_len: torch.Tensor, - ): - audio_features_time_masked = mask_along_axis(audio_features, mask_param=20, mask_value=0.0, axis=1) - audio_features_time_masked_2 = mask_along_axis(audio_features_time_masked, mask_param=20, mask_value=0.0, axis=1) - audio_features_masked = mask_along_axis(audio_features_time_masked_2, mask_param=10, mask_value=0.0, axis=2) - audio_features_masked_2 = mask_along_axis(audio_features_masked, mask_param=10, mask_value=0.0, axis=2) - blstm_in = torch.swapaxes(audio_features_masked_2, 0, 1) # [B, T, F] -> [T, B, F] - - blstm_packed_in = nn.utils.rnn.pack_padded_sequence(blstm_in, audio_features_len) - blstm_first, _ = self.blstm1(blstm_packed_in) - blstm_packed_out, _ = self.blstm_stack(blstm_first) - blstm_out, _ = nn.utils.rnn.pad_packed_sequence(blstm_packed_out, padding_value=0.0, batch_first=False) # [T, B, F] - - logits = self.final_linear(blstm_out) # [T, B, F] - logits = torch.permute(logits, dims=(1, 2, 0)) # CE expects [B, F, T] - log_probs = torch.log_softmax(logits, dim=2) - - return log_probs, logits - -scripted_model = None - -def train_step(*, model: Model, data, train_ctx, **_kwargs): - global scripted_model - audio_features = data["data"] - audio_features_len = data["data:seq_len"] - - audio_features_len, indices = torch.sort(audio_features_len, descending=True) - audio_features = audio_features[indices, :, :] - - phonemes = data["classes"][indices, :] - phonemes_len = data["classes:seq_len"][indices] - - if scripted_model is None: - scripted_model = torch.jit.script(model) - - log_probs, logits = model( - audio_features=audio_features, - audio_features_len=audio_features_len, - ) - - targets_packed = nn.utils.rnn.pack_padded_sequence(phonemes, phonemes_len, batch_first=True, enforce_sorted=False) - targets_masked, _ = nn.utils.rnn.pad_packed_sequence(targets_packed, batch_first=True, padding_value=-100) - - loss = nn.functional.cross_entropy(logits, targets_masked) - - train_ctx.mark_as_loss(name="CE", loss=loss) - - - - - - diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/blstm8x1024_more_specaug_fp16.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/blstm8x1024_more_specaug_fp16.py deleted file mode 100644 index e41fe158f..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/blstm8x1024_more_specaug_fp16.py +++ /dev/null @@ -1,176 +0,0 @@ -import time -import torch -from torch import nn, autocast -from torch.onnx import export as onnx_export -from torch.cuda.amp import GradScaler -from torchaudio.functional import mask_along_axis - -from returnn.log import log -from returnn.torch.engine import Engine as TorchEngine -from returnn.util.basic import NumbersDict -from returnn.torch.context import get_run_ctx, init_train_step_run_ctx - - -class CustomEngine(TorchEngine): - - def train_epoch(self): - """ - train one (sub)epoch - """ - print("start", self.get_epoch_str(), "with learning rate", self.learning_rate, "...", file=log.v4) - - self._model.train() - init_train_step_run_ctx(device=self._device) - - # Creates a GradScaler once at the beginning of training. - scaler = GradScaler() - - accumulated_losses_dict = NumbersDict() - step_idx = 0 - for data in self._train_dataloader: - step_time_start = time.time() - run_ctx = get_run_ctx() - run_ctx.init_step() - - self._updater.get_optimizer().zero_grad() - with autocast(device_type='cuda', dtype=torch.bfloat16): - self._run_step(data) - - losses_dict = run_ctx.losses - total_loss = run_ctx.total_loss() - - scaler.scale(total_loss).backward() - scaler.step(self._updater.get_optimizer()) - scaler.update() - - losses_dict = { - "train_loss_" + name: float(loss.loss.detach().cpu().numpy()) - for name, loss in losses_dict.items() - } - accumulated_losses_dict += NumbersDict(losses_dict) - print("step %i, loss: %f, took: %.3fs" % ( - step_idx, total_loss.detach().cpu().numpy(), time.time() - step_time_start - ), file=log.v4) - - step_idx += 1 - - print("Trained %i steps" % step_idx) - - accumulated_losses_dict = accumulated_losses_dict / step_idx - self.learning_rate_control.set_epoch_error(self.epoch, dict(accumulated_losses_dict)) - self.learning_rate_control.save() - - if self.epoch % self._save_model_epoch_interval == 0 or self.epoch == self._final_epoch: - self._save_model() - self._save_optimizer() - - self.eval_model() - -class Model(torch.nn.Module): - - def __init__(self): - super().__init__() - lstm_size = 1024 - target_size=12001 - self.blstm1 = nn.LSTM(input_size=50, hidden_size=lstm_size, bidirectional=True, batch_first=False) - self.blstm_stack = nn.LSTM(input_size=2*lstm_size, hidden_size=lstm_size, bidirectional=True, num_layers=7, batch_first=False) - self.final_linear = nn.Linear(2*lstm_size, target_size) - self.lstm_size = lstm_size - - def forward( - self, - audio_features: torch.Tensor, - audio_features_len: torch.Tensor, - ): - if self.training: - audio_features_time_masked = mask_along_axis(audio_features, mask_param=20, mask_value=0.0, axis=1) - audio_features_time_masked_2 = mask_along_axis(audio_features_time_masked, mask_param=20, mask_value=0.0, axis=1) - audio_features_masked = mask_along_axis(audio_features_time_masked_2, mask_param=10, mask_value=0.0, axis=2) - audio_features_masked_2 = mask_along_axis(audio_features_masked, mask_param=10, mask_value=0.0, axis=2) - else: - audio_features_masked_2 = audio_features - blstm_in = torch.swapaxes(audio_features_masked_2, 0, 1) # [B, T, F] -> [T, B, F] - - blstm_packed_in = nn.utils.rnn.pack_padded_sequence(blstm_in, audio_features_len) - blstm_first, _ = self.blstm1(blstm_packed_in) - blstm_packed_out, _ = self.blstm_stack(blstm_first) - blstm_out, _ = nn.utils.rnn.pad_packed_sequence(blstm_packed_out, padding_value=0.0, batch_first=False) # [T, B, F] - - logits = self.final_linear(blstm_out) # [T, B, F] - logits_rasr_order = torch.permute(logits, dims=(1, 0, 2)) # RASR expects [B, T, F] - logits_ce_order = torch.permute(logits, dims=(1, 2, 0)) # CE expects [B, F, T] - log_probs = torch.log_softmax(logits_rasr_order, dim=2) - - return log_probs, logits_ce_order - - -scripted_model = None - -def train_step(*, model: Model, data, run_ctx, **_kwargs): - global scripted_model - audio_features = data["data"] - audio_features_len = data["data:seq_len"] - - audio_features_len, indices = torch.sort(audio_features_len, descending=True) - audio_features = audio_features[indices, :, :] - - phonemes = data["classes"][indices, :] - phonemes_len = data["classes:seq_len"][indices] - - if scripted_model is None: - scripted_model = torch.jit.script(model) - - # distributed_model = DataParallel(model) - log_probs, logits = model( - audio_features=audio_features, - audio_features_len=audio_features_len, - ) - - targets_packed = nn.utils.rnn.pack_padded_sequence(phonemes, phonemes_len, batch_first=True, enforce_sorted=False) - targets_masked, _ = nn.utils.rnn.pad_packed_sequence(targets_packed, batch_first=True, padding_value=-100) - - loss = nn.functional.cross_entropy(logits, targets_masked) - - run_ctx.mark_as_loss(name="CE", loss=loss) - - -def export(*, model: Model, model_filename: str): - scripted_model = torch.jit.optimize_for_inference(torch.jit.script(model.eval())) - dummy_data = torch.randn(1, 30, 50, device="cpu") - dummy_data_len, _ = torch.sort(torch.randint(low=10, high=30, size=(1,), device="cpu", dtype=torch.int32), descending=True) - onnx_export( - scripted_model, - (dummy_data, dummy_data_len), - f=model_filename, - verbose=True, - input_names=["data", "data_len"], - output_names=["classes"], - dynamic_axes={ - # dict value: manually named axes - "data": {0: "batch", 1: "time"}, - "data_len": {0: "batch"}, - "classes": {0: "batch", 1: "time"} - } - ) - - -def export_trace(*, model: Model, model_filename: str): - dummy_data = torch.randn(1, 30, 50, device="cpu") - dummy_data_len, _ = torch.sort(torch.randint(low=10, high=30, size=(1,), device="cpu", dtype=torch.int32), descending=True) - scripted_model = torch.jit.optimize_for_inference(torch.jit.trace(model.eval(), example_inputs=(dummy_data, dummy_data_len))) - onnx_export( - scripted_model, - (dummy_data, dummy_data_len), - f=model_filename, - verbose=True, - input_names=["data", "data_len"], - output_names=["classes"], - dynamic_axes={ - # dict value: manually named axes - "data": {0: "batch", 1: "time"}, - "data_len": {0: "batch"}, - "classes": {0: "batch", 1: "time"} - } - ) - - diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/espnet_conformer_large.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/espnet_conformer_large.py deleted file mode 100644 index 44188e011..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/espnet_conformer_large.py +++ /dev/null @@ -1,121 +0,0 @@ -import os -from random import random -import torch -import time -from typing import Dict, Optional -from torch import nn -from torch.onnx import export as onnx_export -from torchaudio.functional import mask_along_axis - -from returnn.torch.engine import Engine as TorchEngine -from returnn.util import NumbersDict -from returnn.log import log -from returnn.torch.context import get_run_ctx - -from espnet.nets.pytorch_backend.e2e_asr_conformer import Encoder - - -def lengths_to_mask(lengths, max_len): - return torch.arange(max_len).expand(lengths.size()[0], max_len) < lengths.unsqueeze(1) - - -class Model(torch.nn.Module): - - def __init__(self): - super().__init__() - target_size=12001 - attention_dim = 512 - self.espnet_conformer = Encoder(idim=50, num_blocks=12, attention_dim=attention_dim, attention_heads=8, input_layer="linear") - self.final_linear = nn.Linear(attention_dim, target_size) - - def forward( - self, - audio_features: torch.Tensor, # [B T D] - audio_features_len: torch.Tensor, # [B] - ): - - if self.training: - audio_features_time_masked = mask_along_axis(audio_features, mask_param=20, mask_value=0.0, axis=1) - audio_features_time_masked_2 = mask_along_axis(audio_features_time_masked, mask_param=20, mask_value=0.0, axis=1) - audio_features_masked = mask_along_axis(audio_features_time_masked_2, mask_param=10, mask_value=0.0, axis=2) - audio_features_masked_2 = mask_along_axis(audio_features_masked, mask_param=10, mask_value=0.0, axis=2) - else: - audio_features_masked_2 = audio_features - - run_ctx = get_run_ctx() - mask = lengths_to_mask(audio_features_len, max_len=audio_features.size()[1]) - mask = torch.unsqueeze(mask, dim=1).to(run_ctx.device) - - - conformer_out, mask = self.espnet_conformer(audio_features_masked_2, mask) - - logits = self.final_linear(conformer_out) # [B, T, F] - logits_ce_order = torch.permute(logits, dims=(0, 2, 1)) # CE expects [B, F, T] - log_probs = torch.log_softmax(logits, dim=2) - - return log_probs, logits_ce_order - - -scripted_model = None - -def train_step(*, model: Model, data, run_ctx, **_kwargs): - global scripted_model - audio_features = data["data"] - audio_features_len = data["data:seq_len"] - - audio_features_len, indices = torch.sort(audio_features_len, descending=True) - audio_features = audio_features[indices, :, :] - - phonemes = data["classes"][indices, :] - phonemes_len = data["classes:seq_len"][indices] - - #if scripted_model is None: - # # check exportability - # tmp_filename = "/var/tmp/onnx_export/export.onnx" - # os.makedirs(os.path.dirname(tmp_filename), exist_ok=True) - # scripted_model = export(model=model, model_filename=tmp_filename) - # os.unlink(tmp_filename) - - # distributed_model = DataParallel(model) - log_probs, logits = model( - audio_features=audio_features, - audio_features_len=audio_features_len, - ) - - targets_packed = nn.utils.rnn.pack_padded_sequence(phonemes, phonemes_len, batch_first=True, enforce_sorted=False) - targets_masked, _ = nn.utils.rnn.pad_packed_sequence(targets_packed, batch_first=True, padding_value=-100) - - loss = nn.functional.cross_entropy(logits, targets_masked) - - run_ctx.mark_as_loss(name="ce", loss=loss) - - -def export(*, model: Model, model_filename: str): - from returnn.torch import context - context._run_ctx = context.RunCtx(stage="forward_step", device="cpu") - dummy_data = torch.randn(1, 30, 50, device="cpu") * 10 - #dummy_data_len, _ = torch.sort(torch.randint(low=10, high=30, size=(1,), device="cpu", dtype=torch.int32), descending=True) - dummy_data_len = torch.ones((1,)) * 25 - dummy_data_len2 = torch.ones((1,)) * 25 - scripted_model = torch.jit.optimize_for_inference(torch.jit.trace(model.eval(), example_inputs=(dummy_data, dummy_data_len))) - l1 = scripted_model(dummy_data, dummy_data_len) - l2 = scripted_model(dummy_data, dummy_data_len2) - print(l1) - print(l2) - assert False - onnx_export( - scripted_model, - (dummy_data, dummy_data_len), - f=model_filename, - verbose=True, - input_names=["data", "data_len"], - output_names=["classes"], - dynamic_axes={ - # dict value: manually named axes - "data": {0: "batch", 1: "time"}, - "data_len": {0: "batch"}, - "classes": {0: "batch", 1: "time"} - } - ) - return scripted_model - diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/espnet_conformer_test.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/espnet_conformer_test.py deleted file mode 100644 index db32f29f4..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/espnet_conformer_test.py +++ /dev/null @@ -1,124 +0,0 @@ -import os -from random import random -import torch -import time -from typing import Dict, Optional -from torch import nn -from torch.onnx import export as onnx_export -from torchaudio.functional import mask_along_axis - -from returnn.torch.engine import Engine as TorchEngine -from returnn.util import NumbersDict -from returnn.log import log -from returnn.torch.context import get_run_ctx - -from espnet.nets.pytorch_backend.e2e_asr_conformer import Encoder - - -def lengths_to_mask(lengths, max_len): - return torch.arange(max_len).expand(lengths.size()[0], max_len) < lengths.unsqueeze(1) - - -class Model(torch.nn.Module): - - def __init__(self): - super().__init__() - target_size=12001 - attention_dim = 256 - self.espnet_conformer = Encoder(idim=50, attention_dim=attention_dim, input_layer="linear") - self.final_linear = nn.Linear(attention_dim, target_size) - - def forward( - self, - audio_features: torch.Tensor, # [B T D] - audio_features_len: torch.Tensor, # [B] - ): - - if self.training: - audio_features_time_masked = mask_along_axis(audio_features, mask_param=20, mask_value=0.0, axis=1) - audio_features_time_masked_2 = mask_along_axis(audio_features_time_masked, mask_param=20, mask_value=0.0, axis=1) - audio_features_masked = mask_along_axis(audio_features_time_masked_2, mask_param=10, mask_value=0.0, axis=2) - audio_features_masked_2 = mask_along_axis(audio_features_masked, mask_param=10, mask_value=0.0, axis=2) - else: - audio_features_masked_2 = audio_features - - run_ctx = get_run_ctx() - mask = lengths_to_mask(audio_features_len, max_len=audio_features.size()[1]) - mask = torch.unsqueeze(mask, dim=1).to(run_ctx.device) - - conformer_out, mask = self.espnet_conformer(audio_features_masked_2, mask) - - logits = self.final_linear(conformer_out) # [B, T, F] - logits_ce_order = torch.permute(logits, dims=(0, 2, 1)) # CE expects [B, F, T] - log_probs = torch.log_softmax(logits, dim=2) - - return log_probs, logits_ce_order - - -scripted_model = None - -def train_step(*, model: Model, data, run_ctx, **_kwargs): - global scripted_model - audio_features = data["data"] - audio_features_len = data["data:seq_len"] - - audio_features_len, indices = torch.sort(audio_features_len, descending=True) - audio_features = audio_features[indices, :, :] - - phonemes = data["classes"][indices, :] - phonemes_len = data["classes:seq_len"][indices] - - #if scripted_model is None: - # # check exportability - # tmp_filename = "/var/tmp/onnx_export/export.onnx" - # os.makedirs(os.path.dirname(tmp_filename), exist_ok=True) - # scripted_model = export(model=model, model_filename=tmp_filename) - # os.unlink(tmp_filename) - - # distributed_model = DataParallel(model) - log_probs, logits = model( - audio_features=audio_features, - audio_features_len=audio_features_len, - ) - - targets_packed = nn.utils.rnn.pack_padded_sequence(phonemes, phonemes_len, batch_first=True, enforce_sorted=False) - targets_masked, _ = nn.utils.rnn.pad_packed_sequence(targets_packed, batch_first=True, padding_value=-100) - - loss = nn.functional.cross_entropy(logits, targets_masked) - - run_ctx.mark_as_loss(name="ce", loss=loss) - - -def export(*, model: Model, model_filename: str): - # create new run context - from returnn.torch import context - context._run_ctx = context.RunCtx(stage="forward_step", device="cpu") - model.eval() - dummy_data = torch.randn(1, 30, 50, device="cpu") - dummy_data2 = torch.randn(1, 50, 50, device="cpu") - #dummy_data_len, _ = torch.sort(torch.randint(low=29, high=30, size=(1,), device="cpu", dtype=torch.int32), descending=True) - dummy_data_len = torch.ones((1,), device="cpu", dtype=torch.int32)*30 - dummy_data2_len = torch.ones((1,), device="cpu", dtype=torch.int32)*40 - log_probs, logits = model(dummy_data2, dummy_data2_len) - scripted_model = torch.jit.optimize_for_inference(torch.jit.trace(model.eval(), example_inputs=(dummy_data, dummy_data_len))) - s_log_probs, s_logits = scripted_model(dummy_data2, dummy_data2_len) - print(log_probs) - print(s_log_probs) - assert False - #scripted_model = torch.jit.optimize_for_inference(torch.jit.script(model.eval())) - onnx_export( - scripted_model, - (dummy_data, dummy_data_len), - f=model_filename, - verbose=True, - input_names=["data", "data_len"], - output_names=["classes"], - dynamic_axes={ - # dict value: manually named axes - "data": {0: "batch", 1: "time"}, - "data_len": {0: "batch"}, - "classes": {0: "batch", 1: "time"} - } - ) - return scripted_model - diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/mlp_test.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/mlp_test.py deleted file mode 100644 index 07a3e2bae..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/mlp_test.py +++ /dev/null @@ -1,31 +0,0 @@ -import torch -from torch import nn -from torch.onnx import export - -class Model(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear_relu_stack = nn.Sequential( - nn.Linear(50, 512), - nn.ReLU(), - nn.Linear(512, 512), - nn.ReLU(), - nn.Linear(512, 12001), - ) - - def forward(self, x): - y = self.linear_relu_stack(x) - return y - - -def train_step(*, model: Model, data, train_ctx, **_kwargs): - frames = data["data"] - scripted_model = torch.jit.script(model) - dummy_input = torch.randn(10, 3, 50, device="cuda") - export(scripted_model, dummy_input, "model.onnx", verbose=True, input_names=["features"], output_names=["output"]) - assert False - outputs = scripted_model(frames) - print(scripted_model.graph) - targets = data["classes"] - loss = nn.CrossEntropyLoss(reduction="sum")(torch.swapaxes(outputs, 1, 2), targets) - train_ctx.mark_as_loss(name="ce", loss=loss) diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/torchaudio_conformer.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/torchaudio_conformer.py deleted file mode 100644 index 4e9ea9828..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/torchaudio_conformer.py +++ /dev/null @@ -1,423 +0,0 @@ -import time -import torch -from torch import nn -from torch.onnx import export as onnx_export -from torchaudio.functional import mask_along_axis -from torchaudio.models.conformer import Conformer - - - -from typing import Optional, Tuple - -import torch - - -__all__ = ["Conformer"] - - -def _lengths_to_padding_mask(lengths: torch.Tensor) -> torch.Tensor: - batch_size = lengths.shape[0] - max_length = int(torch.max(lengths).item()) - padding_mask = torch.arange(max_length, device=lengths.device, dtype=lengths.dtype).expand( - batch_size, max_length - ) >= lengths.unsqueeze(1) - return padding_mask - - -class _ConvolutionModule(torch.nn.Module): - r"""Conformer convolution module. - - Args: - input_dim (int): input dimension. - num_channels (int): number of depthwise convolution layer input channels. - depthwise_kernel_size (int): kernel size of depthwise convolution layer. - dropout (float, optional): dropout probability. (Default: 0.0) - bias (bool, optional): indicates whether to add bias term to each convolution layer. (Default: ``False``) - use_group_norm (bool, optional): use GroupNorm rather than BatchNorm. (Default: ``False``) - """ - - def __init__( - self, - input_dim: int, - num_channels: int, - depthwise_kernel_size: int, - dropout: float = 0.0, - bias: bool = False, - use_group_norm: bool = False, - ) -> None: - super().__init__() - if (depthwise_kernel_size - 1) % 2 != 0: - raise ValueError("depthwise_kernel_size must be odd to achieve 'SAME' padding.") - self.layer_norm = torch.nn.LayerNorm(input_dim) - self.sequential = torch.nn.Sequential( - torch.nn.Conv1d( - input_dim, - 2 * num_channels, - 1, - stride=1, - padding=0, - bias=bias, - ), - torch.nn.GLU(dim=1), - torch.nn.Conv1d( - num_channels, - num_channels, - depthwise_kernel_size, - stride=1, - padding=(depthwise_kernel_size - 1) // 2, - groups=num_channels, - bias=bias, - ), - torch.nn.GroupNorm(num_groups=1, num_channels=num_channels) - if use_group_norm - else torch.nn.BatchNorm1d(num_channels), - torch.nn.SiLU(), - torch.nn.Conv1d( - num_channels, - input_dim, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ), - torch.nn.Dropout(dropout), - ) - - def forward(self, input: torch.Tensor) -> torch.Tensor: - r""" - Args: - input (torch.Tensor): with shape `(B, T, D)`. - - Returns: - torch.Tensor: output, with shape `(B, T, D)`. - """ - x = self.layer_norm(input) - x = x.transpose(1, 2) - x = self.sequential(x) - return x.transpose(1, 2) - - -class _FeedForwardModule(torch.nn.Module): - r"""Positionwise feed forward layer. - - Args: - input_dim (int): input dimension. - hidden_dim (int): hidden dimension. - dropout (float, optional): dropout probability. (Default: 0.0) - """ - - def __init__(self, input_dim: int, hidden_dim: int, dropout: float = 0.0) -> None: - super().__init__() - self.sequential = torch.nn.Sequential( - torch.nn.LayerNorm(input_dim), - torch.nn.Linear(input_dim, hidden_dim, bias=True), - torch.nn.SiLU(), - torch.nn.Dropout(dropout), - torch.nn.Linear(hidden_dim, input_dim, bias=True), - torch.nn.Dropout(dropout), - ) - - def forward(self, input: torch.Tensor) -> torch.Tensor: - r""" - Args: - input (torch.Tensor): with shape `(*, D)`. - - Returns: - torch.Tensor: output, with shape `(*, D)`. - """ - return self.sequential(input) - - -class ConformerLayer(torch.nn.Module): - r"""Conformer layer that constitutes Conformer. - - Args: - input_dim (int): input dimension. - ffn_dim (int): hidden layer dimension of feedforward network. - num_attention_heads (int): number of attention heads. - depthwise_conv_kernel_size (int): kernel size of depthwise convolution layer. - dropout (float, optional): dropout probability. (Default: 0.0) - use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d`` - in the convolution module. (Default: ``False``) - convolution_first (bool, optional): apply the convolution module ahead of - the attention module. (Default: ``False``) - """ - - def __init__( - self, - input_dim: int, - ffn_dim: int, - num_attention_heads: int, - depthwise_conv_kernel_size: int, - dropout: float = 0.0, - use_group_norm: bool = False, - convolution_first: bool = False, - ) -> None: - super().__init__() - - self.ffn1 = _FeedForwardModule(input_dim, ffn_dim, dropout=dropout) - - self.self_attn_layer_norm = torch.nn.LayerNorm(input_dim) - self.self_attn = torch.nn.MultiheadAttention(input_dim, num_attention_heads, dropout=dropout) - self.self_attn_dropout = torch.nn.Dropout(dropout) - - self.conv_module = _ConvolutionModule( - input_dim=input_dim, - num_channels=input_dim, - depthwise_kernel_size=depthwise_conv_kernel_size, - dropout=dropout, - bias=True, - use_group_norm=use_group_norm, - ) - - self.ffn2 = _FeedForwardModule(input_dim, ffn_dim, dropout=dropout) - self.final_layer_norm = torch.nn.LayerNorm(input_dim) - self.convolution_first = convolution_first - - def _apply_convolution(self, input: torch.Tensor) -> torch.Tensor: - residual = input - input = input.transpose(0, 1) - input = self.conv_module(input) - input = input.transpose(0, 1) - input = residual + input - return input - - def forward(self, input: torch.Tensor, key_padding_mask: Optional[torch.Tensor]) -> torch.Tensor: - r""" - Args: - input (torch.Tensor): input, with shape `(T, B, D)`. - key_padding_mask (torch.Tensor or None): key padding mask to use in self attention layer. - - Returns: - torch.Tensor: output, with shape `(T, B, D)`. - """ - residual = input - x = self.ffn1(input) - x = x * 0.5 + residual - - if self.convolution_first: - x = self._apply_convolution(x) - - residual = x - x = self.self_attn_layer_norm(x) - x, _ = self.self_attn( - query=x, - key=x, - value=x, - key_padding_mask=key_padding_mask, - need_weights=False, - ) - x = self.self_attn_dropout(x) - x = x + residual - - if not self.convolution_first: - x = self._apply_convolution(x) - - residual = x - x = self.ffn2(x) - x = x * 0.5 + residual - - x = self.final_layer_norm(x) - return x - - -class Conformer(torch.nn.Module): - r"""Conformer architecture introduced in - *Conformer: Convolution-augmented Transformer for Speech Recognition* - :cite:`gulati2020conformer`. - - Args: - input_dim (int): input dimension. - num_heads (int): number of attention heads in each Conformer layer. - ffn_dim (int): hidden layer dimension of feedforward networks. - num_layers (int): number of Conformer layers to instantiate. - depthwise_conv_kernel_size (int): kernel size of each Conformer layer's depthwise convolution layer. - dropout (float, optional): dropout probability. (Default: 0.0) - use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d`` - in the convolution module. (Default: ``False``) - convolution_first (bool, optional): apply the convolution module ahead of - the attention module. (Default: ``False``) - - Examples: - >>> conformer = Conformer( - >>> input_dim=80, - >>> num_heads=4, - >>> ffn_dim=128, - >>> num_layers=4, - >>> depthwise_conv_kernel_size=31, - >>> ) - >>> lengths = torch.randint(1, 400, (10,)) # (batch,) - >>> input = torch.rand(10, int(lengths.max()), input_dim) # (batch, num_frames, input_dim) - >>> output = conformer(input, lengths) - """ - - def __init__( - self, - input_dim: int, - num_heads: int, - ffn_dim: int, - num_layers: int, - depthwise_conv_kernel_size: int, - dropout: float = 0.0, - use_group_norm: bool = False, - convolution_first: bool = False, - ): - super().__init__() - - self.conformer_layers = torch.nn.ModuleList( - [ - ConformerLayer( - input_dim, - ffn_dim, - num_heads, - depthwise_conv_kernel_size, - dropout=dropout, - use_group_norm=use_group_norm, - convolution_first=convolution_first, - ) - for _ in range(num_layers) - ] - ) - self.export_mode = False - - def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - r""" - Args: - input (torch.Tensor): with shape `(B, T, input_dim)`. - lengths (torch.Tensor): with shape `(B,)` and i-th element representing - number of valid frames for i-th batch element in ``input``. - - Returns: - (torch.Tensor, torch.Tensor) - torch.Tensor - output frames, with shape `(B, T, input_dim)` - torch.Tensor - output lengths, with shape `(B,)` and i-th element representing - number of valid frames for i-th batch element in output frames. - """ - encoder_padding_mask = None if self.export_mode else _lengths_to_padding_mask(lengths) - - x = input.transpose(0, 1) - for layer in self.conformer_layers: - x = layer(x, encoder_padding_mask) - return x.transpose(0, 1), lengths - -class Model(torch.nn.Module): - - def __init__(self, **kwargs): - super().__init__() - conformer_size = 384 - target_size=12001 - self.initial_linear = nn.Linear(50, conformer_size) - self.conformer = Conformer( - input_dim=conformer_size, - num_heads=4, - ffn_dim=1024, - num_layers=8, - depthwise_conv_kernel_size=31, - dropout=0.1 - ) - self.final_linear = nn.Linear(conformer_size, target_size) - - def forward( - self, - audio_features: torch.Tensor, - audio_features_len: torch.Tensor, - ): - if self.training: - audio_features_time_masked = mask_along_axis(audio_features, mask_param=20, mask_value=0.0, axis=1) - audio_features_time_masked_2 = mask_along_axis(audio_features_time_masked, mask_param=20, mask_value=0.0, axis=1) - audio_features_masked = mask_along_axis(audio_features_time_masked_2, mask_param=10, mask_value=0.0, axis=2) - audio_features_masked_2 = mask_along_axis(audio_features_masked, mask_param=10, mask_value=0.0, axis=2) - else: - audio_features_masked_2 = audio_features - - - conformer_in = self.initial_linear(audio_features_masked_2) - - conformer_out, _ = self.conformer(conformer_in, audio_features_len) - - logits = self.final_linear(conformer_out) # [B, T, F] - logits_ce_order = torch.permute(logits, dims=(0, 2, 1)) # CE expects [B, F, T] - log_probs = torch.log_softmax(logits, dim=2) - - return log_probs, logits_ce_order - - -scripted_model = None - -def train_step(*, model: Model, data, run_ctx, **_kwargs): - global scripted_model - audio_features = data["data"] - audio_features_len = data["data:size1"] - - audio_features_len, indices = torch.sort(audio_features_len, descending=True) - audio_features = audio_features[indices, :, :] - - phonemes = data["classes"][indices, :] - phonemes_len = data["classes:size1"][indices] - - #if scripted_model is None: - # model.to("cpu") - # export_trace(model=model, model_filename="testdump.onnx") - # model.to("cuda") - # model.train() - - # distributed_model = DataParallel(model) - log_probs, logits = model( - audio_features=audio_features, - audio_features_len=audio_features_len, - ) - - targets_packed = nn.utils.rnn.pack_padded_sequence(phonemes, phonemes_len.to("cpu"), batch_first=True, enforce_sorted=False) - targets_masked, _ = nn.utils.rnn.pad_packed_sequence(targets_packed, batch_first=True, padding_value=-100) - - loss = nn.functional.cross_entropy(logits, targets_masked, reduction="sum") - - num_frames = torch.sum(phonemes_len) - - run_ctx.mark_as_loss(name="CE", loss=loss, inv_norm_factor=num_frames) - - -# def export(*, model: Model, model_filename: str): -# scripted_model = torch.jit.optimize_for_inference(torch.jit.script(model.eval())) -# dummy_data = torch.randn(1, 30, 50, device="cpu") -# dummy_data_len, _ = torch.sort(torch.randint(low=10, high=30, size=(1,), device="cpu", dtype=torch.int32), descending=True) -# onnx_export( -# scripted_model, -# (dummy_data, dummy_data_len), -# f=model_filename, -# verbose=True, -# input_names=["data", "data_len"], -# output_names=["classes"], -# dynamic_axes={ -# # dict value: manually named axes -# "data": {0: "batch", 1: "time"}, -# "data_len": {0: "batch"}, -# "classes": {0: "batch", 1: "time"} -# } -# ) -# - -def export(*, model: Model, model_filename: str): - model.conformer.export_mode = True - dummy_data = torch.randn(1, 30, 50, device="cpu") - # dummy_data_len, _ = torch.sort(torch.randint(low=10, high=30, size=(1,), device="cpu", dtype=torch.int32), descending=True) - dummy_data_len = torch.ones((1,))*30 - scripted_model = torch.jit.optimize_for_inference(torch.jit.trace(model.eval(), example_inputs=(dummy_data, dummy_data_len))) - onnx_export( - scripted_model, - (dummy_data, dummy_data_len), - f=model_filename, - verbose=True, - input_names=["data", "data_len"], - output_names=["classes"], - dynamic_axes={ - # dict value: manually named axes - "data": {0: "batch", 1: "time"}, - "data_len": {0: "batch"}, - "classes": {0: "batch", 1: "time"} - } - ) - - diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/torchaudio_conformer_debug.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/torchaudio_conformer_debug.py deleted file mode 100644 index 0282fa789..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/torchaudio_conformer_debug.py +++ /dev/null @@ -1,425 +0,0 @@ -import time -import torch -from torch import nn -from torch.onnx import export as onnx_export -from torchaudio.functional import mask_along_axis -from torchaudio.models.conformer import Conformer - - - -from typing import Optional, Tuple - -import torch - - -__all__ = ["Conformer"] - - -def lengths_to_mask(lengths, max_len: int): - return torch.arange(max_len).expand(lengths.size()[0], max_len) < lengths.unsqueeze(1) - - -class _ConvolutionModule(torch.nn.Module): - r"""Conformer convolution module. - - Args: - input_dim (int): input dimension. - num_channels (int): number of depthwise convolution layer input channels. - depthwise_kernel_size (int): kernel size of depthwise convolution layer. - dropout (float, optional): dropout probability. (Default: 0.0) - bias (bool, optional): indicates whether to add bias term to each convolution layer. (Default: ``False``) - use_group_norm (bool, optional): use GroupNorm rather than BatchNorm. (Default: ``False``) - """ - - def __init__( - self, - input_dim: int, - num_channels: int, - depthwise_kernel_size: int, - dropout: float = 0.0, - bias: bool = False, - use_group_norm: bool = False, - ) -> None: - super().__init__() - if (depthwise_kernel_size - 1) % 2 != 0: - raise ValueError("depthwise_kernel_size must be odd to achieve 'SAME' padding.") - self.layer_norm = torch.nn.LayerNorm(input_dim) - self.sequential = torch.nn.Sequential( - torch.nn.Conv1d( - input_dim, - 2 * num_channels, - 1, - stride=1, - padding=0, - bias=bias, - ), - torch.nn.GLU(dim=1), - torch.nn.Conv1d( - num_channels, - num_channels, - depthwise_kernel_size, - stride=1, - padding=(depthwise_kernel_size - 1) // 2, - groups=num_channels, - bias=bias, - ), - torch.nn.GroupNorm(num_groups=1, num_channels=num_channels) - if use_group_norm - else torch.nn.BatchNorm1d(num_channels), - torch.nn.SiLU(), - torch.nn.Conv1d( - num_channels, - input_dim, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ), - torch.nn.Dropout(dropout), - ) - - def forward(self, input: torch.Tensor) -> torch.Tensor: - r""" - Args: - input (torch.Tensor): with shape `(B, T, D)`. - - Returns: - torch.Tensor: output, with shape `(B, T, D)`. - """ - x = self.layer_norm(input) - x = x.transpose(1, 2) - x = self.sequential(x) - return x.transpose(1, 2) - - -class _FeedForwardModule(torch.nn.Module): - r"""Positionwise feed forward layer. - - Args: - input_dim (int): input dimension. - hidden_dim (int): hidden dimension. - dropout (float, optional): dropout probability. (Default: 0.0) - """ - - def __init__(self, input_dim: int, hidden_dim: int, dropout: float = 0.0) -> None: - super().__init__() - self.sequential = torch.nn.Sequential( - torch.nn.LayerNorm(input_dim), - torch.nn.Linear(input_dim, hidden_dim, bias=True), - torch.nn.SiLU(), - torch.nn.Dropout(dropout), - torch.nn.Linear(hidden_dim, input_dim, bias=True), - torch.nn.Dropout(dropout), - ) - - def forward(self, input: torch.Tensor) -> torch.Tensor: - r""" - Args: - input (torch.Tensor): with shape `(*, D)`. - - Returns: - torch.Tensor: output, with shape `(*, D)`. - """ - return self.sequential(input) - - -class ConformerLayer(torch.nn.Module): - r"""Conformer layer that constitutes Conformer. - - Args: - input_dim (int): input dimension. - ffn_dim (int): hidden layer dimension of feedforward network. - num_attention_heads (int): number of attention heads. - depthwise_conv_kernel_size (int): kernel size of depthwise convolution layer. - dropout (float, optional): dropout probability. (Default: 0.0) - use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d`` - in the convolution module. (Default: ``False``) - convolution_first (bool, optional): apply the convolution module ahead of - the attention module. (Default: ``False``) - """ - - def __init__( - self, - input_dim: int, - ffn_dim: int, - num_attention_heads: int, - depthwise_conv_kernel_size: int, - dropout: float = 0.0, - use_group_norm: bool = False, - convolution_first: bool = False, - ) -> None: - super().__init__() - - self.ffn1 = _FeedForwardModule(input_dim, ffn_dim, dropout=dropout) - - self.self_attn_layer_norm = torch.nn.LayerNorm(input_dim) - self.self_attn = torch.nn.MultiheadAttention(input_dim, num_attention_heads, dropout=dropout) - self.self_attn_dropout = torch.nn.Dropout(dropout) - - self.conv_module = _ConvolutionModule( - input_dim=input_dim, - num_channels=input_dim, - depthwise_kernel_size=depthwise_conv_kernel_size, - dropout=dropout, - bias=True, - use_group_norm=use_group_norm, - ) - - self.ffn2 = _FeedForwardModule(input_dim, ffn_dim, dropout=dropout) - self.final_layer_norm = torch.nn.LayerNorm(input_dim) - self.convolution_first = convolution_first - - def _apply_convolution(self, input: torch.Tensor) -> torch.Tensor: - residual = input - input = input.transpose(0, 1) - input = self.conv_module(input) - input = input.transpose(0, 1) - input = residual + input - return input - - def forward(self, input: torch.Tensor, key_padding_mask: Optional[torch.Tensor]) -> torch.Tensor: - r""" - Args: - input (torch.Tensor): input, with shape `(T, B, D)`. - key_padding_mask (torch.Tensor or None): key padding mask to use in self attention layer. - - Returns: - torch.Tensor: output, with shape `(T, B, D)`. - """ - residual = input - x = self.ffn1(input) - x = x * 0.5 + residual - - if self.convolution_first: - x = self._apply_convolution(x) - - residual = x - x = self.self_attn_layer_norm(x) - x, _ = self.self_attn( - query=x, - key=x, - value=x, - key_padding_mask=None, - need_weights=False, - ) - x = self.self_attn_dropout(x) - x = x + residual - - if not self.convolution_first: - x = self._apply_convolution(x) - - residual = x - x = self.ffn2(x) - x = x * 0.5 + residual - - x = self.final_layer_norm(x) - return x - - -class Conformer(torch.nn.Module): - r"""Conformer architecture introduced in - *Conformer: Convolution-augmented Transformer for Speech Recognition* - :cite:`gulati2020conformer`. - - Args: - input_dim (int): input dimension. - num_heads (int): number of attention heads in each Conformer layer. - ffn_dim (int): hidden layer dimension of feedforward networks. - num_layers (int): number of Conformer layers to instantiate. - depthwise_conv_kernel_size (int): kernel size of each Conformer layer's depthwise convolution layer. - dropout (float, optional): dropout probability. (Default: 0.0) - use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d`` - in the convolution module. (Default: ``False``) - convolution_first (bool, optional): apply the convolution module ahead of - the attention module. (Default: ``False``) - - Examples: - >>> conformer = Conformer( - >>> input_dim=80, - >>> num_heads=4, - >>> ffn_dim=128, - >>> num_layers=4, - >>> depthwise_conv_kernel_size=31, - >>> ) - >>> lengths = torch.randint(1, 400, (10,)) # (batch,) - >>> input = torch.rand(10, int(lengths.max()), input_dim) # (batch, num_frames, input_dim) - >>> output = conformer(input, lengths) - """ - - def __init__( - self, - input_dim: int, - num_heads: int, - ffn_dim: int, - num_layers: int, - depthwise_conv_kernel_size: int, - dropout: float = 0.0, - use_group_norm: bool = False, - convolution_first: bool = False, - ): - super().__init__() - - self.conformer_layers = torch.nn.ModuleList( - [ - ConformerLayer( - input_dim, - ffn_dim, - num_heads, - depthwise_conv_kernel_size, - dropout=dropout, - use_group_norm=use_group_norm, - convolution_first=convolution_first, - ) - for _ in range(num_layers) - ] - ) - - def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - r""" - Args: - input (torch.Tensor): with shape `(B, T, input_dim)`. - lengths (torch.Tensor): with shape `(B,)` and i-th element representing - number of valid frames for i-th batch element in ``input``. - - Returns: - (torch.Tensor, torch.Tensor) - torch.Tensor - output frames, with shape `(B, T, input_dim)` - torch.Tensor - output lengths, with shape `(B,)` and i-th element representing - number of valid frames for i-th batch element in output frames. - """ - encoder_padding_mask = lengths_to_mask(lengths, input.size()[1]) - - x = input.transpose(0, 1) - for layer in self.conformer_layers: - x = layer(x, encoder_padding_mask) - return x.transpose(0, 1), lengths - -class Model_(torch.nn.Module): - - def __init__(self): - super().__init__() - conformer_size = 384 - target_size=12001 - self.initial_linear = nn.Linear(50, conformer_size) - #self.conformer = Conformer( - # input_dim=conformer_size, - # num_heads=4, - # ffn_dim=1024, - # num_layers=8, - # depthwise_conv_kernel_size=31, - # dropout=0.1 - #) - self.self_attn = torch.nn.MultiheadAttention(conformer_size, 4, dropout=0.1) - self.final_linear = nn.Linear(conformer_size, target_size) - - def forward( - self, - audio_features: torch.Tensor, - audio_features_len: torch.Tensor, - ): - if self.training: - audio_features_time_masked = mask_along_axis(audio_features, mask_param=20, mask_value=0.0, axis=1) - audio_features_time_masked_2 = mask_along_axis(audio_features_time_masked, mask_param=20, mask_value=0.0, axis=1) - audio_features_masked = mask_along_axis(audio_features_time_masked_2, mask_param=10, mask_value=0.0, axis=2) - audio_features_masked_2 = mask_along_axis(audio_features_masked, mask_param=10, mask_value=0.0, axis=2) - else: - audio_features_masked_2 = audio_features - - - conformer_in = self.initial_linear(audio_features_masked_2) - - conformer_out, _ = self.self_attn(query=conformer_in, key=conformer_in, value=conformer_in) - - logits = self.final_linear(conformer_out) # [B, T, F] - logits_ce_order = torch.permute(logits, dims=(0, 2, 1)) # CE expects [B, F, T] - log_probs = torch.log_softmax(logits, dim=2) - - return log_probs, logits_ce_order - - -class Model(torch.nn.Module): - - def __init__(self): - super().__init__() - self.self_attn = torch.nn.MultiheadAttention(50, 5, dropout=0.1) - - def forward( - self, - input: torch.Tensor, - ): - conformer_out, _ = self.self_attn(query=input, key=input, value=input) - - return conformer_out - - -scripted_model = None - -def train_step(*, model: Model, data, run_ctx, **_kwargs): - global scripted_model - audio_features = data["data"] - audio_features_len = data["data:seq_len"] - - audio_features_len, indices = torch.sort(audio_features_len, descending=True) - audio_features = audio_features[indices, :, :] - - phonemes = data["classes"][indices, :] - phonemes_len = data["classes:seq_len"][indices] - - if scripted_model is None: - scripted_model = torch.jit.script(model) - export(model=model, model_filename="testdump.onnx") - - # distributed_model = DataParallel(model) - log_probs, logits = model( - audio_features=audio_features, - audio_features_len=audio_features_len.to("cuda"), - ) - - targets_packed = nn.utils.rnn.pack_padded_sequence(phonemes, phonemes_len, batch_first=True, enforce_sorted=False) - targets_masked, _ = nn.utils.rnn.pad_packed_sequence(targets_packed, batch_first=True, padding_value=-100) - - loss = nn.functional.cross_entropy(logits, targets_masked) - - run_ctx.mark_as_loss(name="CE", loss=loss) - - -def export(*, model: Model, model_filename: str): - scripted_model = torch.jit.optimize_for_inference(torch.jit.script(model.eval())) - dummy_data = torch.randn(1, 30, 50, device="cpu") - dummy_data_len, _ = torch.sort(torch.randint(low=10, high=30, size=(1,), device="cpu", dtype=torch.int32), descending=True) - onnx_export( - scripted_model, - (dummy_data, dummy_data_len), - f=model_filename, - verbose=True, - input_names=["in"], - output_names=["out"], - dynamic_axes={ - # dict value: manually named axes - "in": {0: "batch", 1: "time"}, - "out": {0: "batch", 1: "time"} - } - ) - - -def export_trace(*, model: Model, model_filename: str): - dummy_data = torch.randn(1, 30, 50, device="cpu") - dummy_data_len, _ = torch.sort(torch.randint(low=10, high=30, size=(1,), device="cpu", dtype=torch.int32), descending=True) - scripted_model = torch.jit.optimize_for_inference(torch.jit.trace(model.eval(), example_inputs=(dummy_data,))) - onnx_export( - scripted_model, - (dummy_data,), - f=model_filename, - verbose=True, - input_names=["in"], - output_names=["out"], - dynamic_axes={ - # dict value: manually named axes - "in": {0: "batch", 1: "time"}, - "out": {0: "batch", 1: "time"} - } - ) - - diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/torchaudio_conformer_large_fp16.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/torchaudio_conformer_large_fp16.py deleted file mode 100644 index 678e4a4c2..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/torchaudio_conformer_large_fp16.py +++ /dev/null @@ -1,476 +0,0 @@ -import time - -from typing import Optional, Tuple - -import torch -from torch import nn, autocast -from torch.onnx import export as onnx_export -from torch.cuda.amp import GradScaler -from torchaudio.functional import mask_along_axis - -from returnn.log import log -from returnn.torch.engine import Engine as TorchEngine -from returnn.util.basic import NumbersDict -from returnn.torch.context import get_run_ctx, init_train_step_run_ctx - - -class CustomEngine(TorchEngine): - - def train_epoch(self): - """ - train one (sub)epoch - """ - print("start", self.get_epoch_str(), "with learning rate", self.learning_rate, "...", file=log.v4) - - self._model.train() - init_train_step_run_ctx(device=self._device) - - # Creates a GradScaler once at the beginning of training. - scaler = GradScaler() - - accumulated_losses_dict = NumbersDict() - step_idx = 0 - for data in self._train_dataloader: - step_time_start = time.time() - run_ctx = get_run_ctx() - run_ctx.init_step() - - self._updater.get_optimizer().zero_grad() - with autocast(device_type='cuda', dtype=torch.bfloat16): - self._run_step(data) - - losses_dict = run_ctx.losses - total_loss = run_ctx.total_loss() - - scaler.scale(total_loss).backward() - scaler.step(self._updater.get_optimizer()) - scaler.update() - - losses_dict = { - "train_loss_" + name: float(loss.loss.detach().cpu().numpy()) - for name, loss in losses_dict.items() - } - accumulated_losses_dict += NumbersDict(losses_dict) - print("step %i, loss: %f, took: %.3fs" % ( - step_idx, total_loss.detach().cpu().numpy(), time.time() - step_time_start - ), file=log.v4) - - step_idx += 1 - - print("Trained %i steps" % step_idx) - - accumulated_losses_dict = accumulated_losses_dict / step_idx - self.learning_rate_control.set_epoch_error(self.epoch, dict(accumulated_losses_dict)) - self.learning_rate_control.save() - - if self.epoch % self._save_model_epoch_interval == 0 or self.epoch == self._final_epoch: - self._save_model() - self._save_optimizer() - - self.eval_model() - - -def _lengths_to_padding_mask(lengths: torch.Tensor) -> torch.Tensor: - batch_size = lengths.shape[0] - max_length = int(torch.max(lengths).item()) - padding_mask = torch.arange(max_length, device=lengths.device, dtype=lengths.dtype).expand( - batch_size, max_length - ) >= lengths.unsqueeze(1) - return padding_mask - - -class _ConvolutionModule(torch.nn.Module): - r"""Conformer convolution module. - - Args: - input_dim (int): input dimension. - num_channels (int): number of depthwise convolution layer input channels. - depthwise_kernel_size (int): kernel size of depthwise convolution layer. - dropout (float, optional): dropout probability. (Default: 0.0) - bias (bool, optional): indicates whether to add bias term to each convolution layer. (Default: ``False``) - use_group_norm (bool, optional): use GroupNorm rather than BatchNorm. (Default: ``False``) - """ - - def __init__( - self, - input_dim: int, - num_channels: int, - depthwise_kernel_size: int, - dropout: float = 0.0, - bias: bool = False, - use_group_norm: bool = False, - ) -> None: - super().__init__() - if (depthwise_kernel_size - 1) % 2 != 0: - raise ValueError("depthwise_kernel_size must be odd to achieve 'SAME' padding.") - self.layer_norm = torch.nn.LayerNorm(input_dim) - self.sequential = torch.nn.Sequential( - torch.nn.Conv1d( - input_dim, - 2 * num_channels, - 1, - stride=1, - padding=0, - bias=bias, - ), - torch.nn.GLU(dim=1), - torch.nn.Conv1d( - num_channels, - num_channels, - depthwise_kernel_size, - stride=1, - padding=(depthwise_kernel_size - 1) // 2, - groups=num_channels, - bias=bias, - ), - torch.nn.GroupNorm(num_groups=1, num_channels=num_channels) - if use_group_norm - else torch.nn.BatchNorm1d(num_channels), - torch.nn.SiLU(), - torch.nn.Conv1d( - num_channels, - input_dim, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ), - torch.nn.Dropout(dropout), - ) - - def forward(self, input: torch.Tensor) -> torch.Tensor: - r""" - Args: - input (torch.Tensor): with shape `(B, T, D)`. - - Returns: - torch.Tensor: output, with shape `(B, T, D)`. - """ - x = self.layer_norm(input) - x = x.transpose(1, 2) - x = self.sequential(x) - return x.transpose(1, 2) - - -class _FeedForwardModule(torch.nn.Module): - r"""Positionwise feed forward layer. - - Args: - input_dim (int): input dimension. - hidden_dim (int): hidden dimension. - dropout (float, optional): dropout probability. (Default: 0.0) - """ - - def __init__(self, input_dim: int, hidden_dim: int, dropout: float = 0.0) -> None: - super().__init__() - self.sequential = torch.nn.Sequential( - torch.nn.LayerNorm(input_dim), - torch.nn.Linear(input_dim, hidden_dim, bias=True), - torch.nn.SiLU(), - torch.nn.Dropout(dropout), - torch.nn.Linear(hidden_dim, input_dim, bias=True), - torch.nn.Dropout(dropout), - ) - - def forward(self, input: torch.Tensor) -> torch.Tensor: - r""" - Args: - input (torch.Tensor): with shape `(*, D)`. - - Returns: - torch.Tensor: output, with shape `(*, D)`. - """ - return self.sequential(input) - - -class ConformerLayer(torch.nn.Module): - r"""Conformer layer that constitutes Conformer. - - Args: - input_dim (int): input dimension. - ffn_dim (int): hidden layer dimension of feedforward network. - num_attention_heads (int): number of attention heads. - depthwise_conv_kernel_size (int): kernel size of depthwise convolution layer. - dropout (float, optional): dropout probability. (Default: 0.0) - use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d`` - in the convolution module. (Default: ``False``) - convolution_first (bool, optional): apply the convolution module ahead of - the attention module. (Default: ``False``) - """ - - def __init__( - self, - input_dim: int, - ffn_dim: int, - num_attention_heads: int, - depthwise_conv_kernel_size: int, - dropout: float = 0.0, - use_group_norm: bool = False, - convolution_first: bool = False, - ) -> None: - super().__init__() - - self.ffn1 = _FeedForwardModule(input_dim, ffn_dim, dropout=dropout) - - self.self_attn_layer_norm = torch.nn.LayerNorm(input_dim) - self.self_attn = torch.nn.MultiheadAttention(input_dim, num_attention_heads, dropout=dropout) - self.self_attn_dropout = torch.nn.Dropout(dropout) - - self.conv_module = _ConvolutionModule( - input_dim=input_dim, - num_channels=input_dim, - depthwise_kernel_size=depthwise_conv_kernel_size, - dropout=dropout, - bias=True, - use_group_norm=use_group_norm, - ) - - self.ffn2 = _FeedForwardModule(input_dim, ffn_dim, dropout=dropout) - self.final_layer_norm = torch.nn.LayerNorm(input_dim) - self.convolution_first = convolution_first - - def _apply_convolution(self, input: torch.Tensor) -> torch.Tensor: - residual = input - input = input.transpose(0, 1) - input = self.conv_module(input) - input = input.transpose(0, 1) - input = residual + input - return input - - def forward(self, input: torch.Tensor, key_padding_mask: Optional[torch.Tensor]) -> torch.Tensor: - r""" - Args: - input (torch.Tensor): input, with shape `(T, B, D)`. - key_padding_mask (torch.Tensor or None): key padding mask to use in self attention layer. - - Returns: - torch.Tensor: output, with shape `(T, B, D)`. - """ - residual = input - x = self.ffn1(input) - x = x * 0.5 + residual - - if self.convolution_first: - x = self._apply_convolution(x) - - residual = x - x = self.self_attn_layer_norm(x) - x, _ = self.self_attn( - query=x, - key=x, - value=x, - key_padding_mask=key_padding_mask, - need_weights=False, - ) - x = self.self_attn_dropout(x) - x = x + residual - - if not self.convolution_first: - x = self._apply_convolution(x) - - residual = x - x = self.ffn2(x) - x = x * 0.5 + residual - - x = self.final_layer_norm(x) - return x - - -class Conformer(torch.nn.Module): - r"""Conformer architecture introduced in - *Conformer: Convolution-augmented Transformer for Speech Recognition* - :cite:`gulati2020conformer`. - - Args: - input_dim (int): input dimension. - num_heads (int): number of attention heads in each Conformer layer. - ffn_dim (int): hidden layer dimension of feedforward networks. - num_layers (int): number of Conformer layers to instantiate. - depthwise_conv_kernel_size (int): kernel size of each Conformer layer's depthwise convolution layer. - dropout (float, optional): dropout probability. (Default: 0.0) - use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d`` - in the convolution module. (Default: ``False``) - convolution_first (bool, optional): apply the convolution module ahead of - the attention module. (Default: ``False``) - - Examples: - >>> conformer = Conformer( - >>> input_dim=80, - >>> num_heads=4, - >>> ffn_dim=128, - >>> num_layers=4, - >>> depthwise_conv_kernel_size=31, - >>> ) - >>> lengths = torch.randint(1, 400, (10,)) # (batch,) - >>> input = torch.rand(10, int(lengths.max()), input_dim) # (batch, num_frames, input_dim) - >>> output = conformer(input, lengths) - """ - - def __init__( - self, - input_dim: int, - num_heads: int, - ffn_dim: int, - num_layers: int, - depthwise_conv_kernel_size: int, - dropout: float = 0.0, - use_group_norm: bool = False, - convolution_first: bool = False, - ): - super().__init__() - - self.conformer_layers = torch.nn.ModuleList( - [ - ConformerLayer( - input_dim, - ffn_dim, - num_heads, - depthwise_conv_kernel_size, - dropout=dropout, - use_group_norm=use_group_norm, - convolution_first=convolution_first, - ) - for _ in range(num_layers) - ] - ) - self.export_mode = False - - def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - r""" - Args: - input (torch.Tensor): with shape `(B, T, input_dim)`. - lengths (torch.Tensor): with shape `(B,)` and i-th element representing - number of valid frames for i-th batch element in ``input``. - - Returns: - (torch.Tensor, torch.Tensor) - torch.Tensor - output frames, with shape `(B, T, input_dim)` - torch.Tensor - output lengths, with shape `(B,)` and i-th element representing - number of valid frames for i-th batch element in output frames. - """ - encoder_padding_mask = None if self.export_mode else _lengths_to_padding_mask(lengths) - - x = input.transpose(0, 1) - for layer in self.conformer_layers: - x = layer(x, encoder_padding_mask) - return x.transpose(0, 1), lengths - -class Model(torch.nn.Module): - - def __init__(self): - super().__init__() - conformer_size = 512 - target_size=12001 - self.initial_linear = nn.Linear(50, conformer_size) - self.conformer = Conformer( - input_dim=conformer_size, - num_heads=8, - ffn_dim=2048, - num_layers=12, - depthwise_conv_kernel_size=31, - dropout=0.1 - ) - self.final_linear = nn.Linear(conformer_size, target_size) - - def forward( - self, - audio_features: torch.Tensor, - audio_features_len: torch.Tensor, - ): - if self.training: - audio_features_time_masked = mask_along_axis(audio_features, mask_param=20, mask_value=0.0, axis=1) - audio_features_time_masked_2 = mask_along_axis(audio_features_time_masked, mask_param=20, mask_value=0.0, axis=1) - audio_features_masked = mask_along_axis(audio_features_time_masked_2, mask_param=10, mask_value=0.0, axis=2) - audio_features_masked_2 = mask_along_axis(audio_features_masked, mask_param=10, mask_value=0.0, axis=2) - else: - audio_features_masked_2 = audio_features - - - conformer_in = self.initial_linear(audio_features_masked_2) - - conformer_out, _ = self.conformer(conformer_in, audio_features_len) - - logits = self.final_linear(conformer_out) # [B, T, F] - logits_ce_order = torch.permute(logits, dims=(0, 2, 1)) # CE expects [B, F, T] - log_probs = torch.log_softmax(logits, dim=2) - - return log_probs, logits_ce_order - - -scripted_model = None - -def train_step(*, model: Model, data, run_ctx, **_kwargs): - global scripted_model - audio_features = data["data"] - audio_features_len = data["data:seq_len"] - - audio_features_len, indices = torch.sort(audio_features_len, descending=True) - audio_features = audio_features[indices, :, :] - - phonemes = data["classes"][indices, :] - phonemes_len = data["classes:seq_len"][indices] - - #if scripted_model is None: - # model.to("cpu") - # export_trace(model=model, model_filename="testdump.onnx") - # model.to("cuda") - # model.train() - - # distributed_model = DataParallel(model) - log_probs, logits = model( - audio_features=audio_features, - audio_features_len=audio_features_len.to("cuda"), - ) - - targets_packed = nn.utils.rnn.pack_padded_sequence(phonemes, phonemes_len, batch_first=True, enforce_sorted=False) - targets_masked, _ = nn.utils.rnn.pad_packed_sequence(targets_packed, batch_first=True, padding_value=-100) - - loss = nn.functional.cross_entropy(logits, targets_masked) - - run_ctx.mark_as_loss(name="CE", loss=loss) - - -# def export(*, model: Model, model_filename: str): -# scripted_model = torch.jit.optimize_for_inference(torch.jit.script(model.eval())) -# dummy_data = torch.randn(1, 30, 50, device="cpu") -# dummy_data_len, _ = torch.sort(torch.randint(low=10, high=30, size=(1,), device="cpu", dtype=torch.int32), descending=True) -# onnx_export( -# scripted_model, -# (dummy_data, dummy_data_len), -# f=model_filename, -# verbose=True, -# input_names=["data", "data_len"], -# output_names=["classes"], -# dynamic_axes={ -# # dict value: manually named axes -# "data": {0: "batch", 1: "time"}, -# "data_len": {0: "batch"}, -# "classes": {0: "batch", 1: "time"} -# } -# ) -# - -def export_trace(*, model: Model, model_filename: str): - model.conformer.export_mode = True - dummy_data = torch.randn(1, 30, 50, device="cpu") - # dummy_data_len, _ = torch.sort(torch.randint(low=10, high=30, size=(1,), device="cpu", dtype=torch.int32), descending=True) - dummy_data_len = torch.ones((1,))*30 - scripted_model = torch.jit.optimize_for_inference(torch.jit.trace(model.eval(), example_inputs=(dummy_data, dummy_data_len))) - onnx_export( - scripted_model, - (dummy_data, dummy_data_len), - f=model_filename, - verbose=True, - input_names=["data", "data_len"], - output_names=["classes"], - dynamic_axes={ - # dict value: manually named axes - "data": {0: "batch", 1: "time"}, - "data_len": {0: "batch"}, - "classes": {0: "batch", 1: "time"} - } - ) - - diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/torchaudio_conformer_subsample.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/torchaudio_conformer_subsample.py deleted file mode 100644 index f48d2322c..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/torchaudio_conformer_subsample.py +++ /dev/null @@ -1,426 +0,0 @@ -import time -import torch -from torch import nn -from torch.onnx import export as onnx_export -from torchaudio.functional import mask_along_axis -from torchaudio.models.conformer import Conformer - - - -from typing import Optional, Tuple - -import torch - - -__all__ = ["Conformer"] - - -def _lengths_to_padding_mask(lengths: torch.Tensor) -> torch.Tensor: - batch_size = lengths.shape[0] - max_length = int(torch.max(lengths).item()) - padding_mask = torch.arange(max_length, device=lengths.device, dtype=lengths.dtype).expand( - batch_size, max_length - ) >= lengths.unsqueeze(1) - return padding_mask - - -class _ConvolutionModule(torch.nn.Module): - r"""Conformer convolution module. - - Args: - input_dim (int): input dimension. - num_channels (int): number of depthwise convolution layer input channels. - depthwise_kernel_size (int): kernel size of depthwise convolution layer. - dropout (float, optional): dropout probability. (Default: 0.0) - bias (bool, optional): indicates whether to add bias term to each convolution layer. (Default: ``False``) - use_group_norm (bool, optional): use GroupNorm rather than BatchNorm. (Default: ``False``) - """ - - def __init__( - self, - input_dim: int, - num_channels: int, - depthwise_kernel_size: int, - dropout: float = 0.0, - bias: bool = False, - use_group_norm: bool = False, - ) -> None: - super().__init__() - if (depthwise_kernel_size - 1) % 2 != 0: - raise ValueError("depthwise_kernel_size must be odd to achieve 'SAME' padding.") - self.layer_norm = torch.nn.LayerNorm(input_dim) - self.sequential = torch.nn.Sequential( - torch.nn.Conv1d( - input_dim, - 2 * num_channels, - 1, - stride=1, - padding=0, - bias=bias, - ), - torch.nn.GLU(dim=1), - torch.nn.Conv1d( - num_channels, - num_channels, - depthwise_kernel_size, - stride=1, - padding=(depthwise_kernel_size - 1) // 2, - groups=num_channels, - bias=bias, - ), - torch.nn.GroupNorm(num_groups=1, num_channels=num_channels) - if use_group_norm - else torch.nn.BatchNorm1d(num_channels), - torch.nn.SiLU(), - torch.nn.Conv1d( - num_channels, - input_dim, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ), - torch.nn.Dropout(dropout), - ) - - def forward(self, input: torch.Tensor) -> torch.Tensor: - r""" - Args: - input (torch.Tensor): with shape `(B, T, D)`. - - Returns: - torch.Tensor: output, with shape `(B, T, D)`. - """ - x = self.layer_norm(input) - x = x.transpose(1, 2) - x = self.sequential(x) - return x.transpose(1, 2) - - -class _FeedForwardModule(torch.nn.Module): - r"""Positionwise feed forward layer. - - Args: - input_dim (int): input dimension. - hidden_dim (int): hidden dimension. - dropout (float, optional): dropout probability. (Default: 0.0) - """ - - def __init__(self, input_dim: int, hidden_dim: int, dropout: float = 0.0) -> None: - super().__init__() - self.sequential = torch.nn.Sequential( - torch.nn.LayerNorm(input_dim), - torch.nn.Linear(input_dim, hidden_dim, bias=True), - torch.nn.SiLU(), - torch.nn.Dropout(dropout), - torch.nn.Linear(hidden_dim, input_dim, bias=True), - torch.nn.Dropout(dropout), - ) - - def forward(self, input: torch.Tensor) -> torch.Tensor: - r""" - Args: - input (torch.Tensor): with shape `(*, D)`. - - Returns: - torch.Tensor: output, with shape `(*, D)`. - """ - return self.sequential(input) - - -class ConformerLayer(torch.nn.Module): - r"""Conformer layer that constitutes Conformer. - - Args: - input_dim (int): input dimension. - ffn_dim (int): hidden layer dimension of feedforward network. - num_attention_heads (int): number of attention heads. - depthwise_conv_kernel_size (int): kernel size of depthwise convolution layer. - dropout (float, optional): dropout probability. (Default: 0.0) - use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d`` - in the convolution module. (Default: ``False``) - convolution_first (bool, optional): apply the convolution module ahead of - the attention module. (Default: ``False``) - """ - - def __init__( - self, - input_dim: int, - ffn_dim: int, - num_attention_heads: int, - depthwise_conv_kernel_size: int, - dropout: float = 0.0, - use_group_norm: bool = False, - convolution_first: bool = False, - ) -> None: - super().__init__() - - self.ffn1 = _FeedForwardModule(input_dim, ffn_dim, dropout=dropout) - - self.self_attn_layer_norm = torch.nn.LayerNorm(input_dim) - self.self_attn = torch.nn.MultiheadAttention(input_dim, num_attention_heads, dropout=dropout) - self.self_attn_dropout = torch.nn.Dropout(dropout) - - self.conv_module = _ConvolutionModule( - input_dim=input_dim, - num_channels=input_dim, - depthwise_kernel_size=depthwise_conv_kernel_size, - dropout=dropout, - bias=True, - use_group_norm=use_group_norm, - ) - - self.ffn2 = _FeedForwardModule(input_dim, ffn_dim, dropout=dropout) - self.final_layer_norm = torch.nn.LayerNorm(input_dim) - self.convolution_first = convolution_first - - def _apply_convolution(self, input: torch.Tensor) -> torch.Tensor: - residual = input - input = input.transpose(0, 1) - input = self.conv_module(input) - input = input.transpose(0, 1) - input = residual + input - return input - - def forward(self, input: torch.Tensor, key_padding_mask: Optional[torch.Tensor]) -> torch.Tensor: - r""" - Args: - input (torch.Tensor): input, with shape `(T, B, D)`. - key_padding_mask (torch.Tensor or None): key padding mask to use in self attention layer. - - Returns: - torch.Tensor: output, with shape `(T, B, D)`. - """ - residual = input - x = self.ffn1(input) - x = x * 0.5 + residual - - if self.convolution_first: - x = self._apply_convolution(x) - - residual = x - x = self.self_attn_layer_norm(x) - x, _ = self.self_attn( - query=x, - key=x, - value=x, - key_padding_mask=key_padding_mask, - need_weights=False, - ) - x = self.self_attn_dropout(x) - x = x + residual - - if not self.convolution_first: - x = self._apply_convolution(x) - - residual = x - x = self.ffn2(x) - x = x * 0.5 + residual - - x = self.final_layer_norm(x) - return x - - -class Conformer(torch.nn.Module): - r"""Conformer architecture introduced in - *Conformer: Convolution-augmented Transformer for Speech Recognition* - :cite:`gulati2020conformer`. - - Args: - input_dim (int): input dimension. - num_heads (int): number of attention heads in each Conformer layer. - ffn_dim (int): hidden layer dimension of feedforward networks. - num_layers (int): number of Conformer layers to instantiate. - depthwise_conv_kernel_size (int): kernel size of each Conformer layer's depthwise convolution layer. - dropout (float, optional): dropout probability. (Default: 0.0) - use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d`` - in the convolution module. (Default: ``False``) - convolution_first (bool, optional): apply the convolution module ahead of - the attention module. (Default: ``False``) - - Examples: - >>> conformer = Conformer( - >>> input_dim=80, - >>> num_heads=4, - >>> ffn_dim=128, - >>> num_layers=4, - >>> depthwise_conv_kernel_size=31, - >>> ) - >>> lengths = torch.randint(1, 400, (10,)) # (batch,) - >>> input = torch.rand(10, int(lengths.max()), input_dim) # (batch, num_frames, input_dim) - >>> output = conformer(input, lengths) - """ - - def __init__( - self, - input_dim: int, - num_heads: int, - ffn_dim: int, - num_layers: int, - depthwise_conv_kernel_size: int, - dropout: float = 0.0, - use_group_norm: bool = False, - convolution_first: bool = False, - ): - super().__init__() - - self.downsample_conv = torch.nn.Conv1d(in_channels=input_dim, out_channels=input_dim, kernel_size=5, stride=2, padding=2) - self.conformer_layers = torch.nn.ModuleList( - [ - ConformerLayer( - input_dim, - ffn_dim, - num_heads, - depthwise_conv_kernel_size, - dropout=dropout, - use_group_norm=use_group_norm, - convolution_first=convolution_first, - ) - for _ in range(num_layers) - ] - ) - self.upsample_conv = torch.nn.ConvTranspose1d(in_channels=input_dim, out_channels=input_dim, kernel_size=5, stride=2, padding=1) - self.export_mode = False - - def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - r""" - Args: - input (torch.Tensor): with shape `(B, T, input_dim)`. - lengths (torch.Tensor): with shape `(B,)` and i-th element representing - number of valid frames for i-th batch element in ``input``. - - Returns: - (torch.Tensor, torch.Tensor) - torch.Tensor - output frames, with shape `(B, T, input_dim)` - torch.Tensor - output lengths, with shape `(B,)` and i-th element representing - number of valid frames for i-th batch element in output frames. - """ - - - input = self.downsample_conv.forward(input.transpose(1,2)).transpose(1, 2) - encoder_padding_mask = None if self.export_mode else _lengths_to_padding_mask((lengths+1)//2) - - x = input.transpose(0, 1) - for layer in self.conformer_layers: - x = layer(x, encoder_padding_mask) - return x.transpose(0, 1), lengths - -class Model(torch.nn.Module): - - def __init__(self): - super().__init__() - conformer_size = 384 - target_size=12001 - self.initial_linear = nn.Linear(50, conformer_size) - self.conformer = Conformer( - input_dim=conformer_size, - num_heads=4, - ffn_dim=1024, - num_layers=8, - depthwise_conv_kernel_size=31, - dropout=0.1 - ) - self.final_linear = nn.Linear(conformer_size, target_size) - - def forward( - self, - audio_features: torch.Tensor, - audio_features_len: torch.Tensor, - ): - if self.training: - audio_features_time_masked = mask_along_axis(audio_features, mask_param=20, mask_value=0.0, axis=1) - audio_features_time_masked_2 = mask_along_axis(audio_features_time_masked, mask_param=20, mask_value=0.0, axis=1) - audio_features_masked = mask_along_axis(audio_features_time_masked_2, mask_param=10, mask_value=0.0, axis=2) - audio_features_masked_2 = mask_along_axis(audio_features_masked, mask_param=10, mask_value=0.0, axis=2) - else: - audio_features_masked_2 = audio_features - - - conformer_in = self.initial_linear(audio_features_masked_2) - - conformer_out, _ = self.conformer(conformer_in, audio_features_len) - - logits = self.final_linear(conformer_out) # [B, T, F] - logits_ce_order = torch.permute(logits, dims=(0, 2, 1)) # CE expects [B, F, T] - log_probs = torch.log_softmax(logits, dim=2) - - return log_probs, logits_ce_order - - -scripted_model = None - -def train_step(*, model: Model, data, run_ctx, **_kwargs): - global scripted_model - audio_features = data["data"] - audio_features_len = data["data:seq_len"] - - audio_features_len, indices = torch.sort(audio_features_len, descending=True) - audio_features = audio_features[indices, :, :] - - phonemes = data["classes"][indices, :] - phonemes_len = data["classes:seq_len"][indices] - - #if scripted_model is None: - # model.to("cpu") - # export_trace(model=model, model_filename="testdump.onnx") - # model.to("cuda") - # model.train() - - # distributed_model = DataParallel(model) - log_probs, logits = model( - audio_features=audio_features, - audio_features_len=audio_features_len.to("cuda"), - ) - - targets_packed = nn.utils.rnn.pack_padded_sequence(phonemes[:,::2], (phonemes_len+1)//2, batch_first=True, enforce_sorted=False) - targets_masked, _ = nn.utils.rnn.pad_packed_sequence(targets_packed, batch_first=True, padding_value=-100) - - loss = nn.functional.cross_entropy(logits, targets_masked) - - run_ctx.mark_as_loss(name="CE", loss=loss) - - -# def export(*, model: Model, model_filename: str): -# scripted_model = torch.jit.optimize_for_inference(torch.jit.script(model.eval())) -# dummy_data = torch.randn(1, 30, 50, device="cpu") -# dummy_data_len, _ = torch.sort(torch.randint(low=10, high=30, size=(1,), device="cpu", dtype=torch.int32), descending=True) -# onnx_export( -# scripted_model, -# (dummy_data, dummy_data_len), -# f=model_filename, -# verbose=True, -# input_names=["data", "data_len"], -# output_names=["classes"], -# dynamic_axes={ -# # dict value: manually named axes -# "data": {0: "batch", 1: "time"}, -# "data_len": {0: "batch"}, -# "classes": {0: "batch", 1: "time"} -# } -# ) -# - -def export_trace(*, model: Model, model_filename: str): - model.conformer.export_mode = True - dummy_data = torch.randn(1, 30, 50, device="cpu") - # dummy_data_len, _ = torch.sort(torch.randint(low=10, high=30, size=(1,), device="cpu", dtype=torch.int32), descending=True) - dummy_data_len = torch.ones((1,))*30 - scripted_model = torch.jit.optimize_for_inference(torch.jit.trace(model.eval(), example_inputs=(dummy_data, dummy_data_len))) - onnx_export( - scripted_model, - (dummy_data, dummy_data_len), - f=model_filename, - verbose=True, - input_names=["data", "data_len"], - output_names=["classes"], - dynamic_axes={ - # dict value: manually named axes - "data": {0: "batch", 1: "time"}, - "data_len": {0: "batch"}, - "classes": {0: "batch", 1: "time"} - } - ) - - diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/torchaudio_conformer_subsample_upsample.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/torchaudio_conformer_subsample_upsample.py deleted file mode 100644 index 8251c4d1e..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/pytorch_networks/torchaudio_conformer_subsample_upsample.py +++ /dev/null @@ -1,440 +0,0 @@ -import time -import torch -from torch import nn -from torch.onnx import export as onnx_export -from torchaudio.functional import mask_along_axis -from torchaudio.models.conformer import Conformer - - - -from typing import Optional, Tuple - -import torch - - -__all__ = ["Conformer"] - - -def _lengths_to_padding_mask(lengths: torch.Tensor) -> torch.Tensor: - batch_size = lengths.shape[0] - max_length = int(torch.max(lengths).item()) - padding_mask = torch.arange(max_length, device=lengths.device, dtype=lengths.dtype).expand( - batch_size, max_length - ) >= lengths.unsqueeze(1) - return padding_mask - - -class _ConvolutionModule(torch.nn.Module): - r"""Conformer convolution module. - - Args: - input_dim (int): input dimension. - num_channels (int): number of depthwise convolution layer input channels. - depthwise_kernel_size (int): kernel size of depthwise convolution layer. - dropout (float, optional): dropout probability. (Default: 0.0) - bias (bool, optional): indicates whether to add bias term to each convolution layer. (Default: ``False``) - use_group_norm (bool, optional): use GroupNorm rather than BatchNorm. (Default: ``False``) - """ - - def __init__( - self, - input_dim: int, - num_channels: int, - depthwise_kernel_size: int, - dropout: float = 0.0, - bias: bool = False, - use_group_norm: bool = False, - ) -> None: - super().__init__() - if (depthwise_kernel_size - 1) % 2 != 0: - raise ValueError("depthwise_kernel_size must be odd to achieve 'SAME' padding.") - self.layer_norm = torch.nn.LayerNorm(input_dim) - self.sequential = torch.nn.Sequential( - torch.nn.Conv1d( - input_dim, - 2 * num_channels, - 1, - stride=1, - padding=0, - bias=bias, - ), - torch.nn.GLU(dim=1), - torch.nn.Conv1d( - num_channels, - num_channels, - depthwise_kernel_size, - stride=1, - padding=(depthwise_kernel_size - 1) // 2, - groups=num_channels, - bias=bias, - ), - torch.nn.GroupNorm(num_groups=1, num_channels=num_channels) - if use_group_norm - else torch.nn.BatchNorm1d(num_channels), - torch.nn.SiLU(), - torch.nn.Conv1d( - num_channels, - input_dim, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ), - torch.nn.Dropout(dropout), - ) - - def forward(self, input: torch.Tensor) -> torch.Tensor: - r""" - Args: - input (torch.Tensor): with shape `(B, T, D)`. - - Returns: - torch.Tensor: output, with shape `(B, T, D)`. - """ - x = self.layer_norm(input) - x = x.transpose(1, 2) - x = self.sequential(x) - return x.transpose(1, 2) - - -class _FeedForwardModule(torch.nn.Module): - r"""Positionwise feed forward layer. - - Args: - input_dim (int): input dimension. - hidden_dim (int): hidden dimension. - dropout (float, optional): dropout probability. (Default: 0.0) - """ - - def __init__(self, input_dim: int, hidden_dim: int, dropout: float = 0.0) -> None: - super().__init__() - self.sequential = torch.nn.Sequential( - torch.nn.LayerNorm(input_dim), - torch.nn.Linear(input_dim, hidden_dim, bias=True), - torch.nn.SiLU(), - torch.nn.Dropout(dropout), - torch.nn.Linear(hidden_dim, input_dim, bias=True), - torch.nn.Dropout(dropout), - ) - - def forward(self, input: torch.Tensor) -> torch.Tensor: - r""" - Args: - input (torch.Tensor): with shape `(*, D)`. - - Returns: - torch.Tensor: output, with shape `(*, D)`. - """ - return self.sequential(input) - - -class ConformerLayer(torch.nn.Module): - r"""Conformer layer that constitutes Conformer. - - Args: - input_dim (int): input dimension. - ffn_dim (int): hidden layer dimension of feedforward network. - num_attention_heads (int): number of attention heads. - depthwise_conv_kernel_size (int): kernel size of depthwise convolution layer. - dropout (float, optional): dropout probability. (Default: 0.0) - use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d`` - in the convolution module. (Default: ``False``) - convolution_first (bool, optional): apply the convolution module ahead of - the attention module. (Default: ``False``) - """ - - def __init__( - self, - input_dim: int, - ffn_dim: int, - num_attention_heads: int, - depthwise_conv_kernel_size: int, - dropout: float = 0.0, - use_group_norm: bool = False, - convolution_first: bool = False, - ) -> None: - super().__init__() - - self.ffn1 = _FeedForwardModule(input_dim, ffn_dim, dropout=dropout) - - self.self_attn_layer_norm = torch.nn.LayerNorm(input_dim) - self.self_attn = torch.nn.MultiheadAttention(input_dim, num_attention_heads, dropout=dropout) - self.self_attn_dropout = torch.nn.Dropout(dropout) - - self.conv_module = _ConvolutionModule( - input_dim=input_dim, - num_channels=input_dim, - depthwise_kernel_size=depthwise_conv_kernel_size, - dropout=dropout, - bias=True, - use_group_norm=use_group_norm, - ) - - self.ffn2 = _FeedForwardModule(input_dim, ffn_dim, dropout=dropout) - self.final_layer_norm = torch.nn.LayerNorm(input_dim) - self.convolution_first = convolution_first - - def _apply_convolution(self, input: torch.Tensor) -> torch.Tensor: - residual = input - input = input.transpose(0, 1) - input = self.conv_module(input) - input = input.transpose(0, 1) - input = residual + input - return input - - def forward(self, input: torch.Tensor, key_padding_mask: Optional[torch.Tensor]) -> torch.Tensor: - r""" - Args: - input (torch.Tensor): input, with shape `(T, B, D)`. - key_padding_mask (torch.Tensor or None): key padding mask to use in self attention layer. - - Returns: - torch.Tensor: output, with shape `(T, B, D)`. - """ - residual = input - x = self.ffn1(input) - x = x * 0.5 + residual - - if self.convolution_first: - x = self._apply_convolution(x) - - residual = x - x = self.self_attn_layer_norm(x) - x, _ = self.self_attn( - query=x, - key=x, - value=x, - key_padding_mask=key_padding_mask, - need_weights=False, - ) - x = self.self_attn_dropout(x) - x = x + residual - - if not self.convolution_first: - x = self._apply_convolution(x) - - residual = x - x = self.ffn2(x) - x = x * 0.5 + residual - - x = self.final_layer_norm(x) - return x - - -class Conformer(torch.nn.Module): - r"""Conformer architecture introduced in - *Conformer: Convolution-augmented Transformer for Speech Recognition* - :cite:`gulati2020conformer`. - - Extended version with very simple downsampling and upsampling - - Args: - input_dim (int): input dimension. - num_heads (int): number of attention heads in each Conformer layer. - ffn_dim (int): hidden layer dimension of feedforward networks. - num_layers (int): number of Conformer layers to instantiate. - depthwise_conv_kernel_size (int): kernel size of each Conformer layer's depthwise convolution layer. - dropout (float, optional): dropout probability. (Default: 0.0) - use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d`` - in the convolution module. (Default: ``False``) - convolution_first (bool, optional): apply the convolution module ahead of - the attention module. (Default: ``False``) - - Examples: - >>> conformer = Conformer( - >>> input_dim=80, - >>> num_heads=4, - >>> ffn_dim=128, - >>> num_layers=4, - >>> depthwise_conv_kernel_size=31, - >>> ) - >>> lengths = torch.randint(1, 400, (10,)) # (batch,) - >>> input = torch.rand(10, int(lengths.max()), input_dim) # (batch, num_frames, input_dim) - >>> output = conformer(input, lengths) - """ - - def __init__( - self, - input_dim: int, - num_heads: int, - ffn_dim: int, - num_layers: int, - depthwise_conv_kernel_size: int, - dropout: float = 0.0, - use_group_norm: bool = False, - convolution_first: bool = False, - ): - super().__init__() - - self.downsample_conv = torch.nn.Conv1d(in_channels=input_dim, out_channels=input_dim, kernel_size=5, stride=2, padding=2) - self.conformer_layers = torch.nn.ModuleList( - [ - ConformerLayer( - input_dim, - ffn_dim, - num_heads, - depthwise_conv_kernel_size, - dropout=dropout, - use_group_norm=use_group_norm, - convolution_first=convolution_first, - ) - for _ in range(num_layers) - ] - ) - self.upsample_conv = torch.nn.ConvTranspose1d(in_channels=input_dim, out_channels=input_dim, kernel_size=5, stride=2, padding=1) - self.export_mode = False - - def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - r""" - Args: - input (torch.Tensor): with shape `(B, T, input_dim)`. - lengths (torch.Tensor): with shape `(B,)` and i-th element representing - number of valid frames for i-th batch element in ``input``. - - Returns: - (torch.Tensor, torch.Tensor) - torch.Tensor - output frames, with shape `(B, T, input_dim)` - torch.Tensor - output lengths, with shape `(B,)` and i-th element representing - number of valid frames for i-th batch element in output frames. - """ - - # downsampling is done as [B, F, T] - input_downsampled = self.downsample_conv.forward(input.transpose(1,2)).transpose(1, 2) - - # also downsample the mask for training, in ONNX export we currently ignore the mask - encoder_padding_mask = None if self.export_mode else _lengths_to_padding_mask((lengths+1)//2) - - # Conformer is applied as [T, B, F] - x = input_downsampled.transpose(0, 1) - for layer in self.conformer_layers: - x = layer(x, encoder_padding_mask) # [T, B, F] - - conf_output = torch.permute(x, (1, 2 ,0)) # [B, F, T] for upsampling - upsampled = self.upsample_conv(conf_output).transpose(1, 2) # final upsampled [B, T, F] - - # slice for correct length - out_upsampled = upsampled[:,0:input.size()[1],:] - - return out_upsampled, lengths - -class Model(torch.nn.Module): - - def __init__(self, epoch, step, **kwargs): - super().__init__() - conformer_size = 384 - target_size=12001 - self.initial_linear = nn.Linear(50, conformer_size) - self.conformer = Conformer( - input_dim=conformer_size, - num_heads=4, - ffn_dim=1024, - num_layers=8, - depthwise_conv_kernel_size=31, - dropout=0.1 - ) - self.final_linear = nn.Linear(conformer_size, target_size) - - def forward( - self, - audio_features: torch.Tensor, - audio_features_len: torch.Tensor, - ): - if self.training: - audio_features_time_masked = mask_along_axis(audio_features, mask_param=20, mask_value=0.0, axis=1) - audio_features_time_masked_2 = mask_along_axis(audio_features_time_masked, mask_param=20, mask_value=0.0, axis=1) - audio_features_masked = mask_along_axis(audio_features_time_masked_2, mask_param=10, mask_value=0.0, axis=2) - audio_features_masked_2 = mask_along_axis(audio_features_masked, mask_param=10, mask_value=0.0, axis=2) - else: - audio_features_masked_2 = audio_features - - - conformer_in = self.initial_linear(audio_features_masked_2) - - conformer_out, _ = self.conformer(conformer_in, audio_features_len) - - logits = self.final_linear(conformer_out) # [B, T, F] - logits_ce_order = torch.permute(logits, dims=(0, 2, 1)) # CE expects [B, F, T] - log_probs = torch.log_softmax(logits, dim=2) - - return log_probs, logits_ce_order - - -# scripted_model = None - -def train_step(*, model: Model, data, run_ctx, **_kwargs): - global scripted_model - audio_features = data["data"] - audio_features_len = data["data:size1"] - - audio_features_len, indices = torch.sort(audio_features_len, descending=True) - audio_features = audio_features[indices, :, :] - - phonemes = data["classes"][indices, :] - phonemes_len = data["classes:size1"][indices] - - #if scripted_model is None: - # model.eval() - # model.to("cpu") - # export_trace(model=model, model_filename="testdump.onnx") - # assert False - - # distributed_model = DataParallel(model) - log_probs, logits = model( - audio_features=audio_features, - audio_features_len=audio_features_len.to("cuda"), - ) - - - targets_packed = nn.utils.rnn.pack_padded_sequence(phonemes, phonemes_len.to("cpu"), batch_first=True, enforce_sorted=False) - targets_masked, _ = nn.utils.rnn.pad_packed_sequence(targets_packed, batch_first=True, padding_value=-100) - - loss = nn.functional.cross_entropy(logits, targets_masked) - - run_ctx.mark_as_loss(name="CE", loss=loss) - - -# def export(*, model: Model, model_filename: str): -# scripted_model = torch.jit.optimize_for_inference(torch.jit.script(model.eval())) -# dummy_data = torch.randn(1, 30, 50, device="cpu") -# dummy_data_len, _ = torch.sort(torch.randint(low=10, high=30, size=(1,), device="cpu", dtype=torch.int32), descending=True) -# onnx_export( -# scripted_model, -# (dummy_data, dummy_data_len), -# f=model_filename, -# verbose=True, -# input_names=["data", "data_len"], -# output_names=["classes"], -# dynamic_axes={ -# # dict value: manually named axes -# "data": {0: "batch", 1: "time"}, -# "data_len": {0: "batch"}, -# "classes": {0: "batch", 1: "time"} -# } -# ) -# - -def export_trace(*, model: Model, model_filename: str): - model.conformer.export_mode = True - dummy_data = torch.randn(1, 30, 50, device="cpu") - # dummy_data_len, _ = torch.sort(torch.randint(low=10, high=30, size=(1,), device="cpu", dtype=torch.int32), descending=True) - dummy_data_len = torch.ones((1,))*30 - scripted_model = torch.jit.optimize_for_inference(torch.jit.trace(model.eval(), example_inputs=(dummy_data, dummy_data_len))) - onnx_export( - scripted_model, - (dummy_data, dummy_data_len), - f=model_filename, - verbose=True, - input_names=["data", "data_len"], - output_names=["classes"], - opset_version=14, - dynamic_axes={ - # dict value: manually named axes - "data": {0: "batch", 1: "time"}, - "data_len": {0: "batch"}, - "classes": {0: "batch", 1: "time"} - } - ) - - diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/rc_networks/__init__.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/rc_networks/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/rc_networks/debug.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/rc_networks/debug.py deleted file mode 100644 index 06f22404b..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/rc_networks/debug.py +++ /dev/null @@ -1,142 +0,0 @@ -from .default_hybrid import construct_hybrid_network, BLSTMEncoder -from returnn_common import nn -from returnn_common.asr.specaugment import random_mask_v2 -from returnn_common.nn.encoder import ISeqFramewiseEncoder - -from returnn.datasets import init_dataset -from returnn.config import Config -from returnn.tf.engine import Engine - -import better_exchook -better_exchook.install() - - -def specaugment_v2(x: nn.Tensor, *, - spatial_dim: nn.Dim, - feature_dim: nn.Dim = nn.NotSpecified, - global_train_step_dependent: bool = True, - only_on_train: bool = True, - ) -> nn.Tensor: - """ - SpecAugment reimplementation of :func:`specaugment_v1` - """ - if feature_dim is nn.NotSpecified: - assert x.feature_dim - feature_dim = x.feature_dim - if global_train_step_dependent: - step = nn.global_train_step() - step1 = nn.where(step >= 1000, 1, 0) - step2 = nn.where(step >= 2000, 1, 0) - else: - step1 = step2 = 1 - time_factor = 1 - - #with nn.Cond(nn.train_flag() | (not only_on_train)) as cond: - # x_masked = x - # spatial_len = nn.dim_value(spatial_dim) - # # time mask - # x_masked = random_mask_v2( - # x_masked, mask_axis=spatial_dim, broadcast_axis=feature_dim, - # min_num=nn.minimum(step1 + step2, spatial_len), - # max_num=nn.minimum(nn.maximum(spatial_len // 100, 2) * (1 + step1 + step2 * 2), spatial_len), - # max_dims=20 // time_factor) - # # feature mask - # # x_masked = random_mask_v2( - # # x_masked, mask_axis=feature_dim, broadcast_axis=spatial_dim, - # # min_num=step1 + step2, max_num=2 + step1 + step2 * 2, - # # max_dims=feature_dim.dimension // 5) - # # cond.true = x_masked - # cond.false = x - - spatial_len = nn.dim_value(spatial_dim) - # return cond.result - x_masked = random_mask_v2( - x, mask_axis=spatial_dim, broadcast_axis=feature_dim, - min_num=nn.minimum(step1 + step2, spatial_len), - max_num=nn.minimum(nn.maximum(spatial_len // 100, 2) * (1 + step1 + step2 * 2), spatial_len), - max_dims=20 // time_factor) - return x_masked - - - - - -class BLSTMEncoderMinimal(ISeqFramewiseEncoder): - """ - BLSTM encoder with specaugment - """ - - def __init__(self, label_feature_dim): - super().__init__() - - def __call__(self, source: nn.Tensor, *, spatial_dim: nn.Dim) -> nn.Tensor: - return specaugment_v2(source, spatial_dim=spatial_dim, feature_dim=source.feature_dim) - - -data_time = nn.SpatialDim("data_time", None) -data_feature = nn.FeatureDim("data_feature", 5) -classes_feature = nn.FeatureDim("classes_feature", 12) -data = nn.Data( - name="data", - dim_tags=[nn.batch_dim, data_time, data_feature], - available_for_inference=True, -) -classes = nn.Data( - name="classes", - dim_tags=[nn.batch_dim, data_time], - sparse_dim=classes_feature, - available_for_inference=False, -) - -def _config_get_network(epoch: int, **_kwargs) -> dict: - nn.reset_default_root_name_ctx() - net = construct_hybrid_network( - epoch=0, - train=True, - encoder=BLSTMEncoderMinimal, - audio_data=data, - label_data=classes, - ) - return nn.get_returnn_config().get_net_dict_raw_dict(net) - -class DummyNet(nn.Module): - - def __init__(self): - super().__init__() - - def __call__(self, source: nn.Tensor, spatial_dim: nn.Dim): - return random_mask_v2(x=source, mask_axis=spatial_dim, broadcast_axis=source.feature_dim, min_num=1, max_num=2, max_dims=3) - -def _config_get_spec_network(epoch: int) -> dict: - nn.reset_default_root_name_ctx() - net = DummyNet() - data_tensor = nn.get_extern_data(data) - spatial_dim = data.dim_tags[data.time_dim_axis] - out = net( - source=data_tensor, - spatial_dim=spatial_dim - ) - out.mark_as_default_output() - return nn.get_returnn_config().get_net_dict_raw_dict(net) - - -data_args = data.get_kwargs() -data_args.pop("name") -classes_args = classes.get_kwargs() -classes_args.pop("name") - -extern_data = { - "data": data_args, - "classes": classes_args, -} - -config = Config({ - "task": "train", "num_epochs": 1, "start_epoch": 1, - "get_network": _config_get_spec_network, - "extern_data": extern_data, -}) -train_dataset = init_dataset( - {"class": "DummyDataset", "input_dim": 5, "output_dim": 12, "num_seqs": 3}) -engine = Engine(config) -engine.init_train_from_config(config, train_data=train_dataset) -engine.train() \ No newline at end of file diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/rc_networks/debug_random_mask.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/rc_networks/debug_random_mask.py deleted file mode 100644 index b1d2fd202..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/rc_networks/debug_random_mask.py +++ /dev/null @@ -1,54 +0,0 @@ -from returnn_common import nn -from returnn_common.asr.specaugment import random_mask_v2, specaugment_v2 - -from returnn.datasets import init_dataset -from returnn.config import Config -from returnn.tf.engine import Engine - -import better_exchook -better_exchook.install() - -data_time = nn.SpatialDim("data_time", None) -data_feature = nn.FeatureDim("data_feature", 5) -data = nn.Data( - name="data", - dim_tags=[nn.batch_dim, data_time, data_feature], - available_for_inference=True, -) - -class DummyNet(nn.Module): - - def __init__(self): - super().__init__() - - def __call__(self, source: nn.Tensor, spatial_dim: nn.Dim): - return specaugment_v2(x=source, spatial_dim=spatial_dim) - # return random_mask_v2(x=source, mask_axis=spatial_dim, broadcast_axis=source.feature_dim, min_num=1, max_num=2, max_dims=3) - - -def _config_get_spec_network(epoch: int) -> dict: - nn.reset_default_root_name_ctx() - net = DummyNet() - data_tensor = nn.get_extern_data(data) - spatial_dim = data.dim_tags[data.time_dim_axis] - out = net(source=data_tensor, spatial_dim=spatial_dim) - out.mark_as_default_output() - config_code = nn.get_returnn_config().get_complete_py_code_str(net) - print(config_code) # I will also provide this as gist - return nn.get_returnn_config().get_net_dict_raw_dict(net) - -data_args = data.get_kwargs() -data_args.pop("name") -extern_data = {"data": data_args,} - -config = Config({ - "task": "train", "num_epochs": 1, "start_epoch": 1, - "get_network": _config_get_spec_network, - "extern_data": extern_data, - "behavior_version": 12 -}) -train_dataset = init_dataset( - {"class": "DummyDataset", "input_dim": 5, "output_dim": 12, "num_seqs": 3}) -engine = Engine(config) -engine.init_train_from_config(config, train_data=train_dataset) -engine.train() diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/rc_networks/default_hybrid.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/rc_networks/default_hybrid.py deleted file mode 100644 index 1b718644b..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/rc_networks/default_hybrid.py +++ /dev/null @@ -1,162 +0,0 @@ -from typing import Any, Callable, Optional, Tuple - -from returnn_common import nn -from returnn_common.nn.hybrid_hmm import IHybridHMM, EncoderType, ISeqFramewiseEncoder, ISeqDownsamplingEncoder -from returnn_common.asr.specaugment import specaugment_v2, random_mask_v2 - - -def flexible_specaugment( - x: nn.Tensor, *, - spatial_dim: nn.Dim, - feature_dim: nn.Dim = nn.NotSpecified, - only_on_train: bool = True, - min_frame_masks=2, - mask_each_n_frames=25, - max_frames_per_mask=20, - min_feature_masks=2, - max_feature_masks=5, - max_features_per_mask=8 - ) -> nn.Tensor: - """ - SpecAugment :func:`specaugment_v2` but with adjustable parameters - """ - if feature_dim is nn.NotSpecified: - assert x.feature_dim - feature_dim = x.feature_dim - - with nn.Cond(nn.train_flag() | (not only_on_train)) as cond: - x_masked = x - spatial_len = nn.dim_value(spatial_dim) - # time mask - x_masked = random_mask_v2( - x_masked, mask_axis=spatial_dim, broadcast_axis=feature_dim, - min_num=nn.minimum(min_frame_masks, spatial_len), - max_num=nn.minimum(nn.maximum(spatial_len // mask_each_n_frames, min_frame_masks), spatial_len), - max_dims=max_frames_per_mask) - # feature mask - x_masked = random_mask_v2( - x_masked, mask_axis=feature_dim, broadcast_axis=spatial_dim, - min_num=min_feature_masks, max_num=max_feature_masks, - max_dims=max_features_per_mask) - cond.true = x_masked - cond.false = x - return cond.result - - -class BLSTMLayer(nn.Module): - """ - BLSTM with time broadcasted dropout - """ - def __init__(self, size=512, dropout: float = 0.0): - super().__init__() - self.lstm_dim = nn.FeatureDim(description="BLSTM-out-dim", dimension=size) - self.out_dim = self.lstm_dim * 2 - self.fwd_lstm = nn.LSTM(out_dim=self.lstm_dim) - self.bwd_lstm = nn.LSTM(out_dim=self.lstm_dim) - self.dropout = dropout - - def __call__(self, source: nn.Tensor, time_dim: nn.Dim): - fwd, _ = self.fwd_lstm(source, axis=time_dim, direction=1) - bwd, _ = self.bwd_lstm(source, axis=time_dim, direction=-1) - concat = nn.concat((fwd, self.lstm_dim), (bwd, self.lstm_dim)) - if self.dropout > 0.0: - return nn.dropout(concat, self.dropout, axis=[nn.batch_dim, concat.feature_dim]) - else: - return concat - - -class BLSTMEncoder(ISeqFramewiseEncoder): - """ - BLSTM encoder with specaugment - """ - - def __init__(self, label_dim: nn.Dim, num_layers: int, size: int, dropout: float, specaugment_options=None): - super().__init__() - self.specaugment_options = specaugment_options - self.out_dim = label_dim - self.blstm_stack = nn.Sequential( - [ - BLSTMLayer(size=size, dropout=dropout) - for _ in range(num_layers) - ] - ) - - def __call__(self, source: nn.Tensor, *, spatial_dim: nn.Dim) -> nn.Tensor: - if self.specaugment_options: - source = flexible_specaugment( - source, - spatial_dim=spatial_dim, - feature_dim=source.feature_dim, - **self.specaugment_options) - return self.blstm_stack(source, time_dim=spatial_dim) - - -class HybridHMM(IHybridHMM): - """ - Hybrid NN-HMM - """ - - def __init__(self, *, encoder: EncoderType, out_dim: nn.Dim, focal_loss_scale: float = 1.0): - super().__init__() - self.encoder = encoder - self.out_dim = out_dim - self.focal_loss_scale = focal_loss_scale - self.out_projection = nn.Linear(out_dim) - - def __call__(self, source: nn.Tensor, *, - state: Optional[nn.LayerState] = None, - train: bool = False, targets: Optional[nn.Tensor] = None) -> Tuple[nn.Tensor, Optional[nn.LayerState]]: - assert source.data.time_dim_axis is not None - in_spatial_dim = source.data.dim_tags[source.data.time_dim_axis] - assert state is None, f"{self} stateful hybrid HMM not supported yet" - if isinstance(self.encoder, ISeqFramewiseEncoder): - encoder_output = self.encoder(source, spatial_dim=in_spatial_dim) - out_spatial_dim = in_spatial_dim - elif isinstance(self.encoder, ISeqDownsamplingEncoder): - encoder_output, out_spatial_dim = self.encoder(source, in_spatial_dim=in_spatial_dim) - else: - raise TypeError(f"unsupported encoder type {type(self.encoder)}") - out_embed = self.out_projection(encoder_output) - if train: - assert out_spatial_dim in targets.shape - ce_loss = nn.sparse_softmax_cross_entropy_with_logits(logits=out_embed, targets=targets, axis=self.out_dim) - # focal loss (= more emphasis on "low" scores), might not be correct yet - if self.focal_loss_scale != 1.0: - ce_loss *= (1.0 - nn.exp(-ce_loss)) ** self.focal_loss_scale - ce_loss.mark_as_loss(name="default_ce") - return nn.log_softmax(out_embed, axis=self.out_dim), None - - -def construct_hybrid_network( - epoch: int, - train: bool, - encoder: Callable[[nn.Dim, Any], EncoderType], - audio_data: nn.Data, - label_data: nn.Data, - **kwargs -): - """ - :param epoch: - :param train: - :param encoder: - :param audio_data: - :param label_data: - :param kwargs: encoder kwargs - :return: - """ - label_feature_dim = label_data.sparse_dim - focal_loss_scale = kwargs.pop("focal_loss_scale", 1.0) - enc = encoder(label_feature_dim, **kwargs) - net = HybridHMM( - encoder=enc, - out_dim=label_feature_dim, - focal_loss_scale=focal_loss_scale - ) - out, _ = net( - source=nn.get_extern_data(audio_data), - train=train, - targets=nn.get_extern_data(label_data) - ) - out.mark_as_default_output() - - return net diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/rc_networks/default_hybrid_v2.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/rc_networks/default_hybrid_v2.py deleted file mode 100644 index a4fd55263..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/rc_networks/default_hybrid_v2.py +++ /dev/null @@ -1,167 +0,0 @@ -""" -Like the original, but with the updated syntax -""" -from typing import Any, Callable, Optional, Tuple - -from returnn_common import nn -from returnn_common.nn.hybrid_hmm import IHybridHMM, EncoderType, ISeqFramewiseEncoder, ISeqDownsamplingEncoder -from returnn_common.asr.specaugment import specaugment_v2, random_mask_v2 - - -def flexible_specaugment( - x: nn.Tensor, *, - spatial_dim: nn.Dim, - feature_dim: nn.Dim = nn.NotSpecified, - only_on_train: bool = True, - min_frame_masks=2, - mask_each_n_frames=25, - max_frames_per_mask=20, - min_feature_masks=2, - max_feature_masks=5, - max_features_per_mask=8 - ) -> nn.Tensor: - """ - SpecAugment :func:`specaugment_v2` but with adjustable parameters - """ - if feature_dim is nn.NotSpecified: - assert x.feature_dim - feature_dim = x.feature_dim - - with nn.Cond(nn.train_flag() | (not only_on_train)) as cond: - x_masked = x - spatial_len = nn.dim_value(spatial_dim) - # time mask - x_masked = random_mask_v2( - x_masked, mask_axis=spatial_dim, broadcast_axis=feature_dim, - min_num=nn.minimum(min_frame_masks, spatial_len), - max_num=nn.minimum(nn.maximum(spatial_len // mask_each_n_frames, min_frame_masks), spatial_len), - max_dims=max_frames_per_mask) - # feature mask - x_masked = random_mask_v2( - x_masked, mask_axis=feature_dim, broadcast_axis=spatial_dim, - min_num=min_feature_masks, max_num=max_feature_masks, - max_dims=max_features_per_mask) - cond.true = x_masked - cond.false = x - return cond.result - - -class BLSTMLayer(nn.Module): - """ - BLSTM with time broadcasted dropout - """ - def __init__(self, in_dim: nn.Dim, size=512, dropout: float = 0.0): - super().__init__() - self.lstm_dim = nn.FeatureDim(description="BLSTM-out-dim", dimension=size) - self.out_dim = 2*self.lstm_dim - self.fwd_lstm = nn.LSTM(in_dim=in_dim, out_dim=self.lstm_dim) - self.bwd_lstm = nn.LSTM(in_dim=in_dim, out_dim=self.lstm_dim) - self.dropout = dropout - - def __call__(self, source: nn.Tensor, time_dim: nn.Dim): - fwd, _ = self.fwd_lstm(source, spatial_dim=time_dim, direction=1) - bwd, _ = self.bwd_lstm(source, spatial_dim=time_dim, direction=-1) - concat, _ = nn.concat((fwd, self.lstm_dim), (bwd, self.lstm_dim)) - if self.dropout > 0.0: - return nn.dropout(concat, self.dropout, axis=[nn.batch_dim, concat.feature_dim]) - else: - return concat - - -class BLSTMEncoder(ISeqFramewiseEncoder): - """ - BLSTM encoder with specaugment - """ - - def __init__(self, in_dim: nn.Dim, num_layers: int, size: int, dropout: float, specaugment_options=None): - super().__init__() - self.specaugment_options = specaugment_options - - running_dim = in_dim - blstm_layers = [] - for _ in range(num_layers): - blstm_layer = BLSTMLayer(in_dim=running_dim, size=size, dropout=dropout) - blstm_layers.append(blstm_layer) - running_dim = blstm_layer.out_dim - - self.out_dim = running_dim - self.blstm_stack = nn.Sequential(blstm_layers) - - def __call__(self, source: nn.Tensor, *, spatial_dim: nn.Dim) -> nn.Tensor: - if self.specaugment_options: - source = flexible_specaugment( - source, - spatial_dim=spatial_dim, - feature_dim=source.feature_dim, - **self.specaugment_options) - return self.blstm_stack(source, time_dim=spatial_dim) - - -class HybridHMM(IHybridHMM): - """ - Hybrid NN-HMM - """ - - def __init__(self, *, encoder: EncoderType, out_dim: nn.Dim, focal_loss_scale: float = 0.0): - super().__init__() - self.encoder = encoder - self.focal_loss_scale = focal_loss_scale - self.out_dim = out_dim - self.out_projection = nn.Linear(in_dim=self.encoder.out_dim, out_dim=out_dim) - - def __call__(self, source: nn.Tensor, *, - state: Optional[nn.LayerState] = None, - train: bool = False, targets: Optional[nn.Tensor] = None) -> Tuple[nn.Tensor, Optional[nn.LayerState]]: - assert source.data.time_dim_axis is not None - in_spatial_dim = source.data.dim_tags[source.data.time_dim_axis] - assert state is None, f"{self} stateful hybrid HMM not supported yet" - if isinstance(self.encoder, ISeqFramewiseEncoder): - encoder_output = self.encoder(source, spatial_dim=in_spatial_dim) - out_spatial_dim = in_spatial_dim - elif isinstance(self.encoder, ISeqDownsamplingEncoder): - encoder_output, out_spatial_dim = self.encoder(source, in_spatial_dim=in_spatial_dim) - else: - raise TypeError(f"unsupported encoder type {type(self.encoder)}") - out_embed = self.out_projection(encoder_output) - if train: - assert out_spatial_dim in targets.shape - ce_loss = nn.sparse_softmax_cross_entropy_with_logits(logits=out_embed, targets=targets, axis=self.out_dim) - # focal loss (= more emphasis on "low" scores), might not be correct yet - if self.focal_loss_scale > 0.0: - ce_loss *= (1.0 - nn.exp(-ce_loss)) ** self.focal_loss_scale - ce_loss.mark_as_loss(name="default_ce") - return nn.log_softmax(out_embed, axis=self.out_dim), None - - -def construct_hybrid_network( - epoch: int, - train: bool, - audio_data: nn.Data, - label_data: nn.Data, - **kwargs -): - """ - :param epoch: - :param train: - :param encoder: - :param audio_data: - :param label_data: - :param kwargs: encoder kwargs - :return: - """ - label_feature_dim = label_data.sparse_dim - focal_loss_scale = kwargs.pop("focal_loss_scale", 0.0) - enc = BLSTMEncoder(in_dim=audio_data.feature_dim_or_sparse_dim, **kwargs) - net = HybridHMM( - encoder=enc, - out_dim=label_feature_dim, - focal_loss_scale=focal_loss_scale - ) - out, _ = net( - source=nn.get_extern_data(audio_data), - train=train, - targets=nn.get_extern_data(label_data) - ) - out.mark_as_default_output() - - return net diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/specaugment_clean_legacy.py b/users/rilling/experiments/librispeech/librispeech_100_hybrid/specaugment_clean_legacy.py deleted file mode 100644 index 28bf33463..000000000 --- a/users/rilling/experiments/librispeech/librispeech_100_hybrid/specaugment_clean_legacy.py +++ /dev/null @@ -1,141 +0,0 @@ -from dataclasses import dataclass - - -def _mask(x, batch_axis, axis, pos, max_amount): - """ - :param tf.Tensor x: (batch,time,feature) - :param int batch_axis: - :param int axis: - :param tf.Tensor pos: (batch,) - :param int|tf.Tensor max_amount: inclusive - """ - from returnn.tf.compat import v1 as tf - ndim = x.get_shape().ndims - n_batch = tf.shape(x)[batch_axis] - dim = tf.shape(x)[axis] - amount = tf.random_uniform(shape=(n_batch,), minval=1, maxval=max_amount + 1, dtype=tf.int32) - pos2 = tf.minimum(pos + amount, dim) - idxs = tf.expand_dims(tf.range(0, dim), 0) # (1,dim) - pos_bc = tf.expand_dims(pos, 1) # (batch,1) - pos2_bc = tf.expand_dims(pos2, 1) # (batch,1) - cond = tf.logical_and(tf.greater_equal(idxs, pos_bc), tf.less(idxs, pos2_bc)) # (batch,dim) - if batch_axis > axis: - cond = tf.transpose(cond) # (dim,batch) - cond = tf.reshape(cond, [tf.shape(x)[i] if i in (batch_axis, axis) else 1 for i in range(ndim)]) - from TFUtil import where_bc - x = where_bc(cond, 0.0, x) - return x - - -def _random_mask(x, batch_axis, axis, min_num, max_num, max_dims): - """ - :param tf.Tensor x: (batch,time,feature) - :param int batch_axis: - :param int axis: - :param int|tf.Tensor min_num: - :param int|tf.Tensor max_num: inclusive - :param int|tf.Tensor max_dims: inclusive - """ - from returnn.tf.compat import v1 as tf - n_batch = tf.shape(x)[batch_axis] - if isinstance(min_num, int) and isinstance(max_num, int) and min_num == max_num: - num = min_num - else: - num = tf.random_uniform(shape=(n_batch,), minval=min_num, maxval=max_num + 1, dtype=tf.int32) - # https://github.com/tensorflow/tensorflow/issues/9260 - # https://timvieira.github.io/blog/post/2014/08/01/gumbel-max-trick-and-weighted-reservoir-sampling/ - z = -tf.log(-tf.log(tf.random_uniform((n_batch, tf.shape(x)[axis]), 0, 1))) - _, indices = tf.nn.top_k(z, num if isinstance(num, int) else tf.reduce_max(num)) - # indices should be sorted, and of shape (batch,num), entries (int32) in [0,dim) - # indices = tf.Print(indices, ["indices", indices, tf.shape(indices)]) - if isinstance(num, int): - for i in range(num): - x = _mask(x, batch_axis=batch_axis, axis=axis, pos=indices[:, i], max_amount=max_dims) - else: - _, x = tf.while_loop( - cond=lambda i, _: tf.less(i, tf.reduce_max(num)), - body=lambda i, x: ( - i + 1, - tf.where( - tf.less(i, num), - _mask(x, batch_axis=batch_axis, axis=axis, pos=indices[:, i], max_amount=max_dims), - x)), - loop_vars=(0, x)) - return x - - -def specaugment_eval_func(data, network, - min_frame_masks=2, mask_each_n_frames=25, max_frames_per_mask=20, - min_feature_masks=2, max_feature_masks=5, max_features_per_mask=8): - x = data.placeholder - from returnn.tf.compat import v1 as tf - def get_masked(): - x_masked = x - x_masked = _random_mask( - x_masked, batch_axis=data.batch_dim_axis, axis=data.time_dim_axis, - min_num=min_frame_masks, - max_num=tf.maximum(tf.shape(x)[data.time_dim_axis] // mask_each_n_frames, min_frame_masks), - max_dims=max_frames_per_mask) - x_masked = _random_mask( - x_masked, batch_axis=data.batch_dim_axis, axis=data.feature_dim_axis, - min_num=min_feature_masks, max_num=max_feature_masks, - max_dims=max_features_per_mask) - return x_masked - x = network.cond_on_train(get_masked, lambda: x) - return x - - -@dataclass(eq=False, frozen=True) -class SpecAugmentSettings: - """ - default hybrid settings from chris - """ - min_frame_masks: int = 1 - mask_each_n_frames: int = 100 - max_frames_per_mask: int = 20 - min_feature_masks: int = 1 - max_feature_masks: int = 2 - max_features_per_mask: int = 10 - - def get_options(self): - return self.__dict__ - - -def specaug_layer(in_layer, min_frame_masks=1, mask_each_n_frames=100, max_frames_per_mask=20, - min_feature_masks=1, max_feature_masks=2, max_features_per_mask=10): - """ - specaug layer with default hybrid settings - - :param in_layer: - :param min_frame_masks: - :param mask_each_n_frames: - :param max_frames_per_mask: - :param min_feature_masks: - :param max_feature_masks: - :param max_features_per_mask: - :return: - """ - return { - "class": "eval", - "from": in_layer, - "eval":"self.network.get_config().typed_value('specaugment_eval_func')(" - "source(0, as_data=True), " - "network=self.network, " - "min_frame_masks=%i, " - "mask_each_n_frames=%i, " - "max_frames_per_mask=%i, " - "min_feature_masks=%i, " - "max_feature_masks=%i, " - "max_features_per_mask=%i)" % ( - min_frame_masks, mask_each_n_frames, max_frames_per_mask, - min_feature_masks, max_feature_masks, max_features_per_mask - ), - } - - -def get_funcs(): - funcs = [] - for k, v in list(globals().items()): - if k in ["_mask", "_random_mask", "specaugment_eval_func"]: - funcs.append(v) - return funcs \ No newline at end of file diff --git a/users/rilling/experiments/librispeech/librispeech_conformer_cb_joint/experiments.py b/users/rilling/experiments/librispeech/librispeech_conformer_cb_joint/experiments.py index 3504f73c8..30ce07a10 100644 --- a/users/rilling/experiments/librispeech/librispeech_conformer_cb_joint/experiments.py +++ b/users/rilling/experiments/librispeech/librispeech_conformer_cb_joint/experiments.py @@ -44,12 +44,11 @@ def get_conformer_coupling_glow(x_vector_exp, gl_checkpoint): - """ - Baseline for the glow TTS in returnn_common with serialization - - Uses updated RETURNN_COMMON + """Different experiments similar to the experiments in ../librispeech_glowtts and ../librispeech_joint_training but using Conformers as coupling function instead of wavenet-like CNN - :return: durations_hdf + :param dict x_vector_exp: Dictionary containing experiments from ../librispeech_x_vectors to import pre-trained x-vector models for on-the-fly speaker embedding generation + : param dict gl_checkpoint: Dictionary containing checkpoint and config of BLSTM transforming log-mel into linear spectrograms for G&L vocoding + :return dict: Dictionary containing experiment dictionaries """ prefix = "experiments/librispeech/joint_training/conformer_coupling/raw_audio/" @@ -66,28 +65,43 @@ def run_exp( forward_args={}, search_args={}, keep_epochs=None, - extract_x_vector=False, tts_forward=True, asr_search=True, use_speaker_labels_in_dev=False, - given_train_job_for_forward=None, tts_eval_datasets=None, ): + """Creates jobs for training, TTS forward/evaluation and ASR search + + :param str name: Name of the experiment for aliases + :param dict args: General arguments for training, forwarding and search + :param TrainingDataset dataset: Dataset for training and TTS forwarding without evaluation + :param dict test_dataset: Dictionary of datasets for ASR evaluation + :param int num_epochs: Number of epochs for training, defaults to 100 + :param bool use_custom_engine: whether custom engine should be used in Returnn, defaults to False + :param dict training_args: Additional arguments for training passed to train_step function, defaults to {} + :param dict forward_args: Additional arguments for TTS forward passed to froward_step, defaults to {} + :param dict search_args: Additional arguments for ASR search passed to search_step_init, defaults to {} + :param list[int] keep_epochs: List of checkpoints that should be kept during training, defaults to None + :param bool tts_forward: whether TTS should be evaluated, defaults to True + :param bool asr_search: whether ASR search sould be performend and evaluated, defaults to True + :param bool use_speaker_labels_in_dev: whether validation set for training should contain speaker labels (uses devtrain split instead of dev-other/-clean for validation), defaults to False + :param dict tts_eval_datasets: Dictionary of datasets used for TTS evaluation, defaults to None + :return dict: Dictionary containing all jobs of the experiment + """ assert not tts_forward or ( "x_vector" not in name or tts_eval_datasets is not None ), "Attempting to evaluate a model with x-vector speaker embeddings, but missing explicit forward dataset with precalculated x-vector speaker embeddings." exp = {} - if given_train_job_for_forward is None: - training_config = get_training_config( - training_datasets=dataset, - **args, - training_args=training_args, - use_custom_engine=use_custom_engine, - keep_epochs=keep_epochs, - use_speaker_labels_in_dev=use_speaker_labels_in_dev, - ) # implicit reconstruction loss + training_config = get_training_config( + training_datasets=dataset, + **args, + training_args=training_args, + use_custom_engine=use_custom_engine, + keep_epochs=keep_epochs, + use_speaker_labels_in_dev=use_speaker_labels_in_dev, + ) if tts_forward: forward_config = get_forward_config( @@ -102,16 +116,13 @@ def run_exp( search_args=search_args, ) - if given_train_job_for_forward is None: - train_job = training( - config=training_config, - returnn_exe=RETURNN_PYTORCH_EXE, - returnn_root=MINI_RETURNN_ROOT, - prefix=prefix + name, - num_epochs=num_epochs, - ) - else: - train_job = given_train_job_for_forward + train_job = training( + config=training_config, + returnn_exe=RETURNN_PYTORCH_EXE, + returnn_root=MINI_RETURNN_ROOT, + prefix=prefix + name, + num_epochs=num_epochs, + ) exp["train_job"] = train_job if tts_forward: @@ -139,19 +150,6 @@ def run_exp( swer_eval_corpus_key=ds_key ) - if extract_x_vector: - forward_x_vector_config = get_forward_config( - forward_dataset=dataset, **args, forward_args=forward_args, target="xvector", train_data=True - ) - forward_xvector_job = forward( - checkpoint=train_job.out_checkpoints[num_epochs], - config=forward_x_vector_config, - returnn_exe=RETURNN_PYTORCH_EXE, - returnn_root=MINI_RETURNN_ROOT, - prefix=prefix + name, - target="xvector", - ) - exp["forward_xvector_job"] = forward_xvector_job if asr_search: search( prefix + name + "/search", diff --git a/users/rilling/experiments/librispeech/librispeech_glow_asr/experiments.py b/users/rilling/experiments/librispeech/librispeech_glow_asr/experiments.py index de7fb6b1b..90c4d0fbd 100644 --- a/users/rilling/experiments/librispeech/librispeech_glow_asr/experiments.py +++ b/users/rilling/experiments/librispeech/librispeech_glow_asr/experiments.py @@ -33,6 +33,10 @@ def glowASR(TTS_experiments: dict): + """Contains the ASR-only experiments using both frozen Glow-TTS decoder and unfrozen coupling blocks as a frontend for BLSTMs or Conformers trained on ASR + + :param dict TTS_experiments: Dictionary containing the TTS-only experiments from ../librispeech_glowtts to import Glow-TTS decoder parameters + """ prefix_name = "experiments/librispeech/librispeech_glow_asr/pytorch/" train_settings = TrainingDatasetSettings( @@ -137,6 +141,21 @@ def run_exp( only_forward_no_search=False, eval_invertibility=False, ): + """Creates the jobs for training and search of the experiment + + :param str ft_name: Name of the experiment to be used for aliases + :param TrainingDataset datasets: Dataset to be used for training + :param dict train_args: Dictionary containing arguments for training + :param dict search_args: Dictionary containing arguments for search used in search init step, defaults to None + :param bool with_prior: whether the prior on the test dataset should be evaluated for internal language model correction (if set to False will default to True if search_args["prior_scale"]!= 0), defaults to False + :param int num_epochs: Number of epochs in training, defaults to 100 + :param int extra_evaluate_epoch: Number of checkpoint that should be additionally evaluated to evaluate WER during training, defaults to None + :param dict test_datasets: Dictionary of datasets to be used for evaluation, keys are used for aliases, defaults to dev_dataset_tuples + :param bool large_gpu_training: whether training should require 24GB or memory, defaults to False + :param bool only_forward_no_search: whether ASR search / evaluation should be skipped, defaults to False + :param bool eval_invertibility: whether coupling blocks should be evaluated for invertibility, defaults to False + :return dict: Dictionary containing all the jobs of the experiment + """ search_args = copy.deepcopy(search_args) if search_args is not None else {} with_prior = with_prior or ("prior_scale" in search_args and search_args["prior_scale"] != 0) diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/glowTTS/feature_config.py b/users/rilling/experiments/librispeech/librispeech_glowtts/feature_config.py similarity index 100% rename from users/rilling/experiments/librispeech/librispeech_glowtts/glowTTS/feature_config.py rename to users/rilling/experiments/librispeech/librispeech_glowtts/feature_config.py diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/glowTTS/experiments.py b/users/rilling/experiments/librispeech/librispeech_glowtts/glowTTS/experiments.py index 84b577d6a..56164bf9f 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/glowTTS/experiments.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/glowTTS/experiments.py @@ -21,13 +21,13 @@ def get_pytorch_glowTTS(x_vector_exp: dict, gl_checkpoint: dict): - """ - Baseline for the glow TTS in returnn_common with serialization + """Experiments training Glow-TTS on TTS-only. Most experiments have the bug, that the decoder dropout is set using the wrong keyword, which is why the value of 0.05 is overwritten by the default of 0.0 + The TTS-only experiments in ../librispeech_joing_training_given_alignment/exp_TTS fixed that and are therefore more current - Uses updated RETURNN_COMMON - - :return: durations_hdf - """ + :param dict x_vector_exp: Dictionary containing x-vector experiments from ../librispeech_x_vectors to import x-vector model for on-the-fly speaker embedding generatino + :param dict gl_checkpoint: Dictionary containing checkpoint and config of a BLSTM network to transform log-mel into linear for G&L vocoding + :return dict: Dictionary containing the experiment dictionaries to import attributes of experiment jobs in other experiment folders (should be done using storage) + """ prefix = "experiments/librispeech/tts_architecture/glow_tts/raw_audio/" experiments = {} @@ -53,6 +53,29 @@ def run_exp( tts_eval_datasets=None, forward_device="gpu", ): + """Creates the jobs for training, forwarding and evaluation of the experiment + + :param str name: Name of the experiment to be used in aliases + :param dict args: General arguments to be used in Returnn configs in training and forward + :param TrainingDataset dataset: Dataset used for training and TTS forwarding (not evaluation) + :param int num_epochs: Number of epochs in training, defaults to 100 + :param bool use_custom_engine: whether a custom engine is used in Returnn, defaults to False + :param Optional[int] extra_evaluate_epoch: if an epoch is given the forward run is also performed using that epoch's checkpoint, defaults to None + :param dict forward_args: Additional arguments for the forward step, defaults to {} + :param bool further_training: whether an additional training should be run after the first num_epochs epochs for again num_epochs, defaults to False + :param bool spectrogram_foward: whether an additional forward run should produce spectrograms, defaults to False + :param bool durations_forward: whether an additional forward run should compute the phoneme durations / alignments, defaults to False + :param bool latent_space_forward: whether an additional forward run should output the latent space into an HDF, defaults to False + :param bool joint_data_forward: whether the forwarding/generation should be run on the full data (train and val. split), defaults to False + :param bool train_data_forward: whether the forwarding/generation should be run on the train split of the dataset, defaults to False + :param bool joint_durations_forward: whether the phoneme durations extraction should be run on the full dataset (train + val. split), defaults to False + :param list[int] keep_epochs: List of checkpoints that should be kept during training (not cleaned up), defaults to None + :param bool skip_forward: whether forwarding/generation should be skipped, defaults to False + :param bool nisqa_evaluation: whether the autoMOS should be evaluated using NISQA, defaults to False + :param dict tts_eval_datasets: Dictionary containing the datasets for TTS evaluation (autoMOS + sWER), defaults to None + :param str forward_device: define whether the forward steps are run on "gpu" or "cpu", defaults to "gpu" + :return dict: Dictionary containing the jobs of the experiment + """ exp = {} assert not nisqa_evaluation or (nisqa_evaluation and not skip_forward), "NISQA evaluation with skipping forward jobs is not possible" @@ -154,7 +177,7 @@ def run_exp( # vocoder="univnet" # ) # exp["forward_job_univnet"] = forward_job_univnet - + for ds_k, ds in tts_eval_datasets.items(): forward_config_gl = get_forward_config( returnn_common_root=RETURNN_COMMON, @@ -339,7 +362,7 @@ def get_lr_scale(dim_model, step_num, warmup_steps): ) from .data import get_tts_log_mel_datastream - from .feature_config import DbMelFeatureExtractionConfig + from ..feature_config import DbMelFeatureExtractionConfig from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.audio import DBMelFilterbankOptions log_mel_datastream = get_tts_log_mel_datastream(silence_preprocessing=False) diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/glowTTS/gt_extraction.py b/users/rilling/experiments/librispeech/librispeech_glowtts/glowTTS/gt_extraction.py index df81f04c2..d63eda851 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/glowTTS/gt_extraction.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/glowTTS/gt_extraction.py @@ -66,7 +66,7 @@ def run_exp(name, args, dataset): training_datasets_silence_preprocessed = build_training_dataset(settings=train_settings, librispeech_key="train-clean-100", silence_preprocessing=True) from .data import get_tts_log_mel_datastream - from .feature_config import DbMelFeatureExtractionConfig + from ..feature_config import DbMelFeatureExtractionConfig from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.audio import DBMelFilterbankOptions log_mel_datastream = get_tts_log_mel_datastream(silence_preprocessing=False) diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/glowTTS/vocoder/__init__.py b/users/rilling/experiments/librispeech/librispeech_glowtts/glowTTS/vocoder/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/feature_extraction.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/feature_extraction.py index e4fefba90..283631dbe 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/feature_extraction.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/feature_extraction.py @@ -4,8 +4,7 @@ from librosa import filters -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig - +from ..feature_config import DbMelFeatureExtractionConfig class DbMelFeatureExtraction(nn.Module): diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS.py index f68e79991..05d00a75d 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS.py @@ -16,7 +16,7 @@ from .monotonic_align import maximum_path from .feature_extraction import DbMelFeatureExtraction -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig +from ..feature_config import DbMelFeatureExtractionConfig from .eval_forward import * diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_ddi_actnorm.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_ddi_actnorm.py index 17e1d9e9a..bcf11bd06 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_ddi_actnorm.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_ddi_actnorm.py @@ -16,7 +16,7 @@ from .monotonic_align import maximum_path from .feature_extraction import DbMelFeatureExtraction -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig +from ..feature_config import DbMelFeatureExtractionConfig from .eval_forward import * diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_decoder_test_blstm.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_decoder_test_blstm.py index cbb99c159..9a3618ccf 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_decoder_test_blstm.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_decoder_test_blstm.py @@ -16,7 +16,7 @@ from .monotonic_align import maximum_path from .feature_extraction import DbMelFeatureExtraction -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig +from ..feature_config import DbMelFeatureExtractionConfig from i6_models.parts.blstm import BlstmEncoderV1, BlstmEncoderV1Config diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_decoder_test_multi_layer_ffn.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_decoder_test_multi_layer_ffn.py index ae813a8e7..db30fb87b 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_decoder_test_multi_layer_ffn.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_decoder_test_multi_layer_ffn.py @@ -16,7 +16,7 @@ from .monotonic_align import maximum_path from .feature_extraction import DbMelFeatureExtraction -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig +from ..feature_config import DbMelFeatureExtractionConfig class DurationPredictor(nn.Module): diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_decoder_test_simple_linear.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_decoder_test_simple_linear.py index 1167aab99..7ead486c4 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_decoder_test_simple_linear.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_decoder_test_simple_linear.py @@ -16,7 +16,7 @@ from .monotonic_align import maximum_path from .feature_extraction import DbMelFeatureExtraction -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig +from ..feature_config import DbMelFeatureExtractionConfig class DurationPredictor(nn.Module): diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_blstm.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_blstm.py index 90bc2b906..bd8f2a551 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_blstm.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_blstm.py @@ -16,7 +16,7 @@ from .monotonic_align import maximum_path from .feature_extraction import DbMelFeatureExtraction -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig +from ..feature_config import DbMelFeatureExtractionConfig from i6_models.parts.blstm import BlstmEncoderV1, BlstmEncoderV1Config diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_maxlike_alignment_blstm.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_maxlike_alignment_blstm.py index 5db783e86..fe3d788ef 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_maxlike_alignment_blstm.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_maxlike_alignment_blstm.py @@ -16,7 +16,7 @@ from .monotonic_align import maximum_path from .feature_extraction import DbMelFeatureExtraction -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig +from ..feature_config import DbMelFeatureExtractionConfig from i6_models.parts.blstm import BlstmEncoderV1, BlstmEncoderV1Config diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_maxlike_alignment_multi_layer_ffn.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_maxlike_alignment_multi_layer_ffn.py index d46e6a0e9..19b94415f 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_maxlike_alignment_multi_layer_ffn.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_maxlike_alignment_multi_layer_ffn.py @@ -16,7 +16,7 @@ from .monotonic_align import maximum_path from .feature_extraction import DbMelFeatureExtraction -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig +from ..feature_config import DbMelFeatureExtractionConfig class DurationPredictor(nn.Module): diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_maxlike_alignment_simple_linear.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_maxlike_alignment_simple_linear.py index 572791718..1e3b7b311 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_maxlike_alignment_simple_linear.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_maxlike_alignment_simple_linear.py @@ -16,7 +16,7 @@ from .monotonic_align import maximum_path from .feature_extraction import DbMelFeatureExtraction -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig +from ..feature_config import DbMelFeatureExtractionConfig class DurationPredictor(nn.Module): diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_multi_layer_ffn.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_multi_layer_ffn.py index 5042be9c8..f67c5211e 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_multi_layer_ffn.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_multi_layer_ffn.py @@ -16,7 +16,7 @@ from .monotonic_align import maximum_path from .feature_extraction import DbMelFeatureExtraction -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig +from ..feature_config import DbMelFeatureExtractionConfig class DurationPredictor(nn.Module): diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_simple_linear.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_simple_linear.py index bf0e52169..29f1886a8 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_simple_linear.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_encoder_sample_test_simple_linear.py @@ -16,7 +16,7 @@ from .monotonic_align import maximum_path from .feature_extraction import DbMelFeatureExtraction -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig +from ..feature_config import DbMelFeatureExtractionConfig class DurationPredictor(nn.Module): diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_nar_taco_encoder.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_nar_taco_encoder.py index a75ebe0ee..23a8ce473 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_nar_taco_encoder.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_nar_taco_encoder.py @@ -17,7 +17,7 @@ from .monotonic_align import maximum_path from .feature_extraction import DbMelFeatureExtraction -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig +from ..feature_config import DbMelFeatureExtractionConfig from .eval_forward import * class Config: diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_nar_taco_encoder_no_blstm.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_nar_taco_encoder_no_blstm.py index 6328749d5..0fbf5aa39 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_nar_taco_encoder_no_blstm.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_nar_taco_encoder_no_blstm.py @@ -17,7 +17,7 @@ from .monotonic_align import maximum_path from .feature_extraction import DbMelFeatureExtraction -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig +from ..feature_config import DbMelFeatureExtractionConfig from .eval_forward import * class Config: diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_one_hot_encoder_mean.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_one_hot_encoder_mean.py index 38192665c..2c4a5afd7 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_one_hot_encoder_mean.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_one_hot_encoder_mean.py @@ -16,7 +16,7 @@ from .monotonic_align import maximum_path from .feature_extraction import DbMelFeatureExtraction -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig +from ..feature_config import DbMelFeatureExtractionConfig class DurationPredictor(nn.Module): diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_one_hot_encoder_std.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_one_hot_encoder_std.py index 64d10af1a..ffeb99945 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_one_hot_encoder_std.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_one_hot_encoder_std.py @@ -16,7 +16,7 @@ from .monotonic_align import maximum_path from .feature_extraction import DbMelFeatureExtraction -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig +from ..feature_config import DbMelFeatureExtractionConfig class DurationPredictor(nn.Module): diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_simple_encoder.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_simple_encoder.py index 9fe7baaa9..7f3d982e3 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_simple_encoder.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_simple_encoder.py @@ -16,7 +16,7 @@ from .monotonic_align import maximum_path from .feature_extraction import DbMelFeatureExtraction -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig +from ..feature_config import DbMelFeatureExtractionConfig from .eval_forward import * diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_simple_encoder_test_maxlike_alignment_multi_layer_ffn.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_simple_encoder_test_maxlike_alignment_multi_layer_ffn.py index 6261dd1a1..a3e3a1451 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_simple_encoder_test_maxlike_alignment_multi_layer_ffn.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_simple_encoder_test_maxlike_alignment_multi_layer_ffn.py @@ -16,7 +16,7 @@ from .monotonic_align import maximum_path from .feature_extraction import DbMelFeatureExtraction -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig +from ..feature_config import DbMelFeatureExtractionConfig from .eval_forward import * diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_simple_encoder_test_maxlike_alignment_multi_layer_ffn_v2.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_simple_encoder_test_maxlike_alignment_multi_layer_ffn_v2.py index cdfeb90ee..8321f2187 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_simple_encoder_test_maxlike_alignment_multi_layer_ffn_v2.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_simple_encoder_test_maxlike_alignment_multi_layer_ffn_v2.py @@ -16,7 +16,7 @@ from .monotonic_align import maximum_path from .feature_extraction import DbMelFeatureExtraction -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig +from ..feature_config import DbMelFeatureExtractionConfig from .eval_forward import * diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_x_vector.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_x_vector.py index 9f41c67fa..19c7f04bf 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_x_vector.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_x_vector.py @@ -16,7 +16,7 @@ from .monotonic_align import maximum_path from .feature_extraction import DbMelFeatureExtraction -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig +from ..feature_config import DbMelFeatureExtractionConfig from .eval_forward import * diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_x_vector_eval.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_x_vector_eval.py index dd69ca93e..33c4579e7 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_x_vector_eval.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_x_vector_eval.py @@ -16,7 +16,7 @@ from .monotonic_align import maximum_path from .feature_extraction import DbMelFeatureExtraction -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig +from ..feature_config import DbMelFeatureExtractionConfig class XVector(nn.Module): diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_x_vector_v2.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_x_vector_v2.py index 64a6397fb..0c1e3f652 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_x_vector_v2.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_x_vector_v2.py @@ -17,7 +17,7 @@ from .monotonic_align import maximum_path from .feature_extraction import DbMelFeatureExtraction -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig +from ..feature_config import DbMelFeatureExtractionConfig class XVector(nn.Module): diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_x_vector_v3.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_x_vector_v3.py index b4f7c4df2..1a341b9a9 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_x_vector_v3.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_x_vector_v3.py @@ -17,7 +17,7 @@ from .monotonic_align import maximum_path from .feature_extraction import DbMelFeatureExtraction -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig +from ..feature_config import DbMelFeatureExtractionConfig class DurationPredictor(nn.Module): """ diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_x_vector_v3_norm_xvector.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_x_vector_v3_norm_xvector.py index ee42f1b39..498c83e28 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_x_vector_v3_norm_xvector.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/glowTTS_x_vector_v3_norm_xvector.py @@ -17,7 +17,7 @@ from .monotonic_align import maximum_path from .feature_extraction import DbMelFeatureExtraction -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig +from ..feature_config import DbMelFeatureExtractionConfig class DurationPredictor(nn.Module): diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/gt_extractor.py b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/gt_extractor.py index 20c973003..859b54970 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/gt_extractor.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/pytorch_networks/gt_extractor.py @@ -16,7 +16,7 @@ from .monotonic_align import maximum_path from .feature_extraction import DbMelFeatureExtraction -from ..glowTTS.feature_config import DbMelFeatureExtractionConfig +from ..feature_config import DbMelFeatureExtractionConfig class Model(nn.Module): """ diff --git a/users/rilling/experiments/librispeech/librispeech_100_hybrid/__init__.py b/users/rilling/experiments/librispeech/librispeech_glowtts/vocoder/__init__.py similarity index 100% rename from users/rilling/experiments/librispeech/librispeech_100_hybrid/__init__.py rename to users/rilling/experiments/librispeech/librispeech_glowtts/vocoder/__init__.py diff --git a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/ctc_aligner/config.py b/users/rilling/experiments/librispeech/librispeech_glowtts/vocoder/config.py similarity index 64% rename from users/rilling/experiments/librispeech/tts_architecture_improvement_23/ctc_aligner/config.py rename to users/rilling/experiments/librispeech/librispeech_glowtts/vocoder/config.py index a1ec66639..83c9d41e5 100644 --- a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/ctc_aligner/config.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/vocoder/config.py @@ -7,19 +7,20 @@ from i6_experiments.users.rossenbach.common_setups.returnn.datasets import ( GenericDataset, ) -from .data import AlignmentTrainingDatasets +from .data import TrainingDataset from ..serializer import get_network_serializer, get_pytorch_serializer def get_training_config( - returnn_common_root: tk.Path, - training_datasets: AlignmentTrainingDatasets, - network_module: str, - net_args: Dict[str, Any], - config: Dict[str, Any], - debug: bool = False, - pytorch_mode=False, - use_custom_engine=False, + returnn_common_root: tk.Path, + training_datasets: TrainingDataset, + network_module: str, + net_args: Dict[str, Any], + config: Dict[str, Any], + debug: bool = False, + pytorch_mode=False, + use_custom_engine=False, + keep_epochs: set = None, ): """ Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner @@ -31,14 +32,15 @@ def get_training_config( # changing these does not change the hash post_config = { - "cleanup_old_models": True, - "stop_on_nonfinite_train_score": False, # this might break now with True + "cleanup_old_models": True if keep_epochs is None else {"keep": keep_epochs}, + "stop_on_nonfinite_train_score": True, # this might break now with True + "allow_missing_optimizer_checkpoint": True, } base_config = { ############# "train": training_datasets.train.as_returnn_opts(), - "dev": training_datasets.cv.as_returnn_opts() + "dev": training_datasets.cv.as_returnn_opts(), } config = {**base_config, **copy.deepcopy(config)} @@ -56,16 +58,19 @@ def get_training_config( network_module=network_module, net_args=net_args, debug=debug, - use_custom_engine=use_custom_engine - ) - returnn_config = ReturnnConfig( - config=config, post_config=post_config, python_epilog=[serializer] + use_custom_engine=use_custom_engine, ) + returnn_config = ReturnnConfig(config=config, post_config=post_config, python_epilog=[serializer]) return returnn_config -def get_forward_config( - returnn_common_root, forward_dataset: GenericDataset, datastreams, network_module, net_args, debug=False, pytorch_mode=False, +def get_extract_durations_forward__config( + returnn_common_root, + forward_dataset: GenericDataset, + network_module, + net_args, + debug=False, + pytorch_mode=False, ): """ Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for forward_ctc_aligner @@ -76,30 +81,39 @@ def get_forward_config( """ config = { "behavior_version": 16, - "forward_batch_size": 28000, + "batch_size": 28000, "max_seq_length": {"audio_features": 1000}, "max_seqs": 200, "forward_use_search": True, - "target": "extract_alignment", ############# - "eval": forward_dataset.as_returnn_opts() + "eval": forward_dataset.joint.as_returnn_opts(), } get_serializer = get_pytorch_serializer if pytorch_mode else get_network_serializer serializer = get_serializer( training=False, returnn_common_root=returnn_common_root, - datastreams=datastreams, network_module=network_module, net_args=net_args, - debug=debug + forward=True, + debug=debug, ) returnn_config = ReturnnConfig(config=config, python_epilog=[serializer]) return returnn_config -def get_pt_forward_config( - returnn_common_root, forward_dataset: GenericDataset, datastreams, network_module, net_args, debug=False, pytorch_mode=False, +def get_forward_config( + returnn_common_root, + forward_dataset: GenericDataset, + network_module, + net_args, + config, + debug=False, + pytorch_mode=False, + forward_args={}, + target="audio", + train_data=False, + joint_data=False, ): """ Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for forward_ctc_aligner @@ -108,24 +122,35 @@ def get_pt_forward_config( :param kwargs: arguments to be passed to the network construction :return: RETURNN forward config """ - config = { - "batch_size": 28000, - "max_seqs": 200, + fd = None + if isinstance(forward_dataset, tuple): + fd = forward_dataset[0].as_returnn_opts() + elif train_data: + fd = forward_dataset.train.as_returnn_opts() + elif joint_data: + fd = forward_dataset.joint.as_returnn_opts() + else: + fd = forward_dataset.cv.as_returnn_opts() + + base_config = { + "behavior_version": 16, + "forward_use_search": True, ############# - "forward": forward_dataset.as_returnn_opts() + "forward": fd, } + + config = {**base_config, **copy.deepcopy(config)} get_serializer = get_pytorch_serializer if pytorch_mode else get_network_serializer serializer = get_serializer( training=False, returnn_common_root=returnn_common_root, - datastreams=datastreams, network_module=network_module, - forward=True, net_args=net_args, - debug=debug + forward_args=forward_args, + forward=True, + debug=debug, + target=target, ) returnn_config = ReturnnConfig(config=config, python_epilog=[serializer]) return returnn_config - - diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/vocoder/data.py b/users/rilling/experiments/librispeech/librispeech_glowtts/vocoder/data.py new file mode 100644 index 000000000..3d92b0075 --- /dev/null +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/vocoder/data.py @@ -0,0 +1,370 @@ +from dataclasses import dataclass +import os +from functools import lru_cache +from sisyphus import tk +from typing import Dict, List, Optional, Tuple + +from i6_core.returnn.dataset import SpeakerLabelHDFFromBlissJob +from i6_core.returnn import CodeWrapper, BlissToOggZipJob + +from returnn_common.datasets import Dataset, OggZipDataset, HDFDataset, MetaDataset +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.base import Datastream +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream + +from i6_experiments.users.rossenbach.datasets.librispeech import get_librispeech_tts_segments + +from ..data import ( + get_tts_log_mel_datastream, + get_audio_raw_datastream, + get_train_bliss_and_zip, + get_vocab_datastream, + get_mixed_cv_segments, + get_bliss_corpus_dict, + get_lexicon, +) +from ..default_tools import MINI_RETURNN_ROOT, RETURNN_PYTORCH_EXE + +EpochWiseFilter = Tuple[int, int, int] + +@dataclass(frozen=True) +class TrainingDataset: + """ + Dataclass for Alignment Datasets + """ + + train: Dataset + cv: Dataset + joint: Dataset + datastreams: Dict[str, Datastream] + +@dataclass() +class TrainingDatasetSettings: + # features settings + custom_processing_function: Optional[str] + + # training settings + partition_epoch: int + epoch_wise_filters: List[EpochWiseFilter] + seq_ordering: str + +def make_meta_dataset(audio_dataset, speaker_dataset, duration_dataset=None, xvector_dataset=None): + """ + Shared function to create a metadatset with joined audio and speaker information + + :param datasets.OggZipDataset audio_dataset: + :param datasets.HDFDataset speaker_dataset: + :return: + :rtype: MetaDataset + """ + data_map = { + "audio_features": ("audio", "data"), + "phonemes": ("audio", "classes"), + "speaker_labels": ("speaker", "data"), + } + + ds = { + "audio": audio_dataset.as_returnn_opts(), + "speaker": speaker_dataset.as_returnn_opts() + } + + if duration_dataset: + data_map["durations"] = ("durations", "data") + ds["durations"] = duration_dataset.as_returnn_opts() + + if xvector_dataset is not None: + data_map["xvectors"] = ("xvectors", "data") + ds["xvectors"] = xvector_dataset.as_returnn_opts() + + meta_dataset = MetaDataset( + data_map=data_map, + datasets=ds, + seq_order_control_dataset="audio", + ) + return meta_dataset + +def build_training_dataset( + librispeech_key: str, + settings: TrainingDatasetSettings, + silence_preprocessing=False, + xvectors_file=None, +) -> TrainingDataset: + """ + + :param settings: + :param output_path: + """ + train_bliss, train_ogg = get_train_bliss_and_zip("train-clean-100", silence_preprocessed=silence_preprocessing) + # _, dev_clean_ogg = get_train_bliss_and_zip("dev-clean", silence_preprocessed=silence_preprocessing, remove_unk_seqs=True) + # _, dev_other_ogg = get_train_bliss_and_zip("dev-other", silence_preprocessed=silence_preprocessing, remove_unk_seqs=True) + + train_bpe_datastream = get_vocab_datastream(corpus_key=librispeech_key, with_blank=True) + audio_datastream = get_audio_raw_datastream() + + train_segments, cv_segments = get_librispeech_tts_segments(ls_corpus_key=librispeech_key) + + speaker_label_job = SpeakerLabelHDFFromBlissJob( + bliss_corpus=train_bliss, + returnn_root=MINI_RETURNN_ROOT, + ) + joint_speaker_hdf = speaker_label_job.out_speaker_hdf + + joint_speaker_dataset = HDFDataset( + files=[joint_speaker_hdf] + ) + speaker_datastream = LabelDatastream( + available_for_inference=True, + vocab_size=speaker_label_job.out_num_speakers, + vocab=speaker_label_job.out_speaker_dict, + ) + + datastreams = { + "audio_features": audio_datastream, + "phonemes": train_bpe_datastream, + "speaker_labels": speaker_datastream, + } + + if xvectors_file is not None: + xvector_dataset = HDFDataset(files=[xvectors_file]) + xvector_dataset_train = HDFDataset(files=[xvectors_file], segment_file=train_segments) + xvector_dataset_cv = HDFDataset(files=[xvectors_file], segment_file=cv_segments) + else: + xvector_dataset, xvector_dataset_train, xvector_dataset_cv = (None, None, None) + + training_audio_opts = audio_datastream.as_returnn_audio_opts() + if settings.custom_processing_function: + training_audio_opts["pre_process"] = CodeWrapper(settings.custom_processing_function) + + additional_opts = {} + if settings.epoch_wise_filters: + additional_opts["epoch_wise_filter"] = {} + for fr, to, max_mean_len in settings.epoch_wise_filters: + additional_opts["epoch_wise_filter"][(fr, to)] = {"max_mean_len": max_mean_len} + + train_zip_dataset = OggZipDataset( + files=train_ogg, + audio_options=training_audio_opts, + target_options=train_bpe_datastream.as_returnn_targets_opts(), + partition_epoch=settings.partition_epoch, + segment_file=train_segments, + seq_ordering=settings.seq_ordering, + additional_options=additional_opts, + ) + train_dataset = make_meta_dataset(train_zip_dataset, joint_speaker_dataset, xvector_dataset=xvector_dataset_train) + + cv_zip_dataset = OggZipDataset( + files=train_ogg, + audio_options=audio_datastream.as_returnn_audio_opts(), + target_options=train_bpe_datastream.as_returnn_targets_opts(), + segment_file=cv_segments, + seq_ordering="sorted_reverse", + ) + cv_dataset = make_meta_dataset(cv_zip_dataset, joint_speaker_dataset, xvector_dataset=xvector_dataset_cv) + + devtrain_zip_dataset = OggZipDataset( + files=train_ogg, + audio_options=audio_datastream.as_returnn_audio_opts(), + target_options=train_bpe_datastream.as_returnn_targets_opts(), + seq_ordering="sorted_reverse", + # random_subset=3000, + ) + devtrain_dataset = make_meta_dataset(devtrain_zip_dataset, joint_speaker_dataset, xvector_dataset=xvector_dataset) + + return TrainingDataset(train=train_dataset, cv=cv_dataset, joint=devtrain_dataset, datastreams=datastreams) + +def build_training_dataset2( + settings: TrainingDatasetSettings, + ls_corpus_key="train-clean-100", + durations_file=None, + silence_preprocessed=True, + ) -> TrainingDataset: + """ + + :param center: do feature centering + """ + # bliss_dataset, zip_dataset = get_train_bliss_and_zip(ls_corpus_key=ls_corpus_key, silence_preprocessed=silence_preprocessed) + + train_bliss, train_ogg = get_train_bliss_and_zip(ls_corpus_key=ls_corpus_key, silence_preprocessed=silence_preprocessed) + _, dev_clean_ogg = get_train_bliss_and_zip("dev-clean", silence_preprocessed=False, remove_unk_seqs=True) + _, dev_other_ogg = get_train_bliss_and_zip("dev-other", silence_preprocessed=False, remove_unk_seqs=True) + + # segments for train-clean-100-tts-train and train-clean-100-tts-dev + # (1004 segments for dev, 4 segments for each of the 251 speakers) + train_segments, cv_segments = get_librispeech_tts_segments(ls_corpus_key=ls_corpus_key) + + vocab_datastream = get_vocab_datastream(with_blank=True, corpus_key=ls_corpus_key) + # log_mel_datastream = get_tts_log_mel_datastream(center=center) + + audio_datastream = get_audio_raw_datastream() + + # we currently assume that train and cv share the same corpus file + speaker_label_job = SpeakerLabelHDFFromBlissJob( + bliss_corpus=train_bliss, + returnn_root=MINI_RETURNN_ROOT, + ) + joint_speaker_hdf = speaker_label_job.out_speaker_hdf + + joint_speaker_dataset = HDFDataset( + files=[joint_speaker_hdf] + ) + speaker_datastream = LabelDatastream( + available_for_inference=True, + vocab_size=speaker_label_job.out_num_speakers, + vocab=speaker_label_job.out_speaker_dict, + ) + + if durations_file: + duration_dataset = HDFDataset( + files=[durations_file] + ) + + # ----- Ogg and Meta datasets + training_audio_opts = audio_datastream.as_returnn_audio_opts() + + if settings.custom_processing_function: + training_audio_opts["pre_process"] = CodeWrapper(settings.custom_processing_function) + + additional_opts = {} + if settings.epoch_wise_filters: + additional_opts["epoch_wise_filter"] = {} + for fr, to, max_mean_len in settings.epoch_wise_filters: + additional_opts["epoch_wise_filter"][(fr, to)] = {"max_mean_len": max_mean_len} + + train_ogg_dataset = OggZipDataset( + path=train_ogg, + audio_options=training_audio_opts, + target_options=vocab_datastream.as_returnn_targets_opts(), + segment_file=train_segments, + partition_epoch=settings.partition_epoch, + seq_ordering="laplace:.1000" + ) + if durations_file: + train_dataset = make_meta_dataset(train_ogg_dataset, joint_speaker_dataset, duration_dataset=duration_dataset) + else: + train_dataset = make_meta_dataset(train_ogg_dataset, joint_speaker_dataset) + + cv_ogg_dataset = OggZipDataset( + path=train_ogg, + audio_options=audio_datastream.as_returnn_audio_opts(), + target_options=vocab_datastream.as_returnn_targets_opts(), + segment_file=cv_segments, + partition_epoch=1, + seq_ordering="sorted", + ) + if durations_file: + cv_dataset = make_meta_dataset(cv_ogg_dataset, joint_speaker_dataset, duration_dataset=duration_dataset) + else: + cv_dataset = make_meta_dataset(cv_ogg_dataset, joint_speaker_dataset) + + joint_ogg_zip = OggZipDataset( + path=train_ogg, + audio_options=audio_datastream.as_returnn_audio_opts(), + target_options=vocab_datastream.as_returnn_targets_opts(), + partition_epoch=1, + seq_ordering="sorted", + ) + joint_metadataset = make_meta_dataset(joint_ogg_zip, joint_speaker_dataset) + + # ----- final outputs + + datastreams = { + "audio_features": audio_datastream, + "phonemes": vocab_datastream, + "speaker_labels": speaker_datastream, + } + + align_datasets = TrainingDataset( + train=train_dataset, + cv=cv_dataset, + joint=joint_metadataset, + datastreams=datastreams, + ) + + return align_datasets + +def build_swer_test_dataset(synthetic_bliss, preemphasis: Optional[float] = None, peak_normalization: bool = False): + """ + + :param synthetic_bliss: + :param preemphasis: + :param peak_normalization: + """ + zip_dataset_job = BlissToOggZipJob( + bliss_corpus=synthetic_bliss, + no_conversion=True, # for Librispeech we are already having ogg + returnn_python_exe=RETURNN_PYTORCH_EXE, + returnn_root=MINI_RETURNN_ROOT, + ) + + audio_datastream = get_audio_raw_datastream(preemphasis, peak_normalization) + + data_map = {"raw_audio": ("zip_dataset", "data")} + + test_zip_dataset = OggZipDataset( + files=[zip_dataset_job.out_ogg_zip], + audio_options=audio_datastream.as_returnn_audio_opts(), + seq_ordering="sorted_reverse", + ) + test_dataset = MetaDataset( + data_map=data_map, datasets={"zip_dataset": test_zip_dataset}, seq_order_control_dataset="zip_dataset" + ) + + return test_dataset + + +@lru_cache() +def build_tts_forward_dataset(librispeech_key: str, dataset_key: str, xvectors_file: tk.Path = None): + """ + + :param librispeech_key: base librispeech training set for vocab generation + :param dataset_key: test dataset to generate + :param silence_preprocessing: use a setup with silence preprocessing + """ + + _, test_ogg = get_train_bliss_and_zip(ls_corpus_key=dataset_key, silence_preprocessed=False, remove_unk_seqs=True) + bliss_dict = get_bliss_corpus_dict() + # audio_datastream = get_audio_raw_datastream() + tts_lexicon = get_lexicon(with_blank=True) + phonemes_datastream = get_vocab_datastream(corpus_key=librispeech_key, with_blank=True) + + bliss_corpus = bliss_dict[dataset_key] + + data_map = {"audio": ("zip_dataset", "data"), "phonemes": ("zip_dataset", "classes")} + + audio_datastream = get_audio_raw_datastream() + test_zip_dataset = OggZipDataset( + files=[test_ogg], + audio_options=audio_datastream.as_returnn_audio_opts(), + target_options=phonemes_datastream.as_returnn_targets_opts(), + seq_ordering="sorted_reverse", + ) + + if xvectors_file is not None: + x_vectors_dataset = HDFDataset(files=[xvectors_file]) + data_map["xvectors"] = ("xvectors", "data") + test_dataset = MetaDataset( + data_map=data_map, + datasets={"zip_dataset": test_zip_dataset, "xvectors": x_vectors_dataset}, + seq_order_control_dataset="zip_dataset", + ) + else: + speaker_label_job = SpeakerLabelHDFFromBlissJob( + bliss_corpus=bliss_corpus, + returnn_root=MINI_RETURNN_ROOT, + ) + joint_speaker_hdf = speaker_label_job.out_speaker_hdf + + joint_speaker_dataset = HDFDataset(files=[joint_speaker_hdf]) + speaker_datastream = LabelDatastream( + available_for_inference=True, + vocab_size=speaker_label_job.out_num_speakers, + vocab=speaker_label_job.out_speaker_dict, + ) + + data_map["speaker_labels"] = ("speakers", "data") + test_dataset = MetaDataset( + data_map=data_map, + datasets={"zip_dataset": test_zip_dataset, "speakers": joint_speaker_dataset}, + seq_order_control_dataset="zip_dataset", + ) + + return test_dataset, bliss_corpus diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/vocoder/pipeline.py b/users/rilling/experiments/librispeech/librispeech_glowtts/vocoder/pipeline.py new file mode 100644 index 000000000..49a7915e7 --- /dev/null +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/vocoder/pipeline.py @@ -0,0 +1,81 @@ +import os +import copy +from sisyphus import tk +from i6_core.returnn import ReturnnTrainingJob +from i6_core.returnn.forward import ReturnnForwardJob, ReturnnForwardJobV2 +from i6_core.returnn.search import SearchBPEtoWordsJob +from i6_experiments.users.rossenbach.common_setups.returnn.datasets import GenericDataset +from i6_experiments.users.rossenbach.tts.evaluation.nisqa import NISQAMosPredictionJob + +from ..default_tools import NISQA_REPO, SCTK_BINARY_PATH + +def training(config, returnn_exe, returnn_root, prefix, num_epochs=65): + + train_job = ReturnnTrainingJob( + config, + log_verbosity=5, + num_epochs=num_epochs, + time_rqmt=100, + mem_rqmt=10, + cpu_rqmt=4, + returnn_python_exe=returnn_exe, + returnn_root=returnn_root, +) + train_job.add_alias(prefix + "/training") + tk.register_output(prefix + "/training.models", train_job.out_model_dir) + + return train_job + +def glowTTS_forward(checkpoint, config, returnn_exe, returnn_root, prefix, alias_addition=None, target="audio", extra_evaluation_epoch=None, joint_data=False, device="gpu"): + hdf_outputs = [] if target != "audio" else ["/var/tmp/lukas.rilling/out"] + if target == "audio": + hdf_outputs = ["/var/tmp/lukas.rilling/out"] + elif target == "latent_space": + hdf_outputs = ["samples.hdf", "mean.hdf"] + # hdf_outputs = ["samples.hdf"] + else: + hdf_outputs = [] + + last_forward_job = ReturnnForwardJob( + model_checkpoint=checkpoint, + returnn_config=config, + hdf_outputs=hdf_outputs, + returnn_python_exe=returnn_exe, + returnn_root=returnn_root, + mem_rqmt=20, + device=device + ) + + if (target == "spectrogram" and joint_data): + last_forward_job.rqmt["gpu_mem"] = 24 + + forward_prefix = prefix + "/forward" + + if target != "audio": + forward_prefix += f"_{target}" + + if extra_evaluation_epoch is not None: + forward_prefix += f"_extra_evaluation_{extra_evaluation_epoch}" + + if alias_addition: + forward_prefix += alias_addition + + forward_suffix = f"/{target}" + + last_forward_job.add_alias(forward_prefix) + + tts_hdf = None + + if target == "audio": + tts_hdf = last_forward_job.out_hdf_files["/var/tmp/lukas.rilling/out"] + tk.register_output(forward_prefix + forward_suffix, tts_hdf) + elif target == "latent_space": + samples_hdf = last_forward_job.out_hdf_files["samples.hdf"] + mean_hdf = last_forward_job.out_hdf_files["mean.hdf"] + tk.register_output(forward_prefix + forward_suffix + "/samples", samples_hdf) + tk.register_output(forward_prefix + forward_suffix + "/mean", mean_hdf) + else: + tts_hdf = last_forward_job.out_hdf_files["output.hdf"] + tk.register_output(forward_prefix + forward_suffix, tts_hdf) + + return last_forward_job diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/glowTTS/vocoder/simple_gl.py b/users/rilling/experiments/librispeech/librispeech_glowtts/vocoder/simple_gl.py similarity index 93% rename from users/rilling/experiments/librispeech/librispeech_glowtts/glowTTS/vocoder/simple_gl.py rename to users/rilling/experiments/librispeech/librispeech_glowtts/vocoder/simple_gl.py index 7cc513e06..841d37dd9 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/glowTTS/vocoder/simple_gl.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/vocoder/simple_gl.py @@ -7,10 +7,10 @@ from i6_core.tools.git import CloneGitRepositoryJob -from ..data import build_training_dataset, TrainingDatasetSettings -from ..config import get_training_config -from ..pipeline import glowTTS_training as training -from ...data import get_tts_log_mel_datastream +from .data import build_training_dataset, TrainingDatasetSettings +from .config import get_training_config +from .pipeline import training +from ..data import get_tts_log_mel_datastream from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.audio import DBMelFilterbankOptions @@ -75,7 +75,7 @@ def run_exp(name, params, net_module, config, use_custom_engine=False, debug=Fal norm = (log_mel_datastream.additional_options["norm_mean"], log_mel_datastream.additional_options["norm_std_dev"]) - from ...pytorch_networks.vocoder.simple_gl.blstm_gl_predictor import BlstmGLPredictorConfig + from ..pytorch_networks.vocoder.simple_gl.blstm_gl_predictor import BlstmGLPredictorConfig from ..feature_config import DbMelFeatureExtractionConfig assert isinstance(log_mel_datastream.options.feature_options, DBMelFilterbankOptions) fe_config = DbMelFeatureExtractionConfig( diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/experiments.py b/users/rilling/experiments/librispeech/librispeech_joint_training/experiments.py index 3b116ab20..dd1ecffe2 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/experiments.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/experiments.py @@ -34,13 +34,12 @@ def get_glow_joint(x_vector_exp, gl_checkpoint): - """ - Baseline for the glow TTS in returnn_common with serialization + """Experiments on joint training of Glow-TTS and a Conformer ASR using the latent space of Glow-TTS as features. - Uses updated RETURNN_COMMON - - :return: durations_hdf - """ + :param dict x_vector_exp: Dictionary of x-vector experiments from ../librispeech_x_vectors to import x-vector model for on-the-fly speaker embedding generation for TTS and ASR + :param dict gl_checkpoint: Dictionary containing checkpoint and config of a BLSTM transforming log-mel into linear spectrogram for G&L vocoding + :return dict: Dictionary containing the experiments with all their jobs to be used to import checkpoints or other job attributes in other experiments + """ prefix = "experiments/librispeech/joint_training/default/raw_audio/" experiments = {} @@ -68,6 +67,31 @@ def run_exp( large_gpu_training=False, with_prior=False, ): + """Creates the Jobs for training, TTS generation/forwarding, ASR search and evaluations + + :param str name: Name of the experiment for alias creation + :param dict args: General arguments for training, forward and search configs + :param TrainingDataset dataset: Dataset used for training and TTS forwarding (without eval.) + :param dict test_dataset: Dictionary containing datasets to be used for ASR evaluation + :param int num_epochs: Number of epochs in training, defaults to 100 + :param bool use_custom_engine: whether a custom engine is to be used in Returnn, defaults to False + :param dict training_args: Additional arguments for training, passed to the train step, defaults to {} + :param dict forward_args: Additional arguments for forwarding passed to the forward step, defaults to {} + :param dict search_args: Additional arguments for search passed to the search init step, defaults to {} + :param list[int] keep_epochs: List of checkpoints that should be kept during training, defaults to None + :param bool extract_x_vector: whether the x-vectors whould be extracted into an HDF (only useful if x-vector model is unfrozen), defaults to False + :param bool tts_forward: whether TTS forwarding should be run (not evaluation, uses training dataset), defaults to True + :param bool asr_search: whether ASR search should be run, defaults to True + :param bool use_speaker_labels_in_dev: whether the validation set should contain speaker labels, defaults to False + :param ReturnnTrainingJob given_train_job_for_forward: , defaults to None + :param bool eval_tts: whether TTS should be evaluated, defaults to False + :param dict tts_eval_datasets: Dictionary containing datasets for TTS evaluation, defaults to None + :param bool eval_invertibility: whether invertibility of coupling blocks should be evaluated, defaults to False + :param bool eval_asr_invertibility: whether the invertibility of the ASR usage of the coupling blocks should be evaluated (only useful if separate passes are used for TTS and ASR), defaults to False + :param bool large_gpu_training: whether the GPU memory requirement for trianing should be set to 24GB, defaults to False + :param bool with_prior: Whether the prior of the internal language model should be estimated for prior correction (defaults to True if search_args["prior_scale]!=0), defaults to False + :return dict: Dictionary containing all the jobs for this experiment + """ exp = {} with_prior = with_prior or ("prior_scale" in search_args and search_args["prior_scale"] != 0) diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/README.md b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/README.md new file mode 100644 index 000000000..13872af24 --- /dev/null +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/README.md @@ -0,0 +1,15 @@ +# Joint Training using given alignments + +The naming of this folder might be confusing: Despite the name "joint training" this does not contain experiments jointly training TTS and ASR like it is done in "librispeech_joint_training", but this folder contains experiments using an additional auxiliary loss to influence the latent space of Glow-TTS during TTS training. + +Additionally this setup is the newest of the setups dealing with joint modelling of TTS and ASR and therefore contains functions to create datasets for TTS and ASR with additional external durations and x-vector speaker embeddings for TTS forwarding as well as explicit model configs that are written to the Returnn Config to enforce that all model parameters are stored in the Returnn config and considered for hash computation of the returnn jobs. + +Therefore the folder [`exp_tts`](./exp_tts/) contains additional TTS-only experiments similar to the experiments in [`librispeech_glowtts`](../librispeech_glowtts/). + +The folder [`exp_joint`](./exp_joint/) contains TTS trainings with auxiliary loss, where the aux. loss is computed on the phoneme labels that are upsampled using an external Glow-TTS Viterbi alignment, given as an HDF. + +The folder [`exp_joint_flow_ga`](./exp_joint_flow_ga/) contains similar experiments but instead of using MAS to compute the Viterbi alignment during training the external alignment is also used for the TTS itself. + +[`exp_joint_flow_ga_frozen_glowtts`](./exp_joint_flow_ga_frozen_glowtts/) additionally freezes the Glow-TTS parameters and only trains the phoneme reconstruction from different parts of the latent space, making it similar to the "encoder_test/decoder_test/encoder_sample" experiments in [`librispeech_glowtts`](../librispeech_glowtts/) and [`librispeech_glow_asr`](../librispeech_glow_asr/). + +[`exp_joint_2step`](./exp_joint_2step/) contains a mixture of experiments using two steps of training. The first block in the respective experiment.py contains trainings where in the first step a very strong auxiliary loss was used, after which the TTS is then further trained without an aux. loss in a second training. Additionally, it contains further ASR trainings using BLSTM or Conformer on a TTS trained with aux. loss to see the effect of the aux. loss on WER. diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint/experiments.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint/experiments.py index adf39a1bc..9e4f161a4 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint/experiments.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint/experiments.py @@ -36,14 +36,13 @@ from ..storage import tts_models, add_tts_model, TTSModel -def get_glow_joint(x_vector_exp, joint_exps, tts_exps, gl_checkpoint): - """ - Baseline for the glow TTS in returnn_common with serialization +def get_glow_joint(x_vector_exp, tts_exps, gl_checkpoint): + """Experiments training TTS jointly with phoneme prediction network as an auxiliary loss - Uses updated RETURNN_COMMON - - :return: durations_hdf - """ + :param dict x_vector_exp: Dictionary containing x-vector experiments from ../../librispeech_x_vectors + :param dict tts_exps: Dictionary containing TTS-only experiments from ../../librispeech_glowtts + :param dict gl_checkpoint: _description_ + """ prefix = "experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/" experiments = {} @@ -59,26 +58,40 @@ def run_exp( forward_args={}, search_args={}, keep_epochs=None, - extract_x_vector=False, tts_forward=True, asr_search=True, phoneme_pred=True, encoder_phoneme_pred=False, - asr_cv_set=False, - given_train_job_for_forward=None, tts_eval_datasets=None, ): + """Creates the job for an experiment + + :param str name: Name of the experiment used for aliases + :param dict args: General arguments used for the Returnn Configs + :param TrainingDataset dataset: Dataset for training + :param dict test_dataset: Dictionary of datasets used for evaluation + :param int num_epochs: Number of epochs for training, defaults to 100 + :param bool use_custom_engine: whether a custom engine is used in Returnn, defaults to False + :param dict training_args: Additional arguments for training, defaults to {} + :param dict forward_args: Additional arguments for TTS forward, defaults to {} + :param dict search_args: Additional arguments for phoneme prediction, defaults to {} + :param list[int] keep_epochs: List of numbers of checkpoints that are supposed to be kept during training, defaults to None + :param bool tts_forward: whether TTS evaluation should be run (autoMOS, NISQA), defaults to True + :param bool asr_search: whether ASR search should be run, defaults to True + :param bool phoneme_pred: whether phoneme prediction evaluation should be run, defaults to True + :param bool encoder_phoneme_pred: whether phoneme prediction evaluation should be run using encoder output instead of inverse decoder output as the input, defaults to False + :param dict tts_eval_datasets: Dictionary containing datasets for TTS evaluation, defaults to None + :return dict: Dictionary containing all the job references for the given experiment + """ exp = {} - if given_train_job_for_forward is None: - training_config = get_training_config( - training_datasets=dataset, - **args, - training_args=training_args, - use_custom_engine=use_custom_engine, - keep_epochs=keep_epochs, - asr_cv_set=asr_cv_set, - ) # implicit reconstruction loss + training_config = get_training_config( + training_datasets=dataset, + **args, + training_args=training_args, + use_custom_engine=use_custom_engine, + keep_epochs=keep_epochs, + ) if asr_search or phoneme_pred: search_config = get_search_config( @@ -95,16 +108,13 @@ def run_exp( if encoder_phoneme_pred: encoder_phoneme_pred_config = get_search_config(**args, search_args=search_args, target="encoder_phoneme") - if given_train_job_for_forward is None: - train_job = training( - config=training_config, - returnn_exe=RETURNN_PYTORCH_EXE, - returnn_root=MINI_RETURNN_ROOT, - prefix=prefix + name, - num_epochs=num_epochs, - ) - else: - train_job = given_train_job_for_forward + train_job = training( + config=training_config, + returnn_exe=RETURNN_PYTORCH_EXE, + returnn_root=MINI_RETURNN_ROOT, + prefix=prefix + name, + num_epochs=num_epochs, + ) exp["train_job"] = train_job if tts_forward: @@ -133,19 +143,6 @@ def run_exp( ) exp["forward_job_gl"] = forward_job_gl - if extract_x_vector: - forward_x_vector_config = get_forward_config( - forward_dataset=dataset, **args, forward_args=forward_args, target="xvector", train_data=True - ) - forward_xvector_job = forward( - checkpoint=train_job.out_checkpoints[num_epochs], - config=forward_x_vector_config, - returnn_exe=RETURNN_PYTORCH_EXE, - returnn_root=MINI_RETURNN_ROOT, - prefix=prefix + name, - target="xvector", - ) - exp["forward_xvector_job"] = forward_xvector_job if asr_search: search( prefix + name + "/search", @@ -175,19 +172,10 @@ def run_exp( ) return exp - # def get_lr_scale(dim_model, step_num, warmup_steps): - # return np.power(dim_model, -0.5) * np.min( - # [np.power(step_num + 1, -0.5), step_num + 1 * np.power(warmup_steps, -1.5)] - # ) - train_settings = TrainingDatasetSettings( custom_processing_function=None, partition_epoch=3, epoch_wise_filters=[], seq_ordering="laplace:.1000" ) - # training_datasets = build_training_dataset( - # settings=train_settings, librispeech_key="train-clean-100", silence_preprocessing=False - # ) - glowTTS_durations_job = tts_exps["glowTTS/enc192/200ep/long_cooldown/not_silence_preprocessed"]["forward_job_joint_durations"] training_datasets_tts_segments = build_training_dataset( settings=train_settings, @@ -196,9 +184,6 @@ def run_exp( use_tts_train_segments=True, durations_file=glowTTS_durations_job.out_hdf_files["output.hdf"] ) - # training_datasets_silence_preprocessed = build_training_dataset( - # settings=train_settings, librispeech_key="train-clean-100", silence_preprocessing=True - # ) train_settings_pe1 = TrainingDatasetSettings( custom_processing_function=None, partition_epoch=1, epoch_wise_filters=[], seq_ordering="laplace:.1000" ) @@ -306,24 +291,6 @@ def run_exp( max_dim_feat=8, num_repeat_feat=5, ) - frontend_config = VGG4LayerActFrontendV1Config_mod( - in_features=80, - conv1_channels=16, - conv2_channels=16, - conv3_channels=16, - conv4_channels=16, - conv_kernel_size=(3, 3), - conv_padding=None, - pool1_kernel_size=(2, 1), - pool1_stride=(2, 1), - pool1_padding=None, - pool2_kernel_size=(2, 1), - pool2_stride=(2, 1), - pool2_padding=None, - activation_str="ReLU", - out_features=96, - activation=None, - ) text_encoder_config = TextEncoderConfig( n_vocab=label_datastream_tts.vocab_size, hidden_channels=192, diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_2step/experiments.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_2step/experiments.py index 167f98973..119e2e8c7 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_2step/experiments.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_2step/experiments.py @@ -42,12 +42,10 @@ def get_glow_joint_2step(x_vector_exp, joint_exps, tts_exps, gl_checkpoint): """ - Baseline for the glow TTS in returnn_common with serialization - - Uses updated RETURNN_COMMON - - :return: durations_hdf - """ + Experiments performing two step training (first block) or further training on pre-trained models from ../exp_joint and ../exp_joint_ga (second and third block) + The experiments contain further TTS training after a first train step with very strong auxiliary loss (first block) + but also ASR and phoneme prediction trainings after a pre-training (last blocks) + """ prefix = "experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/" experiments = {} @@ -63,28 +61,43 @@ def run_exp( forward_args={}, search_args={}, keep_epochs=None, - extract_x_vector=False, tts_forward=True, asr_search=True, phoneme_pred=True, asr_cv_set=False, - given_train_job_for_forward=None, ): + """Creates the jobs to run training, forwarding/TTS evaluation and phoneme prediction evaluation for the given experiment + + :param str name: Name of the experiment used for aliases + :param dict args: General arguments used for the Returnn Configs + :param TrainingDataset dataset: Dataset for training + :param dict test_dataset: Dictionary of datasets used for evaluation + :param int num_epochs: Number of epochs for training, defaults to 100 + :param bool use_custom_engine: whether a custom engine is used in Returnn, defaults to False + :param dict training_args: Additional arguments for training, defaults to {} + :param dict forward_args: Additional arguments for TTS forward, defaults to {} + :param dict search_args: Additional arguments for phoneme prediction, defaults to {} + :param list[int] keep_epochs: List of numbers of checkpoints that are supposed to be kept during training, defaults to None + :param bool tts_forward: whether TTS evaluation should be run (autoMOS, NISQA), defaults to True + :param bool asr_search: whether ASR search should be run, defaults to True + :param bool phoneme_pred: whether phoneme prediction evaluation should be run, defaults to True + :param bool asr_cv_set: whether the training should use the ASR validation set for CV during training (necessary for ASR trainings), defaults to False + :return dict: Dictionary containing all the jobs of the experiment + """ exp = {} assert num_epochs == len(args["config"]["learning_rates"]), "Number of Epochs and Number of LR steps differs!" with_prior = "prior_scale" in search_args - if given_train_job_for_forward is None: - training_config = get_training_config( - training_datasets=dataset, - **args, - training_args=training_args, - use_custom_engine=use_custom_engine, - keep_epochs=keep_epochs, - asr_cv_set=asr_cv_set, - ) # implicit reconstruction loss + training_config = get_training_config( + training_datasets=dataset, + **args, + training_args=training_args, + use_custom_engine=use_custom_engine, + keep_epochs=keep_epochs, + asr_cv_set=asr_cv_set, + ) if tts_forward: forward_config_gl = get_forward_config( @@ -111,16 +124,13 @@ def run_exp( target="phoneme" ) - if given_train_job_for_forward is None: - train_job = training( - config=training_config, - returnn_exe=RETURNN_PYTORCH_EXE, - returnn_root=MINI_RETURNN_ROOT, - prefix=prefix + name, - num_epochs=num_epochs, - ) - else: - train_job = given_train_job_for_forward + train_job = training( + config=training_config, + returnn_exe=RETURNN_PYTORCH_EXE, + returnn_root=MINI_RETURNN_ROOT, + prefix=prefix + name, + num_epochs=num_epochs, + ) exp["train_job"] = train_job if with_prior: @@ -147,19 +157,6 @@ def run_exp( ) exp["forward_job_gl"] = forward_job_gl - if extract_x_vector: - forward_x_vector_config = get_forward_config( - forward_dataset=dataset, **args, forward_args=forward_args, target="xvector", train_data=True - ) - forward_xvector_job = forward( - checkpoint=train_job.out_checkpoints[num_epochs], - config=forward_x_vector_config, - returnn_exe=RETURNN_PYTORCH_EXE, - returnn_root=MINI_RETURNN_ROOT, - prefix=prefix + name, - target="xvector", - ) - exp["forward_xvector_job"] = forward_xvector_job if asr_search: search( prefix + name + "/search", diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_flow_ga/experiments.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_flow_ga/experiments.py index 08197c92d..1ca31d2b2 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_flow_ga/experiments.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_flow_ga/experiments.py @@ -38,11 +38,8 @@ def get_glow_joint_flow_ga(x_vector_exp, joint_exps, tts_exps, gl_checkpoint): """ - Baseline for the glow TTS in returnn_common with serialization - - Uses updated RETURNN_COMMON - - :return: durations_hdf + Experiments training Glow-TTS with an additional auxiliary loss using a simple model for phoneme reconstruction (FFN or CNN). + Other than the models in ../exp_joint these models also use the given external alignment for the Glow-TTS training so MAS/Viterbi is completely omitted """ prefix = "experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/" @@ -59,33 +56,38 @@ def run_exp( forward_args={}, search_args={}, keep_epochs=None, - extract_x_vector=False, - tts_forward=True, - asr_search=True, phoneme_pred=True, encoder_phoneme_pred=False, - asr_cv_set=False, - given_train_job_for_forward=None, eval_invertibility=False, tts_eval_datasets=None, ): + """Creates the jobs for training, TTS generation/forwarding and phoneme prediction evaluation + + :param str name: Name of the experiment used for alias + :param dict args: General arguments used for Returnn config generation + :param TrainingDataset dataset: Dataset used for training + :param dict test_dataset: Dictionary containing datasets used for phoneme prediction evaluation + :param int num_epochs: Number of epochs for training, defaults to 100 + :param bool use_custom_engine: whether a custom engine should be used in Returnn, defaults to False + :param dict training_args: Additional arguments passed to the training config and the train steps, defaults to {} + :param dict forward_args: Additional arguments passed to the TTS forward config and forward steps, defaults to {} + :param dict search_args: Additional arguments passed to the phoneme prediction and respective steps, defaults to {} + :param list[int] keep_epochs: List containing the number of checkpoints that are not supposed to be deleted during training, defaults to None + :param bool phoneme_pred: whether phoneme prediction using the inverse decoder output should be run and evaluated, uses test_dataset for evaluation, defaults to True + :param bool encoder_phoneme_pred: whether phoneme prediction using the encoder output should be run, uses test_dataset for evaluation, defaults to False + :param bool eval_invertibility: whether invertibility of the models coupling blocks should be run, defaults to False + :param dict tts_eval_datasets: Dictionary of datasets to be used for TTS evaluation (autoMOS, NISQA), defaults to None + :return dict: Dictionary containing all the jobs created for + """ exp = {} - if given_train_job_for_forward is None: - training_config = get_training_config( - training_datasets=dataset, - **args, - training_args=training_args, - use_custom_engine=use_custom_engine, - keep_epochs=keep_epochs, - asr_cv_set=asr_cv_set, - ) # implicit reconstruction loss - - if asr_search or phoneme_pred: - search_config = get_search_config( - **args, - search_args=search_args, - ) + training_config = get_training_config( + training_datasets=dataset, + **args, + training_args=training_args, + use_custom_engine=use_custom_engine, + keep_epochs=keep_epochs, + ) if phoneme_pred: phoneme_pred_config = get_search_config( @@ -102,65 +104,40 @@ def run_exp( if encoder_phoneme_pred: encoder_phoneme_pred_config = get_search_config(**args, search_args=search_args, target="encoder_phoneme") - if given_train_job_for_forward is None: - train_job = training( - config=training_config, - returnn_exe=RETURNN_PYTORCH_EXE, - returnn_root=MINI_RETURNN_ROOT, - prefix=prefix + name, - num_epochs=num_epochs, - ) - else: - train_job = given_train_job_for_forward + train_job = training( + config=training_config, + returnn_exe=RETURNN_PYTORCH_EXE, + returnn_root=MINI_RETURNN_ROOT, + prefix=prefix + name, + num_epochs=num_epochs, + ) + exp["train_job"] = train_job - if tts_forward: - for ds_k, ds in tts_eval_datasets.items(): - forward_config_gl = get_forward_config( - forward_dataset=ds, - **{**args, **{"config": {"batch_size": 50 * 16000}}}, - forward_args={ - **forward_args, - "gl_net_checkpoint": gl_checkpoint["checkpoint"], - "gl_net_config": gl_checkpoint["config"], - }, - target="corpus_gl", - ) - forward_job_gl = tts_eval( - checkpoint=train_job.out_checkpoints[num_epochs], - prefix_name=prefix + name, - returnn_config=forward_config_gl, - returnn_exe=RETURNN_PYTORCH_EXE, - returnn_exe_asr=RETURNN_PYTORCH_ASR_SEARCH_EXE, - returnn_root=MINI_RETURNN_ROOT, - vocoder="gl", - nisqa_eval=True, - swer_eval=True, - swer_eval_corpus_key=ds_k - ) - - if extract_x_vector: - forward_x_vector_config = get_forward_config( - forward_dataset=dataset, **args, forward_args=forward_args, target="xvector", train_data=True + for ds_k, ds in tts_eval_datasets.items(): + forward_config_gl = get_forward_config( + forward_dataset=ds, + **{**args, **{"config": {"batch_size": 50 * 16000}}}, + forward_args={ + **forward_args, + "gl_net_checkpoint": gl_checkpoint["checkpoint"], + "gl_net_config": gl_checkpoint["config"], + }, + target="corpus_gl", ) - forward_xvector_job = forward( + forward_job_gl = tts_eval( checkpoint=train_job.out_checkpoints[num_epochs], - config=forward_x_vector_config, + prefix_name=prefix + name, + returnn_config=forward_config_gl, returnn_exe=RETURNN_PYTORCH_EXE, + returnn_exe_asr=RETURNN_PYTORCH_ASR_SEARCH_EXE, returnn_root=MINI_RETURNN_ROOT, - prefix=prefix + name, - target="xvector", - ) - exp["forward_xvector_job"] = forward_xvector_job - if asr_search: - search( - prefix + name + "/search", - search_config, - train_job.out_checkpoints[num_epochs], - test_dataset, - RETURNN_PYTORCH_EXE, - MINI_RETURNN_ROOT, + vocoder="gl", + nisqa_eval=True, + swer_eval=True, + swer_eval_corpus_key=ds_k ) + if phoneme_pred: compute_phoneme_pred_accuracy( prefix + name, @@ -192,8 +169,6 @@ def run_exp( target="invertibility", ) exp["invertibility_job"] = forward_job - # if "ce_loss_scale" in training_args and training_args["ce_loss_scale"] == 0.01: - # breakpoint() return exp glowTTS_durations_job = tts_exps["glowTTS/enc192/200ep/long_cooldown/not_silence_preprocessed"]["forward_job_joint_durations"] @@ -296,30 +271,6 @@ def run_exp( get_bliss_corpus_dict()["train-clean-100"], ) - specaug_config = SpecaugConfig( - repeat_per_n_frames=100, - max_dim_time=20, - max_dim_feat=8, - num_repeat_feat=5, - ) - frontend_config = VGG4LayerActFrontendV1Config_mod( - in_features=80, - conv1_channels=16, - conv2_channels=16, - conv3_channels=16, - conv4_channels=16, - conv_kernel_size=(3, 3), - conv_padding=None, - pool1_kernel_size=(2, 1), - pool1_stride=(2, 1), - pool1_padding=None, - pool2_kernel_size=(2, 1), - pool2_stride=(2, 1), - pool2_padding=None, - activation_str="ReLU", - out_features=96, - activation=None, - ) text_encoder_config = TextEncoderConfig( n_vocab=label_datastream_tts.vocab_size, hidden_channels=192, @@ -428,7 +379,6 @@ def run_exp( training_args={"ce_loss_scale": 0.1}, search_args=default_search_args, tts_eval_datasets=tts_forward_datasets_xvectors, - asr_search=False, encoder_phoneme_pred=True, ) @@ -442,7 +392,6 @@ def run_exp( training_args={"ce_loss_scale": 0.1}, search_args=default_search_args, tts_eval_datasets=tts_forward_datasets_xvectors, - asr_search=False, encoder_phoneme_pred=True, ) @@ -460,7 +409,6 @@ def run_exp( training_args={"ce_loss_scale": 0.1}, search_args=default_search_args, tts_eval_datasets=tts_forward_datasets_xvectors, - asr_search=False, encoder_phoneme_pred=True, ) @@ -474,7 +422,6 @@ def run_exp( training_args={"ce_loss_scale": 0.1}, search_args=default_search_args, tts_eval_datasets=tts_forward_datasets_xvectors, - asr_search=False, encoder_phoneme_pred=True, ) @@ -488,7 +435,6 @@ def run_exp( training_args={"ce_loss_scale": 1.0}, search_args=default_search_args, tts_eval_datasets=tts_forward_datasets_xvectors, - asr_search=False, encoder_phoneme_pred=True, ) @@ -502,7 +448,6 @@ def run_exp( training_args={"ce_loss_scale": 1.0}, search_args=default_search_args, tts_eval_datasets=tts_forward_datasets_xvectors, - asr_search=False, encoder_phoneme_pred=True, ) @@ -537,7 +482,6 @@ def run_exp( training_args={"ce_loss_scale": 0.1}, search_args=default_search_args, tts_eval_datasets=tts_forward_datasets_xvectors, - asr_search=False, ) exp_dict = run_exp( @@ -550,7 +494,6 @@ def run_exp( training_args={"ce_loss_scale": 0.1}, search_args=default_search_args, tts_eval_datasets=tts_forward_datasets_xvectors, - asr_search=False, ) net_module = "ga_glowTTS_ASR_cnn_x_vector_v2" @@ -566,7 +509,6 @@ def run_exp( training_args={"ce_loss_scale": 0.1}, search_args=default_search_args, tts_eval_datasets=tts_forward_datasets_xvectors, - asr_search=False, encoder_phoneme_pred=True, ) @@ -580,7 +522,6 @@ def run_exp( training_args={"ce_loss_scale": 0.1}, search_args=default_search_args, tts_eval_datasets=tts_forward_datasets_xvectors, - asr_search=False, encoder_phoneme_pred=True, ) @@ -594,7 +535,6 @@ def run_exp( training_args={"ce_loss_scale": 0.01}, search_args=default_search_args, tts_eval_datasets=tts_forward_datasets_xvectors, - asr_search=False, encoder_phoneme_pred=True, ) @@ -608,6 +548,5 @@ def run_exp( training_args={"ce_loss_scale": 0.01}, search_args=default_search_args, tts_eval_datasets=tts_forward_datasets_xvectors, - asr_search=False, encoder_phoneme_pred=True, ) diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_flow_ga_frozen_glowtts/experiments.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_flow_ga_frozen_glowtts/experiments.py index a460aa1ed..5274592d7 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_flow_ga_frozen_glowtts/experiments.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_joint_flow_ga_frozen_glowtts/experiments.py @@ -37,11 +37,10 @@ def get_glow_joint_flow_ga_frozen_glowtts(x_vector_exp, joint_exps, tts_exps, gl_checkpoint): """ - Baseline for the glow TTS in returnn_common with serialization - - Uses updated RETURNN_COMMON - - :return: durations_hdf + Experiments using frozen coupling blocks pre-trained on TTS in combination with a trainable simple phoneme reconstruction network. + The experiments are therefore similar to the "decoder_test"/"encoder_test"/"encoder_sample" experiments in ../../librispeech_glowtts and ../../librispeech_glow_asr + but instead of the MAS/Viterbi alignment, the experiments here make use of the external given alignment. The only exceptions for this are marked with "_mas" in the + module filename. """ prefix = "experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/" @@ -55,48 +54,37 @@ def run_exp( num_epochs=100, use_custom_engine=False, training_args={}, - forward_args={}, search_args={}, keep_epochs=None, - extract_x_vector=False, - tts_forward=False, - asr_search=False, phoneme_pred=True, - asr_cv_set=False, - given_train_job_for_forward=None, eval_invertibility=False, ): + """Creates the jobs for training, generation/search and evaluation + + :param str name: Name of the experiment group to be used for aliases + :param dict args: Arguments passed to the training and forward configs + :param TrainingDataset dataset: Dataset to be used for training and the tts forwarding not used for automatic evaluation + :param dict test_dataset: Dictionary of dataset to test phoneme prediction accuracy, keys should be corpus names/corpus keys and are used for aliases + :param int num_epochs: Number of Epochs for training, defaults to 100 + :param bool use_custom_engine: whether a custom engine is to be used in Returnn, defaults to False + :param dict training_args: Additional training only arguments for Returnn config and train steps, defaults to {} + :param dict search_args: Additional search/phoneme prediction only arguments for Returnn config and search* steps, defaults to {} + :param list keep_epochs: List of numbers marking the model checkpoints that are not supposed to be cleaned, defaults to None + :param bool phoneme_pred: whether a phoneme prediction and evaluation should be run after training, defaults to True + :param bool eval_invertibility: whether the invertibility of the coupling blocks should be evaluated, defaults to False + :return dict: Dictionary containing all the jobs for this experiment + """ exp = {} assert len(args["config"]["learning_rates"]) == num_epochs, "Number of epochs and number of learning rates differ!" - if given_train_job_for_forward is None: - training_config = get_training_config( - training_datasets=dataset, - **args, - training_args=training_args, - use_custom_engine=use_custom_engine, - keep_epochs=keep_epochs, - asr_cv_set=asr_cv_set, - ) # implicit reconstruction loss - - if tts_forward: - forward_config_gl = get_forward_config( - forward_dataset=dataset, - **{**args, **{"config": {"batch_size": 50 * 16000}}}, - forward_args={ - **forward_args, - "gl_net_checkpoint": gl_checkpoint["checkpoint"], - "gl_net_config": gl_checkpoint["config"], - }, - target="corpus_gl", - ) - - if asr_search or phoneme_pred: - search_config = get_search_config( - **args, - search_args=search_args, - ) + training_config = get_training_config( + training_datasets=dataset, + **args, + training_args=training_args, + use_custom_engine=use_custom_engine, + keep_epochs=keep_epochs, + ) if phoneme_pred: phoneme_pred_config_encoder = get_search_config( @@ -115,54 +103,15 @@ def run_exp( forward_dataset=dataset, **{**args, **{"config": {"batch_size": 50 * 16000}}}, target="invertibility" ) - if given_train_job_for_forward is None: - train_job = training( - config=training_config, - returnn_exe=RETURNN_PYTORCH_EXE, - returnn_root=MINI_RETURNN_ROOT, - prefix=prefix + name, - num_epochs=num_epochs, - ) - else: - train_job = given_train_job_for_forward + train_job = training( + config=training_config, + returnn_exe=RETURNN_PYTORCH_EXE, + returnn_root=MINI_RETURNN_ROOT, + prefix=prefix + name, + num_epochs=num_epochs, + ) exp["train_job"] = train_job - if tts_forward: - forward_job_gl = tts_eval( - checkpoint=train_job.out_checkpoints[num_epochs], - prefix_name=prefix + name, - returnn_config=forward_config_gl, - returnn_exe=RETURNN_PYTORCH_EXE, - returnn_exe_asr=RETURNN_PYTORCH_ASR_SEARCH_EXE, - returnn_root=MINI_RETURNN_ROOT, - vocoder="gl", - nisqa_eval=True, - swer_eval=True - ) - exp["forward_job_gl"] = forward_job_gl - - if extract_x_vector: - forward_x_vector_config = get_forward_config( - forward_dataset=dataset, **args, forward_args=forward_args, target="xvector", train_data=True - ) - forward_xvector_job = forward( - checkpoint=train_job.out_checkpoints[num_epochs], - config=forward_x_vector_config, - returnn_exe=RETURNN_PYTORCH_EXE, - returnn_root=MINI_RETURNN_ROOT, - prefix=prefix + name, - target="xvector", - ) - exp["forward_xvector_job"] = forward_xvector_job - if asr_search: - search( - prefix + name + "/search", - search_config, - train_job.out_checkpoints[num_epochs], - test_dataset, - RETURNN_PYTORCH_EXE, - MINI_RETURNN_ROOT, - ) if phoneme_pred: compute_phoneme_pred_accuracy( prefix + name + "/encoder_eval/", @@ -192,8 +141,6 @@ def run_exp( target="invertibility", ) exp["invertibility_job"] = forward_job - # if "ce_loss_scale" in training_args and training_args["ce_loss_scale"] == 0.01: - # breakpoint() return exp glowTTS_durations_job = tts_exps["glowTTS/enc192/200ep/long_cooldown/not_silence_preprocessed"]["forward_job_joint_durations"] @@ -406,7 +353,6 @@ def run_exp( training_datasets_pe1_tts_segments, dev_dataset_tuples_with_phon, 100, - forward_args=forward_args, training_args={"recognition_input": "encoder"}, search_args=default_search_args, phoneme_pred=True, @@ -418,7 +364,6 @@ def run_exp( training_datasets_pe1_tts_segments, dev_dataset_tuples_with_phon, 100, - forward_args=forward_args, training_args={"recognition_input": "decoder"}, search_args=default_search_args, phoneme_pred=True, @@ -434,7 +379,6 @@ def run_exp( training_datasets_pe1_tts_segments, dev_dataset_tuples_with_phon, 100, - forward_args=forward_args, training_args={"recognition_input": "encoder"}, search_args=default_search_args, phoneme_pred=True, @@ -446,7 +390,6 @@ def run_exp( training_datasets_pe1_tts_segments, dev_dataset_tuples_with_phon, 100, - forward_args=forward_args, training_args={"recognition_input": "decoder"}, search_args=default_search_args, phoneme_pred=True, @@ -462,7 +405,6 @@ def run_exp( training_datasets_pe1_tts_segments, dev_dataset_tuples_with_phon, 100, - forward_args=forward_args, training_args={"recognition_input": "encoder"}, search_args=default_search_args, phoneme_pred=True, @@ -474,7 +416,6 @@ def run_exp( training_datasets_pe1_tts_segments, dev_dataset_tuples_with_phon, 100, - forward_args=forward_args, training_args={"recognition_input": "decoder"}, search_args=default_search_args, phoneme_pred=True, @@ -498,7 +439,6 @@ def run_exp( training_datasets_pe1_tts_segments, dev_dataset_tuples_with_phon, 100, - forward_args=forward_args, training_args={"recognition_input": "encoder"}, search_args=default_search_args, phoneme_pred=True, @@ -510,7 +450,6 @@ def run_exp( training_datasets_pe1_tts_segments, dev_dataset_tuples_with_phon, 100, - forward_args=forward_args, training_args={"recognition_input": "decoder"}, search_args=default_search_args, phoneme_pred=True, @@ -526,7 +465,6 @@ def run_exp( training_datasets_pe1_tts_segments, dev_dataset_tuples_with_phon, 100, - forward_args=forward_args, training_args={"recognition_input": "encoder"}, search_args=default_search_args, phoneme_pred=True, @@ -538,7 +476,6 @@ def run_exp( training_datasets_pe1_tts_segments, dev_dataset_tuples_with_phon, 100, - forward_args=forward_args, training_args={"recognition_input": "decoder"}, search_args=default_search_args, phoneme_pred=True, @@ -554,7 +491,6 @@ def run_exp( training_datasets_pe1_tts_segments, dev_dataset_tuples_with_phon, 100, - forward_args=forward_args, training_args={"recognition_input": "encoder"}, search_args=default_search_args, phoneme_pred=True, @@ -566,7 +502,6 @@ def run_exp( training_datasets_pe1_tts_segments, dev_dataset_tuples_with_phon, 100, - forward_args=forward_args, training_args={"recognition_input": "decoder"}, search_args=default_search_args, phoneme_pred=True, @@ -582,7 +517,6 @@ def run_exp( training_datasets_pe1_tts_segments, dev_dataset_tuples_with_phon, 100, - forward_args=forward_args, training_args={"recognition_input": "encoder"}, search_args=default_search_args, phoneme_pred=True, @@ -594,7 +528,6 @@ def run_exp( training_datasets_pe1_tts_segments, dev_dataset_tuples_with_phon, 100, - forward_args=forward_args, training_args={"recognition_input": "decoder"}, search_args=default_search_args, phoneme_pred=True, diff --git a/users/rilling/experiments/librispeech/librispeech_x_vectors/x_vectors/experiments.py b/users/rilling/experiments/librispeech/librispeech_x_vectors/x_vectors/experiments.py index 39e1a8d84..677ea57ca 100644 --- a/users/rilling/experiments/librispeech/librispeech_x_vectors/x_vectors/experiments.py +++ b/users/rilling/experiments/librispeech/librispeech_x_vectors/x_vectors/experiments.py @@ -14,13 +14,10 @@ def get_pytorch_xvector(): - """ - Baseline for the glow TTS in returnn_common with serialization + """Contains experiments training x-vector TDNN models, which can for example be used for Glow-TTS with x-vector embeddings - Uses updated RETURNN_COMMON - - :return: durations_hdf - """ + :return dict: Dictionary containing the experiments used by some Glow-TTS experiments to load the x-vector model parameters from the training jobs checkpoint (should be done using storage instead) + """ prefix = "experiments/librispeech/x_vector/" diff --git a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/__init__.py b/users/rilling/experiments/librispeech/tts_architecture_improvement_23/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/ctc_aligner/__init__.py b/users/rilling/experiments/librispeech/tts_architecture_improvement_23/ctc_aligner/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/ctc_aligner/data.py b/users/rilling/experiments/librispeech/tts_architecture_improvement_23/ctc_aligner/data.py deleted file mode 100644 index 7344b4e5a..000000000 --- a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/ctc_aligner/data.py +++ /dev/null @@ -1,115 +0,0 @@ -from dataclasses import dataclass -import os -from sisyphus import tk -from typing import Dict - -from i6_core.returnn.dataset import SpeakerLabelHDFFromBlissJob -from i6_core.returnn.oggzip import BlissToOggZipJob - -from i6_experiments.users.rossenbach.common_setups.returnn.datasets import GenericDataset, OggZipDataset, HDFDataset -from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.base import Datastream -from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream - -from i6_experiments.users.rossenbach.datasets.librispeech import get_librispeech_tts_segments - -from ..data import ( - get_tts_log_mel_datastream, - get_bliss_and_zip, - get_vocab_datastream, - make_meta_dataset -) -from ..default_tools import RETURNN_ROOT - -@dataclass(frozen=True) -class AlignmentTrainingDatasets: - """ - Dataclass for Alignment Datasets - """ - - train: GenericDataset - cv: GenericDataset - joint: GenericDataset - datastreams: Dict[str, Datastream] - - -def build_training_dataset( - ls_corpus_key="train-clean-100", - silence_preprocessed=True, - partition_epoch=1, - center : bool = False) -> AlignmentTrainingDatasets: - """ - - :param center: do feature centering - """ - bliss_dataset, zip_dataset = get_bliss_and_zip(ls_corpus_key=ls_corpus_key, silence_preprocessed=silence_preprocessed) - - # segments for train-clean-100-tts-train and train-clean-100-tts-dev - # (1004 segments for dev, 4 segments for each of the 251 speakers) - train_segments, cv_segments = get_librispeech_tts_segments(ls_corpus_key=ls_corpus_key) - - vocab_datastream = get_vocab_datastream(with_blank=True, corpus_key=ls_corpus_key) - log_mel_datastream = get_tts_log_mel_datastream() - - # we currently assume that train and cv share the same corpus file - speaker_label_job = SpeakerLabelHDFFromBlissJob( - bliss_corpus=bliss_dataset, - returnn_root=RETURNN_ROOT, - ) - joint_speaker_hdf = speaker_label_job.out_speaker_hdf - - joint_speaker_dataset = HDFDataset( - files=[joint_speaker_hdf] - ) - speaker_datastream = LabelDatastream( - available_for_inference=True, - vocab_size=speaker_label_job.out_num_speakers, - vocab=speaker_label_job.out_speaker_dict, - ) - - # ----- Ogg and Meta datasets - - train_ogg_dataset = OggZipDataset( - path=zip_dataset, - audio_options=log_mel_datastream.as_returnn_audio_opts(), - target_options=vocab_datastream.as_returnn_targets_opts(), - segment_file=train_segments, - partition_epoch=partition_epoch, - seq_ordering="laplace:.1000" - ) - train_dataset = make_meta_dataset(train_ogg_dataset, joint_speaker_dataset) - - cv_ogg_dataset = OggZipDataset( - path=zip_dataset, - audio_options=log_mel_datastream.as_returnn_audio_opts(), - target_options=vocab_datastream.as_returnn_targets_opts(), - segment_file=cv_segments, - partition_epoch=1, - seq_ordering="sorted", - ) - cv_dataset = make_meta_dataset(cv_ogg_dataset, joint_speaker_dataset) - - joint_ogg_zip = OggZipDataset( - path=zip_dataset, - audio_options=log_mel_datastream.as_returnn_audio_opts(), - target_options=vocab_datastream.as_returnn_targets_opts(), - partition_epoch=1, - seq_ordering="sorted", - ) - joint_metadataset = make_meta_dataset(joint_ogg_zip, joint_speaker_dataset) - - # ----- final outputs - - datastreams = { - "audio_features": log_mel_datastream, - "phonemes": vocab_datastream, - "speaker_labels": speaker_datastream, - } - - align_datasets = AlignmentTrainingDatasets( - train=train_dataset, - cv=cv_dataset, - joint=joint_metadataset, - datastreams=datastreams, - ) - - return align_datasets \ No newline at end of file diff --git a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/ctc_aligner/experiments.py b/users/rilling/experiments/librispeech/tts_architecture_improvement_23/ctc_aligner/experiments.py deleted file mode 100644 index 7b7c32829..000000000 --- a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/ctc_aligner/experiments.py +++ /dev/null @@ -1,309 +0,0 @@ -import copy -import os -import numpy as np -from sisyphus import tk -from dataclasses import asdict - -from .data import build_training_dataset -from .config import get_training_config, get_forward_config, get_pt_forward_config -from .pipeline import ctc_training, ctc_forward -from ..data import get_tts_log_mel_datastream - -from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.audio import DBMelFilterbankOptions - -from ..default_tools import RETURNN_EXE, RETURNN_ROOT, RETURNN_COMMON, RETURNN_PYTORCH_EXE, MINI_RETURNN_ROOT -from ..storage import add_duration - - -from ..rc_networks.ctc_aligner.parameters import ConvBlstmRecParams - -def get_baseline_ctc_alignment(): - """ - Baseline for the ctc aligner in returnn_common with serialization - - Uses updated RETURNN_COMMON - - :return: durations_hdf - """ - - name = "experiments/librispeech/tts_architecture/ctc_aligner/baseline" - training_datasets = build_training_dataset(silence_preprocessed=True) - - - config = { - "behavior_version": 16, - ############ - "optimizer": {"class": "adam", "epsilon": 1e-8}, - "learning_rate_control": "newbob_multi_epoch", - "learning_rate_control_min_num_epochs_per_new_lr": 5, - "learning_rate_control_relative_error_relative_lr": True, - "learning_rates": [0.001], - "use_learning_rate_control_always": True, - ############ - "accum_grad_multiple_step": 2, - "gradient_clip": 1, - "gradient_noise": 0, - "learning_rate_control_error_measure": "dev_score_reconstruction_output", - ############ - "newbob_learning_rate_decay": 0.9, - "newbob_multi_num_epochs": 5, - "newbob_multi_update_interval": 1, - "newbob_relative_error_threshold": 0, - ############# - "batch_size": 28000, - "max_seq_length": {"audio_features": 1600}, - "max_seqs": 200, - } - - net_module = "ctc_aligner.conv_blstm_rec" - params = ConvBlstmRecParams( - audio_emb_size=256, - speaker_emb_size=256, - conv_hidden_size=256, - enc_lstm_size=256, - rec_lstm_size=512, - dropout=0.5, - reconstruction_scale=0.5, - training=True - ) - - aligner_config = get_training_config( - returnn_common_root=RETURNN_COMMON, - training_datasets=training_datasets, - network_module=net_module, - net_args=asdict(params), - config=config, - ) # implicit reconstruction loss - params.training = False - forward_config = get_forward_config( - returnn_common_root=RETURNN_COMMON, - forward_dataset=training_datasets.joint, - datastreams=training_datasets.datastreams, - network_module=net_module, - net_args=asdict(params) - ) - train_job = ctc_training( - config=aligner_config, - returnn_exe=RETURNN_EXE, - returnn_root=RETURNN_ROOT, - prefix=name, - ) - duration_hdf = ctc_forward( - checkpoint=train_job.out_checkpoints[100], - config=forward_config, - returnn_exe=RETURNN_EXE, - returnn_root=RETURNN_ROOT, - prefix=name - ) - return duration_hdf - -def get_pytorch_ctc_alignment(): - """ - Baseline for the ctc aligner in returnn_common with serialization - - Uses updated RETURNN_COMMON - - :return: durations_hdf - """ - - config = { - "optimizer": {"class": "adam", "epsilon": 1e-8}, - "learning_rate_control": "newbob_multi_epoch", - "learning_rate_control_min_num_epochs_per_new_lr": 5, - "learning_rate_control_relative_error_relative_lr": True, - "learning_rates": [0.001], - "gradient_clip": 1.0, - "use_learning_rate_control_always": True, - "learning_rate_control_error_measure": "dev_ctc", - ############ - "newbob_learning_rate_decay": 0.9, - "newbob_multi_num_epochs": 5, - "newbob_multi_update_interval": 1, - "newbob_relative_error_threshold": 0, - ############# - "batch_size": 56000, - "max_seq_length": {"audio_features": 1600}, - "max_seqs": 200, - } - - prefix = "experiments/librispeech/tts_architecture/ctc_aligner/pytorch/" - training_datasets = build_training_dataset(silence_preprocessed=True) - - def run_exp(name, params, net_module, config, use_custom_engine=False, debug=False): - aligner_config = get_training_config( - returnn_common_root=RETURNN_COMMON, - training_datasets=training_datasets, - network_module=net_module, - net_args=params, - config=config, - debug=debug, - use_custom_engine=use_custom_engine, - pytorch_mode=True - ) # implicit reconstruction loss - forward_config = get_pt_forward_config( - returnn_common_root=RETURNN_COMMON, - forward_dataset=training_datasets.joint, - datastreams=training_datasets.datastreams, - network_module=net_module, - net_args=params, - pytorch_mode=True - ) - train_job = ctc_training( - config=aligner_config, - returnn_exe=RETURNN_PYTORCH_EXE, - returnn_root=MINI_RETURNN_ROOT, - prefix=prefix + name, - ) - duration_hdf = ctc_forward( - checkpoint=train_job.out_checkpoints[100], - config=forward_config, - returnn_exe=RETURNN_PYTORCH_EXE, - returnn_root=MINI_RETURNN_ROOT, - prefix=prefix + name - ) - return duration_hdf - - net_module = "ctc_aligner_v1" - params = { - "conv_hidden_size": 256, - "lstm_size": 512, - "speaker_embedding_size": 256, - "dropout": 0.35, - "target_size": 44 - } - - duration_hdf = run_exp(net_module + "_drop035_bs56k", params, net_module, config, debug=True) - - #net_module = "ctc_aligner_v1_gradaccum" - config_gradaccum = copy.deepcopy(config) - #run_exp(net_module + "_drop1d035_bs28k_accum2", params, net_module, config, use_custom_engine=True, debug=True) - - - # net_module = "ctc_aligner_v2" - # params_v2 = { - # "conv_hidden_size": 256, - # "lstm_size": 512, - # "speaker_embedding_size": 256, - # "conv_dropout": 0.5, - # "final_dropout": 0.1, - # "target_size": 44 - # } - # run_exp(net_module + "_drop05_01", params_v2, net_module, config, use_custom_engine=False, debug=False) - - #net_module = "ctc_aligner_v1_ctc_sum" - #run_exp(net_module + "_drop01", params, net_module, config) - - #net_module = "ctc_aligner_v1_ctc_sum_nobroad" - #params = copy.deepcopy(params) - #params["dropout"] = 0.35 - #run_exp(net_module + "_drop035", params, net_module, config) - - return duration_hdf - - -def get_pytorch_raw_ctc_alignment(): - """ - Baseline for the ctc aligner in returnn_common with serialization - - Uses updated RETURNN_COMMON - - :return: durations_hdf - """ - - samples_per_frame = int(16000*0.0125) - config = { - "optimizer": {"class": "adam", "epsilon": 1e-8}, - "learning_rate_control": "newbob_multi_epoch", - "learning_rate_control_min_num_epochs_per_new_lr": 5, - "learning_rate_control_relative_error_relative_lr": True, - "learning_rates": [0.001], - "gradient_clip": 1.0, - "use_learning_rate_control_always": True, - "learning_rate_control_error_measure": "dev_ctc", - ############ - "newbob_learning_rate_decay": 0.9, - "newbob_multi_num_epochs": 5, - "newbob_multi_update_interval": 1, - "newbob_relative_error_threshold": 0, - ############# - "batch_size": 56000*samples_per_frame, - "max_seq_length": {"audio_features": 1600*samples_per_frame}, - "max_seqs": 200, - } - - prefix = "experiments/librispeech/tts_architecture/ctc_aligner/pytorch/" - training_datasets = build_training_dataset(silence_preprocessed=True, raw_audio=True) - - def run_exp(name, params, net_module, config, use_custom_engine=False, debug=False): - aligner_config = get_training_config( - returnn_common_root=RETURNN_COMMON, - training_datasets=training_datasets, - network_module=net_module, - net_args=params, - config=config, - debug=debug, - use_custom_engine=use_custom_engine, - pytorch_mode=True - ) # implicit reconstruction loss - forward_config = get_forward_config( - returnn_common_root=RETURNN_COMMON, - forward_dataset=training_datasets.joint, - datastreams=training_datasets.datastreams, - network_module=net_module, - net_args=params, - pytorch_mode=True - ) - train_job = ctc_training( - config=aligner_config, - returnn_exe=RETURNN_PYTORCH_EXE, - returnn_root=MINI_RETURNN_ROOT, - prefix=prefix + name, - ) - duration_hdf = ctc_forward( - checkpoint=train_job.out_checkpoints[100], - config=forward_config, - returnn_exe=RETURNN_PYTORCH_EXE, - returnn_root=MINI_RETURNN_ROOT, - prefix=prefix + name - ) - return duration_hdf - - net_module = "ctc_aligner_v1_fe" - log_mel_datastream = get_tts_log_mel_datastream() - - # verify that normalization exists - assert "norm_mean" in log_mel_datastream.additional_options - assert "norm_std_dev" in log_mel_datastream.additional_options - - norm = (log_mel_datastream.additional_options["norm_mean"], log_mel_datastream.additional_options["norm_std_dev"]) - - from ..pytorch_networks.ctc_aligner_v1_fe import DbMelFeatureExtractionConfig, Config - assert isinstance(log_mel_datastream.options.feature_options, DBMelFilterbankOptions) - fe_config = DbMelFeatureExtractionConfig( - sample_rate=log_mel_datastream.options.sample_rate, - win_size=log_mel_datastream.options.window_len, - hop_size=log_mel_datastream.options.step_len, - f_min=log_mel_datastream.options.feature_options.fmin, - f_max=log_mel_datastream.options.feature_options.fmax, - min_amp=log_mel_datastream.options.feature_options.min_amp, - num_filters=log_mel_datastream.options.num_feature_filters, - center=log_mel_datastream.options.feature_options.center, - norm=norm - ) - model_config = Config( - conv_hidden_size=256, - lstm_size=512, - speaker_embedding_size=256, - dropout=0.35, - target_size=44, - feature_extraction_config=fe_config, - ) - - params = { - "config": asdict(model_config) - } - - - duration_hdf = run_exp(net_module + "_drop035_bs56k", params, net_module, config, debug=True) - - return duration_hdf diff --git a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/ctc_aligner/pipeline.py b/users/rilling/experiments/librispeech/tts_architecture_improvement_23/ctc_aligner/pipeline.py deleted file mode 100644 index d055f1b2f..000000000 --- a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/ctc_aligner/pipeline.py +++ /dev/null @@ -1,35 +0,0 @@ -from sisyphus import tk -from i6_core.returnn import ReturnnTrainingJob -from i6_core.returnn.forward import ReturnnForwardJob - -def ctc_training(config, returnn_exe, returnn_root, prefix, num_epochs=100): - - train_job = ReturnnTrainingJob( - config, - log_verbosity=5, - num_epochs=num_epochs, - time_rqmt=100, - mem_rqmt=16, - cpu_rqmt=4, - returnn_python_exe=returnn_exe, - returnn_root=returnn_root, - ) - train_job.add_alias(prefix + "/training") - tk.register_output(prefix + "/training.models", train_job.out_model_dir) - - return train_job - - -def ctc_forward(checkpoint, config, returnn_exe, returnn_root, prefix): - last_forward_job = ReturnnForwardJob( - model_checkpoint=checkpoint, - returnn_config=config, - hdf_outputs=[], - returnn_python_exe=returnn_exe, - returnn_root=returnn_root, - ) - last_forward_job.add_alias(prefix + "/forward") - alignment_hdf = last_forward_job.out_hdf_files["output.hdf"] - tk.register_output(prefix + "/training.alignment", alignment_hdf) - - return alignment_hdf diff --git a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/data.py b/users/rilling/experiments/librispeech/tts_architecture_improvement_23/data.py deleted file mode 100644 index 1888c91ae..000000000 --- a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/data.py +++ /dev/null @@ -1,320 +0,0 @@ -from dataclasses import asdict -from functools import lru_cache -import os -from sisyphus import tk -from typing import List - -from i6_core.returnn.oggzip import BlissToOggZipJob -from i6_core.returnn.vocabulary import ReturnnVocabFromPhonemeInventory - -from i6_experiments.common.datasets.librispeech import get_g2p_augmented_bliss_lexicon_dict, get_bliss_corpus_dict - -from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.audio import AudioFeatureDatastream, DBMelFilterbankOptions, ReturnnAudioFeatureOptions, FeatureType -from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream -from i6_experiments.users.rossenbach.common_setups.returnn import datasets - -from i6_experiments.users.rossenbach.datasets.librispeech import ( - get_librispeech_tts_segments, - get_ls_train_clean_100_tts_silencepreprocessed, - get_ls_train_clean_360_tts_silencepreprocessed, - get_ls_train_other_500_tts_silencepreprocessed -) - - -from i6_experiments.users.rossenbach.setups.tts.preprocessing import ( - process_corpus_text_with_extended_lexicon, - extend_lexicon_with_tts_lemmas, - extend_lexicon_with_blank -) - -from .default_tools import RETURNN_EXE, RETURNN_ROOT - -DATA_PREFIX = "experiments/alignment_analysis_tts/data/" - - -@lru_cache -def get_librispeech_lexicon(corpus_key="train-clean-100") -> tk.Path: - """ - get the TTS-extended g2p bliss lexicon with [start], [end] and [space] marker - :return: - """ - return extend_lexicon_with_tts_lemmas(get_g2p_augmented_bliss_lexicon_dict(use_stress_marker=False)[corpus_key]) - - -def get_tts_extended_bliss(ls_corpus_key) -> tk.Path: - """ - get a modified ls corpus using the TTS processing - :return: - """ - ls_bliss = get_bliss_corpus_dict(audio_format="ogg")[ls_corpus_key] - tts_ls_bliss = process_corpus_text_with_extended_lexicon( - bliss_corpus=ls_bliss, - lexicon=get_librispeech_lexicon(corpus_key=ls_corpus_key)) - - return tts_ls_bliss - - -@lru_cache -def get_ls100_silence_preprocessed_bliss() -> tk.Path: - """ - Get the modified ls100 corpus for the TTS task with silence preprocessing - :return: Bliss xml file - """ - # this is the FFmpeg silence preprocessed version of LibriSpeech train-clean-100 - sil_pp_train_clean_100_co = get_ls_train_clean_100_tts_silencepreprocessed() - - # convert the corpus transcriptions into phoneme and marker representation - sil_pp_train_clean_100_tts = process_corpus_text_with_extended_lexicon( - bliss_corpus=sil_pp_train_clean_100_co.corpus_file, - lexicon=get_librispeech_lexicon()) - - return sil_pp_train_clean_100_tts - - -@lru_cache -def get_ls460_silence_preprocessed_bliss() -> tk.Path: - """ - Get the modified ls100 corpus for the TTS task - :return: Bliss xml file - """ - # this is the FFmpeg silence preprocessed version of LibriSpeech train-clean-100 - sil_pp_train_clean_100_co = get_ls_train_clean_100_tts_silencepreprocessed() - sil_pp_train_clean_360_co = get_ls_train_clean_360_tts_silencepreprocessed() - - from i6_core.corpus.transform import MergeCorporaJob, MergeStrategy - spp_460_corpus = MergeCorporaJob( - bliss_corpora=[ - sil_pp_train_clean_100_co.corpus_file, - sil_pp_train_clean_360_co.corpus_file, - ], - merge_strategy=MergeStrategy.FLAT, - name="train-clean-460" - ).out_merged_corpus - - # convert the corpus transcriptions into phoneme and marker representation - sil_pp_train_clean_100_tts = process_corpus_text_with_extended_lexicon( - bliss_corpus=spp_460_corpus, - lexicon=get_librispeech_lexicon("train-clean-460")) - - return sil_pp_train_clean_100_tts - - -@lru_cache -def get_ls960_silence_preprocessed_bliss() -> tk.Path: - """ - Get the modified ls960 corpus for the TTS task - :return: Bliss xml file - """ - # this is the FFmpeg silence preprocessed version of LibriSpeech train-clean-100 - sil_pp_train_clean_100_co = get_ls_train_clean_100_tts_silencepreprocessed() - sil_pp_train_clean_360_co = get_ls_train_clean_360_tts_silencepreprocessed() - sil_pp_train_other_500_co = get_ls_train_other_500_tts_silencepreprocessed() - - from i6_core.corpus.transform import MergeCorporaJob, MergeStrategy - spp_960_corpus = MergeCorporaJob( - bliss_corpora=[ - sil_pp_train_clean_100_co.corpus_file, - sil_pp_train_clean_360_co.corpus_file, - sil_pp_train_other_500_co.corpus_file, - ], - merge_strategy=MergeStrategy.FLAT, - name="train-other-960" - ).out_merged_corpus - - # convert the corpus transcriptions into phoneme and marker representation - sil_pp_train_other_960_tts = process_corpus_text_with_extended_lexicon( - bliss_corpus=spp_960_corpus, - lexicon=get_librispeech_lexicon("train-other-960")) - - return sil_pp_train_other_960_tts - - -@lru_cache -def get_ls360_zip_for_synthesis_only() -> tk.Path: - """ - TTS label processed librispeech 360 without audio - - :return: - """ - ls460_lexicon = extend_lexicon_with_tts_lemmas(get_g2p_augmented_bliss_lexicon_dict(use_stress_marker=False)["train-clean-460"]) - corpus = get_bliss_corpus_dict()["train-clean-360"] # original corpus as .flac - tts_corpus = process_corpus_text_with_extended_lexicon( - bliss_corpus=corpus, - lexicon=ls460_lexicon - ) - zip_dataset = BlissToOggZipJob( - bliss_corpus=tts_corpus, - no_audio=True, - returnn_python_exe=RETURNN_EXE, - returnn_root=RETURNN_ROOT, - ).out_ogg_zip - - return zip_dataset - - -@lru_cache -def get_ls100_silence_preprocess_ogg_zip() -> tk.Path: - """ - :return: Returnn OggZip .zip file - """ - - sil_pp_train_clean_100_tts = get_ls100_silence_preprocessed_bliss() - - zip_dataset = BlissToOggZipJob( - bliss_corpus=sil_pp_train_clean_100_tts, - no_conversion=True, - returnn_python_exe=RETURNN_EXE, - returnn_root=RETURNN_ROOT, - ).out_ogg_zip - - return zip_dataset - - -@lru_cache -def get_ls460_silence_preprocess_ogg_zip() -> tk.Path: - """ - :return: Returnn OggZip .zip file - """ - - sil_pp_train_clean_460_tts = get_ls460_silence_preprocessed_bliss() - - zip_dataset = BlissToOggZipJob( - bliss_corpus=sil_pp_train_clean_460_tts, - no_conversion=True, - returnn_python_exe=RETURNN_EXE, - returnn_root=RETURNN_ROOT, - ).out_ogg_zip - - return zip_dataset - - -def get_bliss_and_zip(ls_corpus_key, silence_preprocessed=True): - """ - :param ls_corpus_key: e.g. train-clean-100, see LibriSpeech data definition - :param silence_preprocessed: - :return: - """ - if silence_preprocessed: - if ls_corpus_key == "train-clean-100": - bliss_dataset = get_ls100_silence_preprocessed_bliss() - elif ls_corpus_key == "train-clean-460": - bliss_dataset = get_ls460_silence_preprocessed_bliss() - elif ls_corpus_key == "train-other-960": - bliss_dataset = get_ls960_silence_preprocessed_bliss() - else: - assert False, "invalid key %s" % ls_corpus_key - else: - bliss_dataset = get_tts_extended_bliss(ls_corpus_key=ls_corpus_key) - - zip_dataset = BlissToOggZipJob( - bliss_corpus=bliss_dataset, - no_conversion=True, - returnn_python_exe=RETURNN_EXE, - returnn_root=RETURNN_ROOT, - ).out_ogg_zip - - return bliss_dataset, zip_dataset - - -def make_meta_dataset(audio_dataset, speaker_dataset): - """ - Shared function to create a metadatset with joined audio and speaker information - - :param datasets.OggZipDataset audio_dataset: - :param datasets.HDFDataset speaker_dataset: - :return: - :rtype: MetaDataset - """ - meta_dataset = datasets.MetaDataset( - data_map={'audio_features': ('audio', 'data'), - 'phonemes': ('audio', 'classes'), - 'speaker_labels': ('speaker', 'data'), - }, - datasets={ - 'audio': audio_dataset.as_returnn_opts(), - 'speaker': speaker_dataset.as_returnn_opts() - }, - seq_order_control_dataset="audio", - ) - return meta_dataset - - -def get_tts_log_mel_datastream() -> AudioFeatureDatastream: - """ - Returns the AudioFeatureDatastream using the default feature parameters - (non-adjustable for now) based on statistics calculated over the provided dataset - - This function serves as an example for ASR Systems, and should be copied and modified in the - specific experiments if changes to the default parameters are needed - - Supports both centered and non-centered windowing, as we need non-centered windowing for RASR-compatible - feature extraction, but centered windowing to support linear-features for the vocoder mel-to-linear training. - - :param center: use center for CTC and Attention alignment, but not for GMM for RASR compatibility - """ - feature_options_center = ReturnnAudioFeatureOptions( - window_len=0.050, - step_len=0.0125, - num_feature_filters=80, - features=FeatureType.DB_MEL_FILTERBANK, - peak_normalization=False, - preemphasis=0.97, - sample_rate=16000, - feature_options=DBMelFilterbankOptions( - fmin=60, - fmax=7600, - min_amp=1e-10, - center=True, - ) - ) - audio_datastream = AudioFeatureDatastream( - available_for_inference=False, options=feature_options_center - ) - - ls100_ogg_zip = get_ls100_silence_preprocess_ogg_zip() - train_segments, _ = get_librispeech_tts_segments() - - audio_datastream.add_global_statistics_to_audio_feature_datastream( - [ls100_ogg_zip], - segment_file=train_segments, - use_scalar_only=True, - returnn_python_exe=RETURNN_EXE, - returnn_root=RETURNN_ROOT, - alias_path=DATA_PREFIX + "/ls100/", - ) - return audio_datastream - - -def get_lexicon(with_blank: bool = False, corpus_key="train-clean-100") -> tk.Path: - """ - Get the TTS/CTC lexicon - - :param with_blank: add blank (e.g. for CTC training or extraction) - :return: path to bliss lexicon file - """ - lexicon = get_librispeech_lexicon(corpus_key=corpus_key) - lexicon = extend_lexicon_with_tts_lemmas(lexicon) - if with_blank: - lexicon = extend_lexicon_with_blank(lexicon) - return lexicon - - -def get_vocab_datastream(with_blank: bool = False, corpus_key="train-clean-100") -> LabelDatastream: - """ - Default VocabularyDatastream for LibriSpeech (uppercase ARPA phoneme symbols) - - :param with_blank: datastream for CTC training - """ - lexicon = get_lexicon(with_blank, corpus_key=corpus_key) - blacklist = {"[SILENCE]"} - returnn_vocab_job = ReturnnVocabFromPhonemeInventory(lexicon, blacklist=blacklist) - name = "returnn_vocab_from_lexicon_with_blank" if with_blank else "returnn_vocab_from_lexicon" - returnn_vocab_job.add_alias(os.path.join(DATA_PREFIX, name)) - - vocab_datastream = LabelDatastream( - available_for_inference=True, - vocab=returnn_vocab_job.out_vocab, - vocab_size=returnn_vocab_job.out_vocab_size - ) - - return vocab_datastream diff --git a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/default_tools.py b/users/rilling/experiments/librispeech/tts_architecture_improvement_23/default_tools.py deleted file mode 100644 index dbc5ae58c..000000000 --- a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/default_tools.py +++ /dev/null @@ -1,17 +0,0 @@ -from sisyphus import tk -from i6_core.tools.git import CloneGitRepositoryJob -from i6_experiments.common.tools.rasr import compile_rasr_binaries_i6mode - - - -RETURNN_EXE = tk.Path("/u/rossenbach/bin/returnn/returnn_tf_dynamic_version_mkl_launcher.sh", hash_overwrite="GENERIC_RETURNN_LAUNCHER") -RETURNN_PYTORCH_EXE = tk.Path("/u/lukas.rilling/bin/returnn/returnn_pt20_experimental.sh", hash_overwrite="GENERIC_RETURNN_LAUNCHER") - -# outdated version using DataLoader v1 and the Tensor fix for RC networks -RETURNN_ROOT = CloneGitRepositoryJob("https://github.com/rwth-i6/returnn", commit="d98a6c606d2c007e2a6771684e77a7650bb3fad6").out_repository.copy() -RETURNN_ROOT.hash_overwrite = "LIBRISPEECH_DEFAULT_RETURNN_ROOT" - -MINI_RETURNN_ROOT = tk.Path("/u/lukas.rilling/github/MiniReturnn", hash_overwrite="LIBRISPEECH_DEFAULT_RETURNN_ROOT") - -RETURNN_COMMON = CloneGitRepositoryJob("https://github.com/rwth-i6/returnn_common", commit="d1fc1c7dc6ae63658e5aa01dc2aad41eb2758573", checkout_folder_name="returnn_common").out_repository.copy() -RETURNN_COMMON.hash_overwrite = "LIBRISPEECH_DEFAULT_RETURNN_COMMON" diff --git a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/__init__.py b/users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/ctc_aligner_rf.py b/users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/ctc_aligner_rf.py deleted file mode 100644 index 953abf6a3..000000000 --- a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/ctc_aligner_rf.py +++ /dev/null @@ -1,120 +0,0 @@ -import time -import torch -from torch import nn -from typing import Dict -from torch.onnx import export - - -from returnn.tensor import Tensor, Dim, batch_dim -from returnn import frontend as rf - -feature_time_dim = Dim(None, name="feature_time") -phoneme_time_dim = Dim(None, name="phoneme_time") -speaker_time_dim = Dim(None, name="speaker_time") -in_dim = Dim(80, name="in") -out_dim = Dim(44, name="out") -spk_dim = Dim(1, name="spk") - -def get_extern_data(**kwargs): - - extern_data = { - "audio_features": {"dims": (batch_dim, feature_time_dim, in_dim), "dtype": "float32"}, - "phonemes": {"dims": (batch_dim, phoneme_time_dim), "sparse_dim": out_dim, "dtype": "int32"}, - "speaker_labels": {"dims": (batch_dim, spk_dim), "dtype": "int32"}, - } - return extern_data - - - - -class Conv1DBlock(rf.Module): - - def __init__(self, in_dim, out_dim, filter_size, dropout): - super().__init__() - self.out_dim = out_dim - self.conv = rf.Conv1d(in_dim=in_dim, out_dim=out_dim, filter_size=filter_size, padding="same") - # self.bn = rf.BatchNorm(in_dim=out_dim, epsilon=1e-5, use_mask=True) - self.dropout = dropout - - def __call__(self, x: Tensor, spatial_dim: Dim): - x, out_spatial_dim = self.conv(x, in_spatial_dim=spatial_dim) - x = rf.relu(x) - # x = self.bn(x) - # x = rf.dropout(x, drop_prob=0.1, axis=x.dims) # does this broadcast also over batch? - return x, out_spatial_dim - -class Model(rf.Module): - - def __init__(self, conv_hidden_size: int, lstm_size: int, target_size: int, **kwargs): - super().__init__() - self.conv_hidden_size = rf.Dim(name="conv_hidden", dimension=conv_hidden_size) - self.audio_embedding = rf.Linear(in_dim, out_dim=self.conv_hidden_size) - - out_dims = [ - rf.Dim(name="conv_dim_%s" % str(x), dimension=conv_hidden_size) - for x in range(5) - ] - - sequential_list = [] - temp_in_dim = self.conv_hidden_size - for x in range(5): - sequential_list.append( - Conv1DBlock( - in_dim=temp_in_dim, - out_dim=out_dims[x], - filter_size=5, - dropout=0.1, - ) - ) - temp_in_dim = out_dims[x] - - self.convs = rf.Sequential(sequential_list) - #self.blstm = rf.LSTM( - # input_size=conv_hidden_size, - # hidden_size=lstm_size, - # bidirectional=True, - # batch_first=False) - self.out_dim = rf.Dim(name="target_out", dimension=target_size) - self.final_linear = rf.Linear(out_dims[-1], self.out_dim) - self.lstm_size = lstm_size - - def __call__( - self, - audio_features: Tensor, - audio_features_time_dim: Dim, - ): - - for dim in audio_features.dim_tags: - print(dim.get_dim_value()) - audio_embedding = self.audio_embedding(audio_features) - conv_out, _ = self.convs((audio_embedding, audio_features_time_dim)) - softmax_in = self.final_linear(conv_out) - - log_probs = rf.log_softmax(softmax_in, axis=self.out_dim, use_mask=True) - - return log_probs - -def train_step(*, model: Model, extern_data: Dict[str, Tensor], **_kwargs): - features = extern_data["audio_features"] - speaker_labels = extern_data["speaker_labels"] - logprobs = model(features, feature_time_dim) - - phonemes = extern_data["phonemes"] - - - raw_phonemes = phonemes.raw_tensor - raw_logprobs = logprobs.raw_tensor - - logprobs_len = feature_time_dim.dyn_size - phonemes_len = phoneme_time_dim.dyn_size - - print(logprobs_len) - print(phonemes_len) - print(logprobs_len.size()) - print(phonemes_len.size()) - print(raw_logprobs.size()) - - - loss = torch.nn.functional.ctc_loss(raw_logprobs.transpose(0, 1), raw_phonemes, input_lengths=logprobs_len, target_lengths=phonemes_len, blank=43) - - rf.get_run_ctx().mark_as_loss(loss, name="ce") diff --git a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/ctc_aligner_v1.py b/users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/ctc_aligner_v1.py deleted file mode 100644 index 28b7de571..000000000 --- a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/ctc_aligner_v1.py +++ /dev/null @@ -1,296 +0,0 @@ -from dataclasses import dataclass -import torch -import numpy -from torch import nn -import multiprocessing - -from returnn.datasets.hdf import SimpleHDFWriter - - -class Conv1DBlock(torch.nn.Module): - """ - A 1D-Convolution with ReLU, batch-norm and non-broadcasted dropout - Will pad to the same output length - """ - - def __init__(self, in_size, out_size, filter_size, dropout): - """ - :param in_size: input feature size - :param out_size: output feature size - :param filter_size: filter size - :param dropout: dropout probability - """ - super().__init__() - assert filter_size % 2 == 1, "Only odd filter sizes allowed" - self.conv = nn.Conv1d(in_size, out_size, filter_size, padding=filter_size // 2) - self.bn = nn.BatchNorm1d(num_features=out_size) - self.dropout = dropout - - def forward(self, x): - """ - :param x: [B, F_in, T] - :return: [B, F_out, T] - """ - x = self.conv(x) - x = nn.functional.relu(x) - # TODO: does not consider masking! - x = self.bn(x) - x = nn.functional.dropout(x, p=self.dropout, training=self.training) - return x - - -class Model(torch.nn.Module): - """ - Default TTS aligner with 5 convolution blocks of size 5 followed by a BLSTM - """ - - def __init__( - self, - conv_hidden_size: int, - lstm_size: int, - speaker_embedding_size: int, - dropout: float, - target_size: int, - **kwargs, - ): - super().__init__() - self.audio_embedding = nn.Linear(80, conv_hidden_size) - self.speaker_embedding = nn.Embedding(251, speaker_embedding_size) - self.convs = nn.Sequential( - Conv1DBlock(conv_hidden_size + speaker_embedding_size, conv_hidden_size, filter_size=5, dropout=dropout), - Conv1DBlock(conv_hidden_size, conv_hidden_size, filter_size=5, dropout=dropout), - Conv1DBlock(conv_hidden_size, conv_hidden_size, filter_size=5, dropout=dropout), - Conv1DBlock(conv_hidden_size, conv_hidden_size, filter_size=5, dropout=dropout), - Conv1DBlock(conv_hidden_size, conv_hidden_size, filter_size=5, dropout=dropout), - ) - self.blstm = nn.LSTM(input_size=conv_hidden_size, hidden_size=lstm_size, bidirectional=True, batch_first=True) - self.final_linear = nn.Linear(2 * lstm_size, target_size) - self.lstm_size = lstm_size - self.target_size = target_size - self.dropout = dropout - - # initialize weights - self.apply(self._weight_init) - - @staticmethod - def _weight_init(module: torch.nn.Module): - if isinstance(module, torch.nn.Conv1d): - nn.init.xavier_normal_(module.weight) - - def forward( - self, - audio_features: torch.Tensor, - speaker_labels: torch.Tensor, - audio_features_len: torch.Tensor, - ): - """ - :param audio_features: [B, T, F] - :param speaker_labels: [B, 1] - :param audio_features_len: length of T as [B] - :return: logprobs as [B, T, #PHONES] - """ - speaker_embeddings: torch.Tensor = self.speaker_embedding(torch.squeeze(speaker_labels, dim=1)) - # manually broadcast speaker embeddings to each time step - speaker_embeddings = torch.repeat_interleave( - torch.unsqueeze(speaker_embeddings, 1), audio_features.size()[1], dim=1 - ) # [B, T, #SPK_EMB_SIZE] - audio_embedding = self.audio_embedding(audio_features) # [B, T, F] - - conv_in = torch.concat([speaker_embeddings, audio_embedding], dim=2) # [B, T, F] - conv_in = torch.swapaxes(conv_in, 1, 2) # [B, F, T] - conv_out = self.convs(conv_in) - blstm_in = torch.permute(conv_out, dims=(0, 2, 1)) # [B, F, T] -> [B, T, F] - - blstm_packed_in = nn.utils.rnn.pack_padded_sequence(blstm_in, audio_features_len.to("cpu"), batch_first=True) - blstm_packed_out, _ = self.blstm(blstm_packed_in) - blstm_out, _ = nn.utils.rnn.pad_packed_sequence( - blstm_packed_out, padding_value=0.0, batch_first=True - ) # [B, T, F] - blstm_out = nn.functional.dropout(blstm_out, p=self.dropout, training=self.training) - logits = self.final_linear(blstm_out) # [B, T, #PHONES] - log_probs = torch.log_softmax(logits, dim=2) # [B, T, #PHONES] - - return log_probs - - -def train_step(*, model: Model, data, run_ctx, **kwargs): - audio_features = data["audio_features"] # [B, T, F] - audio_features_len = data["audio_features:size1"] # [B] - - # perform local length sorting for more efficient packing - audio_features_len, indices = torch.sort(audio_features_len, descending=True) - - audio_features = audio_features[indices, :, :] - phonemes = data["phonemes"][indices, :] # [B, T] (sparse) - phonemes_len = data["phonemes:size1"][indices] # [B, T] - speaker_labels = data["speaker_labels"][indices, :] # [B, 1] (sparse) - - logprobs = model( - audio_features=audio_features, - audio_features_len=audio_features_len, - speaker_labels=speaker_labels, - ) - - transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] - ctc_loss = nn.functional.ctc_loss( - transposed_logprobs, - phonemes, - input_lengths=audio_features_len, - target_lengths=phonemes_len, - blank=model.target_size - 1, - reduction="sum", - ) - num_frames = torch.sum(phonemes_len) - run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_frames) - - -############# FORWARD STUFF ################ -import numpy as np -from scipy.sparse import coo_matrix -from scipy.sparse.csgraph import dijkstra - - -# Duration extraction helpers are taken from -# https://github.com/as-ideas/DeepForcedAligner/blob/main/dfa/duration_extraction.py -# with commit id d1f565604bba25d4c56e3e12b289ab335980e069 -# MIT license -def to_node_index(i, j, cols): - return cols * i + j - - -def from_node_index(node_index, cols): - return node_index // cols, node_index % cols - - -def to_adj_matrix(mat): - """ - :param mat: [T x N] matrix where for each time frame we have the N scores of our target phoneme labels - :return: A sparse CTC-style adjacent lattice matrix where the connection weight is the score of the "target" node - of each connection. - """ - rows = mat.shape[0] - cols = mat.shape[1] - - row_ind = [] - col_ind = [] - data = [] - - for i in range(rows): - for j in range(cols): - - node = to_node_index(i, j, cols) - - if j < cols - 1: - right_node = to_node_index(i, j + 1, cols) - weight_right = mat[i, j + 1] - row_ind.append(node) - col_ind.append(right_node) - data.append(weight_right) - - if i < rows - 1 and j < cols: - bottom_node = to_node_index(i + 1, j, cols) - weight_bottom = mat[i + 1, j] - row_ind.append(node) - col_ind.append(bottom_node) - data.append(weight_bottom) - - if i < rows - 1 and j < cols - 1: - bottom_right_node = to_node_index(i + 1, j + 1, cols) - weight_bottom_right = mat[i + 1, j + 1] - row_ind.append(node) - col_ind.append(bottom_right_node) - data.append(weight_bottom_right) - - adj_mat = coo_matrix((data, (row_ind, col_ind)), shape=(rows * cols, rows * cols)) - return adj_mat.tocsr() - - -@dataclass() -class AlignSequence: - """ - :param logprobs: [T x F] log probabilities - :param phonemes: [N] indexed tokens with indices in the range [0, F-1] - """ - logprobs: np.ndarray - phonemes: np.ndarray - - -def extract_durations_with_dijkstra(sequence: AlignSequence) -> np.array: - """ - Extracts durations from the attention matrix by finding the shortest monotonic path from - top left to bottom right. - - :return durations: [N] durations which sum to T - """ - - neg_log_weights = -sequence.logprobs[:, sequence.phonemes] - adj_matrix = to_adj_matrix(neg_log_weights) - dist_matrix, predecessors = dijkstra(csgraph=adj_matrix, directed=True, indices=0, return_predecessors=True) - path = [] - pr_index = predecessors[-1] - while pr_index != 0: - path.append(pr_index) - pr_index = predecessors[pr_index] - path.reverse() - - # append first and last node - path = [0] + path + [dist_matrix.size - 1] - cols = neg_log_weights.shape[1] - mel_text = {} - durations = np.zeros(sequence.phonemes.shape[0], dtype=np.int32) - - # collect indices (mel, text) along the path - for node_index in path: - i, j = from_node_index(node_index, cols) - mel_text[i] = j - - for j in mel_text.values(): - durations[j] += 1 - - return durations - - -def forward_init_hook(run_ctx, **kwargs): - run_ctx.hdf_writer = SimpleHDFWriter("durations.hdf", dim=None, ndim=1) - run_ctx.pool = multiprocessing.Pool(8) - - -def forward_finish_hook(run_ctx, **kwargs): - run_ctx.hdf_writer.close() - - -def forward_step(*, model: Model, data, run_ctx, **kwargs): - tags = data["seq_tag"] - audio_features = data["audio_features"] # [B, T, F] - audio_features_len = data["audio_features:size1"] # [B] - - # perform local length sorting for more efficient packing - audio_features_len, indices = torch.sort(audio_features_len, descending=True) - - audio_features = audio_features[indices, :, :] - phonemes = data["phonemes"][indices, :] # [B, T] (sparse) - phonemes_len = data["phonemes:size1"][indices] # [B, T] - speaker_labels = data["speaker_labels"][indices, :] # [B, 1] (sparse) - - logprobs = model( - audio_features=audio_features, - audio_features_len=audio_features_len, - speaker_labels=speaker_labels, - ) - - numpy_logprobs = logprobs.detach().cpu().numpy() - numpy_phonemes = phonemes.detach().cpu().numpy() - - align_sequences = [] - - for single_logprobs, single_phonemes, feat_len, phon_len in zip( - numpy_logprobs, numpy_phonemes, audio_features_len, phonemes_len - ): - align_sequences.append(AlignSequence(single_logprobs[:feat_len], single_phonemes[:phon_len])) - - durations = run_ctx.pool.map(extract_durations_with_dijkstra, align_sequences) - for tag, duration, feat_len, phon_len in zip(tags, durations, audio_features_len, phonemes_len): - total_sum = numpy.sum(duration) - assert total_sum == feat_len - assert len(duration) == phon_len - run_ctx.hdf_writer.insert_batch(numpy.asarray([duration]), [len(duration)], [tag]) diff --git a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/ctc_aligner_v1_ctc_sum.py b/users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/ctc_aligner_v1_ctc_sum.py deleted file mode 100644 index 783cb6ddc..000000000 --- a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/ctc_aligner_v1_ctc_sum.py +++ /dev/null @@ -1,109 +0,0 @@ -import time -import torch -from torch import nn -from torch.onnx import export - - -class Conv1DBlock(torch.nn.Module): - - def __init__(self, in_size, out_size, filter_size, dropout): - super().__init__() - self.conv = nn.Conv1d(in_size, out_size, filter_size, padding=filter_size//2) - self.bn = nn.BatchNorm1d(num_features=out_size) - self.dropout = dropout - - def forward(self, x): - """ - - :param x: should be [B, C, T] - :return: - """ - x = self.conv(x) - x = nn.functional.relu(x) - # TODO: does not consider masking! - x = self.bn(x) - x = nn.functional.dropout1d(x, p=self.dropout) - return x - - -class Model(torch.nn.Module): - - - def __init__(self, - conv_hidden_size: int, - lstm_size: int, - speaker_embedding_size: int, - dropout: float, - target_size: int, - epoch: int, - step: int, - ): - super().__init__() - self.audio_embedding = nn.Linear(80, conv_hidden_size) - self.speaker_embedding = nn.Embedding(251, speaker_embedding_size) - self.convs = nn.Sequential( - Conv1DBlock(conv_hidden_size + speaker_embedding_size, conv_hidden_size, filter_size=5, dropout=dropout), - Conv1DBlock(conv_hidden_size, conv_hidden_size, filter_size=5, dropout=dropout), - Conv1DBlock(conv_hidden_size, conv_hidden_size, filter_size=5, dropout=dropout), - Conv1DBlock(conv_hidden_size, conv_hidden_size, filter_size=5, dropout=dropout), - Conv1DBlock(conv_hidden_size, conv_hidden_size, filter_size=5, dropout=dropout), - ) - self.blstm = nn.LSTM( - input_size=conv_hidden_size, - hidden_size=lstm_size, - bidirectional=True, - batch_first=True) - self.final_linear = nn.Linear(2*lstm_size, target_size) - self.lstm_size = lstm_size - self.target_size = target_size - self.dropout = dropout - - def forward( - self, - audio_features: torch.Tensor, - speaker_labels: torch.Tensor, - audio_features_len: torch.Tensor, - ): - speaker_embeddings: torch.Tensor = self.speaker_embedding(torch.squeeze(speaker_labels, dim=1)) - # manually broadcast speaker embeddings to each time step - speaker_embeddings = torch.repeat_interleave(torch.unsqueeze(speaker_embeddings, 1), audio_features.size()[1], dim=1) # [B, T, F] - audio_embedding = self.audio_embedding(audio_features) # [B, T, F] - - conv_in = torch.concat([speaker_embeddings, audio_embedding], dim=2) # [B, T, F] - conv_in = torch.swapaxes(conv_in, 1, 2) # [B, C, T] - conv_out = self.convs(conv_in) - blstm_in = torch.permute(conv_out, dims=(0, 2, 1)) # [B, C, T] -> [B, T, C] - - blstm_packed_in = nn.utils.rnn.pack_padded_sequence(blstm_in, audio_features_len.to("cpu"), batch_first=True) - blstm_packed_out, _ = self.blstm(blstm_packed_in) - blstm_out, _ = nn.utils.rnn.pad_packed_sequence(blstm_packed_out, padding_value=0.0, batch_first=True) # [B, T, C] - blstm_out = nn.functional.dropout1d(blstm_out, p=self.dropout) - logits = self.final_linear(blstm_out) # [B, T, #PHONES] - log_probs = torch.log_softmax(logits, dim=2) # [B, T, #PHONES] - - return log_probs - -def train_step(*, model: Model, data, run_ctx, **kwargs): - audio_features = data["audio_features"] # [B, T, F] - audio_features_len = data["audio_features:size1"] # [B] - - # perform local length sorting for more efficient packing - audio_features_len, indices = torch.sort(audio_features_len, descending=True) - - audio_features = audio_features[indices, :, :] - phonemes = data["phonemes"][indices, :] # [B, T] (sparse) - phonemes_len = data["phonemes:size1"][indices] # [B, T] - speaker_labels = data["speaker_labels"][indices, :] # [B, 1] (sparse) - - logprobs = model( - audio_features=audio_features, - audio_features_len=audio_features_len, - speaker_labels=speaker_labels, - ) - - transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) - ctc_loss = nn.functional.ctc_loss(transposed_logprobs, phonemes, input_lengths=audio_features_len, target_lengths=phonemes_len, - blank=model.target_size-1, reduction="sum") - num_frames = torch.sum(phonemes_len) - run_ctx.mark_as_loss(name="ctc_sum", loss=ctc_loss, inv_norm_factor=None) - run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, scale=0.0, inv_norm_factor=num_frames) diff --git a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/ctc_aligner_v1_ctc_sum_nobroad.py b/users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/ctc_aligner_v1_ctc_sum_nobroad.py deleted file mode 100644 index b2b8aaa20..000000000 --- a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/ctc_aligner_v1_ctc_sum_nobroad.py +++ /dev/null @@ -1,103 +0,0 @@ -import time -import torch -from torch import nn -from torch.onnx import export - - -class Conv1DBlock(torch.nn.Module): - """ - Variant without dropout broadcasting - """ - def __init__(self, in_size, out_size, filter_size, dropout): - super().__init__() - self.conv = nn.Conv1d(in_size, out_size, filter_size, padding=filter_size//2) - self.bn = nn.BatchNorm1d(num_features=out_size) - self.dropout = dropout - - def forward(self, x): - """ - - :param x: should be [B, C, T] - :return: - """ - x = self.conv(x) - x = nn.functional.relu(x) - # TODO: does not consider masking! - x = self.bn(x) - x = nn.functional.dropout(x, p=self.dropout) - return x - - -class Model(torch.nn.Module): - - - def __init__(self, conv_hidden_size: int, lstm_size: int, speaker_embedding_size: int, dropout: float, target_size: int): - super().__init__() - self.audio_embedding = nn.Linear(80, conv_hidden_size) - self.speaker_embedding = nn.Embedding(251, speaker_embedding_size) - self.convs = nn.Sequential( - Conv1DBlock(conv_hidden_size + speaker_embedding_size, conv_hidden_size, filter_size=5, dropout=dropout), - Conv1DBlock(conv_hidden_size, conv_hidden_size, filter_size=5, dropout=dropout), - Conv1DBlock(conv_hidden_size, conv_hidden_size, filter_size=5, dropout=dropout), - Conv1DBlock(conv_hidden_size, conv_hidden_size, filter_size=5, dropout=dropout), - Conv1DBlock(conv_hidden_size, conv_hidden_size, filter_size=5, dropout=dropout), - ) - self.blstm = nn.LSTM( - input_size=conv_hidden_size, - hidden_size=lstm_size, - bidirectional=True, - batch_first=True) - self.final_linear = nn.Linear(2*lstm_size, target_size) - self.lstm_size = lstm_size - self.target_size = target_size - self.dropout = dropout - - def forward( - self, - audio_features: torch.Tensor, - speaker_labels: torch.Tensor, - audio_features_len: torch.Tensor, - ): - speaker_embeddings: torch.Tensor = self.speaker_embedding(torch.squeeze(speaker_labels, dim=1)) - # manually broadcast speaker embeddings to each time step - speaker_embeddings = torch.repeat_interleave(torch.unsqueeze(speaker_embeddings, 1), audio_features.size()[1], dim=1) # [B, T, F] - audio_embedding = self.audio_embedding(audio_features) # [B, T, F] - - conv_in = torch.concat([speaker_embeddings, audio_embedding], dim=2) # [B, T, F] - conv_in = torch.swapaxes(conv_in, 1, 2) # [B, C, T] - conv_out = self.convs(conv_in) - blstm_in = torch.permute(conv_out, dims=(0, 2, 1)) # [B, C, T] -> [B, T, C] - - blstm_packed_in = nn.utils.rnn.pack_padded_sequence(blstm_in, audio_features_len.to("cpu"), batch_first=True) - blstm_packed_out, _ = self.blstm(blstm_packed_in) - blstm_out, _ = nn.utils.rnn.pad_packed_sequence(blstm_packed_out, padding_value=0.0, batch_first=True) # [B, T, C] - blstm_out = nn.functional.dropout1d(blstm_out, p=self.dropout) - logits = self.final_linear(blstm_out) # [B, T, #PHONES] - log_probs = torch.log_softmax(logits, dim=2) # [B, T, #PHONES] - - return log_probs - -def train_step(*, model: Model, data, run_ctx, **kwargs): - audio_features = data["audio_features"] # [B, T, F] - audio_features_len = data["audio_features:size1"] # [B] - - # perform local length sorting for more efficient packing - audio_features_len, indices = torch.sort(audio_features_len, descending=True) - - audio_features = audio_features[indices, :, :] - phonemes = data["phonemes"][indices, :] # [B, T] (sparse) - phonemes_len = data["phonemes:size1"][indices] # [B, T] - speaker_labels = data["speaker_labels"][indices, :] # [B, 1] (sparse) - - logprobs = model( - audio_features=audio_features, - audio_features_len=audio_features_len, - speaker_labels=speaker_labels, - ) - - transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) - ctc_loss = nn.functional.ctc_loss(transposed_logprobs, phonemes, input_lengths=audio_features_len, target_lengths=phonemes_len, - blank=model.target_size-1, reduction="sum") - num_frames = torch.sum(phonemes_len) - run_ctx.mark_as_loss(name="ctc_sum", loss=ctc_loss, inv_norm_factor=None) - run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, scale=0.0, inv_norm_factor=num_frames) diff --git a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/ctc_aligner_v1_gradaccum.py b/users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/ctc_aligner_v1_gradaccum.py deleted file mode 100644 index 10bf6811b..000000000 --- a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/ctc_aligner_v1_gradaccum.py +++ /dev/null @@ -1,149 +0,0 @@ -from random import random -import time -import torch -from torch import nn, Tensor -from torch.onnx import export -from typing import Dict, Tuple - -from returnn.torch.engine import Engine as TorchEngine -from returnn.torch.context import RunCtx, Loss - - -class CustomEngine(TorchEngine): - - def run_train_step(self, data: dict[str, torch.Tensor], run_ctx: RunCtx) -> Tuple[Tensor, Dict[str, Loss]]: - """ - :param data: model inputs for the step - :param run_ctx: the current run ctx object - :return: total loss (weighted sum) calculated for the step, and individual losses as a name -> value mapping - """ - assert isinstance(data, dict) and data - # move all data to the target device as default - # note that in some cases, e.g. for using rnn.pack_padded_sequence you need to have - # length tensors on CPU - data = {k: v.to(self._device) for (k, v) in data.items()} - - sentinel_kw = {"__fwd_compatible_random_arg_%i" % int(random() * 100): None} - self._train_step_func(model=self._model, data=data, run_ctx=run_ctx, **sentinel_kw) - - losses_dict = run_ctx.losses - total_loss = run_ctx.total_loss() - - total_loss.backward() - if self.grad_clip_value > 0.0: - torch.nn.utils.clip_grad_value_(self._model.parameters(), self.grad_clip_value) - - if self._train_step % 2 == 0: - self._updater.get_optimizer().step() - self._updater.get_optimizer().zero_grad() - - return total_loss, losses_dict - -class Conv1DBlock(torch.nn.Module): - - def __init__(self, in_size, out_size, filter_size, dropout): - super().__init__() - self.conv = nn.Conv1d(in_size, out_size, filter_size, padding=filter_size//2) - self.bn = nn.BatchNorm1d(num_features=out_size) - self.dropout = dropout - - def forward(self, x): - """ - - :param x: should be [B, C, T] - :return: - """ - x = self.conv(x) - x = nn.functional.relu(x) - # TODO: does not consider masking! - x = self.bn(x) - x = nn.functional.dropout(x, p=self.dropout) - return x - - -class Model(torch.nn.Module): - """ - """ - - def __init__(self, conv_hidden_size: int, lstm_size: int, speaker_embedding_size: int, dropout: float, target_size: int, **kwargs): - super().__init__() - self.audio_embedding = nn.Linear(80, conv_hidden_size) - self.speaker_embedding = nn.Embedding(251, speaker_embedding_size) - self.convs = nn.Sequential( - Conv1DBlock(conv_hidden_size + speaker_embedding_size, conv_hidden_size, filter_size=5, dropout=dropout), - Conv1DBlock(conv_hidden_size, conv_hidden_size, filter_size=5, dropout=dropout), - Conv1DBlock(conv_hidden_size, conv_hidden_size, filter_size=5, dropout=dropout), - Conv1DBlock(conv_hidden_size, conv_hidden_size, filter_size=5, dropout=dropout), - Conv1DBlock(conv_hidden_size, conv_hidden_size, filter_size=5, dropout=dropout), - ) - self.blstm = nn.LSTM( - input_size=conv_hidden_size, - hidden_size=lstm_size, - bidirectional=True, - batch_first=True) - self.final_linear = nn.Linear(2*lstm_size, target_size) - self.lstm_size = lstm_size - self.target_size = target_size - self.dropout = dropout - - # initialize weights - self.apply(self._weight_init) - - @staticmethod - def _weight_init(module: torch.nn.Module): - if isinstance(module, torch.nn.Conv1d): - nn.init.xavier_normal_(module.weight) - - def forward( - self, - audio_features: torch.Tensor, - speaker_labels: torch.Tensor, - audio_features_len: torch.Tensor, - ): - speaker_embeddings: torch.Tensor = self.speaker_embedding(torch.squeeze(speaker_labels, dim=1)) - # manually broadcast speaker embeddings to each time step - speaker_embeddings = torch.repeat_interleave(torch.unsqueeze(speaker_embeddings, 1), audio_features.size()[1], dim=1) # [B, T, F] - audio_embedding = self.audio_embedding(audio_features) # [B, T, F] - - conv_in = torch.concat([speaker_embeddings, audio_embedding], dim=2) # [B, T, F] - conv_in = torch.swapaxes(conv_in, 1, 2) # [B, C, T] - conv_out = self.convs(conv_in) - blstm_in = torch.permute(conv_out, dims=(0, 2, 1)) # [B, C, T] -> [B, T, C] - - blstm_packed_in = nn.utils.rnn.pack_padded_sequence(blstm_in, audio_features_len.to("cpu"), batch_first=True) - blstm_packed_out, _ = self.blstm(blstm_packed_in) - blstm_out, _ = nn.utils.rnn.pad_packed_sequence(blstm_packed_out, padding_value=0.0, batch_first=True) # [B, T, C] - blstm_out = nn.functional.dropout1d(blstm_out, p=self.dropout) - logits = self.final_linear(blstm_out) # [B, T, #PHONES] - log_probs = torch.log_softmax(logits, dim=2) # [B, T, #PHONES] - - return log_probs - - -def train_step(*, model: Model, data, run_ctx, **kwargs): - audio_features = data["audio_features"] # [B, T, F] - audio_features_len = data["audio_features:size1"] # [B] - - # perform local length sorting for more efficient packing - audio_features_len, indices = torch.sort(audio_features_len, descending=True) - - audio_features = audio_features[indices, :, :] - phonemes = data["phonemes"][indices, :] # [B, T] (sparse) - phonemes_len = data["phonemes:size1"][indices] # [B, T] - speaker_labels = data["speaker_labels"][indices, :] # [B, 1] (sparse) - - logprobs = model( - audio_features=audio_features, - audio_features_len=audio_features_len, - speaker_labels=speaker_labels, - ) - - #params = 0 - #for parameter in model.parameters(): - # params += parameter.data.size().numel() - #print(params) - transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # Needs [T, B, F] - ctc_loss = nn.functional.ctc_loss(transposed_logprobs, phonemes, input_lengths=audio_features_len, target_lengths=phonemes_len, - blank=model.target_size-1, reduction="sum") - num_frames = torch.sum(phonemes_len) - run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_frames) diff --git a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/ctc_aligner_v2.py b/users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/ctc_aligner_v2.py deleted file mode 100644 index 098a772c6..000000000 --- a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/pytorch_networks/ctc_aligner_v2.py +++ /dev/null @@ -1,265 +0,0 @@ -""" - -Aligner version 2 with some updates: - - Separate CTC and Final dropout values - - xavier_uniform instead of xavier_normal -""" -import tempfile -import h5py - -import torch -from torch import nn - - -class Conv1DBlock(torch.nn.Module): - def __init__(self, in_size, out_size, filter_size, dropout): - super().__init__() - self.conv = nn.Conv1d(in_size, out_size, filter_size, padding=filter_size // 2) - self.bn = nn.BatchNorm1d(num_features=out_size) - self.dropout = dropout - - def forward(self, x): - """ - - :param x: should be [B, C, T] - :return: - """ - x = self.conv(x) - x = nn.functional.relu(x) - # TODO: does not consider masking! - x = self.bn(x) - x = nn.functional.dropout(x, p=self.dropout, training=self.training) - return x - - -class Model(torch.nn.Module): - """ """ - - def __init__( - self, - conv_hidden_size: int, - lstm_size: int, - speaker_embedding_size: int, - conv_dropout: float, - final_dropout: float, - target_size: int, - **kwargs, - ): - super().__init__() - self.audio_embedding = nn.Linear(80, conv_hidden_size) - self.speaker_embedding = nn.Embedding(251, speaker_embedding_size) - self.convs = nn.Sequential( - Conv1DBlock( - conv_hidden_size + speaker_embedding_size, conv_hidden_size, filter_size=5, dropout=conv_dropout - ), - Conv1DBlock(conv_hidden_size, conv_hidden_size, filter_size=5, dropout=conv_dropout), - Conv1DBlock(conv_hidden_size, conv_hidden_size, filter_size=5, dropout=conv_dropout), - Conv1DBlock(conv_hidden_size, conv_hidden_size, filter_size=5, dropout=conv_dropout), - Conv1DBlock(conv_hidden_size, conv_hidden_size, filter_size=5, dropout=conv_dropout), - ) - self.blstm = nn.LSTM(input_size=conv_hidden_size, hidden_size=lstm_size, bidirectional=True, batch_first=True) - self.final_linear = nn.Linear(2 * lstm_size, target_size) - self.lstm_size = lstm_size - self.target_size = target_size - self.final_dropout = final_dropout - - # initialize weights - self.apply(self._weight_init) - - @staticmethod - def _weight_init(module: torch.nn.Module): - if isinstance(module, torch.nn.Conv1d): - nn.init.xavier_uniform_(module.weight) - - def forward( - self, - audio_features: torch.Tensor, - speaker_labels: torch.Tensor, - audio_features_len: torch.Tensor, - ): - speaker_embeddings: torch.Tensor = self.speaker_embedding(torch.squeeze(speaker_labels, dim=1)) - # manually broadcast speaker embeddings to each time step - speaker_embeddings = torch.repeat_interleave( - torch.unsqueeze(speaker_embeddings, 1), audio_features.size()[1], dim=1 - ) # [B, T, F] - audio_embedding = self.audio_embedding(audio_features) # [B, T, F] - - conv_in = torch.concat([speaker_embeddings, audio_embedding], dim=2) # [B, T, F] - conv_in = torch.swapaxes(conv_in, 1, 2) # [B, C, T] - conv_out = self.convs(conv_in) - blstm_in = torch.permute(conv_out, dims=(0, 2, 1)) # [B, C, T] -> [B, T, C] - - blstm_packed_in = nn.utils.rnn.pack_padded_sequence(blstm_in, audio_features_len.to("cpu"), batch_first=True) - blstm_packed_out, _ = self.blstm(blstm_packed_in) - blstm_out, _ = nn.utils.rnn.pad_packed_sequence( - blstm_packed_out, padding_value=0.0, batch_first=True - ) # [B, T, C] - blstm_out = nn.functional.dropout(blstm_out, p=self.final_dropout, training=self.training) - logits = self.final_linear(blstm_out) # [B, T, #PHONES] - log_probs = torch.log_softmax(logits, dim=2) # [B, T, #PHONES] - - return log_probs - - -def train_step(*, model: Model, data, run_ctx, **kwargs): - audio_features = data["audio_features"] # [B, T, F] - audio_features_len = data["audio_features:size1"] # [B] - - # perform local length sorting for more efficient packing - audio_features_len, indices = torch.sort(audio_features_len, descending=True) - - audio_features = audio_features[indices, :, :] - phonemes = data["phonemes"][indices, :] # [B, T] (sparse) - phonemes_len = data["phonemes:size1"][indices] # [B, T] - speaker_labels = data["speaker_labels"][indices, :] # [B, 1] (sparse) - - logprobs = model( - audio_features=audio_features, - audio_features_len=audio_features_len, - speaker_labels=speaker_labels, - ) - - # params = 0 - # for parameter in model.parameters(): - # params += parameter.data.size().numel() - # print(params) - transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # Needs [T, B, F] - ctc_loss = nn.functional.ctc_loss( - transposed_logprobs, - phonemes, - input_lengths=audio_features_len, - target_lengths=phonemes_len, - blank=model.target_size - 1, - reduction="sum", - ) - num_frames = torch.sum(phonemes_len) - run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_frames) - - -############# FORWARD STUFF ################ - -import numpy as np -from scipy.sparse import coo_matrix -from scipy.sparse.csgraph import dijkstra - - -def to_node_index(i, j, cols): - return cols * i + j - - -def from_node_index(node_index, cols): - return node_index // cols, node_index % cols - - -def to_adj_matrix(mat): - rows = mat.shape[0] - cols = mat.shape[1] - - row_ind = [] - col_ind = [] - data = [] - - for i in range(rows): - for j in range(cols): - - node = to_node_index(i, j, cols) - - if j < cols - 1: - right_node = to_node_index(i, j + 1, cols) - weight_right = mat[i, j + 1] - row_ind.append(node) - col_ind.append(right_node) - data.append(weight_right) - - if i < rows - 1 and j < cols: - bottom_node = to_node_index(i + 1, j, cols) - weight_bottom = mat[i + 1, j] - row_ind.append(node) - col_ind.append(bottom_node) - data.append(weight_bottom) - - if i < rows - 1 and j < cols - 1: - bottom_right_node = to_node_index(i + 1, j + 1, cols) - weight_bottom_right = mat[i + 1, j + 1] - row_ind.append(node) - col_ind.append(bottom_right_node) - data.append(weight_bottom_right) - - adj_mat = coo_matrix((data, (row_ind, col_ind)), shape=(rows * cols, rows * cols)) - return adj_mat.tocsr() - - -def extract_durations_with_dijkstra(tokens: np.array, log_probs: np.array) -> np.array: - """ - Extracts durations from the attention matrix by finding the shortest monotonic path from - top left to bottom right. - - :param tokens: - - """ - - neg_log_weights = -pred[:, tokens] - adj_matrix = to_adj_matrix(neg_log_weights) - dist_matrix, predecessors = dijkstra(csgraph=adj_matrix, directed=True, indices=0, return_predecessors=True) - path = [] - pr_index = predecessors[-1] - while pr_index != 0: - path.append(pr_index) - pr_index = predecessors[pr_index] - path.reverse() - - # append first and last node - path = [0] + path + [dist_matrix.size - 1] - cols = neg_log_weights.shape[1] - mel_text = {} - durations = np.zeros(tokens.shape[0], dtype=np.int32) - - # collect indices (mel, text) along the path - for node_index in path: - i, j = from_node_index(node_index, cols) - mel_text[i] = j - - for j in mel_text.values(): - durations[j] += 1 - - return durations - - -def forward_init_hook(run_ctx): - fd, fname = tempfile.mkstemp("durations.hdf") - # run_ctx.hdf_writer = - - -def forward_finish_hook(run_ctx): - pass - - -def forward_step(*, model: Model, data, run_ctx, **kwargs): - if not hasattr(run_ctx, "hdf_writer"): - run_ctx.hdf_writer = SimpleHDFWriter() - - audio_features = data["audio_features"] # [B, T, F] - audio_features_len = data["audio_features:size1"] # [B] - - # perform local length sorting for more efficient packing - audio_features_len, indices = torch.sort(audio_features_len, descending=True) - - audio_features = audio_features[indices, :, :] - phonemes = data["phonemes"][indices, :] # [B, T] (sparse) - phonemes_len = data["phonemes:size1"][indices] # [B, T] - speaker_labels = data["speaker_labels"][indices, :] # [B, 1] (sparse) - - logprobs = model( - audio_features=audio_features, - audio_features_len=audio_features_len, - speaker_labels=speaker_labels, - ) - - numpy_logprobs = logprobs.detach().cpu().numpy() - numpy_phonemes = phonemes.detach().cpu().numpy() - for single_logprobs, single_phonemes, len in zip(numpy_logprobs, numpy_phonemes, phonemes_len): - durations = extract_durations_with_dijkstra( - single_phonemes, - single_logprobs, - ) - assert numpy.sum(durations) == len.detach().cpu().numpy() diff --git a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/rc_networks/__init__.py b/users/rilling/experiments/librispeech/tts_architecture_improvement_23/rc_networks/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/rc_networks/ctc_aligner/__init__.py b/users/rilling/experiments/librispeech/tts_architecture_improvement_23/rc_networks/ctc_aligner/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/rc_networks/ctc_aligner/conv_blstm_rec.py b/users/rilling/experiments/librispeech/tts_architecture_improvement_23/rc_networks/ctc_aligner/conv_blstm_rec.py deleted file mode 100644 index b8b5d895a..000000000 --- a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/rc_networks/ctc_aligner/conv_blstm_rec.py +++ /dev/null @@ -1,211 +0,0 @@ -""" -Implementation of the CTC Aligner, updated for the new non-lazy init -""" -from typing import Tuple, Union -from returnn_common import nn -from ..shared.convolution import Conv1DStack -from .parameters import ConvBlstmRecParams - - -class TTSDecoder(nn.Module): - """ - Decoder for audio reconstruction - """ - - def __init__(self, in_dim: nn.Dim, lstm_dim: int = 512): - """ - :param lstm_dim: LSTM dimension size - """ - super(TTSDecoder, self).__init__() - self.lstm_dim = nn.FeatureDim("dec_lstm_dim", lstm_dim) - self.lstm_1_fw = nn.LSTM(in_dim=in_dim, out_dim=self.lstm_dim) - self.lstm_1_bw = nn.LSTM(in_dim=in_dim, out_dim=self.lstm_dim) - self.lstm_2_fw = nn.LSTM(in_dim=2*self.lstm_dim, out_dim=self.lstm_dim) - self.lstm_2_bw = nn.LSTM(in_dim=2*self.lstm_dim, out_dim=self.lstm_dim) - - def __call__( - self, phoneme_probs: nn.Tensor, speaker_embedding: nn.Tensor, audio_time: nn.Dim - ): - """ - :param phoneme_probs: - :param speaker_embedding: - :param audio_time: - :return: - """ - cat, _ = nn.concat( - (phoneme_probs, phoneme_probs.feature_dim), - (speaker_embedding, speaker_embedding.feature_dim), - allow_broadcast=True, - ) - lstm_fw, _ = self.lstm_1_fw(cat, spatial_dim=audio_time, direction=1) - lstm_bw, _ = self.lstm_1_bw(cat, spatial_dim=audio_time, direction=-1) - # TODO maybe dropout? - cat, _ = nn.concat((lstm_fw, lstm_fw.feature_dim), (lstm_bw, lstm_bw.feature_dim)) - lstm_fw, _ = self.lstm_2_fw(cat, spatial_dim=audio_time, direction=1) - lstm_bw, _ = self.lstm_2_bw(cat, spatial_dim=audio_time, direction=-1) - cat, _ = nn.concat((lstm_fw, lstm_fw.feature_dim), (lstm_bw, lstm_bw.feature_dim)) - return cat - - -class CTCAligner(nn.Module): - """ - CTC Aligner from Timur Schümann implemented in returnn common - """ - - def __init__( - self, - audio_feature_dim: nn.Dim, - speaker_label_dim: nn.Dim, - phoneme_dim: nn.Dim, - parameters: ConvBlstmRecParams, - ): - """ - - :param audio_feature_dim: - :param speaker_label_dim: - :param phoneme_dim: - :param parameters: - """ - super(CTCAligner, self).__init__() - - self.audio_hidden_dim = nn.FeatureDim("audio_hidden_dim", parameters.audio_emb_size) - self.speaker_embedding_dim = nn.FeatureDim("speaker_embedding_dim", parameters.speaker_emb_size) - self.hidden_dim = nn.FeatureDim("hidden_size", parameters.conv_hidden_size) - self.enc_lstm_dim = nn.FeatureDim("enc_lstm_dim", parameters.enc_lstm_size) - - self.audio_embedding = nn.Linear(in_dim=audio_feature_dim, out_dim=self.audio_hidden_dim) - self.speaker_embedding = nn.Embedding(in_dim=speaker_label_dim, out_dim=self.speaker_embedding_dim) - self.enc_conv_stack = Conv1DStack(in_dim=self.speaker_embedding_dim + self.audio_hidden_dim, dropout=[parameters.dropout]*5) - self.enc_lstm_fw = nn.LSTM(in_dim=self.enc_conv_stack.out_dim, out_dim=self.enc_lstm_dim) - self.enc_lstm_bw = nn.LSTM(in_dim=self.enc_conv_stack.out_dim, out_dim=self.enc_lstm_dim) - - self.softmax_dim = nn.FeatureDim("softmax_linear", phoneme_dim.dimension) - - self.softmax_lin = nn.Linear( - in_dim=2*self.enc_lstm_dim, - out_dim=self.softmax_dim, - ) - self.tts_decoder = TTSDecoder(in_dim=self.softmax_dim + self.speaker_embedding_dim, lstm_dim=parameters.rec_lstm_size) - self.reconstruction_lin = nn.Linear( - in_dim=2*self.tts_decoder.lstm_dim, - out_dim=nn.FeatureDim("reconstruction_dim", 80) - ) - - self.spectrogram_drop = parameters.dropout - self.reconstruction_scale = parameters.reconstruction_scale - self.training = parameters.training - - def __call__( - self, - audio_features: nn.Tensor, - speaker_labels: nn.Tensor, - phonemes: nn.Tensor, - audio_time: nn.Dim, - speaker_label_time: nn.Dim, - phoneme_time: nn.Dim, - ): - """ - :param audio_features: - :param speaker_labels: - :param phonemes: - :param audio_time: - :param speaker_label_time: - :return: - """ - speaker_label_notime = nn.squeeze(speaker_labels, axis=speaker_label_time) - - # embedding - speaker_embedding = self.speaker_embedding(speaker_label_notime) - audio_embedding = self.audio_embedding(audio_features) - # encoder - cat, _ = nn.concat( - (speaker_embedding, speaker_embedding.feature_dim), - (audio_embedding, audio_embedding.feature_dim), - allow_broadcast=True, - ) - enc_conv = self.enc_conv_stack(cat, time_dim=audio_time) - enc_fw, _ = self.enc_lstm_fw(enc_conv, spatial_dim=audio_time, direction=1) - enc_bw, _ = self.enc_lstm_bw(enc_conv, spatial_dim=audio_time, direction=-1) - cat, _ = nn.concat((enc_fw, enc_fw.feature_dim), (enc_bw, enc_bw.feature_dim)) - - # spectogram loss - spectogram_encoder = nn.dropout( - cat, dropout=self.spectrogram_drop, axis=cat.feature_dim - ) - spectogram_encoder = self.softmax_lin(spectogram_encoder) - softmax = nn.softmax(spectogram_encoder, axis=spectogram_encoder.feature_dim) - ctc = nn.ctc_loss(logits=spectogram_encoder, targets=phonemes) - ctc.mark_as_loss(name="ctc", custom_inv_norm_factor=nn.length(dim=phoneme_time)) - - if self.training: - # TTS decoder - tts_decoder = self.tts_decoder( - phoneme_probs=softmax, - speaker_embedding=speaker_embedding, - audio_time=audio_time, - ) - reconstruction_lin = self.reconstruction_lin(tts_decoder) - audio_features, _ = nn.replace_dim( - audio_features, - in_dim=audio_features.feature_dim, - out_dim=reconstruction_lin.feature_dim, - ) - reconstruction_loss = nn.mean_squared_difference( - reconstruction_lin, audio_features - ) - reconstruction_loss.mark_as_loss(name="mse", scale=self.reconstruction_scale) - return reconstruction_lin - else: - # replace the CTC blank label probability manually with zero - # within the RETURNN backend this will be replaced via safe_log with 1e-20 - slice_out, slice_dim = nn.slice( - softmax, axis=softmax.feature_dim, slice_start=0, slice_end=self.softmax_dim.dimension - 1, - ) - padding = nn.pad( - slice_out, - axes=slice_out.feature_dim, - mode="constant", - padding=[(0, 1)], - value=0, - ) - extract_alignment = nn.forced_alignment( - padding, align_target=phonemes, topology="ctc", input_type="prob", blank_included=True - ) - dur_dump = nn.hdf_dump(extract_alignment, filename="durations.hdf") - return dur_dump - - -def construct_network( - epoch: int, - audio_features: nn.Data, - phonemes: nn.Data, - speaker_labels: nn.Data, - **kwargs -): - """ - - :param epoch: - :param audio_features - :param phonemes - :param speaker_labels - :param kwargs: - :return: - """ - params = ConvBlstmRecParams(**kwargs) - net = CTCAligner( - audio_feature_dim=audio_features.feature_dim_or_sparse_dim, - speaker_label_dim=speaker_labels.feature_dim_or_sparse_dim, - phoneme_dim=phonemes.feature_dim_or_sparse_dim, - parameters=params, - ) - out = net( - audio_features=nn.get_extern_data(audio_features), - speaker_labels=nn.get_extern_data(speaker_labels), - phonemes=nn.get_extern_data(phonemes), - audio_time=audio_features.dim_tags[audio_features.time_dim_axis], - speaker_label_time=speaker_labels.dim_tags[speaker_labels.time_dim_axis], - phoneme_time=phonemes.dim_tags[phonemes.time_dim_axis], - ) - out.mark_as_default_output() - - return net diff --git a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/rc_networks/ctc_aligner/parameters.py b/users/rilling/experiments/librispeech/tts_architecture_improvement_23/rc_networks/ctc_aligner/parameters.py deleted file mode 100644 index 555022984..000000000 --- a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/rc_networks/ctc_aligner/parameters.py +++ /dev/null @@ -1,16 +0,0 @@ -""" -Parameters, to be imported from Sisyphus so make sure there is nothing unnecessary loaded -""" - -from dataclasses import dataclass - -@dataclass -class ConvBlstmRecParams: - audio_emb_size: int - speaker_emb_size: int - conv_hidden_size: int - enc_lstm_size: int - rec_lstm_size: int - dropout: float - reconstruction_scale: float - training: bool \ No newline at end of file diff --git a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/rc_networks/shared/__init__.py b/users/rilling/experiments/librispeech/tts_architecture_improvement_23/rc_networks/shared/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/rc_networks/shared/convolution.py b/users/rilling/experiments/librispeech/tts_architecture_improvement_23/rc_networks/shared/convolution.py deleted file mode 100644 index b97377f90..000000000 --- a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/rc_networks/shared/convolution.py +++ /dev/null @@ -1,128 +0,0 @@ -from typing import Tuple, Union, Sequence -from returnn_common import nn - - -class Conv1DBlock(nn.Module): - """ - 1D Convolutional Block with batchnorm, l2 and full axis dropout - """ - - def __init__( - self, - in_dim: nn.Dim, - dim: Union[int, nn.Dim] = 256, - filter_size: int = 5, - bn_epsilon: float = 1e-5, - dropout: float = 0.5, - l2: float = 1e-07, - ): - """ - :param dim: feature dimension of the convolution - :param filter_size: filter size of the conv, int because we are doing 1D here - :param bn_epsilon: batch_normalization epsilon value - :param dropout: dropout value - :param l2: weight decay value - """ - super(Conv1DBlock, self).__init__() - if isinstance(dim, int): - self.conv_dim = nn.FeatureDim("conv_dim_%d" % dim, dim) - elif isinstance(dim, nn.Dim): - self.conv_dim = dim - else: - raise Exception("Wrong Dim given!") - self.conv = nn.Conv1d( - in_dim=in_dim, - out_dim=self.conv_dim, - filter_size=filter_size, - padding="same", - with_bias=False, - ) - self.bn = nn.BatchNorm( - in_dim=self.conv_dim, - epsilon=bn_epsilon, use_mask=False - ) - self.dropout = dropout - self.l2 = l2 - - def __call__(self, inp: nn.Tensor, time_dim: nn.SpatialDim): - conv, _ = self.conv(inp, in_spatial_dim=time_dim) - # set weight decay - for param in self.conv.parameters(): - param.weight_decay = self.l2 - - conv = nn.relu(conv) - bn = self.bn(conv) - drop = nn.dropout( - bn, dropout=self.dropout, axis=[nn.batch_dim, time_dim, bn.feature_dim] - ) - - return drop - - -class Conv1DStack(nn.Module): - """ - Stacks :class:`Conv1DBlock` modules - """ - - def __init__( - self, - in_dim: nn.Dim, - num_layers: int = 5, - dim_sizes: Tuple[int] = (256,), - filter_sizes: Tuple[int] = (5, 5, 5, 5, 5), - bn_epsilon: float = 1e-5, - dropout: Sequence[float] = (0.35, 0.35, 0.35, 0.35, 0.35), - l2: float = 1e-07, - ): - """ - :param num_layers: number of conv block layers - :param dim_sizes: dimensions for the convolutions in the block - :param filter_sizes: sizes for the filters in the block - :param bn_epsilon: batch_normalization epsilon value - :param dropout: dropout values - :param l2: weight decay value - """ - super(Conv1DStack, self).__init__() - assert ( - len(dim_sizes) == num_layers or len(dim_sizes) == 1 - ) # mismatch in dim_sizes - assert len(filter_sizes) == num_layers # mismatch in filter_sizes - assert len(dropout) == num_layers # mismatch in dropout - - self.num_layers = num_layers - # simplify tags a bit if possible - if len(set(dim_sizes)) == 1: # all sizes equal - out_dims = [nn.FeatureDim("conv_dim", dim_sizes[0])] * num_layers - else: - out_dims = [ - nn.FeatureDim("conv_dim_%s" % str(x), dim_sizes[x]) - for x in range(num_layers) - ] - - sequential_list = [] - temp_in_dim = in_dim - for x in range(num_layers): - sequential_list.append( - Conv1DBlock( - in_dim=temp_in_dim, - dim=out_dims[x], - filter_size=filter_sizes[x], - bn_epsilon=bn_epsilon, - dropout=dropout[x], - l2=l2, - ) - ) - temp_in_dim = out_dims[x] - - self.stack = nn.Sequential(sequential_list) - self.out_dim = out_dims[-1] - - def __call__(self, inp: nn.Tensor, time_dim: nn.Dim): - """ - Applies all conv blocks in sequence - - :param inp: input tensor - :return: - """ - out = self.stack(inp, time_dim=time_dim) - return out \ No newline at end of file diff --git a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/serializer.py b/users/rilling/experiments/librispeech/tts_architecture_improvement_23/serializer.py deleted file mode 100644 index 473365147..000000000 --- a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/serializer.py +++ /dev/null @@ -1,117 +0,0 @@ -import copy -from sisyphus import tk -from typing import Any, Dict - -from i6_experiments.common.setups.returnn_common.serialization import ( - Collection, - ExternData, - Import, - Network, - PythonEnlargeStackWorkaroundNonhashedCode, - ExplicitHash -) - -from i6_experiments.common.setups.returnn_pytorch.serialization import ( - Collection as TorchCollection, - PyTorchModel, -) - -from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.base import Datastream - -PACKAGE = __package__ - - -def get_network_serializer( - training: bool, - returnn_common_root: tk.Path, - datastreams: Dict[str, Datastream], - network_module: str, - net_args: Dict[str, Any], - debug=False, - **kwargs, -) -> Collection: - """ - - :param training - :param returnn_common_root - :param datastreams: - :param net_args: - :param debug: - :return: - """ - extern_data = [ - datastream.as_nnet_constructor_data(key) - for key, datastream in datastreams.items() - ] - - rc_recursionlimit = PythonEnlargeStackWorkaroundNonhashedCode - rc_extern_data = ExternData(extern_data=extern_data) - - rc_package = PACKAGE + ".rc_networks" - rc_construction_code = Import(rc_package + "." + network_module + ".construct_network") - - d = copy.deepcopy(net_args) - if training is False: - d["training"] = False - - rc_network = Network( - net_func_name=rc_construction_code.object_name, - net_func_map={key: key for key in datastreams.keys()}, # names just have to match - net_kwargs=d, - ) - - serializer = Collection( - serializer_objects=[ - rc_recursionlimit, - rc_extern_data, - rc_construction_code, - rc_network, - ], - returnn_common_root=returnn_common_root, - make_local_package_copy=not debug, - packages={ - rc_package, - }, - ) - - return serializer - -def get_pytorch_serializer( - network_module: str, - net_args: Dict[str, Any], - use_custom_engine=False, - debug=False, - **kwargs -) -> TorchCollection: - - package = PACKAGE + ".pytorch_networks" - - pytorch_model_import = Import( - package + ".%s.Model" % network_module - ) - pytorch_train_step = Import( - package + ".%s.train_step" % network_module - ) - pytorch_model = PyTorchModel( - model_class_name=pytorch_model_import.object_name, - model_kwargs=net_args, - ) - serializer_objects = [ - pytorch_model_import, - pytorch_train_step, - pytorch_model, - ] - if use_custom_engine: - pytorch_engine = Import( - package + ".%s.CustomEngine" % network_module - ) - serializer_objects.append(pytorch_engine) - serializer = TorchCollection( - serializer_objects=serializer_objects, - make_local_package_copy=not debug, - packages={ - package, - }, - ) - - return serializer diff --git a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/storage.py b/users/rilling/experiments/librispeech/tts_architecture_improvement_23/storage.py deleted file mode 100644 index d3e017c53..000000000 --- a/users/rilling/experiments/librispeech/tts_architecture_improvement_23/storage.py +++ /dev/null @@ -1,13 +0,0 @@ -from sisyphus import tk - -synthetic_ogg_zip_data = {} - -def add_ogg_zip(name: str, ogg_zip: tk.Path): - global synthetic_ogg_zip_data - synthetic_ogg_zip_data[name] = ogg_zip - -duration_alignments = {} - -def add_duration(name: str, duration_hdf: tk.Path): - global duration_alignments - duration_alignments[name] = duration_hdf \ No newline at end of file From 729c2776d9df347fd74ef8f5910439826b7e7ce6 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 16 May 2024 10:32:51 +0200 Subject: [PATCH 029/227] small fixes --- users/zeyer/datasets/librispeech.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/users/zeyer/datasets/librispeech.py b/users/zeyer/datasets/librispeech.py index 5fdf35cb7..e68134709 100644 --- a/users/zeyer/datasets/librispeech.py +++ b/users/zeyer/datasets/librispeech.py @@ -97,9 +97,9 @@ def _get_spm_vocab( @cache def _get_vocab_by_str(vocab: str) -> Union[SentencePieceModel, Bpe]: if re.match("^spm[0-9]+.*$", vocab): - return _get_spm_vocab(dim=vocab[3:], model_type=SentencePieceType.UNIGRAM) + return _get_spm_vocab(dim=vocab[len("spm") :], model_type=SentencePieceType.UNIGRAM) elif re.match("^spm_bpe[0-9]+.*$", vocab): - return _get_spm_vocab(dim=vocab[3:], model_type=SentencePieceType.BPE) + return _get_spm_vocab(dim=vocab[len("spm_bpe") :], model_type=SentencePieceType.BPE) elif vocab == "bpe10k": # predefined return bpe10k else: From 69b4185a7c0db2c22f775dbe2eb09dd4e9f948ce Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 16 May 2024 21:27:31 +0200 Subject: [PATCH 030/227] spm20k --- users/zeyer/datasets/librispeech.py | 2 +- users/zeyer/experiments/exp2024_04_23_baselines/ctc.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/users/zeyer/datasets/librispeech.py b/users/zeyer/datasets/librispeech.py index e68134709..020e285fe 100644 --- a/users/zeyer/datasets/librispeech.py +++ b/users/zeyer/datasets/librispeech.py @@ -57,7 +57,7 @@ def _get_spm_vocab( ) -> SentencePieceModel: if isinstance(dim, str): # Not sure if power-of-two or just multiple-of-64, but 10240 has more 2s in it (2048*5) than 10048. - dim = {"10k": 10_240, "5k": 5_120, "4k": 4_096, "1k": 1_024}[dim] + dim = {"20k": 20_480, "10k": 10_240, "5k": 5_120, "4k": 4_096, "1k": 1_024}[dim] assert isinstance(dim, int) and dim >= 10 # https://github.com/google/sentencepiece/blob/master/doc/options.md diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 65265722d..25943285b 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -31,6 +31,7 @@ def py(): for vocab in [ + "spm20k", "bpe10k", # 8.23 "spm10k", # 8.12 "spm_bpe10k", From 40000a5180e7b3a7b8fc3acd202011708f175cb2 Mon Sep 17 00:00:00 2001 From: Nick Rossenbach Date: Fri, 17 May 2024 17:16:23 +0200 Subject: [PATCH 031/227] more ctc and rnn-t librispeech experiments --- .../experiments/ctc_bpe/baseline.py | 34 +++++- .../experiments/ctc_bpe/low_vocab_exps.py | 61 ++++++++--- .../experiments/rnnt_bpe/low_vocab_exps.py | 100 ++++++++++++++++++ 3 files changed, 179 insertions(+), 16 deletions(-) diff --git a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_bpe/baseline.py b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_bpe/baseline.py index e46ee5583..34693e1cb 100644 --- a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_bpe/baseline.py +++ b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_bpe/baseline.py @@ -13,7 +13,7 @@ from ...data.bpe import build_bpe_training_datasets, get_text_lexicon from ...default_tools import RETURNN_EXE, MINI_RETURNN_ROOT from ...lm import get_4gram_binary_lm -from ...pipeline import training, prepare_asr_model, search +from ...pipeline import training, prepare_asr_model, search, ASRModel from ...storage import add_ctc_model @@ -60,6 +60,9 @@ def bpe_ls960_1023_base(): "returnn_root": MINI_RETURNN_ROOT, } + from ...pytorch_networks.ctc.decoder.flashlight_ctc_v1 import DecoderConfig + from ...pytorch_networks.ctc.decoder.greedy_bpe_ctc_v3 import DecoderConfig as GreedyDecoderConfig + def tune_and_evaluate_helper(training_name, asr_model, base_decoder_config, lm_scales, prior_scales): tune_parameters = [] tune_values_clean = [] @@ -94,9 +97,26 @@ def tune_and_evaluate_helper(training_name, asr_model, base_decoder_config, lm_s decoder_args={"config": asdict(decoder_config)}, test_dataset_tuples={key: test_dataset_tuples[key]}, **default_returnn ) - - - from ...pytorch_networks.ctc.decoder.flashlight_ctc_v1 import DecoderConfig + + def greedy_search_helper( + training_name: str, + asr_model: ASRModel, + decoder_config: GreedyDecoderConfig + ): + # remove prior if exists + asr_model = copy.deepcopy(asr_model) + asr_model.prior_file = None + + search_name = training_name + "/search_greedy" + search_jobs, wers = search( + search_name, + forward_config={}, + asr_model=asr_model, + decoder_module="ctc.decoder.greedy_bpe_ctc_v3", + decoder_args={"config": asdict(decoder_config)}, + test_dataset_tuples=dev_dataset_tuples, + **default_returnn, + ) default_decoder_config_bpe5000 = DecoderConfig( lexicon=get_text_lexicon(prefix=prefix_name, librispeech_key="train-other-960", bpe_size=5000), @@ -191,6 +211,12 @@ def tune_and_evaluate_helper(training_name, asr_model, base_decoder_config, lm_s add_ctc_model("ls960_ctc_bpe_5k." + network_module + ".512dim_sub6_24gbgpu_50eps_ckpt500", asr_model) tune_and_evaluate_helper(training_name, asr_model, default_decoder_config_bpe5000, lm_scales=[1.6, 1.8, 2.0], prior_scales=[0.2, 0.3, 0.4]) + + greedy_decoder_config = GreedyDecoderConfig( + returnn_vocab=label_datastream_bpe5000.vocab, + ) + greedy_search_helper(training_name, asr_model=asr_model, decoder_config=greedy_decoder_config) + for token in [16, 32, 64]: decoder_config = copy.deepcopy(default_decoder_config_bpe5000) decoder_config.lm_weight = 1.8 diff --git a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_bpe/low_vocab_exps.py b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_bpe/low_vocab_exps.py index 54e3de754..f78e75ee1 100644 --- a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_bpe/low_vocab_exps.py +++ b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_bpe/low_vocab_exps.py @@ -13,7 +13,7 @@ from ...data.bpe import build_bpe_training_datasets, get_text_lexicon from ...default_tools import RETURNN_EXE, MINI_RETURNN_ROOT from ...lm import get_4gram_binary_lm -from ...pipeline import training, prepare_asr_model, search +from ...pipeline import training, prepare_asr_model, search, ASRModel from ...storage import add_ctc_model @@ -37,6 +37,7 @@ def bpe_ls960_1023_low_vocab_test(): } from ...pytorch_networks.ctc.decoder.flashlight_ctc_v1 import DecoderConfig + from ...pytorch_networks.ctc.decoder.greedy_bpe_ctc_v3 import DecoderConfig as GreedyDecoderConfig @@ -133,6 +134,26 @@ def tune_and_evaluate_helper(training_name, dev_dataset_tuples, test_dataset_tup **default_returnn ) + def greedy_search_helper( + training_name: str, + asr_model: ASRModel, + decoder_config: GreedyDecoderConfig + ): + # remove prior if exists + asr_model = copy.deepcopy(asr_model) + asr_model.prior_file = None + + search_name = training_name + "/search_greedy" + search_jobs, wers = search( + search_name, + forward_config={}, + asr_model=asr_model, + decoder_module="ctc.decoder.greedy_bpe_ctc_v3", + decoder_args={"config": asdict(decoder_config)}, + test_dataset_tuples={**dev_dataset_tuples, **test_dataset_tuples}, + **default_returnn, + ) + for BPE_SIZE in [128, 256, 512, 1024]: # build the training datasets object containing train, cv, dev-train and the extern_data dict @@ -271,20 +292,36 @@ def tune_and_evaluate_helper(training_name, dev_dataset_tuples, test_dataset_tup for search_job in search_jobs: search_job.rqmt["sbatch_args"] = "-A rescale_speed -p rescale_amd" + if BPE_SIZE == 128 or BPE_SIZE == 512: # Extra long training for the BPE 128 one train_args_conv_first_ep100 = copy.deepcopy(train_args_conv_first) train_args_conv_first_ep100["config"]["learning_rates"] = list(np.linspace(7e-6, 5e-4, 240)) + list( np.linspace(5e-4, 5e-5, 720)) + list(np.linspace(5e-5, 1e-7, 40)) train_args_conv_first_ep100["config"]["gradient_clip"] = 1.0 - training_name = prefix_name + "/" + str( - BPE_SIZE) + "/" + network_module_conv_first + ".512dim_sub4_24gbgpu_100eps" - train_job = training(training_name, train_data_bpe, train_args_conv_first_ep100, num_epochs=1000, **default_returnn) - train_job.rqmt["gpu_mem"] = 24 - asr_model = prepare_asr_model( - training_name, train_job, train_args_conv_first_ep100, with_prior=True, datasets=train_data_bpe, - get_specific_checkpoint=1000 - ) - tune_and_evaluate_helper(training_name, dev_dataset_tuples, test_dataset_tuples, asr_model, - default_decoder_config_bpe, lm_scales=[1.6, 1.8, 2.0], - prior_scales=[0.2, 0.3, 0.4]) \ No newline at end of file + train_args_conv_first_ep100_sp = copy.deepcopy(train_args_conv_first_ep100) + train_args_conv_first_ep100_sp["use_speed_perturbation"] = True + + train_args_pairs = [ + (".512dim_sub4_24gbgpu_100eps", train_args_conv_first_ep100), + (".512dim_sub4_24gbgpu_100eps_sp", train_args_conv_first_ep100_sp) + ] + + for name, train_args in train_args_pairs: + training_name = prefix_name + "/" + str( + BPE_SIZE) + "/" + network_module_conv_first + name + train_job = training(training_name, train_data_bpe, train_args, num_epochs=1000, **default_returnn) + train_job.rqmt["gpu_mem"] = 24 + asr_model = prepare_asr_model( + training_name, train_job, train_args, with_prior=True, datasets=train_data_bpe, + get_specific_checkpoint=1000 + ) + add_ctc_model(f"ls960_ctc_bpe_{BPE_SIZE}." + network_module_conv_first + name + "_ckpt1000", + asr_model) + tune_and_evaluate_helper(training_name, dev_dataset_tuples, test_dataset_tuples, asr_model, + default_decoder_config_bpe, lm_scales=[1.6, 1.8, 2.0], + prior_scales=[0.2, 0.3, 0.4]) + greedy_decoder_config = GreedyDecoderConfig( + returnn_vocab=label_datastream_bpe.vocab, + ) + greedy_search_helper(training_name, asr_model=asr_model, decoder_config=greedy_decoder_config) \ No newline at end of file diff --git a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/experiments/rnnt_bpe/low_vocab_exps.py b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/experiments/rnnt_bpe/low_vocab_exps.py index d1b5ab180..f6e26cb62 100644 --- a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/experiments/rnnt_bpe/low_vocab_exps.py +++ b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/experiments/rnnt_bpe/low_vocab_exps.py @@ -230,4 +230,104 @@ def evaluate_helper( decoder_config_bpe5000, ) + if BPE_SIZE == 128: + # DO HERE AGAIN IN CORRECT + KEEP = [300, 400, 500, 600, 700, 800, 900, 950, 980] + network_module = "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v9_i6_native_conv_first" + train_args_warprnnt_fullspec_from_ctc100 = copy.deepcopy(train_args_fullspec) + train_args_warprnnt_fullspec_from_ctc100["network_module"] = network_module + train_args_warprnnt_fullspec_from_ctc100["config"]["preload_from_files"] = { + "encoder": { + "filename": get_ctc_model( + f"ls960_ctc_bpe_{BPE_SIZE}.ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6_conv_first.512dim_sub4_24gbgpu_100eps_ckpt1000" + ).checkpoint, + "init_for_train": True, + "ignore_missing": True, + } + } + train_args_warprnnt_fullspec_from_ctc100["config"]["learning_rates"] = list( + np.linspace(5e-5, 5e-4, 240)) + list( + np.linspace(5e-4, 5e-5, 720)) + list(np.linspace(5e-5, 1e-7, 40)) + train_args_warprnnt_fullspec_from_ctc100["config"]["cleanup_old_models"] = { + "keep_last_n": 4, + "keep_best_n": 4, + "keep": KEEP + } + train_args_warprnnt_fullspec_from_ctc100["config"]["gradient_clip"] = 1.0 + + # small BPE saves a lot of memory, train without grad accum + train_args_warprnnt_fullspec_from_ctc100_noacumm = copy.deepcopy(train_args_warprnnt_fullspec_from_ctc100) + train_args_warprnnt_fullspec_from_ctc100_noacumm["config"]["accum_grad_multiple_step"] = 1 + train_args_warprnnt_fullspec_from_ctc100_noacumm["config"]["batch_size"] = 240 * 16000 + + training_name = prefix_name + "/" + str( + BPE_SIZE) + "/" + network_module + ".512dim_sub4_24gbgpu_100eps_accum1_gradclip_fullspec1_continue_from_ctc100eps" + train_job = training(training_name, train_data_bpe, train_args_warprnnt_fullspec_from_ctc100_noacumm, + num_epochs=1000, **default_returnn) + train_job.rqmt["gpu_mem"] = 24 + train_job.set_env("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True") + for keep in KEEP: + asr_model = prepare_asr_model( + training_name, train_job, train_args_warprnnt_fullspec_from_ctc100_noacumm, with_prior=False, + datasets=train_data_bpe, get_specific_checkpoint=keep + ) + evaluate_helper( + training_name + "/keep_%i" % keep, + asr_model, + decoder_config_bpe5000, + use_gpu=True + ) + asr_model = prepare_asr_model( + training_name, train_job, train_args_warprnnt_fullspec_from_ctc100_noacumm, with_prior=False, + datasets=train_data_bpe, get_specific_checkpoint=1000 + ) + evaluate_helper( + training_name + "/keep_%i" % 1000, + asr_model, + decoder_config_bpe5000, + use_gpu=True, + ) + + asr_model = prepare_asr_model( + training_name, train_job, train_args_warprnnt_fullspec_from_ctc100_noacumm, with_prior=False, + datasets=train_data_bpe, get_best_averaged_checkpoint=(1, "dev_loss_rnnt"), + ) + evaluate_helper( + training_name + "/best", + asr_model, + decoder_config_bpe5000, + use_gpu=True, + ) + + # With speed perturbation + train_args_warprnnt_fullspec_from_ctc100_noacumm_sp = copy.deepcopy(train_args_warprnnt_fullspec_from_ctc100_noacumm) + train_args_warprnnt_fullspec_from_ctc100_noacumm["use_speed_perturbation"] = True + training_name = prefix_name + "/" + str( + BPE_SIZE) + "/" + network_module + ".512dim_sub4_24gbgpu_100eps_accum1_gradclip_fullspec1_sp_continue_from_ctc100eps" + train_job = training(training_name, train_data_bpe, train_args_warprnnt_fullspec_from_ctc100_noacumm, + num_epochs=1000, **default_returnn) + train_job.rqmt["gpu_mem"] = 24 + train_job.set_env("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True") + for keep in KEEP: + asr_model = prepare_asr_model( + training_name, train_job, train_args_warprnnt_fullspec_from_ctc100_noacumm, with_prior=False, + datasets=train_data_bpe, get_specific_checkpoint=keep + ) + evaluate_helper( + training_name + "/keep_%i" % keep, + asr_model, + decoder_config_bpe5000, + use_gpu=True + ) + asr_model = prepare_asr_model( + training_name, train_job, train_args_warprnnt_fullspec_from_ctc100_noacumm, with_prior=False, + datasets=train_data_bpe, get_specific_checkpoint=1000 + ) + evaluate_helper( + training_name + "/keep_%i" % 1000, + asr_model, + decoder_config_bpe5000, + use_gpu=True, + ) + From 9f067cbc603ed2f6e32ba9bd63dd297ea318147c Mon Sep 17 00:00:00 2001 From: Nick Rossenbach Date: Fri, 17 May 2024 17:16:47 +0200 Subject: [PATCH 032/227] add greedy decoder --- .../ctc/decoder/greedy_bpe_ctc_v3.py | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 example_setups/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v3.py diff --git a/example_setups/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v3.py b/example_setups/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v3.py new file mode 100644 index 000000000..1738f7442 --- /dev/null +++ b/example_setups/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v3.py @@ -0,0 +1,86 @@ +""" +Greedy CTC decoder without any extras + +v3: add config objects +""" +from dataclasses import dataclass +import time +import torch + + +@dataclass +class DecoderConfig: + returnn_vocab: str + + +@dataclass +class ExtraConfig: + # used for RTF logging + print_rtf: bool = True + sample_rate: int = 16000 + + # Hypothesis logging + print_hypothesis: bool = True + + +def forward_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + config = DecoderConfig(**kwargs["config"]) + extra_config_dict = kwargs.get("extra_config", {}) + extra_config = ExtraConfig(**extra_config_dict) + + run_ctx.recognition_file = open("search_out.py", "wt") + run_ctx.recognition_file.write("{\n") + + from returnn.datasets.util.vocabulary import Vocabulary + vocab = Vocabulary.create_vocab( + vocab_file=config.returnn_vocab, unknown_label=None) + run_ctx.labels = vocab.labels + + run_ctx.print_rtf = extra_config.print_rtf + if run_ctx.print_rtf: + run_ctx.running_audio_len_s = 0 + run_ctx.total_time = 0 + + run_ctx.print_hypothesis = extra_config.print_hypothesis + +def forward_finish_hook(run_ctx, **kwargs): + run_ctx.recognition_file.write("}\n") + run_ctx.recognition_file.close() + + print("Total-time: %.2f, Batch-RTF: %.3f" % (run_ctx.total_time, run_ctx.total_time / run_ctx.running_audio_len_s)) + + +def forward_step(*, model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000 + + if run_ctx.print_rtf: + run_ctx.running_audio_len_s += audio_len_batch + am_start = time.time() + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + batch_indices = [] + for lp, l in zip(logprobs, audio_features_len): + batch_indices.append(torch.unique_consecutive(torch.argmax(lp[:l], dim=-1), dim=0).detach().cpu().numpy()) + + if run_ctx.print_rtf: + am_time = time.time() - am_start + run_ctx.total_time += am_time + print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time, am_time / audio_len_batch)) + + tags = data["seq_tag"] + + for indices, tag in zip(batch_indices, tags): + sequence = [run_ctx.labels[idx] for idx in indices if idx < len(run_ctx.labels)] + sequence = [s for s in sequence if (not s.startswith("<") and not s.startswith("["))] + text = " ".join(sequence).replace("@@ ", "") + if run_ctx.print_hypothesis: + print(text) + run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(text))) From 411a162f55b31d190dd0a19c7cf4ad119bb074dd Mon Sep 17 00:00:00 2001 From: Nick Rossenbach Date: Fri, 17 May 2024 18:10:42 +0200 Subject: [PATCH 033/227] black --- .../experiments/ctc_bpe/baseline.py | 23 +++++++++++++++++++ .../ctc/decoder/greedy_bpe_ctc_v3.py | 7 +++--- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/example_setups/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_bpe/baseline.py b/example_setups/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_bpe/baseline.py index d423f4a16..e77d46613 100644 --- a/example_setups/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_bpe/baseline.py +++ b/example_setups/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_bpe/baseline.py @@ -61,6 +61,7 @@ def bpe_ls960_1023_base(): } from ...pytorch_networks.ctc.decoder.flashlight_ctc_v1 import DecoderConfig + from ...pytorch_networks.ctc.decoder.greedy_bpe_ctc_v3 import DecoderConfig as GreedyDecoderConfig def tune_and_evaluate_helper( training_name: str, @@ -121,6 +122,22 @@ def tune_and_evaluate_helper( **default_returnn, ) + def greedy_search_helper(training_name: str, asr_model: ASRModel, decoder_config: GreedyDecoderConfig): + # remove prior if exists + asr_model = copy.deepcopy(asr_model) + asr_model.prior_file = None + + search_name = training_name + "/search_greedy" + search_jobs, wers = search( + search_name, + forward_config={}, + asr_model=asr_model, + decoder_module="ctc.decoder.greedy_bpe_ctc_v3", + decoder_args={"config": asdict(decoder_config)}, + test_dataset_tuples=dev_dataset_tuples, + **default_returnn, + ) + default_decoder_config_bpe5000 = DecoderConfig( lexicon=get_text_lexicon(prefix=prefix_name, librispeech_key="train-other-960", bpe_size=5000), returnn_vocab=label_datastream_bpe5000.vocab, @@ -200,6 +217,7 @@ def tune_and_evaluate_helper( "max_seq_length": {"audio_features": 35 * 16000}, "accum_grad_multiple_step": 1, "torch_amp_options": {"dtype": "bfloat16"}, + "gradient_clip": 1.0, } network_module = "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6" @@ -224,3 +242,8 @@ def tune_and_evaluate_helper( lm_scales=[1.6, 1.8, 2.0], prior_scales=[0.2, 0.3, 0.4], ) + + greedy_decoder_config = GreedyDecoderConfig( + returnn_vocab=label_datastream_bpe5000.vocab, + ) + greedy_search_helper(training_name=training_name, asr_model=asr_model, decoder_config=greedy_decoder_config) diff --git a/example_setups/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v3.py b/example_setups/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v3.py index 1738f7442..323c68488 100644 --- a/example_setups/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v3.py +++ b/example_setups/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v3.py @@ -29,13 +29,13 @@ def forward_init_hook(run_ctx, **kwargs): config = DecoderConfig(**kwargs["config"]) extra_config_dict = kwargs.get("extra_config", {}) extra_config = ExtraConfig(**extra_config_dict) - + run_ctx.recognition_file = open("search_out.py", "wt") run_ctx.recognition_file.write("{\n") from returnn.datasets.util.vocabulary import Vocabulary - vocab = Vocabulary.create_vocab( - vocab_file=config.returnn_vocab, unknown_label=None) + + vocab = Vocabulary.create_vocab(vocab_file=config.returnn_vocab, unknown_label=None) run_ctx.labels = vocab.labels run_ctx.print_rtf = extra_config.print_rtf @@ -45,6 +45,7 @@ def forward_init_hook(run_ctx, **kwargs): run_ctx.print_hypothesis = extra_config.print_hypothesis + def forward_finish_hook(run_ctx, **kwargs): run_ctx.recognition_file.write("}\n") run_ctx.recognition_file.close() From bb9f282aa1d071bd818e794709a0ac01dd0f3901 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Fri, 17 May 2024 19:58:31 +0200 Subject: [PATCH 034/227] add ebranchformer --- .../asr/encoder/ebranchformer_encoder.py | 463 +++--------------- 1 file changed, 58 insertions(+), 405 deletions(-) diff --git a/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py b/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py index 01c65076c..4eb066d42 100644 --- a/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py +++ b/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py @@ -1,252 +1,16 @@ -from i6_experiments.users.zeineldeen.modules.network import ReturnnNetwork +from i6_experiments.users.zeineldeen.models.asr.encoder.conformer_encoder_v2 import ConformerEncoderV2 -class EBranchformerEncoder: +class EBranchformerEncoder(ConformerEncoderV2): """ Implement E-branchformer Encoder Architecture * Ref: https://arxiv.org/pdf/2210.00077.pdf """ - def __init__( - self, - input="data", - input_layer="conv-6", - input_layer_conv_act="relu", - num_blocks=16, - conv_kernel_size=32, - specaug=True, - pos_enc="rel", - activation="swish", - ff_dim=512, - ff_bias=True, - ctc_loss_scale=None, - dropout=0.1, - att_dropout=0.1, - enc_key_dim=256, - att_num_heads=4, - target="bpe", - l2=0.0, - lstm_dropout=0.1, - rec_weight_dropout=0.0, - with_ctc=False, - native_ctc=False, - ctc_dropout=0.0, - ctc_l2=0.0, - ctc_opts=None, - subsample=None, - start_conv_init=None, - conv_module_init=None, - mhsa_init=None, - mhsa_out_init=None, - ff_init=None, - rel_pos_clipping=16, - dropout_in=0.1, - batch_norm_opts=None, - self_att_l2=0.0, - sandwich_conv=False, - add_to_prefix_name=None, - output_layer_name="encoder", - rezero=False, - ): - """ - :param str input: input layer name - :param str input_layer: type of input layer which does subsampling - :param int num_blocks: number of Conformer blocks - :param int conv_kernel_size: kernel size for conv layers in Convolution module - :param bool|None specaug: If true, then SpecAug is appliedi wi - :param str|None activation: activation used to sandwich modules - :param bool final_norm: if True, apply layer norm to the output of the encoder - :param int|None ff_dim: dimension of the first linear layer in FF module - :param str|None ff_init: FF layers initialization - :param bool|None ff_bias: If true, then bias is used for the FF layers - :param float embed_dropout: dropout applied to the source embedding - :param float dropout: general dropout - :param float att_dropout: dropout applied to attention weights - :param int enc_key_dim: encoder key dimension, also denoted as d_model, or d_key - :param int att_num_heads: the number of attention heads - :param str target: target labels key name - :param float l2: add L2 regularization for trainable weights parameters - :param float lstm_dropout: dropout applied to the input of the LSTMs in case they are used - :param float rec_weight_dropout: dropout applied to the hidden-to-hidden weight matrices of the LSTM in case used - :param bool with_ctc: if true, CTC loss is used - :param bool native_ctc: if true, use returnn native ctc implementation instead of TF implementation - :param float ctc_dropout: dropout applied on input to ctc - :param float ctc_l2: L2 applied to the weight matrix of CTC softmax - :param dict[str] ctc_opts: options for CTC - :param bool rezero: rezero initialization, ref: https://arxiv.org/abs/2003.04887 - """ - - self.input = input - self.input_layer = input_layer - self.input_layer_conv_act = input_layer_conv_act - - self.num_blocks = num_blocks - self.conv_kernel_size = conv_kernel_size - - self.pos_enc = pos_enc - self.rel_pos_clipping = rel_pos_clipping - - self.ff_bias = ff_bias - - self.specaug = specaug - - self.activation = activation - - self.dropout = dropout - self.att_dropout = att_dropout - self.lstm_dropout = lstm_dropout - - self.dropout_in = dropout_in - - # key and value dimensions are the same - self.enc_key_dim = enc_key_dim - self.enc_value_dim = enc_key_dim - self.att_num_heads = att_num_heads - self.enc_key_per_head_dim = enc_key_dim // att_num_heads - self.enc_val_per_head_dim = enc_key_dim // att_num_heads - - self.ff_dim = ff_dim - - self.target = target - - self.l2 = l2 - self.self_att_l2 = self_att_l2 - self.rec_weight_dropout = rec_weight_dropout - - if batch_norm_opts is None: - batch_norm_opts = {} - - bn_momentum = batch_norm_opts.pop("momentum", 0.1) - bn_eps = batch_norm_opts.pop("epsilon", 1e-3) - bn_update_sample_only_in_train = batch_norm_opts.pop("update_sample_only_in_training", True) - bn_delay_sample_update = batch_norm_opts.pop("delay_sample_update", True) - self.batch_norm_opts = { - "momentum": bn_momentum, - "epsilon": bn_eps, - "update_sample_only_in_training": bn_update_sample_only_in_train, - "delay_sample_update": bn_delay_sample_update, - } - self.batch_norm_opts.update(**batch_norm_opts) - - self.with_ctc = with_ctc - self.native_ctc = native_ctc - self.ctc_dropout = ctc_dropout - self.ctc_loss_scale = ctc_loss_scale - self.ctc_l2 = ctc_l2 - self.ctc_opts = ctc_opts - if not self.ctc_opts: - self.ctc_opts = {} - - self.start_conv_init = start_conv_init - self.conv_module_init = conv_module_init - self.mhsa_init = mhsa_init - self.mhsa_out_init = mhsa_out_init - self.ff_init = ff_init - - self.sandwich_conv = sandwich_conv - - # add maxpooling layers - self.subsample = subsample - self.subsample_list = [1] * num_blocks - if subsample: - for idx, s in enumerate(map(int, subsample.split("_")[:num_blocks])): - self.subsample_list[idx] = s - - self.network = ReturnnNetwork() - - self.add_to_prefix_name = add_to_prefix_name - self.output_layer_name = output_layer_name - - self.rezero = rezero - - def _create_ff_module(self, prefix_name, i, source, block_scale_var): - """ - Add Feed Forward Module: - LN -> FFN -> Swish -> Dropout -> FFN -> Dropout - :param str prefix_name: some prefix name - :param int i: FF module index - :param str source: name of source layer - :return: last layer name of this module - :rtype: str - """ - prefix_name = prefix_name + "_ffmod_{}".format(i) - - ln = self.network.add_layer_norm_layer("{}_ln".format(prefix_name), source) - - ff1 = self.network.add_linear_layer( - "{}_ff1".format(prefix_name), - ln, - n_out=self.ff_dim, - l2=self.l2, - forward_weights_init=self.ff_init, - with_bias=self.ff_bias, - ) - - swish_act = self.network.add_activation_layer("{}_swish".format(prefix_name), ff1, activation="swish") - - drop1 = self.network.add_dropout_layer("{}_drop1".format(prefix_name), swish_act, dropout=self.dropout) - - ff2 = self.network.add_linear_layer( - "{}_ff2".format(prefix_name), - drop1, - n_out=self.enc_key_dim, - l2=self.l2, - forward_weights_init=self.ff_init, - with_bias=self.ff_bias, - ) + def __init__(self, **kwargs): + super().__init__(**kwargs) - drop2 = self.network.add_dropout_layer("{}_drop2".format(prefix_name), ff2, dropout=self.dropout) - - if self.rezero: - drop2 = self.network.add_eval_layer( - "{}_scaled_dropout".format(prefix_name), [block_scale_var, drop2], eval="source(0) * source(1)" - ) - - half_step_ff = self.network.add_eval_layer("{}_half_step".format(prefix_name), drop2, eval="0.5 * source(0)") - - ff_module_res = self.network.add_combine_layer( - "{}_res".format(prefix_name), kind="add", source=[half_step_ff, source], n_out=self.enc_key_dim - ) - - return ff_module_res - - def _create_global_extractor(self, prefix_name, source): - prefix_name = "{}_global_extractor".format(prefix_name) - - ln = self.network.add_layer_norm_layer("{}_ln".format(prefix_name), source) - - ln_rel_pos_enc = self.network.add_relative_pos_encoding_layer( - "{}_ln_rel_pos_enc".format(prefix_name), - ln, - n_out=self.enc_key_per_head_dim, - forward_weights_init=self.ff_init, - ) - - mhsa = self.network.add_self_att_layer( - "{}".format(prefix_name), - ln, - n_out=self.enc_value_dim, - num_heads=self.att_num_heads, - total_key_dim=self.enc_key_dim, - att_dropout=self.att_dropout, - forward_weights_init=self.ff_init, - key_shift=ln_rel_pos_enc if ln_rel_pos_enc is not None else None, - ) - - mhsa_linear = self.network.add_linear_layer( - "{}_linear".format(prefix_name), - mhsa, - n_out=self.enc_key_dim, - l2=self.l2, - forward_weights_init=self.ff_init, - with_bias=False, - ) - - dropout = self.network.add_dropout_layer("{}_dropout".format(prefix_name), mhsa_linear, dropout=self.dropout) - - return dropout - - def _create_local_extractor(self, prefix_name, source): + def _create_conv_gating_mlp(self, prefix_name, source, layer_index): prefix_name = "{}_local_extractor".format(prefix_name) ln = self.network.add_layer_norm_layer("{}_ln".format(prefix_name), source) @@ -258,6 +22,9 @@ def _create_local_extractor(self, prefix_name, source): l2=self.l2, forward_weights_init=self.ff_init, with_bias=False, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, ) gelu_act = self.network.add_activation_layer("{}_gelu".format(prefix_name), ff1, activation="gelu") @@ -279,6 +46,9 @@ def _create_local_extractor(self, prefix_name, source): filter_size=(self.conv_kernel_size,), groups=self.enc_key_dim * 3, l2=self.l2, + param_dropout=self.conv_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.conv_weight_noise, ) br_merge = self.network.add_eval_layer( @@ -294,18 +64,19 @@ def _create_local_extractor(self, prefix_name, source): l2=self.l2, forward_weights_init=self.ff_init, with_bias=False, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, ) return br_merge_ff - def _create_merge_mod(self, prefix_name, source, block_scale_var): - prefix_name = "{}_merge_mod".format(prefix_name) + def _create_merge_module(self, prefix_name, *, source, global_extracter, local_extracter, layer_index): + prefix_name = "{}_merge_module".format(prefix_name) - glb_ext = self._create_global_extractor(prefix_name, source) - - lcl_ext = self._create_local_extractor(prefix_name, source) - - glb_lcl_merge = self.network.add_copy_layer("{}_global_local_merge".format(prefix_name), [glb_ext, lcl_ext]) + glb_lcl_merge = self.network.add_copy_layer( + "{}_global_local_merge".format(prefix_name), [global_extracter, local_extracter] + ) dpt_conv = self.network.add_conv_layer( "{}_dpt_conv".format(prefix_name), @@ -314,6 +85,9 @@ def _create_merge_mod(self, prefix_name, source, block_scale_var): filter_size=(self.conv_kernel_size,), groups=2 * self.enc_key_dim, l2=self.l2, + param_dropout=self.conv_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.conv_weight_noise, ) dpt_conv_res = self.network.add_combine_layer( @@ -330,179 +104,58 @@ def _create_merge_mod(self, prefix_name, source, block_scale_var): l2=self.l2, forward_weights_init=self.ff_init, with_bias=False, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, ) dropout = self.network.add_dropout_layer("{}_dropout".format(prefix_name), ff, dropout=self.dropout) - if self.rezero: - dropout = self.network.add_eval_layer( - "{}_scaled_dropout".format(prefix_name), [block_scale_var, dropout], eval="source(0) * source(1)" - ) - merge_mod_res = self.network.add_combine_layer( "{}_res".format(prefix_name), kind="add", source=[source, dropout], n_out=self.enc_key_dim ) return merge_mod_res - def _create_e_branchformer_block(self, i, source): - prefix_name = "ebranchformer_block_%02i" % i - - if self.rezero: - self.network["mod_%02i_var" % i] = { - "class": "variable", - "init": 1e-8, - "trainable": True, - "add_batch_axis": True, - "shape": (1,), - } + def _block_prefix_name(self, layer_index: int) -> str: + assert layer_index >= 1 + if self.add_to_prefix_name: + prefix_name = "ebranchformer_block_%s_%02i" % (self.add_to_prefix_name, layer_index) + else: + prefix_name = "ebranchformer_block_%02i" % layer_index + return prefix_name - ff_module1 = self._create_ff_module(prefix_name, 1, source, "mod_%02i_var" % i) - - merge_module = self._create_merge_mod(prefix_name, ff_module1, "mod_%02i_var" % i) + def _create_conformer_block(self, i, source): + """ + Create an ebranchformer block: - ff_module2 = self._create_ff_module(prefix_name, 2, merge_module, "mod_%02i_var" % i) + FF -> [MHSA, Conv] -> Merger -> FF -> LN + """ - block_ln = self.network.add_layer_norm_layer("{}_ln".format(prefix_name), ff_module2) + prefix_name = self._block_prefix_name(i) - block_ln = self.network.add_copy_layer(prefix_name, block_ln) + ff_module1 = self._create_ff_module(prefix_name, 1, source, i) - return block_ln + # create branch 1: MHSA + mhsa = self._create_mhsa_module(prefix_name, ff_module1, i) - def _create_all_network_parts(self): - """ - ConvSubsampling/LSTM -> Linear -> Dropout -> [Conformer Blocks] x N - """ + # create branch 2: Convolutional gating MLP + conv_gating_mlp = self._create_conv_gating_mlp(prefix_name, ff_module1, i) - data = self.input - if self.specaug: - data = self.network.add_eval_layer( - "source", - data, - eval="self.network.get_config().typed_value('transform')(source(0, as_data=True), network=self.network)", - ) - - subsampled_input = None - if self.input_layer is None: - subsampled_input = data - elif "lstm" in self.input_layer: - sample_factor = int(self.input_layer.split("-")[1]) - pool_sizes = None - if sample_factor == 2: - pool_sizes = [2, 1] - elif sample_factor == 4: - pool_sizes = [2, 2] - elif sample_factor == 6: - pool_sizes = [3, 2] - # add 2 LSTM layers with max pooling to subsample and encode positional information - subsampled_input = self.network.add_lstm_layers( - data, - num_layers=2, - lstm_dim=self.enc_key_dim, - dropout=self.lstm_dropout, - bidirectional=True, - rec_weight_dropout=self.rec_weight_dropout, - l2=self.l2, - pool_sizes=pool_sizes, - ) - elif self.input_layer == "conv-4": - # conv-layer-1: 3x3x32 followed by max pool layer on feature axis (1, 2) - # conv-layer-2: 3x3x64 with striding (2, 1) on time axis - # conv-layer-3: 3x3x64 with striding (2, 1) on time axis - - # TODO: make this more generic - - conv_input = self.network.add_conv_block( - "conv_out", - data, - hwpc_sizes=[((3, 3), (1, 2), 32)], - l2=self.l2, - activation=self.input_layer_conv_act, - init=self.start_conv_init, - merge_out=False, - ) - - subsampled_input = self.network.add_conv_block( - "conv_merged", - conv_input, - hwpc_sizes=[((3, 3), (2, 1), 64), ((3, 3), (2, 1), 64)], - l2=self.l2, - activation=self.input_layer_conv_act, - init=self.start_conv_init, - use_striding=True, - split_input=False, - prefix_name="subsample_", - ) - elif self.input_layer == "conv-6": - conv_input = self.network.add_conv_block( - "conv_out", - data, - hwpc_sizes=[((3, 3), (1, 2), 32)], - l2=self.l2, - activation=self.input_layer_conv_act, - init=self.start_conv_init, - merge_out=False, - ) - - subsampled_input = self.network.add_conv_block( - "conv_merged", - conv_input, - hwpc_sizes=[((3, 3), (3, 1), 64), ((3, 3), (2, 1), 64)], - l2=self.l2, - activation=self.input_layer_conv_act, - init=self.start_conv_init, - use_striding=True, - split_input=False, - prefix_name="subsample_", - ) - - assert subsampled_input is not None - - source_linear = self.network.add_linear_layer( - "source_linear", - subsampled_input, - n_out=self.enc_key_dim, - l2=self.l2, - forward_weights_init=self.ff_init, - with_bias=False, + # merge two branches + merge_module = self._create_merge_module( + prefix_name, source=ff_module1, global_extracter=mhsa, local_extracter=conv_gating_mlp, layer_index=i ) - if self.dropout_in: - source_linear = self.network.add_dropout_layer("source_dropout", source_linear, dropout=self.dropout_in) - - conformer_block_src = source_linear - for i in range(1, self.num_blocks + 1): - conformer_block_src = self._create_e_branchformer_block(i, conformer_block_src) - - encoder = self.network.add_copy_layer(self.output_layer_name, conformer_block_src) - - if self.with_ctc: - default_ctc_loss_opts = {"beam_width": 1} - if self.native_ctc: - default_ctc_loss_opts["use_native"] = True - else: - self.ctc_opts.update({"ignore_longer_outputs_than_inputs": True}) # always enable - if self.ctc_opts: - default_ctc_loss_opts["ctc_opts"] = self.ctc_opts - self.network.add_softmax_layer( - "ctc", - encoder, - l2=self.ctc_l2, - target=self.target, - loss="ctc", - dropout=self.ctc_dropout, - loss_opts=default_ctc_loss_opts, - loss_scale=self.ctc_loss_scale, - ) - - return encoder - - def _create_e_branchformer_blocks(self, input): - conformer_block_src = input - for i in range(1, self.num_blocks + 1): - conformer_block_src = self._create_e_branchformer_block(i, conformer_block_src) - encoder = self.network.add_copy_layer(self.output_layer_name, conformer_block_src) - return encoder - - def create_network(self): - return self._create_all_network_parts() + ff_module2 = self._create_ff_module(prefix_name, 2, merge_module, i) + + res = ff_module2 + if self.block_final_norm: + res = self.network.add_layer_norm_layer("{}_ln".format(prefix_name), res) + if self.subsample: + assert 0 <= i - 1 < len(self.subsample) + subsample_factor = self.subsample_list[i - 1] + if subsample_factor > 1: + res = self.network.add_pool_layer(res + "_pool{}".format(i), res, pool_size=(subsample_factor,)) + res = self.network.add_copy_layer(prefix_name, res) + return res From 68552aee24ddf3d215103b1e1980ead7a50505ca Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Mon, 20 May 2024 22:43:10 +0200 Subject: [PATCH 035/227] more --- users/zeyer/experiments/exp2024_04_23_baselines/aed.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/aed.py b/users/zeyer/experiments/exp2024_04_23_baselines/aed.py index 38f6f4a69..2976cbafa 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/aed.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/aed.py @@ -80,8 +80,14 @@ def py(): vocab="spm10k", ) + # Testing sampling in SPM. Baseline without sampling: 5.24 dev-other. + # The lower the alpha, the more aggressive the sampling. # alpha=0.1 seems too aggressive for AED, bad convergence - for alpha in [0.3, 0.5, 0.7]: + for alpha in [ + 0.3, # 5.26 + 0.5, + 0.7, # 4.98 (!!) + ]: train_exp( f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-spm10k" f"-spmSample{str(alpha).replace('.', '')}", From 857fbd6f1a4f6a59c8011a129cc38293c56bd837 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Tue, 21 May 2024 10:33:26 +0200 Subject: [PATCH 036/227] more --- users/zeyer/experiments/exp2024_04_23_baselines/aed.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/aed.py b/users/zeyer/experiments/exp2024_04_23_baselines/aed.py index 2976cbafa..6a4ccc9df 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/aed.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/aed.py @@ -86,7 +86,11 @@ def py(): for alpha in [ 0.3, # 5.26 0.5, + 0.6, 0.7, # 4.98 (!!) + 0.8, + 0.9, + 1.0, # sanity check ]: train_exp( f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-spm10k" From 41d336e42cd759f37edf9ebb40d1862283705668 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 21 May 2024 14:39:59 +0000 Subject: [PATCH 037/227] better layer names for ebranchformer --- .../asr/encoder/ebranchformer_encoder.py | 80 ++++++++++--------- 1 file changed, 44 insertions(+), 36 deletions(-) diff --git a/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py b/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py index 4eb066d42..4870f7f25 100644 --- a/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py +++ b/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py @@ -10,38 +10,22 @@ class EBranchformerEncoder(ConformerEncoderV2): def __init__(self, **kwargs): super().__init__(**kwargs) - def _create_conv_gating_mlp(self, prefix_name, source, layer_index): - prefix_name = "{}_local_extractor".format(prefix_name) - - ln = self.network.add_layer_norm_layer("{}_ln".format(prefix_name), source) - - ff1 = self.network.add_linear_layer( - "{}_ff_1".format(prefix_name), - ln, - n_out=6 * self.enc_key_dim, - l2=self.l2, - forward_weights_init=self.ff_init, - with_bias=False, - param_dropout=self.ff_weight_drop, - param_dropout_min_ndim=2, - param_variational_noise=self.ff_weight_noise, - ) + def _create_conv_spatial_gating_unit(self, prefix_name, source, layer_index): + # see also here: https://github.com/espnet/espnet/blob/master/espnet2/asr/layers/cgmlp.py#L15 - gelu_act = self.network.add_activation_layer("{}_gelu".format(prefix_name), ff1, activation="gelu") - - br_part_A = self.network.add_slice_layer( - "{}_branch_a".format(prefix_name), gelu_act, "F", slice_start=0, slice_end=self.enc_key_dim * 3 + branch_a = self.network.add_slice_layer( + "{}_branch_a".format(prefix_name), source, "F", slice_start=0, slice_end=self.enc_key_dim * 3 ) - br_part_B = self.network.add_slice_layer( - "{}_branch_b".format(prefix_name), gelu_act, "F", slice_start=self.enc_key_dim * 3 + branch_b = self.network.add_slice_layer( + "{}_branch_b".format(prefix_name), source, "F", slice_start=self.enc_key_dim * 3 ) - br_part_B_ln = self.network.add_layer_norm_layer("{}_branch_b_ln".format(prefix_name), br_part_B) + br_part_b_ln = self.network.add_layer_norm_layer("{}_branch_b_ln".format(prefix_name), branch_b) - br_part_B_dpt_conv = self.network.add_conv_layer( - "{}_branch_b_dpt_conv".format(prefix_name), - br_part_B_ln, + br_part_b_depthwise_conv = self.network.add_conv_layer( + "{}_branch_b_depthwise_conv".format(prefix_name), + br_part_b_ln, n_out=self.enc_key_dim * 3, filter_size=(self.conv_kernel_size,), groups=self.enc_key_dim * 3, @@ -52,14 +36,37 @@ def _create_conv_gating_mlp(self, prefix_name, source, layer_index): ) br_merge = self.network.add_eval_layer( - "{}_branch_merge".format(prefix_name), [br_part_A, br_part_B_dpt_conv], "source(0)*source(1)" + "{}_branch_merge".format(prefix_name), [branch_a, br_part_b_depthwise_conv], "source(0) * source(1)" ) dropout = self.network.add_dropout_layer("{}_dropout".format(prefix_name), br_merge, dropout=self.dropout) + return dropout + + def _create_conv_gating_mlp(self, prefix_name, source, layer_index): + prefix_name = "{}_cgmlp".format(prefix_name) + + ln = self.network.add_layer_norm_layer("{}_ln".format(prefix_name), source) + + ff1 = self.network.add_linear_layer( + "{}_ff_1".format(prefix_name), + ln, + n_out=6 * self.enc_key_dim, # TODO: make it configurable + l2=self.l2, + forward_weights_init=self.ff_init, + with_bias=False, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, + ) + + gelu_act = self.network.add_activation_layer("{}_gelu".format(prefix_name), ff1, activation="gelu") + + csgu = self._create_conv_spatial_gating_unit(prefix_name, gelu_act, layer_index) + br_merge_ff = self.network.add_linear_layer( "{}_ff_2".format(prefix_name), - dropout, + csgu, n_out=self.enc_key_dim, l2=self.l2, forward_weights_init=self.ff_init, @@ -74,12 +81,13 @@ def _create_conv_gating_mlp(self, prefix_name, source, layer_index): def _create_merge_module(self, prefix_name, *, source, global_extracter, local_extracter, layer_index): prefix_name = "{}_merge_module".format(prefix_name) + # concat on feature dim glb_lcl_merge = self.network.add_copy_layer( "{}_global_local_merge".format(prefix_name), [global_extracter, local_extracter] ) - dpt_conv = self.network.add_conv_layer( - "{}_dpt_conv".format(prefix_name), + depthwise_conv = self.network.add_conv_layer( + "{}_depthwise_conv".format(prefix_name), glb_lcl_merge, n_out=2 * self.enc_key_dim, filter_size=(self.conv_kernel_size,), @@ -90,16 +98,16 @@ def _create_merge_module(self, prefix_name, *, source, global_extracter, local_e param_variational_noise=self.conv_weight_noise, ) - dpt_conv_res = self.network.add_combine_layer( - "{}_dpt_conv_res".format(prefix_name), + depthwise_conv_res = self.network.add_combine_layer( + "{}_depthwise_conv_res".format(prefix_name), kind="add", - source=[glb_lcl_merge, dpt_conv], + source=[glb_lcl_merge, depthwise_conv], n_out=2 * self.enc_key_dim, ) ff = self.network.add_linear_layer( "{}_ff".format(prefix_name), - dpt_conv_res, + depthwise_conv_res, n_out=self.enc_key_dim, l2=self.l2, forward_weights_init=self.ff_init, @@ -140,11 +148,11 @@ def _create_conformer_block(self, i, source): mhsa = self._create_mhsa_module(prefix_name, ff_module1, i) # create branch 2: Convolutional gating MLP - conv_gating_mlp = self._create_conv_gating_mlp(prefix_name, ff_module1, i) + cgmlp = self._create_conv_gating_mlp(prefix_name, ff_module1, i) # merge two branches merge_module = self._create_merge_module( - prefix_name, source=ff_module1, global_extracter=mhsa, local_extracter=conv_gating_mlp, layer_index=i + prefix_name, source=ff_module1, global_extracter=mhsa, local_extracter=cgmlp, layer_index=i ) ff_module2 = self._create_ff_module(prefix_name, 2, merge_module, i) From e52414a30d7252e3aa307bf3064397449b525766 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 21 May 2024 14:44:44 +0000 Subject: [PATCH 038/227] better --- users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py b/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py index 4870f7f25..f771363c3 100644 --- a/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py +++ b/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py @@ -36,7 +36,7 @@ def _create_conv_spatial_gating_unit(self, prefix_name, source, layer_index): ) br_merge = self.network.add_eval_layer( - "{}_branch_merge".format(prefix_name), [branch_a, br_part_b_depthwise_conv], "source(0) * source(1)" + "{}_merge".format(prefix_name), [branch_a, br_part_b_depthwise_conv], "source(0) * source(1)" ) dropout = self.network.add_dropout_layer("{}_dropout".format(prefix_name), br_merge, dropout=self.dropout) @@ -62,7 +62,7 @@ def _create_conv_gating_mlp(self, prefix_name, source, layer_index): gelu_act = self.network.add_activation_layer("{}_gelu".format(prefix_name), ff1, activation="gelu") - csgu = self._create_conv_spatial_gating_unit(prefix_name, gelu_act, layer_index) + csgu = self._create_conv_spatial_gating_unit(f"{prefix_name}_csgu", gelu_act, layer_index) br_merge_ff = self.network.add_linear_layer( "{}_ff_2".format(prefix_name), From 6987cd7c94ae97ba76754c300e16612c592c8cf0 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 21 May 2024 14:52:05 +0000 Subject: [PATCH 039/227] better --- .../asr/encoder/ebranchformer_encoder.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py b/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py index f771363c3..09d0e8e32 100644 --- a/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py +++ b/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py @@ -7,28 +7,31 @@ class EBranchformerEncoder(ConformerEncoderV2): * Ref: https://arxiv.org/pdf/2210.00077.pdf """ - def __init__(self, **kwargs): + def __init__(self, cgmlp_ff_dim, **kwargs): super().__init__(**kwargs) + assert cgmlp_ff_dim % 2 == 0, "cgmlp_dim must be even" + self.cgmlp_ff_dim = cgmlp_ff_dim + def _create_conv_spatial_gating_unit(self, prefix_name, source, layer_index): # see also here: https://github.com/espnet/espnet/blob/master/espnet2/asr/layers/cgmlp.py#L15 + split_size = self.cgmlp_ff_dim // 2 + branch_a = self.network.add_slice_layer( - "{}_branch_a".format(prefix_name), source, "F", slice_start=0, slice_end=self.enc_key_dim * 3 + "{}_branch_a".format(prefix_name), source, "F", slice_start=0, slice_end=split_size ) - branch_b = self.network.add_slice_layer( - "{}_branch_b".format(prefix_name), source, "F", slice_start=self.enc_key_dim * 3 - ) + branch_b = self.network.add_slice_layer("{}_branch_b".format(prefix_name), source, "F", slice_start=split_size) br_part_b_ln = self.network.add_layer_norm_layer("{}_branch_b_ln".format(prefix_name), branch_b) br_part_b_depthwise_conv = self.network.add_conv_layer( "{}_branch_b_depthwise_conv".format(prefix_name), br_part_b_ln, - n_out=self.enc_key_dim * 3, + n_out=split_size, filter_size=(self.conv_kernel_size,), - groups=self.enc_key_dim * 3, + groups=split_size, l2=self.l2, param_dropout=self.conv_weight_drop, param_dropout_min_ndim=2, @@ -51,7 +54,7 @@ def _create_conv_gating_mlp(self, prefix_name, source, layer_index): ff1 = self.network.add_linear_layer( "{}_ff_1".format(prefix_name), ln, - n_out=6 * self.enc_key_dim, # TODO: make it configurable + n_out=self.cgmlp_ff_dim, # TODO: make it configurable l2=self.l2, forward_weights_init=self.ff_init, with_bias=False, From b930a71f9b9e080d6a08b37663a6837b4ea445e0 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 21 May 2024 15:11:17 +0000 Subject: [PATCH 040/227] decouple mhsa residual --- .../models/asr/encoder/conformer_encoder_v2.py | 14 ++++++++------ .../models/asr/encoder/ebranchformer_encoder.py | 5 +++++ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py b/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py index 9831b8e61..fa8e11f26 100644 --- a/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py +++ b/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py @@ -903,12 +903,7 @@ def _create_mhsa_module(self, prefix_name, source, layer_index): drop = self.network.add_dropout_layer("{}_dropout".format(prefix_name), mhsa_linear, dropout=self.dropout) - res_inputs = [drop, source] - - mhsa_res = self.network.add_combine_layer( - "{}_res".format(prefix_name), kind="add", source=res_inputs, n_out=self.enc_value_dim - ) - return mhsa_res + return drop def _create_convolution_module(self, prefix_name, source, layer_index, half_step=False): """ @@ -1071,6 +1066,10 @@ def _create_conformer_block(self, i, source): if self.convolution_first: conv_module_ = self._create_convolution_module(prefix_name, ff_module1, i) mhsa_module = self._create_mhsa_module(prefix_name, conv_module_, i) + mhsa_module = self.network.add_combine_layer( + "{}_res".format(prefix_name), kind="add", source=[mhsa_module, conv_module_], n_out=self.enc_value_dim + ) + ff_module2_input = mhsa_module else: if self.no_mhsa_module: @@ -1083,6 +1082,9 @@ def _create_conformer_block(self, i, source): ) mhsa_input = conv_module1 mhsa = self._create_mhsa_module(prefix_name, mhsa_input, i) + mhsa = self.network.add_combine_layer( + "{}_res".format(prefix_name), kind="add", source=[mhsa, mhsa_input], n_out=self.enc_value_dim + ) conv_module = self._create_convolution_module(prefix_name, mhsa, i, half_step=self.sandwich_conv) ff_module2_input = conv_module diff --git a/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py b/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py index 09d0e8e32..508664380 100644 --- a/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py +++ b/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py @@ -14,6 +14,8 @@ def __init__(self, cgmlp_ff_dim, **kwargs): self.cgmlp_ff_dim = cgmlp_ff_dim def _create_conv_spatial_gating_unit(self, prefix_name, source, layer_index): + # Half split input into [A,B] -> A * DwConv(LN(B)) -> dropout + # # see also here: https://github.com/espnet/espnet/blob/master/espnet2/asr/layers/cgmlp.py#L15 split_size = self.cgmlp_ff_dim // 2 @@ -47,6 +49,8 @@ def _create_conv_spatial_gating_unit(self, prefix_name, source, layer_index): return dropout def _create_conv_gating_mlp(self, prefix_name, source, layer_index): + # GeLU(FF(LN(x))) -> Half split input into [A,B] -> A * DwConv(LN(B)) -> dropout -> FF + prefix_name = "{}_cgmlp".format(prefix_name) ln = self.network.add_layer_norm_layer("{}_ln".format(prefix_name), source) @@ -65,6 +69,7 @@ def _create_conv_gating_mlp(self, prefix_name, source, layer_index): gelu_act = self.network.add_activation_layer("{}_gelu".format(prefix_name), ff1, activation="gelu") + # Half split input into [A,B] -> A * DwConv(LN(B)) -> dropout csgu = self._create_conv_spatial_gating_unit(f"{prefix_name}_csgu", gelu_act, layer_index) br_merge_ff = self.network.add_linear_layer( From ba036c34176b2eaa5da5fa531304a54ed2d4c577 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 21 May 2024 15:11:46 +0000 Subject: [PATCH 041/227] cleanup --- users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py b/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py index 508664380..b513e5af6 100644 --- a/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py +++ b/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py @@ -58,7 +58,7 @@ def _create_conv_gating_mlp(self, prefix_name, source, layer_index): ff1 = self.network.add_linear_layer( "{}_ff_1".format(prefix_name), ln, - n_out=self.cgmlp_ff_dim, # TODO: make it configurable + n_out=self.cgmlp_ff_dim, l2=self.l2, forward_weights_init=self.ff_init, with_bias=False, From 0ad3b4ad10f26e30e069136db5fee5b9cec1b16f Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 21 May 2024 16:56:09 +0000 Subject: [PATCH 042/227] refactor args. add ebranch config --- .../librispeech_960/attention_asr_config.py | 237 +--- .../tedlium2/configs/ebranch_baseline.py | 1023 +++++++++++++++++ .../tedlium2/configs/ted2_att_baseline.py | 98 +- users/zeineldeen/models/asr/decoder/args.py | 137 +++ users/zeineldeen/models/asr/encoder/args.py | 87 ++ .../asr/encoder/ebranchformer_encoder.py | 9 +- 6 files changed, 1317 insertions(+), 274 deletions(-) create mode 100644 users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ebranch_baseline.py create mode 100644 users/zeineldeen/models/asr/decoder/args.py create mode 100644 users/zeineldeen/models/asr/encoder/args.py diff --git a/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/attention_asr_config.py b/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/attention_asr_config.py index 8e2d62658..0630cbd43 100644 --- a/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/attention_asr_config.py +++ b/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/attention_asr_config.py @@ -1,12 +1,22 @@ -import os.path - import numpy import copy -from typing import Any, Dict, Optional, List -from dataclasses import dataclass, asdict +from dataclasses import asdict +from i6_experiments.users.zeineldeen.models.asr.encoder.args import ( + EncoderArgs, + ConformerEncoderArgs, + ConformerEncoderV2Args, + EBranchformerEncoderArgs, +) from i6_experiments.users.zeineldeen.models.asr.encoder.conformer_encoder import ConformerEncoder from i6_experiments.users.zeineldeen.models.asr.encoder.conformer_encoder_v2 import ConformerEncoderV2 +from i6_experiments.users.zeineldeen.models.asr.encoder.ebranchformer_encoder import EBranchformerEncoder +from i6_experiments.users.zeineldeen.models.asr.decoder.args import ( + DecoderArgs, + TransformerDecoderArgs, + RNNDecoderArgs, + ConformerDecoderArgs, +) from i6_experiments.users.zeineldeen.models.asr.decoder.transformer_decoder import TransformerDecoder from i6_experiments.users.zeineldeen.models.asr.decoder.conformer_decoder import ConformerDecoder from i6_experiments.users.zeineldeen.models.asr.decoder.rnn_decoder import RNNDecoder @@ -213,7 +223,6 @@ def pretrain_layers_and_dims( """ InitialDimFactor = initial_dim_factor - encoder_keys = ["ff_dim", "enc_key_dim", "conv_kernel_size"] # TODO: effect of pretraining conv font-end? decoder_keys = ["ff_dim"] encoder_args_copy = copy.deepcopy(encoder_args) @@ -366,218 +375,6 @@ def pretrain_layers_and_dims( # -------------------------------------------------------------------- # -class EncoderArgs: - pass - - -@dataclass -class ConformerEncoderArgs(EncoderArgs): - num_blocks: int = 12 - enc_key_dim: int = 512 - att_num_heads: int = 8 - ff_dim: int = 2048 - conv_kernel_size: int = 32 - input: str = "data" - input_layer: str = "lstm-6" - input_layer_conv_act: str = "relu" - add_abs_pos_enc_to_input: bool = False - pos_enc: str = "rel" - - sandwich_conv: bool = False - subsample: Optional[str] = None - use_causal_layers: bool = False - - # ctc - with_ctc: bool = True - native_ctc: bool = True - ctc_loss_scale: Optional[float] = None - ctc_self_align_delay: Optional[int] = None - ctc_self_align_scale: float = 0.5 - ctc_dropout: float = 0.0 - - # param init - ff_init: Optional[str] = None - mhsa_init: Optional[str] = None - mhsa_out_init: Optional[str] = None - conv_module_init: Optional[str] = None - start_conv_init: Optional[str] = None - - # dropout - dropout: float = 0.1 - dropout_in: float = 0.1 - att_dropout: float = 0.1 - lstm_dropout: float = 0.1 - - # weight dropout - ff_weight_dropout: Optional[float] = None - mhsa_weight_dropout: Optional[float] = None - conv_weight_dropout: Optional[float] = None - - # norms - batch_norm_opts: Optional[Dict[str, Any]] = None - use_ln: bool = False - - # other regularization - l2: float = 0.0001 - frontend_conv_l2: float = 0.0001 - self_att_l2: float = 0.0 - rel_pos_clipping: int = 16 - - use_sqrd_relu: bool = False - - # weight noise - weight_noise: Optional[float] = None - weight_noise_layers: Optional[List[str]] = None - - convolution_first: bool = False - - -class ConformerEncoderV2Args(ConformerEncoderArgs): - # weight noise - ff_weight_noise: Optional[float] = None - mhsa_weight_noise: Optional[float] = None - conv_weight_noise: Optional[float] = None - frontend_conv_weight_noise: Optional[float] = None - - # weight dropout - frontend_conv_weight_dropout: Optional[float] = None - - -class DecoderArgs: - pass - - -@dataclass -class TransformerDecoderArgs(DecoderArgs): - num_layers: int = 6 - att_num_heads: int = 8 - ff_dim: int = 2048 - ff_act: str = "relu" - pos_enc: Optional[str] = None - embed_pos_enc: bool = False - - # param init - ff_init: Optional[str] = None - mhsa_init: Optional[str] = None - mhsa_out_init: Optional[str] = None - - # dropout - dropout: float = 0.1 - att_dropout: float = 0.1 - embed_dropout: float = 0.1 - softmax_dropout: float = 0.0 - - ff_weight_noise: Optional[float] = None - mhsa_weight_noise: Optional[float] = None - ff_weight_dropout: Optional[float] = None - mhsa_weight_dropout: Optional[float] = None - - # other regularization - l2: float = 0.0 - self_att_l2: float = 0.0 - rel_pos_clipping: int = 16 - label_smoothing: float = 0.1 - apply_embed_weight: bool = False - - length_normalization: bool = True - - # ILM - replace_cross_att_w_masked_self_att: bool = False - create_ilm_decoder: bool = False - ilm_type: bool = None - ilm_args: Optional[dict] = None - - -@dataclass -class ConformerDecoderArgs(DecoderArgs): - num_layers: int = 6 - att_num_heads: int = 8 - ff_dim: int = 2048 - pos_enc: Optional[str] = "rel" - - # conv module - conv_kernel_size: int = 32 - - # param init - ff_init: Optional[str] = None - mhsa_init: Optional[str] = None - mhsa_out_init: Optional[str] = None - conv_module_init: Optional[str] = None - - # dropout - dropout: float = 0.1 - att_dropout: float = 0.1 - embed_dropout: float = 0.1 - softmax_dropout: float = 0.1 - - # other regularization - l2: float = 0.0001 - frontend_conv_l2: float = 0.0001 - rel_pos_clipping: int = 16 - label_smoothing: float = 0.1 - apply_embed_weight: bool = False - - length_normalization: bool = True - - use_sqrd_relu: bool = False - - # ILM - replace_cross_att_w_masked_self_att: bool = False - create_ilm_decoder: bool = False - ilm_type: bool = None - ilm_args: Optional[dict] = None - - -@dataclass -class RNNDecoderArgs(DecoderArgs): - att_num_heads: int = 1 - lstm_num_units: int = 1024 - output_num_units: int = 1024 - embed_dim: int = 640 - enc_key_dim: int = 1024 # also attention dim # also attention dim - - # location feedback - loc_conv_att_filter_size: Optional[int] = None - - # param init - lstm_weights_init: Optional[str] = None - embed_weight_init: Optional[str] = None - - # dropout - dropout: float = 0.0 - softmax_dropout: float = 0.3 - att_dropout: float = 0.0 - embed_dropout: float = 0.1 - rec_weight_dropout: float = 0.0 - - # other regularization - l2: float = 0.0001 - zoneout: bool = True - reduceout: bool = True - - # lstm lm - lstm_lm_dim: int = 1024 - add_lstm_lm: bool = False - - length_normalization: bool = True - length_normalization_exponent: float = 1.0 - - coverage_scale: float = None - coverage_threshold: float = None - coverage_update: str = "sum" - - ce_loss_scale: Optional[float] = 1.0 - - label_smoothing: float = 0.1 - - use_zoneout_output: bool = False - - monotonic_att_weights_loss_opts: Optional[dict] = None - use_monotonic_att_weights_loss_in_recog: Optional[bool] = False - - include_eos_in_search_output: bool = False - - def create_config( training_datasets, encoder_args: EncoderArgs, @@ -752,10 +549,12 @@ def create_config( # -------------------------- network -------------------------- # - if isinstance(encoder_args, ConformerEncoderArgs): + if type(encoder_args) is ConformerEncoderArgs: encoder_type = ConformerEncoder - elif isinstance(encoder_args, ConformerEncoderV2Args): + elif type(encoder_args) is ConformerEncoderV2Args: encoder_type = ConformerEncoderV2 + elif type(encoder_args) is EBranchformerEncoderArgs: + encoder_type = EBranchformerEncoder else: raise ValueError("invalid encoder_args type") diff --git a/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ebranch_baseline.py b/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ebranch_baseline.py new file mode 100644 index 000000000..f90326966 --- /dev/null +++ b/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ebranch_baseline.py @@ -0,0 +1,1023 @@ +import copy, os + +import numpy + +import sisyphus.toolkit as tk + +from i6_experiments.users.zeineldeen.experiments.conformer_att_2022.librispeech_960.attention_asr_config import ( + create_config, + ConformerEncoderArgs, + TransformerDecoderArgs, + RNNDecoderArgs, + ConformerDecoderArgs, + EBranchformerEncoderArgs, +) +from i6_experiments.users.zeineldeen.experiments.conformer_att_2022.librispeech_960.additional_config import ( + apply_fairseq_init_to_conformer, + reset_params_init, + apply_fairseq_init_to_transformer_decoder, +) +from i6_experiments.users.zeineldeen.experiments.conformer_att_2023.tedlium2.data import ( + build_training_datasets, + build_test_dataset, +) +from i6_experiments.users.zeineldeen.experiments.conformer_att_2023.tedlium2.default_tools import ( + RETURNN_ROOT, + RETURNN_CPU_EXE, + SCTK_BINARY_PATH, +) +from i6_experiments.users.zeineldeen.experiments.conformer_att_2022.librispeech_960.feature_extraction_net import ( + log10_net_10ms, +) +from i6_experiments.users.zeineldeen.experiments.conformer_att_2022.librispeech_960.pipeline import ( + training, + search, + get_average_checkpoint, + get_best_checkpoint, + search_single, +) +from i6_experiments.users.zeineldeen.models.lm import generic_lm +from i6_experiments.users.zeineldeen.models.lm.transformer_lm import TransformerLM +from i6_experiments.users.zeineldeen.experiments.conformer_att_2022.librispeech_960 import ( + ilm_helpers, +) +from i6_experiments.users.rossenbach.experiments.librispeech.kazuki_lm.experiment import ( + get_lm, + ZeineldeenLM, +) + +train_jobs_map = {} # dict[str, ReturnnTrainJob] +train_job_avg_ckpt = {} +train_job_best_epoch = {} + +BPE_10K = 10000 +BPE_5K = 5000 +BPE_1K = 1000 +BPE_500 = 500 + +# train: +# ------ +# Seq-length 'data' Stats: +# 92973 seqs +# Mean: 819.1473868757647 +# Std dev: 434.7168733027807 +# Min/max: 26 / 2049 + +# --------------------------- LM --------------------------- # + +# LM data (runnnig words) +# trans 2250417 ~ 2.25M +# external: 12688261 ~ 12.7M +# Total: 14.9M + +lstm_10k_lm_opts = { + "lm_subnet": generic_lm.libri_lstm_bpe10k_net, + "lm_model": generic_lm.libri_lstm_bpe10k_model, + "name": "lstm", +} + +lstm_lm_opts_map = { + BPE_10K: lstm_10k_lm_opts, +} + +# Trafo LM trained by willi +# - setup dir: /u/michel/setups/language_modelling/tedlium/neurallm/trafo_kazuki19 +# - config: /u/michel/setups/language_modelling/tedlium/neurallm/trafo_kazuki19/tdl1.config +trafo_lm_net = TransformerLM( + source="prev:output", + num_layers=30, + vocab_size=1057, + use_as_ext_lm=True, + ff_dim=4096, + att_num_heads=12, + embed_dim=128, + qk_dim=768, + v_dim=768, + out_dim=768, + emb_cpu_lookup=False, + embed_pos=False, # wo abs pos encoding + conv_act="gelu", +) +trafo_lm_net.create_network() +trafo_1k_lm_opts = { + "lm_subnet": trafo_lm_net.network.get_net(), + "load_on_init_opts": { + "filename": "/work/asr4/michel/setups-data/language_modelling/tedlium/neurallm/trafo_kazuki19/net-model/network.020", + "params_prefix": "", + "load_if_prefix": "lm_output/", + }, + "name": "trafo", +} + +trafo_lm_opts_map = {BPE_1K: trafo_1k_lm_opts} + +# ----------------------------------------------------------- # + + +def compute_features_stats( + output_dirname, feat_dim, bpe_size=10000, feature_extraction_net=log10_net_10ms, model_checkpoint=None, **kwargs +): + train_data = build_training_datasets( + bpe_size=bpe_size, + use_raw_features=True, + epoch_wise_filter=None, + link_speed_perturbation=False, + seq_ordering="laplace:.1000", + partition_epoch=1, + ) + # Dump log-mel features into HDFDataset + dump_features_config = {} + dump_features_config["extern_data"] = train_data.extern_data + dump_features_config["network"] = copy.deepcopy(feature_extraction_net) + if model_checkpoint: + dump_features_config["network"]["output"] = { + "class": "hdf_dump", + "from": "log_mel_features", + "filename": "log_mel_features.hdf", + } + else: + dump_features_config["network"]["output"] = { + "class": "copy", + "from": "log_mel_features", + } + dump_features_config["forward_batch_size"] = 20_000 * 80 + dump_features_config["use_tensorflow"] = True + dump_features_config["eval"] = train_data.train.as_returnn_opts() + from i6_core.returnn import ReturnnForwardJob, ReturnnConfig + + hdf_filename = "log_mel_features.hdf" if model_checkpoint else "output.hdf" + + dump_features_job = ReturnnForwardJob( + returnn_config=ReturnnConfig(config=dump_features_config), + returnn_python_exe=RETURNN_CPU_EXE, + returnn_root=kwargs.get("returnn_root", RETURNN_ROOT), + model_checkpoint=model_checkpoint, + hdf_outputs=[hdf_filename] if model_checkpoint else [], + device="cpu", + mem_rqmt=15, + time_rqmt=72, + eval_mode=True if model_checkpoint else False, + ) + dump_features_job.add_alias(f"ted2_stats/{output_dirname}/dump_train_log_mel_features") + tk.register_output( + f"ted2_stats/{output_dirname}/log_mel_features.hdf", dump_features_job.out_hdf_files[hdf_filename] + ) + + # Extract features stats from HDFDataset + extract_stats_returnn_config = ReturnnConfig( + { + "extern_data": { + "data": {"dim": feat_dim}, + }, + "train": { + "class": "HDFDataset", + "files": [dump_features_job.out_hdf_files[hdf_filename]], + "use_cache_manager": True, + }, + "batch_size": 20_000 * 80, + "use_tensorflow": True, + } + ) + from i6_core.returnn.dataset import ExtractDatasetMeanStddevJob + + extract_mean_stddev_job = ExtractDatasetMeanStddevJob( + returnn_config=extract_stats_returnn_config, + returnn_python_exe=RETURNN_CPU_EXE, + returnn_root=kwargs.get("returnn_root", RETURNN_ROOT), + ) + extract_mean_stddev_job.add_alias(f"ted2_stats/{output_dirname}/extract_mean_stddev") + + tk.register_output(f"ted2_stats/{output_dirname}/mean_var", extract_mean_stddev_job.out_mean) + tk.register_output(f"ted2_stats/{output_dirname}/std_dev_var", extract_mean_stddev_job.out_std_dev) + tk.register_output(f"ted2_stats/{output_dirname}/mean_file", extract_mean_stddev_job.out_mean_file) + tk.register_output(f"ted2_stats/{output_dirname}/std_dev_file", extract_mean_stddev_job.out_std_dev_file) + + return ( + extract_mean_stddev_job.out_mean, + extract_mean_stddev_job.out_std_dev, + extract_mean_stddev_job.out_mean_file, + extract_mean_stddev_job.out_std_dev_file, + ) + + +def conformer_baseline(): + abs_name = os.path.abspath(__file__) + prefix_name = os.path.basename(abs_name)[: -len(".py")] + + def get_test_dataset_tuples(bpe_size): + test_dataset_tuples = {} + for testset in ["dev", "test"]: + test_dataset_tuples[testset] = build_test_dataset( + testset, + use_raw_features=True, + bpe_size=bpe_size, + ) + return test_dataset_tuples + + def run_train( + exp_name, + train_args, + train_data, + feature_extraction_net, + num_epochs, + recog_epochs, + **kwargs, + ): + exp_prefix = os.path.join(prefix_name, exp_name) + returnn_config = create_config( + training_datasets=train_data, + **train_args, + feature_extraction_net=feature_extraction_net, + recog_epochs=recog_epochs, + ) + train_job = training( + exp_prefix, + returnn_config, + RETURNN_CPU_EXE, + RETURNN_ROOT, + num_epochs=num_epochs, + gpu_mem=kwargs.get("gpu_mem", 11), + ) + return train_job + + def run_single_search( + exp_name, + train_data, + search_args, + checkpoint, + feature_extraction_net, + recog_dataset, + recog_ref, + recog_bliss, + mem_rqmt=8, + time_rqmt=4, + **kwargs, + ): + exp_prefix = os.path.join(prefix_name, exp_name) + returnn_search_config = create_config( + training_datasets=train_data, + **search_args, + feature_extraction_net=feature_extraction_net, + is_recog=True, + ) + search_single( + exp_prefix, + returnn_search_config, + checkpoint, + recognition_dataset=recog_dataset, + recognition_reference=recog_ref, + recognition_bliss_corpus=recog_bliss, + returnn_exe=RETURNN_CPU_EXE, + returnn_root=RETURNN_ROOT, + mem_rqmt=mem_rqmt, + time_rqmt=time_rqmt, + use_sclite=True, + ) + + def run_lm_fusion( + lm_type, + exp_name, + epoch, + test_set_names, + lm_scales, + train_job, + train_data, + feature_net, + bpe_size, + args, + beam_size=12, + prior_scales=None, + prior_type=None, + mini_lstm_ckpt=None, + length_norm=True, + length_norm_exponent=1.0, + prior_type_name=None, + coverage_scale=None, + coverage_threshold=None, + coverage_update=None, + monotonic_att_weights_loss_opts=None, + **kwargs, + ): + assert lm_type in ["lstm", "trafo"], "lm type should be lstm or trafo" + + if isinstance(lm_scales, float): + lm_scales = [lm_scales] + if prior_scales and isinstance(prior_scales, float): + prior_scales = [prior_scales] + if isinstance(test_set_names, str): + test_set_names = [test_set_names] + assert isinstance(test_set_names, list) + + if epoch == "avg": + search_checkpoint = train_job_avg_ckpt[exp_name] + elif epoch == "best": + search_checkpoint = train_job_best_epoch[exp_name] + else: + assert isinstance(epoch, int), "epoch must be either a defined integer or a string in {avg, best}." + search_checkpoint = train_job.out_checkpoints[epoch] + + ext_lm_opts = lstm_lm_opts_map[bpe_size] if lm_type == "lstm" else trafo_lm_opts_map[bpe_size] + + time_rqmt = 1.0 + + search_args = copy.deepcopy(args) + + if lm_type == "lstm": + if beam_size > 128: + search_args["batch_size"] = 4000 * 160 + + if lm_type == "trafo": + search_args["batch_size"] = 4000 * 160 if beam_size <= 32 else 2000 * 160 + time_rqmt = 2 + if beam_size > 50: + time_rqmt = 3 + + search_args["beam_size"] = beam_size + if kwargs.get("batch_size", None): + search_args["batch_size"] = kwargs["batch_size"] + + if not length_norm: + search_args["decoder_args"].length_normalization = False + else: + search_args["decoder_args"].length_normalization = True + search_args["decoder_args"].length_normalization_exponent = length_norm_exponent + + if "decoder_args" in kwargs: + for k, v in kwargs["decoder_args"].items(): + setattr(search_args["decoder_args"], k, v) + + scales = [(e,) for e in lm_scales] + + for test_set in test_set_names: + if prior_scales: + import itertools + + scales = itertools.product(lm_scales, prior_scales) + + for scale in scales: + lm_scale = scale[0] + prior_scale = scale[1] if len(scale) == 2 else None + if prior_scale and prior_scale > lm_scale: + continue + + # External LM opts + ext_lm_opts["lm_scale"] = lm_scale + search_args["ext_lm_opts"] = ext_lm_opts + + # ILM opts + if prior_scale: + ilm_opts = { + "scale": prior_scale, + "type": prior_type, + "ctx_dim": search_args["encoder_args"].enc_key_dim, # this is needed for mini-lstm + } + # this is needed for mini-self-att + if hasattr(search_args["decoder_args"], "num_layers"): + ilm_opts["num_dec_layers"] = search_args["decoder_args"].num_layers + search_args["decoder_args"].create_ilm_decoder = True + search_args["decoder_args"].ilm_type = prior_type + + ilm_opts.update(kwargs.get("ilm_train_opts", {})) # example for FFN, etc + + search_args["prior_lm_opts"] = ilm_opts + search_args["preload_from_files"] = { + "prior_lm": { + "filename": search_checkpoint, # copy ASR decoder to be used as ILM decoder + "prefix": "prior_", + } + } + if prior_type == "mini_lstm" or prior_type == "ffn": + assert mini_lstm_ckpt, "Mini-LSTM checkpoint not set." + search_args["preload_from_files"].update( + { + "mini_lstm": { + "filename": mini_lstm_ckpt, + "prefix": "mini_", + } + } + ) + + if prior_type_name is None: + prior_type_name = prior_type + + lm_desc = "" + if prior_scale: + lm_desc = f"ILM_{prior_type_name}/" + + lm_desc += f"lm-scale-{lm_scale}" + if prior_scale: + lm_desc += f"-prior-{prior_scale}" + lm_desc += f"-beam-{beam_size}" + if length_norm is False: + lm_desc += "-woLenNorm" + + if coverage_scale and coverage_threshold: + assert isinstance(search_args["decoder_args"], RNNDecoderArgs) + search_args["decoder_args"].coverage_scale = coverage_scale + search_args["decoder_args"].coverage_threshold = coverage_threshold + search_args["decoder_args"].coverage_update = coverage_update + lm_desc += f"_coverage-thre{coverage_threshold}-scale{coverage_scale}-{coverage_update}" + + if monotonic_att_weights_loss_opts: + search_args["decoder_args"].monotonic_att_weights_loss_opts = monotonic_att_weights_loss_opts + search_args["decoder_args"].use_monotonic_att_weights_loss_in_recog = True + lm_desc += "_monoAtt" + for k, v in monotonic_att_weights_loss_opts.items(): + lm_desc += f"-{k}{v}" + + if not length_norm: + lm_desc += "_noLenNorm" + elif length_norm_exponent != 1.0: + lm_desc += f"_lenNormExp{length_norm_exponent}" + + name = f"{exp_name}/recog-{lm_type}-lm/ep-{epoch}/{lm_desc}/{test_set}" + + test_dataset_tuples = get_test_dataset_tuples(bpe_size=bpe_size) + + run_single_search( + exp_name=name, + train_data=train_data, + search_args=search_args, + checkpoint=search_checkpoint, + feature_extraction_net=feature_net, + recog_dataset=test_dataset_tuples[test_set][0], + recog_ref=test_dataset_tuples[test_set][1], + recog_bliss=test_dataset_tuples[test_set][2], + time_rqmt=kwargs.get("time_rqmt", time_rqmt), + ) + + def run_search( + exp_name, + train_args, + train_data, + train_job, + feature_extraction_net, + num_epochs, + search_args, + recog_epochs, + bpe_size, + **kwargs, + ): + exp_prefix = os.path.join(prefix_name, exp_name) + + search_args = search_args if search_args is not None else train_args + + returnn_search_config = create_config( + training_datasets=train_data, + **search_args, + feature_extraction_net=feature_extraction_net, + is_recog=True, + ) + + num_avg = kwargs.get("num_avg", 4) + averaged_checkpoint = get_average_checkpoint( + train_job, + returnn_exe=RETURNN_CPU_EXE, + returnn_root=RETURNN_ROOT, + num_average=num_avg, + key=kwargs.get("avg_key", "dev_score_output/output_prob"), + ) + if num_avg == 4: # TODO: just for now to not break hashes + train_job_avg_ckpt[exp_name] = averaged_checkpoint + + best_checkpoint = get_best_checkpoint(train_job, key=kwargs.get("avg_key", "dev_score_output/output_prob")) + train_job_best_epoch[exp_name] = best_checkpoint + + if recog_epochs is None: + default_recog_epochs = [40] + default_recog_epochs += [80 * i for i in range(1, int(num_epochs / 80) + 1)] + if num_epochs % 80 != 0: + default_recog_epochs += [num_epochs] + else: + default_recog_epochs = recog_epochs + + test_dataset_tuples = get_test_dataset_tuples(bpe_size=bpe_size) + + run_only_avg = kwargs.get("run_only_avg", False) + + if not run_only_avg: + for ep in default_recog_epochs: + search( + exp_prefix + f"/recogs/ep-{ep}", + returnn_search_config, + train_job.out_checkpoints[ep], + test_dataset_tuples, + RETURNN_CPU_EXE, + RETURNN_ROOT, + ) + + search( + exp_prefix + "/default_last", + returnn_search_config, + train_job.out_checkpoints[num_epochs], + test_dataset_tuples, + RETURNN_CPU_EXE, + RETURNN_ROOT, + ) + + search( + exp_prefix + "/default_best", + returnn_search_config, + best_checkpoint, + test_dataset_tuples, + RETURNN_CPU_EXE, + RETURNN_ROOT, + use_sclite=True, + ) + + beam_size = search_args.get("beam_size", 12) + if beam_size != 12: + exp_prefix += f"_beam-{beam_size}" + if search_args["decoder_args"].coverage_scale: + exp_prefix += f"_coverage-thre{search_args['decoder_args'].coverage_threshold}-scale{search_args['decoder_args'].coverage_scale}" + search( + exp_prefix + f"/average_{num_avg}", + returnn_search_config, + averaged_checkpoint, + test_dataset_tuples, + RETURNN_CPU_EXE, + RETURNN_ROOT, + use_sclite=True, + ) + + def run_concat_seq_recog(exp_name, corpus_names, num, train_data, search_args, checkpoint, mem_rqmt=8, time_rqmt=1): + exp_prefix = os.path.join(prefix_name, exp_name) + + from i6_experiments.users.zeineldeen.experiments.chunkwise_att_2023.concat_seqs import ( + ConcatDatasetSeqsJob, + ConcatSeqsDataset, + CreateConcatSeqsCTMAndSTMJob, + ) + from i6_core.corpus.convert import CorpusToStmJob + + if isinstance(corpus_names, str): + corpus_names = [corpus_names] + assert isinstance(corpus_names, list) + + for corpus_name in corpus_names: + test_datasets = get_test_dataset_tuples(bpe_size=BPE_1K) + stm = CorpusToStmJob(bliss_corpus=test_datasets[corpus_name][2]).out_stm_path + tk.register_output(f"concat_seqs/{num}/orig_{corpus_name}_stm", stm) + concat_dataset_seqs = ConcatDatasetSeqsJob( + corpus_name="TED-LIUM-realease2", stm=stm, num=num, overlap_dur=None + ) + tk.register_output(f"concat_seqs/{num}/{corpus_name}_stm", concat_dataset_seqs.out_stm) + tk.register_output(f"concat_seqs/{num}/{corpus_name}_tags", concat_dataset_seqs.out_concat_seq_tags) + tk.register_output(f"concat_seqs/{num}/{corpus_name}_lens", concat_dataset_seqs.out_concat_seq_lens_py) + + returnn_search_config = create_config( + training_datasets=train_data, + **search_args, + feature_extraction_net=log10_net_10ms, + is_recog=True, + ) + + returnn_concat_dataset = ConcatSeqsDataset( + dataset=test_datasets[corpus_name][0].as_returnn_opts(), + seq_tags=concat_dataset_seqs.out_concat_seq_tags, + seq_lens_py=concat_dataset_seqs.out_orig_seq_lens_py, + ) + + _, search_words = search_single( + os.path.join(exp_prefix, corpus_name), + returnn_search_config, + checkpoint, + recognition_dataset=returnn_concat_dataset, + recognition_reference=test_datasets[corpus_name][1], + recognition_bliss_corpus=test_datasets[corpus_name][2], + returnn_exe=RETURNN_CPU_EXE, + returnn_root=RETURNN_ROOT, + mem_rqmt=mem_rqmt, + time_rqmt=time_rqmt, + # no scoring + use_sclite=False, + use_returnn_compute_wer=False, + ) + + from i6_core.corpus.convert import CorpusToStmJob + from i6_core.recognition.scoring import ScliteJob + + stm_file = concat_dataset_seqs.out_stm + + concat_ctm_and_stm_job = CreateConcatSeqsCTMAndSTMJob( + recog_words_file=search_words, stm_py_file=concat_dataset_seqs.out_stm_py, stm_file=stm_file + ) + tk.register_output(exp_prefix + f"/{corpus_name}/sclite/stm", concat_ctm_and_stm_job.out_stm_file) + tk.register_output(exp_prefix + f"/{corpus_name}/sclite/ctm", concat_ctm_and_stm_job.out_ctm_file) + + sclite_job = ScliteJob( + ref=concat_ctm_and_stm_job.out_stm_file, + hyp=concat_ctm_and_stm_job.out_ctm_file, + sctk_binary_path=SCTK_BINARY_PATH, + ) + tk.register_output(exp_prefix + f"/{corpus_name}/sclite/wer", sclite_job.out_wer) + tk.register_output(exp_prefix + f"/{corpus_name}/sclite/report", sclite_job.out_report_dir) + + def run_exp( + exp_name, + train_args, + feature_extraction_net=log10_net_10ms, + num_epochs=300, + search_args=None, + recog_epochs=None, + bpe_size=1000, + partition_epoch=4, + **kwargs, + ): + if train_args.get("retrain_checkpoint", None): + assert kwargs.get("epoch_wise_filter", None) is None, "epoch_wise_filter should be disabled for retraining." + train_data = build_training_datasets( + bpe_size=bpe_size, + use_raw_features=True, + epoch_wise_filter=kwargs.get("epoch_wise_filter", [(1, 5, 1000)]), + link_speed_perturbation=train_args.get("speed_pert", True), + seq_ordering=kwargs.get("seq_ordering", "laplace:.1000"), + partition_epoch=partition_epoch, + devtrain_subset=kwargs.get("devtrain_subset", 507), # same as num of dev segments + ) + train_job = run_train( + exp_name, + train_args, + train_data, + feature_extraction_net, + num_epochs, + recog_epochs, + **kwargs, + ) + train_jobs_map[exp_name] = train_job + + run_search( + exp_name, + train_args, + train_data, + train_job, + feature_extraction_net, + num_epochs, + search_args, + recog_epochs, + bpe_size=bpe_size, + **kwargs, + ) + + if kwargs.get("concat_recog_opts", None): + ckpt_ = kwargs["concat_recog_opts"]["checkpoint"] + if isinstance(ckpt_, str): + assert ckpt_ in ["best", "avg"] + if ckpt_ == "best": + concat_recog_ckpt = train_job_best_epoch[exp_name] + else: + concat_recog_ckpt = train_job_avg_ckpt[exp_name] + elif isinstance(ckpt_, int): + concat_recog_ckpt = train_job.out_checkpoints[ckpt_] + else: + raise TypeError(f"concat_recog_opts['checkpoint'] must be str or int, got {type(ckpt_)}") + concat_recog_search_args = kwargs["concat_recog_opts"].get("search_args", None) + search_args_ = copy.deepcopy(train_args) + if concat_recog_search_args: + search_args_.update(concat_recog_search_args) + run_concat_seq_recog( + exp_name=exp_name + f"_concat{kwargs['concat_recog_opts']['num']}", + corpus_names=kwargs["concat_recog_opts"]["corpus_names"], + num=kwargs["concat_recog_opts"]["num"], + train_data=train_data, + search_args=search_args_, + checkpoint=concat_recog_ckpt, + ) + + return train_job, train_data + + def train_mini_lstm( + exp_name, + checkpoint, + args, + epoch_split, + num_epochs=20, + lr=8e-4, + time_rqmt=4, + l2=1e-4, + name="mini_lstm", + w_drop=False, + use_dec_state=False, + use_ffn=False, + ffn_opts=None, + **kwargs, + ): + if not w_drop: + params_freeze_str = ilm_helpers.get_mini_lstm_params_freeze_str() + else: + if use_ffn: + params_freeze_str = ilm_helpers.get_ffn_params_freeze_str_w_drop(ffn_opts["num_ffn_layers"]) + else: + params_freeze_str = ilm_helpers.get_mini_lstm_params_freeze_str_w_drop() + + mini_lstm_args = copy.deepcopy(args) + mini_lstm_args["epoch_split"] = epoch_split + mini_lstm_args["batch_size"] = 20000 * 160 + mini_lstm_args["with_pretrain"] = False + mini_lstm_args["lr"] = lr + mini_lstm_args["learning_rates_list"] = None + mini_lstm_args["allow_lr_scheduling"] = False + mini_lstm_args["encoder_args"].with_ctc = False + mini_lstm_args["keep_all_epochs"] = True # keep everything + mini_lstm_args["extra_str"] = params_freeze_str + mini_lstm_args["preload_from_files"] = { + "import": { + "init_for_train": True, + "ignore_missing": True, + "filename": checkpoint, + } + } + mini_lstm_args.update(kwargs) + + exp_prefix = os.path.join(prefix_name, exp_name, name) + mini_lstm_train_data = build_training_datasets( + bpe_size=BPE_1K, + use_raw_features=True, + epoch_wise_filter=None, + link_speed_perturbation=False, # depends only on text + seq_ordering=kwargs.get("seq_ordering", "laplace:.1000"), + ) + returnn_config = create_config( + training_datasets=mini_lstm_train_data, + **mini_lstm_args, + feature_extraction_net=log10_net_10ms, + ) + + inp = "s" if use_dec_state else "prev:target_embed" + + if use_ffn: + x = inp + activations = ffn_opts["activations"] + for l in range(ffn_opts["num_ffn_layers"]): + returnn_config.config["network"]["output"]["unit"]["ffn_%02i" % (l + 1)] = { + "class": "linear", + "n_out": ffn_opts["ffn_dims"][l], + "L2": l2, + "from": inp, + "activation": activations[l] if activations and l < len(activations) else None, + } + x = "ffn_%02i" % (l + 1) + + returnn_config.config["network"]["output"]["unit"]["att"] = { + "class": "linear", + "from": x, + "activation": None, + "n_out": mini_lstm_args["encoder_args"].enc_key_dim, + "L2": l2, + } + else: + # Mini-LSTM + FF + + returnn_config.config["network"]["output"]["unit"]["att_lstm"] = { + "class": "rec", + "unit": "nativelstm2", + "from": inp, + "n_out": 50, + } + + returnn_config.config["network"]["output"]["unit"]["att"] = { + "class": "linear", + "from": "att_lstm", + "activation": None, + "n_out": mini_lstm_args["encoder_args"].enc_key_dim, + "L2": l2, + } + + train_job = training( + exp_prefix, + returnn_config, + RETURNN_CPU_EXE, + RETURNN_ROOT, + num_epochs=num_epochs, + time_rqmt=time_rqmt, + ) + return train_job + + def train_mini_self_att( + exp_name, + checkpoint, + args, + num_epochs=20, + lr=8e-4, + time_rqmt=4, + name="mini_self_att", + **kwargs, + ): + """ + Same idea as Mini-LSTM but use masked (mini-)self-attention models instead of cross attention. + Note that each layer has its own (mini-)self-attention. + + In the case of transformer decoder, we want to replace cross-attention layers namely: + transformer_decoder_{idx}_att_linear + with masked self-attention models. + """ + + params_freeze_str = ilm_helpers.get_mini_self_att_params_freeze_str_w_drop(args["decoder_args"].num_layers) + + mini_self_att = copy.deepcopy(args) + mini_self_att["batch_size"] = 20000 * 160 # TODO: does this fit now? + mini_self_att["with_pretrain"] = False + mini_self_att["lr"] = lr + mini_self_att["allow_lr_scheduling"] = False + mini_self_att["encoder_args"].with_ctc = False + # mini_self_att['keep_all_epochs'] = True # keep everything + mini_self_att["extra_str"] = params_freeze_str + mini_self_att["preload_from_files"] = { + "import": { + "init_for_train": True, + "ignore_missing": True, + "filename": checkpoint, + } + } + if "decoder_args" in kwargs: + assert isinstance(kwargs["decoder_args"], dict) + for k, v in kwargs["decoder_args"].items(): + setattr(mini_self_att["decoder_args"], k, v) + kwargs.pop("decoder_args") + mini_self_att.update(kwargs) + + exp_prefix = os.path.join(prefix_name, exp_name, name) + mini_self_att_train_data = build_training_datasets( + bpe_size=10000, + use_raw_features=True, + epoch_wise_filter=None, + link_speed_perturbation=False, # depends only on text + seq_ordering=kwargs.get("seq_ordering", "laplace:.1000"), + ) + + # use masked self-att instead of cross-att with layer names having "ilm_" as prefix + mini_self_att["decoder_args"].replace_cross_att_w_masked_self_att = True + + returnn_config = create_config( + training_datasets=mini_self_att_train_data, + **mini_self_att, + feature_extraction_net=log10_net_10ms, + ) + train_job = training( + exp_prefix, + returnn_config, + RETURNN_CPU_EXE, + RETURNN_ROOT, + num_epochs=num_epochs, + time_rqmt=time_rqmt, + ) + return train_job + + # --------------------------- General Settings --------------------------- # + + ebranch_enc_args = EBranchformerEncoderArgs( + num_blocks=12, + input_layer="conv-6", + att_num_heads=6, + ff_dim=1536, + enc_key_dim=384, + conv_kernel_size=31, + pos_enc="rel", + dropout=0.1, + att_dropout=0.1, + l2=0.0001, + ) + apply_fairseq_init_to_conformer(ebranch_enc_args) + ebranch_enc_args.ctc_loss_scale = 1.0 + + rnn_dec_args = RNNDecoderArgs() + + trafo_dec_args = TransformerDecoderArgs( + num_layers=6, + embed_dropout=0.1, + label_smoothing=0.1, + apply_embed_weight=True, + pos_enc="rel", + ) + apply_fairseq_init_to_transformer_decoder(trafo_dec_args) + + training_args = dict() + training_args["with_staged_network"] = True + training_args["speed_pert"] = True + + trafo_training_args = copy.deepcopy(training_args) + trafo_training_args["pretrain_opts"] = { + "variant": 3, + "initial_batch_size": 20000 * 160, + } + trafo_training_args["pretrain_reps"] = 5 + trafo_training_args["batch_size"] = 12000 * 160 # frames * samples per frame + + trafo_dec_exp_args = copy.deepcopy( + { + **trafo_training_args, + "encoder_args": ebranch_enc_args, + "decoder_args": trafo_dec_args, + } + ) + + lstm_training_args = copy.deepcopy(training_args) + lstm_training_args["pretrain_opts"] = { + "variant": 3, + "initial_batch_size": 22500 * 160, + } + lstm_training_args["pretrain_reps"] = 5 + lstm_training_args["batch_size"] = 15000 * 160 # frames * samples per frame + + lstm_dec_exp_args = copy.deepcopy( + { + **lstm_training_args, + "encoder_args": ebranch_enc_args, + "decoder_args": rnn_dec_args, + } + ) + + # --------------------------- Experiments --------------------------- # + + oclr_args = copy.deepcopy(lstm_dec_exp_args) + oclr_args["oclr_opts"] = { + "peak_lr": 9e-4, + "final_lr": 1e-6, + } + oclr_args["encoder_args"].input_layer = "conv-6" + oclr_args["encoder_args"].use_sqrd_relu = True + oclr_args["max_seq_length"] = None + + _, _, global_mean, global_std = compute_features_stats(output_dirname="logmel_80", feat_dim=80) + + # --------------------- V1 --------------------- + def get_base_v1_args(lr, ep, enc_drop=0.1, pretrain_reps=3, use_legacy_stats=True): + # base_bpe1000_peakLR0.0008_ep200_globalNorm_epochOCLR_pre3_fixZoneout_encDrop0.1_woDepthConvPre + # Average ckpt: 8.19/7.64 (50 epochs) + # - Epoch-based OCLR with peak LR 8e-4 + # - EncDrop 0.1, fixed zoneout + # - Pretrain 3, no depthwise conv pretrain + # - Feature global normalization + + base_v1_args = copy.deepcopy(oclr_args) + base_v1_args.pop("oclr_opts") + cyc_ep = int(0.45 * ep) + # Epoch-based OCLR + base_v1_args["learning_rates_list"] = ( + list(numpy.linspace(lr / 10, lr, cyc_ep)) + + list(numpy.linspace(lr, lr / 10, cyc_ep)) + + list(numpy.linspace(lr / 10, 1e-6, ep - 2 * cyc_ep)) + ) + base_v1_args["global_stats"] = { + "mean": global_mean, + "stddev": global_std, + "use_legacy_version": use_legacy_stats, + } + base_v1_args["pretrain_reps"] = pretrain_reps + base_v1_args["pretrain_opts"]["ignored_keys_for_reduce_dim"] = ["conv_kernel_size"] + base_v1_args["encoder_args"].dropout = enc_drop + base_v1_args["encoder_args"].dropout_in = enc_drop + base_v1_args["encoder_args"].att_dropout = enc_drop + base_v1_args["decoder_args"].use_zoneout_output = True + exp_name = f"ebranch_bpe1000_peakLR{lr}_ep{ep}_globalNorm_epochOCLR_pre{pretrain_reps}_fixZoneout_encDrop{enc_drop}_woDepthConvPre" + return base_v1_args, exp_name + + # base_bpe1000_peakLR0.0008_ep400_globalNorm_epochOCLR_pre3_fixZoneout_encDrop0.15_woDepthConvPre_weightDrop0.1_decAttDrop0.0_embedDim256_numBlocks12 + # 7.4 6.85 avg + # base_bpe1000_peakLR0.0008_ep200_globalNorm_epochOCLR_pre3_fixZoneout_encDrop0.1_woDepthConvPre + # 8.19 7.64 avg + # base_bpe1000_peakLR0.0008_ep200_globalNorm_epochOCLR_pre3_fixZoneout_encDrop0.15_woDepthConvPre_weightDrop0.1_decAttDrop0.0_embedDim256_numBlocks12_ctcScale0.3 + # 8.11 7.52 best + + for num_blocks in [12]: + for ep in [50 * 4]: + for lr in [8e-4]: + for target_embed_dim in [256]: + for dec_att_drop in [0.0]: + for weight_drop in [0.1]: + for enc_drop in [0.15]: + for ctc_scale in [1.0, 0.3]: + base_v1_args, exp_name = get_base_v1_args( + lr, ep, enc_drop=enc_drop, use_legacy_stats=False + ) + args = copy.deepcopy(base_v1_args) + + args["encoder_args"].num_blocks = num_blocks + + args["encoder_args"].frontend_conv_weight_dropout = weight_drop + args["encoder_args"].mhsa_weight_dropout = weight_drop + args["encoder_args"].ff_weight_dropout = weight_drop + args["encoder_args"].conv_weight_dropout = weight_drop + + args["decoder_args"].embed_dim = target_embed_dim + args["decoder_args"].att_dropout = dec_att_drop + + exp_name += f"_weightDrop{weight_drop}_decAttDrop{dec_att_drop}_embedDim{target_embed_dim}_numBlocks{num_blocks}" + + if ctc_scale != 1.0: + args["encoder_args"].ctc_loss_scale = ctc_scale + args["decoder_args"].ce_loss_scale = 1.0 - ctc_scale + exp_name += f"_ctcScale{ctc_scale}" + + run_exp( + exp_name, + args, + num_epochs=ep, + epoch_wise_filter=None, + bpe_size=BPE_1K, + partition_epoch=4, + ) + + +def py(): + conformer_baseline() diff --git a/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py b/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py index dfbed77d8..b9e6e4dde 100644 --- a/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py +++ b/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py @@ -1266,53 +1266,53 @@ def get_base_v1_args(lr, ep, enc_drop=0.1, pretrain_reps=3, use_legacy_stats=Tru partition_epoch=4, ) - # TODO: mixup - for num_blocks in [12]: - for ep in [50 * 4]: - for lr in [8e-4]: - for target_embed_dim in [256]: - for att_drop in [0.0]: - for weight_drop in [0.1]: - for enc_drop in [0.15]: - for ctc_scale in [0.3]: - for mixup_apply_prob in [0.2, 0.3, 0.4]: - base_v1_args, exp_name = get_base_v1_args( - lr, ep, enc_drop=enc_drop, use_legacy_stats=False - ) - - args = copy.deepcopy(base_v1_args) - args["encoder_args"].num_blocks = num_blocks - args["encoder_args"].mhsa_weight_dropout = weight_drop - args["encoder_args"].ff_weight_dropout = weight_drop - args["encoder_args"].conv_weight_dropout = weight_drop - - args["decoder_args"].embed_dim = target_embed_dim - args["decoder_args"].att_dropout = att_drop - - exp_name += f"_weightDrop{weight_drop}_decAttDrop{att_drop}_embedDim{target_embed_dim}_numBlocks{num_blocks}" - - args["mixup_aug_opts"] = { - "use_log10_features": True, - "buffer_size": 1_000_000, - "apply_prob": mixup_apply_prob, - "max_num_mix": 4, - "lambda_min": 0.15, - "lambda_max": 0.3, - } - exp_name += f"_mixup_{mixup_apply_prob}_4_0.15_0.3" - - if ctc_scale != 1.0: - args["encoder_args"].ctc_loss_scale = ctc_scale - args["decoder_args"].ce_loss_scale = 1.0 - ctc_scale - exp_name += f"_ctcScale{ctc_scale}" - - run_exp( - exp_name, - args, - num_epochs=ep, - epoch_wise_filter=None, - bpe_size=BPE_1K, - partition_epoch=4, - ) + # TODO: ebranchformer encoder - # TODO: smaller models + # # TODO: mixup + # for num_blocks in [12]: + # for ep in [50 * 4]: + # for lr in [8e-4]: + # for target_embed_dim in [256]: + # for att_drop in [0.0]: + # for weight_drop in [0.1]: + # for enc_drop in [0.15]: + # for ctc_scale in [0.3]: + # for mixup_apply_prob in [0.2, 0.3, 0.4]: + # base_v1_args, exp_name = get_base_v1_args( + # lr, ep, enc_drop=enc_drop, use_legacy_stats=False + # ) + # + # args = copy.deepcopy(base_v1_args) + # args["encoder_args"].num_blocks = num_blocks + # args["encoder_args"].mhsa_weight_dropout = weight_drop + # args["encoder_args"].ff_weight_dropout = weight_drop + # args["encoder_args"].conv_weight_dropout = weight_drop + # + # args["decoder_args"].embed_dim = target_embed_dim + # args["decoder_args"].att_dropout = att_drop + # + # exp_name += f"_weightDrop{weight_drop}_decAttDrop{att_drop}_embedDim{target_embed_dim}_numBlocks{num_blocks}" + # + # args["mixup_aug_opts"] = { + # "use_log10_features": True, + # "buffer_size": 1_000_000, + # "apply_prob": mixup_apply_prob, + # "max_num_mix": 4, + # "lambda_min": 0.15, + # "lambda_max": 0.3, + # } + # exp_name += f"_mixup_{mixup_apply_prob}_4_0.15_0.3" + # + # if ctc_scale != 1.0: + # args["encoder_args"].ctc_loss_scale = ctc_scale + # args["decoder_args"].ce_loss_scale = 1.0 - ctc_scale + # exp_name += f"_ctcScale{ctc_scale}" + # + # run_exp( + # exp_name, + # args, + # num_epochs=ep, + # epoch_wise_filter=None, + # bpe_size=BPE_1K, + # partition_epoch=4, + # ) diff --git a/users/zeineldeen/models/asr/decoder/args.py b/users/zeineldeen/models/asr/decoder/args.py new file mode 100644 index 000000000..fb7cf2592 --- /dev/null +++ b/users/zeineldeen/models/asr/decoder/args.py @@ -0,0 +1,137 @@ +from dataclasses import dataclass +from typing import Optional + + +class DecoderArgs: + pass + + +@dataclass +class TransformerDecoderArgs(DecoderArgs): + num_layers: int = 6 + att_num_heads: int = 8 + ff_dim: int = 2048 + ff_act: str = "relu" + pos_enc: Optional[str] = None + embed_pos_enc: bool = False + + # param init + ff_init: Optional[str] = None + mhsa_init: Optional[str] = None + mhsa_out_init: Optional[str] = None + + # dropout + dropout: float = 0.1 + att_dropout: float = 0.1 + embed_dropout: float = 0.1 + softmax_dropout: float = 0.0 + + ff_weight_noise: Optional[float] = None + mhsa_weight_noise: Optional[float] = None + ff_weight_dropout: Optional[float] = None + mhsa_weight_dropout: Optional[float] = None + + # other regularization + l2: float = 0.0 + self_att_l2: float = 0.0 + rel_pos_clipping: int = 16 + label_smoothing: float = 0.1 + apply_embed_weight: bool = False + + length_normalization: bool = True + + # ILM + replace_cross_att_w_masked_self_att: bool = False + create_ilm_decoder: bool = False + ilm_type: bool = None + ilm_args: Optional[dict] = None + + +@dataclass +class ConformerDecoderArgs(DecoderArgs): + num_layers: int = 6 + att_num_heads: int = 8 + ff_dim: int = 2048 + pos_enc: Optional[str] = "rel" + + # conv module + conv_kernel_size: int = 32 + + # param init + ff_init: Optional[str] = None + mhsa_init: Optional[str] = None + mhsa_out_init: Optional[str] = None + conv_module_init: Optional[str] = None + + # dropout + dropout: float = 0.1 + att_dropout: float = 0.1 + embed_dropout: float = 0.1 + softmax_dropout: float = 0.1 + + # other regularization + l2: float = 0.0001 + frontend_conv_l2: float = 0.0001 + rel_pos_clipping: int = 16 + label_smoothing: float = 0.1 + apply_embed_weight: bool = False + + length_normalization: bool = True + + use_sqrd_relu: bool = False + + # ILM + replace_cross_att_w_masked_self_att: bool = False + create_ilm_decoder: bool = False + ilm_type: bool = None + ilm_args: Optional[dict] = None + + +@dataclass +class RNNDecoderArgs(DecoderArgs): + att_num_heads: int = 1 + lstm_num_units: int = 1024 + output_num_units: int = 1024 + embed_dim: int = 640 + enc_key_dim: int = 1024 # also attention dim # also attention dim + + # location feedback + loc_conv_att_filter_size: Optional[int] = None + + # param init + lstm_weights_init: Optional[str] = None + embed_weight_init: Optional[str] = None + + # dropout + dropout: float = 0.0 + softmax_dropout: float = 0.3 + att_dropout: float = 0.0 + embed_dropout: float = 0.1 + rec_weight_dropout: float = 0.0 + + # other regularization + l2: float = 0.0001 + zoneout: bool = True + reduceout: bool = True + + # lstm lm + lstm_lm_dim: int = 1024 + add_lstm_lm: bool = False + + length_normalization: bool = True + length_normalization_exponent: float = 1.0 + + coverage_scale: float = None + coverage_threshold: float = None + coverage_update: str = "sum" + + ce_loss_scale: Optional[float] = 1.0 + + label_smoothing: float = 0.1 + + use_zoneout_output: bool = False + + monotonic_att_weights_loss_opts: Optional[dict] = None + use_monotonic_att_weights_loss_in_recog: Optional[bool] = False + + include_eos_in_search_output: bool = False diff --git a/users/zeineldeen/models/asr/encoder/args.py b/users/zeineldeen/models/asr/encoder/args.py new file mode 100644 index 000000000..56353430f --- /dev/null +++ b/users/zeineldeen/models/asr/encoder/args.py @@ -0,0 +1,87 @@ +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + + +class EncoderArgs: + pass + + +@dataclass +class ConformerEncoderCommonArgs(EncoderArgs): + num_blocks: int = 12 + enc_key_dim: int = 512 + att_num_heads: int = 8 + ff_dim: int = 2048 + conv_kernel_size: int = 32 + input: str = "data" + input_layer: str = "lstm-6" + input_layer_conv_act: str = "relu" + add_abs_pos_enc_to_input: bool = False + pos_enc: str = "rel" + + sandwich_conv: bool = False + subsample: Optional[str] = None + use_causal_layers: bool = False + + # ctc + with_ctc: bool = True + native_ctc: bool = True + ctc_loss_scale: Optional[float] = None + ctc_self_align_delay: Optional[int] = None + ctc_self_align_scale: float = 0.5 + ctc_dropout: float = 0.0 + + # param init + ff_init: Optional[str] = None + mhsa_init: Optional[str] = None + mhsa_out_init: Optional[str] = None + conv_module_init: Optional[str] = None + start_conv_init: Optional[str] = None + + # dropout + dropout: float = 0.1 + dropout_in: float = 0.1 + att_dropout: float = 0.1 + lstm_dropout: float = 0.1 + + # weight dropout + ff_weight_dropout: Optional[float] = None + mhsa_weight_dropout: Optional[float] = None + conv_weight_dropout: Optional[float] = None + + # norms + batch_norm_opts: Optional[Dict[str, Any]] = None + use_ln: bool = False + + # other regularization + l2: float = 0.0001 + frontend_conv_l2: float = 0.0001 + self_att_l2: float = 0.0 + rel_pos_clipping: int = 16 + + use_sqrd_relu: bool = False + + convolution_first: bool = False + + +@dataclass +class ConformerEncoderArgs(ConformerEncoderCommonArgs): + weight_noise: Optional[float] = None + weight_noise_layers: Optional[List[str]] = None + + +@dataclass +class ConformerEncoderV2Args(ConformerEncoderCommonArgs): + # weight noise + ff_weight_noise: Optional[float] = None + mhsa_weight_noise: Optional[float] = None + conv_weight_noise: Optional[float] = None + frontend_conv_weight_noise: Optional[float] = None + + # weight dropout + frontend_conv_weight_dropout: Optional[float] = None + + +@dataclass +class EBranchformerEncoderArgs(ConformerEncoderV2Args): + pass diff --git a/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py b/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py index b513e5af6..6d6fda018 100644 --- a/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py +++ b/users/zeineldeen/models/asr/encoder/ebranchformer_encoder.py @@ -7,18 +7,15 @@ class EBranchformerEncoder(ConformerEncoderV2): * Ref: https://arxiv.org/pdf/2210.00077.pdf """ - def __init__(self, cgmlp_ff_dim, **kwargs): + def __init__(self, **kwargs): super().__init__(**kwargs) - assert cgmlp_ff_dim % 2 == 0, "cgmlp_dim must be even" - self.cgmlp_ff_dim = cgmlp_ff_dim - def _create_conv_spatial_gating_unit(self, prefix_name, source, layer_index): # Half split input into [A,B] -> A * DwConv(LN(B)) -> dropout # # see also here: https://github.com/espnet/espnet/blob/master/espnet2/asr/layers/cgmlp.py#L15 - split_size = self.cgmlp_ff_dim // 2 + split_size = self.enc_key_dim * 3 branch_a = self.network.add_slice_layer( "{}_branch_a".format(prefix_name), source, "F", slice_start=0, slice_end=split_size @@ -58,7 +55,7 @@ def _create_conv_gating_mlp(self, prefix_name, source, layer_index): ff1 = self.network.add_linear_layer( "{}_ff_1".format(prefix_name), ln, - n_out=self.cgmlp_ff_dim, + n_out=6 * self.enc_key_dim, l2=self.l2, forward_weights_init=self.ff_init, with_bias=False, From 502c327303e96bc408917cb8208c3a422c6c30f9 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Tue, 21 May 2024 23:33:42 +0200 Subject: [PATCH 043/227] more --- .../exp2024_04_23_baselines/ctc.py | 53 +++++++++++++------ 1 file changed, 38 insertions(+), 15 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 25943285b..480eefcd1 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -34,10 +34,10 @@ def py(): "spm20k", "bpe10k", # 8.23 "spm10k", # 8.12 - "spm_bpe10k", - "spm4k", + "spm_bpe10k", # 7.97 + "spm4k", # 9.86 "spm1k", - "spm_bpe1k", + "spm_bpe1k", # 11.76 ]: train_exp( # 8.23 f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-{vocab}", @@ -51,18 +51,41 @@ def py(): vocab=vocab, ) - train_exp( - "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-spm10k-spmSample03", - config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, - config_updates={ - **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), - "optimizer.weight_decay": 1e-2, - "__train_audio_preprocess": speed_pert_librosa_config, - "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], - }, - vocab="spm10k", - train_vocab_opts={"other_opts": {"enable_sampling": True, "alpha": 0.3}}, - ) + for alpha in [ + 0.3, # 7.88 + 0.5, + 0.7, + ]: + train_exp( + "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-spm10k" + f"-spmSample{str(alpha).replace('.', '')}", + config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), + "optimizer.weight_decay": 1e-2, + "__train_audio_preprocess": speed_pert_librosa_config, + "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], + }, + vocab="spm10k", + train_vocab_opts={"other_opts": {"enable_sampling": True, "alpha": alpha}}, + ) + + for alpha in [ + 0.3, + ]: + train_exp( + "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-spm_bpe10k" + f"-spmSample{str(alpha).replace('.', '')}", + config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), + "optimizer.weight_decay": 1e-2, + "__train_audio_preprocess": speed_pert_librosa_config, + "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], + }, + vocab="spm_bpe10k", + train_vocab_opts={"other_opts": {"enable_sampling": True, "alpha": alpha}}, + ) # noinspection PyShadowingNames From 7a7e810af1a4bd0c4c0d2b8c0b69fe01f3e14db2 Mon Sep 17 00:00:00 2001 From: Simon Berger Date: Wed, 22 May 2024 13:12:06 +0200 Subject: [PATCH 044/227] Update users/berger --- .../config_01a_ctc_blstm_raw_samples.py | 4 +- .../config_01c_ctc_conformer_raw_samples.py | 4 +- .../config_01d_ctc_conformer_rasr_features.py | 2 + .../config_02b_transducer_rasr_features.py | 41 +- ..._02f_transducer_rasr_features_am_scales.py | 445 ++++++++++++++++++ ...ig_03b_transducer_fullsum_rasr_features.py | 32 +- .../librispeech/viterbi_transducer_data.py | 2 +- users/berger/settings.py | 10 +- users/berger/systems/functors/seq2seq_base.py | 1 + 9 files changed, 528 insertions(+), 13 deletions(-) create mode 100644 users/berger/configs/librispeech/20230210_baselines/config_02f_transducer_rasr_features_am_scales.py diff --git a/users/berger/configs/librispeech/20230210_baselines/config_01a_ctc_blstm_raw_samples.py b/users/berger/configs/librispeech/20230210_baselines/config_01a_ctc_blstm_raw_samples.py index 5ed506441..471df20c7 100644 --- a/users/berger/configs/librispeech/20230210_baselines/config_01a_ctc_blstm_raw_samples.py +++ b/users/berger/configs/librispeech/20230210_baselines/config_01a_ctc_blstm_raw_samples.py @@ -138,8 +138,8 @@ def run_exp() -> Tuple[SummaryReport, Checkpoint, Dict[str, AlignmentData]]: ) recog_args = exp_args.get_ctc_recog_step_args(num_classes) - align_args = exp_args.get_ctc_align_step_args(num_classes) - recog_args["epochs"] = [320, 400, 480, 500, "best"] + align_args = exp_args.get_ctc_align_step_args(num_classes, epoch=500) + recog_args["epochs"] = [320, 500, "best"] recog_args["prior_scales"] = [0.3] recog_args["lm_scales"] = [0.9] diff --git a/users/berger/configs/librispeech/20230210_baselines/config_01c_ctc_conformer_raw_samples.py b/users/berger/configs/librispeech/20230210_baselines/config_01c_ctc_conformer_raw_samples.py index 64f90fd07..2c3536d75 100644 --- a/users/berger/configs/librispeech/20230210_baselines/config_01c_ctc_conformer_raw_samples.py +++ b/users/berger/configs/librispeech/20230210_baselines/config_01c_ctc_conformer_raw_samples.py @@ -142,8 +142,8 @@ def run_exp() -> Tuple[SummaryReport, Checkpoint, Dict[str, AlignmentData]]: ) recog_args = exp_args.get_ctc_recog_step_args(num_classes) - align_args = exp_args.get_ctc_align_step_args(num_classes) - recog_args["epochs"] = [320, 400, 480, 500, "best"] + align_args = exp_args.get_ctc_align_step_args(num_classes, epoch=500) + recog_args["epochs"] = [320, 500, "best"] recog_args["prior_scales"] = [0.3] recog_args["lm_scales"] = [0.9] diff --git a/users/berger/configs/librispeech/20230210_baselines/config_01d_ctc_conformer_rasr_features.py b/users/berger/configs/librispeech/20230210_baselines/config_01d_ctc_conformer_rasr_features.py index 133e125d7..9d08d868d 100644 --- a/users/berger/configs/librispeech/20230210_baselines/config_01d_ctc_conformer_rasr_features.py +++ b/users/berger/configs/librispeech/20230210_baselines/config_01d_ctc_conformer_rasr_features.py @@ -133,6 +133,7 @@ def run_exp() -> Tuple[SummaryReport, Checkpoint, Dict[str, AlignmentData]]: recog_args["feature_type"] = FeatureType.GAMMATONE_16K recog_args["prior_scales"] = [0.3] recog_args["lm_scales"] = [0.9] + recog_args["search_stats"] = True align_args["feature_type"] = FeatureType.GAMMATONE_16K # ********** System ********** @@ -146,6 +147,7 @@ def run_exp() -> Tuple[SummaryReport, Checkpoint, Dict[str, AlignmentData]]: SummaryKey.PRIOR, SummaryKey.LM, SummaryKey.WER, + SummaryKey.RTF, SummaryKey.SUB, SummaryKey.INS, SummaryKey.DEL, diff --git a/users/berger/configs/librispeech/20230210_baselines/config_02b_transducer_rasr_features.py b/users/berger/configs/librispeech/20230210_baselines/config_02b_transducer_rasr_features.py index bdddbde1e..f6deeff86 100644 --- a/users/berger/configs/librispeech/20230210_baselines/config_02b_transducer_rasr_features.py +++ b/users/berger/configs/librispeech/20230210_baselines/config_02b_transducer_rasr_features.py @@ -319,17 +319,19 @@ def run_exp( dev_data_config=data.cv_data_config, label_smoothing=None, loss_boost_v2=False, - loss_boost_scale=5.0, + loss_boost_scale=0.0, + specaug_v2=True, peak_lr=8e-04, model_preload=None, ), recog_configs={ - "recog_ilm": generate_returnn_config( + f"recog_ilm-{ilm_scale}": generate_returnn_config( train=False, train_data_config=data.train_data_config, dev_data_config=data.cv_data_config, - ilm_scale=0.2, + ilm_scale=ilm_scale, ) + for ilm_scale in [0.1, 0.2, 0.3] }, ), ) @@ -339,6 +341,39 @@ def run_exp( system.run_dev_recog_step(**recog_args) system.run_test_recog_step(**recog_args) + if "blstm" in name_suffix: + recog_args.update( + { + "epochs": [ + 245, + 294, + 320, + 321, + 323, + 369, + 376, + 381, + 382, + 384, + 385, + 386, + 387, + 393, + 394, + 395, + 397, + 399, + 400, + ], + "lm_scales": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9], + } + ) + system.run_recog_step_for_corpora( + exp_names=[f"Conformer_Transducer_Viterbi_specaug-v2_{name_suffix}"], + corpora=["dev-other_4gram"], + **recog_args, + ) + train_job = system.get_train_job(f"Conformer_Transducer_Viterbi_lr-0.0008_{name_suffix}") model = train_job.out_checkpoints[400] assert isinstance(model, Checkpoint) diff --git a/users/berger/configs/librispeech/20230210_baselines/config_02f_transducer_rasr_features_am_scales.py b/users/berger/configs/librispeech/20230210_baselines/config_02f_transducer_rasr_features_am_scales.py new file mode 100644 index 000000000..320f83114 --- /dev/null +++ b/users/berger/configs/librispeech/20230210_baselines/config_02f_transducer_rasr_features_am_scales.py @@ -0,0 +1,445 @@ +import copy +import os +from typing import Dict, Tuple, Optional + +import i6_core.rasr as rasr +from i6_core.returnn import Checkpoint +from i6_core.returnn.config import ReturnnConfig +from i6_experiments.users.berger.args.experiments import transducer as exp_args +from i6_experiments.users.berger.args.returnn.config import get_returnn_config +from i6_experiments.users.berger.args.returnn.learning_rates import ( + LearningRateSchedules, +) +from i6_experiments.users.berger.corpus.librispeech.viterbi_transducer_data import ( + get_librispeech_data, +) +import i6_experiments.users.berger.network.models.context_1_transducer as transducer_model +from i6_experiments.users.berger.recipe.summary.report import SummaryReport +from i6_experiments.users.berger.systems.returnn_seq2seq_system import ( + ReturnnSeq2SeqSystem, +) +from i6_experiments.users.berger.systems.dataclasses import ReturnnConfigs, FeatureType, SummaryKey +from i6_experiments.users.berger.util import default_tools +from i6_private.users.vieting.helpers.returnn import serialize_dim_tags +from i6_experiments.users.berger.systems.dataclasses import AlignmentData +from i6_experiments.users.berger.recipe.returnn.hdf import MatchLengthsJob +from sisyphus import gs, tk + +tools = copy.deepcopy(default_tools) + +# ********** Settings ********** + +rasr.flow.FlowNetwork.default_flags = {"cache_mode": "task_dependent"} + + +num_classes = 79 +num_epochs = 600 + + +# ********** Return Config ********** + + +def generate_returnn_config( + train: bool, + *, + train_data_config: dict, + dev_data_config: dict, + **kwargs, +) -> ReturnnConfig: + specaug_v2 = kwargs.get("specaug_v2", False) + if specaug_v2: + specaug_args = { + "min_reps_time": 0, + "max_reps_time": 20, + "max_len_time": 20, + "min_reps_feature": 0, + "max_reps_feature": 1, + "max_len_feature": 15, + } + else: + specaug_args = { + "max_time_num": 1, + "max_time": 15, + "max_feature_num": 5, + "max_feature": 5, + } + + if train: + ( + network_dict, + extra_python, + ) = transducer_model.make_context_1_conformer_transducer( + num_outputs=num_classes, + specaug_args=specaug_args, + conformer_args={ + "num_blocks": 12, + "size": 512, + "dropout": 0.1, + "l2": 5e-06, + }, + decoder_args={ + "dec_mlp_args": { + "num_layers": 2, + "size": 640, + "activation": "tanh", + "dropout": 0.1, + "l2": 5e-06, + }, + "combination_mode": "concat", + "joint_mlp_args": { + "num_layers": 1, + "size": 1024, + "dropout": 0.1, + "l2": 5e-06, + "activation": "tanh", + }, + }, + output_args={ + "label_smoothing": kwargs.get("label_smoothing", None), + }, + loss_boost_scale=kwargs.get("loss_boost_scale", 5.0), + loss_boost_v2=kwargs.get("loss_boost_v2", False), + specaug_v2=specaug_v2, + ) + else: + ( + network_dict, + extra_python, + ) = transducer_model.make_context_1_conformer_transducer_recog( + num_outputs=num_classes, + conformer_args={ + "num_blocks": 12, + "size": 512, + }, + decoder_args={ + "dec_mlp_args": { + "num_layers": 2, + "size": 640, + "activation": "tanh", + }, + "combination_mode": "concat", + "joint_mlp_args": { + "num_layers": 1, + "size": 1024, + "activation": "tanh", + }, + "ilm_scale": kwargs.get("ilm_scale", 0.0), + }, + ) + + extra_config = { + "train": train_data_config, + "dev": dev_data_config, + "chunking": ( + { + "data": 256, + "classes": 64, + }, + { + "data": 128, + "classes": 32, + }, + ), + } + + if kwargs.get("model_preload", None) is not None: + extra_config["preload_from_files"] = { + "base": { + "init_for_train": True, + "ignore_missing": True, + "filename": kwargs.get("model_preload", None), + } + } + + returnn_config = get_returnn_config( + network=network_dict, + target="classes", + num_epochs=num_epochs, + python_prolog=[ + "import sys", + "sys.setrecursionlimit(10 ** 6)", + ], + extra_python=extra_python, + num_inputs=50, + num_outputs=num_classes, + extern_target_kwargs={"dtype": "int8" if train else "int32"}, + extern_data_config=True, + grad_noise=0.0, + grad_clip=0.0, + schedule=LearningRateSchedules.OCLR, + initial_lr=1e-05, + peak_lr=kwargs.get("peak_lr", 8e-04), + final_lr=1e-06, + n_steps_per_epoch=2450, + batch_size=12500, + extra_config=extra_config, + ) + returnn_config = serialize_dim_tags(returnn_config) + + return returnn_config + + +def subsample_by_4(orig_len: int) -> int: + return -(-orig_len // 4) + + +def run_exp( + alignments: Dict[str, AlignmentData], + ctc_model_checkpoint: Optional[Checkpoint] = None, + name_suffix: str = "", + data_control_train: bool = False, + data_control_cv: bool = False, + match_lengths: bool = False, +) -> Tuple[SummaryReport, Checkpoint]: + assert tools.returnn_root is not None + assert tools.returnn_python_exe is not None + assert tools.rasr_binary_path is not None + + data = get_librispeech_data( + tools.returnn_root, + tools.returnn_python_exe, + rasr_binary_path=tools.rasr_binary_path, + alignments=alignments, + add_unknown_phoneme_and_mapping=False, + # use_augmented_lexicon=False, + # use_wei_lexicon=True, + use_augmented_lexicon=True, + use_wei_lexicon=False, + feature_type=FeatureType.GAMMATONE_16K, + ) + changed_data_configs = [] + if data_control_train: + changed_data_configs.append(data.train_data_config) + if data_control_cv: + changed_data_configs.append(data.cv_data_config) + + data.train_data_config["datasets"]["classes"]["seq_ordering"] = "laplace:.384" + data.train_data_config["datasets"]["classes"]["partition_epoch"] = 40 + + for data_config in changed_data_configs: + data_config["datasets"]["data"].update( + { + "seq_ordering": data_config["datasets"]["classes"]["seq_ordering"], + "partition_epoch": data_config["datasets"]["classes"]["partition_epoch"], + } + ) + del data_config["datasets"]["classes"]["seq_ordering"] + del data_config["datasets"]["classes"]["partition_epoch"] + data_config["seq_order_control_dataset"] = "data" + if match_lengths: + for data_config in [data.train_data_config, data.cv_data_config]: + data_config["datasets"]["classes"]["files"] = [ + MatchLengthsJob(file, data_config["datasets"]["data"]["files"], subsample_by_4).out_hdf + for file in data_config["datasets"]["classes"]["files"] + ] + + # ********** System ********** + + system = ReturnnSeq2SeqSystem( + tools, + summary_keys=[ + SummaryKey.TRAIN_NAME, + SummaryKey.RECOG_NAME, + SummaryKey.CORPUS, + SummaryKey.EPOCH, + SummaryKey.LM, + SummaryKey.WER, + SummaryKey.SUB, + SummaryKey.INS, + SummaryKey.DEL, + SummaryKey.ERR, + ], + summary_sort_keys=[SummaryKey.ERR, SummaryKey.CORPUS], + ) + + system.init_corpora( + dev_keys=data.dev_keys, + test_keys=data.test_keys, + align_keys=data.align_keys, + corpus_data=data.data_inputs, + am_args=exp_args.transducer_recog_am_args, + ) + system.setup_scoring() + + # ********** Step args ********** + + train_args = exp_args.get_transducer_train_step_args( + num_epochs=num_epochs, + # gpu_mem_rqmt=24, + ) + + recog_args = exp_args.get_transducer_recog_step_args( + num_classes, + lm_scales=[0.8], + epochs=[num_epochs], + # lookahead_options={"scale": 0.5}, + search_parameters={"label-pruning": 12.0}, + feature_type=FeatureType.GAMMATONE_16K, + reduction_factor=4, + reduction_subtrahend=0, + ) + + # ********** Returnn Configs ********** + + for label_smoothing, loss_boost_v2, peak_lr, loss_boost_scale, ctc_init in [ + # (None, True, 4e-04, 5.0, False), + # (None, True, 8e-04, 5.0, False), + # # (None, False, 4e-04, 5.0, False), + # # (None, False, 8e-04, 5.0, False), + # # (None, False, 4e-04, 5.0, True), + # # (None, False, 8e-04, 5.0, True), + # (0.2, True, 4e-04, 5.0, False), + # (0.2, True, 8e-04, 5.0, False), + # # (0.2, False, 4e-04, 5.0, False), + # # (0.2, False, 8e-04, 5.0, False), + (None, False, 8e-04, 0.0, False), + # (None, False, 8e-04, 0.0, True), + ]: + ctc_init = ctc_init and (ctc_model_checkpoint is not None) + train_config = generate_returnn_config( + train=True, + train_data_config=data.train_data_config, + dev_data_config=data.cv_data_config, + label_smoothing=label_smoothing, + loss_boost_v2=loss_boost_v2, + loss_boost_scale=loss_boost_scale, + peak_lr=peak_lr, + model_preload=ctc_model_checkpoint if ctc_init else None, + specaug_v2=True, + ) + recog_config = generate_returnn_config( + train=False, + train_data_config=data.train_data_config, + dev_data_config=data.cv_data_config, + ) + recog_config_ilm = generate_returnn_config( + train=False, + train_data_config=data.train_data_config, + dev_data_config=data.cv_data_config, + ilm_scale=0.2, + ) + + returnn_configs = ReturnnConfigs( + train_config=train_config, + recog_configs={"recog": recog_config, "recog_ilm": recog_config_ilm}, + ) + + suffix = f"lr-{peak_lr}" + if loss_boost_scale: + if loss_boost_v2: + suffix += "_loss-boost-v2" + else: + suffix += "_loss-boost" + if label_smoothing: + suffix += f"_ls-{label_smoothing}" + + if ctc_init: + suffix += "_ctc-init" + + system.add_experiment_configs(f"Conformer_Transducer_Viterbi_{suffix}_{name_suffix}", returnn_configs) + + system.run_train_step(**train_args) + + system.run_dev_recog_step(**recog_args) + system.run_test_recog_step(**recog_args) + + train_job = system.get_train_job(f"Conformer_Transducer_Viterbi_lr-0.0008_{name_suffix}") + model = train_job.out_checkpoints[num_epochs] + assert isinstance(model, Checkpoint) + + assert system.summary_report + return system.summary_report, model + + +def py() -> SummaryReport: + filename_handle = os.path.splitext(os.path.basename(__file__))[0][len("config_") :] + gs.ALIAS_AND_OUTPUT_SUBDIR = f"{filename_handle}/" + + summary_report = SummaryReport() + + alignments_nour = {} + + alignment_paths_nour = { + 0.1: { + "train-other-960_align": tk.Path( + "/work/asr3/raissi/shared_workspaces/bayoumi/sisyphus_work/i6_experiments/users/berger/recipe/mm/alignment/Seq2SeqAlignmentJob.waHWItDFeH4p/output/alignment.cache.bundle" + ), + "dev-clean_align": tk.Path( + "/work/asr3/raissi/shared_workspaces/bayoumi/sisyphus_work/i6_experiments/users/berger/recipe/mm/alignment/Seq2SeqAlignmentJob.39RvKswiwE5X/output/alignment.cache.bundle" + ), + "dev-other_align": tk.Path( + "/work/asr3/raissi/shared_workspaces/bayoumi/sisyphus_work/i6_experiments/users/berger/recipe/mm/alignment/Seq2SeqAlignmentJob.UQcQtRgFJtri/output/alignment.cache.bundle" + ), + }, + 0.3: { + "train-other-960_align": tk.Path( + "/work/asr3/raissi/shared_workspaces/bayoumi/sisyphus_work/i6_experiments/users/berger/recipe/mm/alignment/Seq2SeqAlignmentJob.4bWrFMO9rBP7/output/alignment.cache.bundle" + ), + "dev-clean_align": tk.Path( + "/work/asr3/raissi/shared_workspaces/bayoumi/sisyphus_work/i6_experiments/users/berger/recipe/mm/alignment/Seq2SeqAlignmentJob.WAPZqf6YGRqV/output/alignment.cache.bundle" + ), + "dev-other_align": tk.Path( + "/work/asr3/raissi/shared_workspaces/bayoumi/sisyphus_work/i6_experiments/users/berger/recipe/mm/alignment/Seq2SeqAlignmentJob.8e6a0qmzOKPS/output/alignment.cache.bundle" + ), + }, + 0.5: { + "train-other-960_align": tk.Path( + "/work/asr3/raissi/shared_workspaces/bayoumi/sisyphus_work/i6_experiments/users/berger/recipe/mm/alignment/Seq2SeqAlignmentJob.9F7XAOE5SW6a/output/alignment.cache.bundle" + ), + "dev-clean_align": tk.Path( + "/work/asr3/raissi/shared_workspaces/bayoumi/sisyphus_work/i6_experiments/users/berger/recipe/mm/alignment/Seq2SeqAlignmentJob.NZ9KCbM3iaUM/output/alignment.cache.bundle" + ), + "dev-other_align": tk.Path( + "/work/asr3/raissi/shared_workspaces/bayoumi/sisyphus_work/i6_experiments/users/berger/recipe/mm/alignment/Seq2SeqAlignmentJob.NrLiIv3mx2Mi/output/alignment.cache.bundle" + ), + }, + 0.7: { + "train-other-960_align": tk.Path( + "/work/asr3/raissi/shared_workspaces/bayoumi/sisyphus_work/i6_experiments/users/berger/recipe/mm/alignment/Seq2SeqAlignmentJob.nOX1kOQx5Txi/output/alignment.cache.bundle" + ), + "dev-clean_align": tk.Path( + "/work/asr3/raissi/shared_workspaces/bayoumi/sisyphus_work/i6_experiments/users/berger/recipe/mm/alignment/Seq2SeqAlignmentJob.WhaHQ8VtCQWb/output/alignment.cache.bundle" + ), + "dev-other_align": tk.Path( + "/work/asr3/raissi/shared_workspaces/bayoumi/sisyphus_work/i6_experiments/users/berger/recipe/mm/alignment/Seq2SeqAlignmentJob.Z7Yc9kH2BYOc/output/alignment.cache.bundle" + ), + }, + 1.0: { + "train-other-960_align": tk.Path( + "/work/asr3/raissi/shared_workspaces/bayoumi/sisyphus_work/i6_experiments/users/berger/recipe/mm/alignment/Seq2SeqAlignmentJob.0a7MCFFN37Bg/output/alignment.cache.bundle" + ), + "dev-clean_align": tk.Path( + "/work/asr3/raissi/shared_workspaces/bayoumi/sisyphus_work/i6_experiments/users/berger/recipe/mm/alignment/Seq2SeqAlignmentJob.HjJgbxdZhWZj/output/alignment.cache.bundle" + ), + "dev-other_align": tk.Path( + "/work/asr3/raissi/shared_workspaces/bayoumi/sisyphus_work/i6_experiments/users/berger/recipe/mm/alignment/Seq2SeqAlignmentJob.UatqVP2YM55f/output/alignment.cache.bundle" + ), + }, + } + + for am_scale, alignment_paths in alignment_paths_nour.items(): + for key, path in alignment_paths.items(): + align_data = AlignmentData( + alignment_cache_bundle=path, + allophone_file=tk.Path( + "/work/asr3/raissi/shared_workspaces/bayoumi/sisyphus_work/i6_core/lexicon/allophones/StoreAllophonesJob.8Nygr67IZfVG/output/allophones" + ), + state_tying_file=tk.Path( + "/work/asr3/raissi/shared_workspaces/bayoumi/sisyphus_work/i6_core/lexicon/allophones/DumpStateTyingJob.6w7HRWTGkgEd/output/state-tying" + ), + silence_phone="", + ) + alignments_nour[key] = align_data + + report, _ = run_exp( + alignments_nour, + name_suffix=f"nour-align-am-{am_scale}", + data_control_train=True, + data_control_cv=False, + match_lengths=True, + ) + summary_report.merge_report(report, update_structure=True) + + tk.register_report(f"{gs.ALIAS_AND_OUTPUT_SUBDIR}/summary.report", summary_report) + + return summary_report diff --git a/users/berger/configs/librispeech/20230210_baselines/config_03b_transducer_fullsum_rasr_features.py b/users/berger/configs/librispeech/20230210_baselines/config_03b_transducer_fullsum_rasr_features.py index 09cbfb429..f45cac63e 100644 --- a/users/berger/configs/librispeech/20230210_baselines/config_03b_transducer_fullsum_rasr_features.py +++ b/users/berger/configs/librispeech/20230210_baselines/config_03b_transducer_fullsum_rasr_features.py @@ -158,6 +158,7 @@ def run_exp(alignments: Dict[str, AlignmentData], viterbi_model_checkpoint: Chec data = get_librispeech_data( tools.returnn_root, tools.returnn_python_exe, + lm_names=["4gram", "kazuki_transformer"], alignments=alignments, rasr_binary_path=tools.rasr_binary_path, add_unknown_phoneme_and_mapping=False, @@ -253,18 +254,43 @@ def run_exp(alignments: Dict[str, AlignmentData], viterbi_model_checkpoint: Chec system.run_recog_step_for_corpora(corpora=["test-clean_4gram", "test-other_4gram"], **recog_args) recog_args["lm_scales"] = [0.8] - recog_args["search_parameters"].update({"full-sum-decoding": True, "label-full-sum": True}) + recog_args["search_parameters"].update( + { + "full-sum-decoding": True, + "label-full-sum": True, + "label-recombination-limit": 1, + "separate-recombination-lm": True, + "recombination-lm.type": "simple-history", + } + ) system.run_recog_step_for_corpora( recog_descriptor="fs", recog_exp_names={"Conformer_Transducer_Fullsum_lr-0.0001_bs-9000": ["recog_ilm-0.2"]}, - corpora=["dev-clean_4gram", "dev-other_4gram"], + corpora=["dev-clean_4gram", "dev-other_4gram", "test-clean_4gram", "test-other_4gram"], **recog_args, ) + + recog_args["search_parameters"].update( + { + # "separate-lookahead-lm": True, + "label-full-sum": False, + "label-pruning": 16.2, + } + ) + recog_args["lookahead_options"].update({"lm_lookahead_scale": 0.45}) + recog_args["use_gpu"] = True + recog_args["rtf"] = 50 + system.run_recog_step_for_corpora( recog_descriptor="fs", recog_exp_names={"Conformer_Transducer_Fullsum_lr-0.0001_bs-9000": ["recog_ilm-0.2"]}, - corpora=["test-clean_4gram", "test-other_4gram"], + corpora=[ + # "dev-clean_kazuki_transformer", + "dev-other_kazuki_transformer", + # "test-clean_kazuki_transformer", + # "test-other_kazuki_transformer", + ], **recog_args, ) diff --git a/users/berger/corpus/librispeech/viterbi_transducer_data.py b/users/berger/corpus/librispeech/viterbi_transducer_data.py index f61868612..a3c8946e5 100644 --- a/users/berger/corpus/librispeech/viterbi_transducer_data.py +++ b/users/berger/corpus/librispeech/viterbi_transducer_data.py @@ -32,7 +32,7 @@ def get_librispeech_data( # ********** Data inputs ********** - (train_data_inputs, cv_data_inputs, dev_data_inputs, test_data_inputs,) = data.get_data_inputs( + train_data_inputs, cv_data_inputs, dev_data_inputs, test_data_inputs = data.get_data_inputs( train_key=train_key, cv_keys=cv_keys, dev_keys=dev_keys, diff --git a/users/berger/settings.py b/users/berger/settings.py index 04308e1af..9744555d6 100644 --- a/users/berger/settings.py +++ b/users/berger/settings.py @@ -23,7 +23,9 @@ def check_engine_limits(current_rqmt, task): i6 support for gpu_mem """ current_rqmt["time"] = min(168, current_rqmt.get("time", 2)) - if current_rqmt.get("gpu", 0) > 0 and "-p" not in current_rqmt.get("sbatch_args", []): + if current_rqmt.get("gpu", 0) > 0 and "-p" not in current_rqmt.get( + "sbatch_args", [] + ): if current_rqmt.get("gpu_mem", 0) > 11: current_rqmt["sbatch_args"] = ["-p", "gpu_24gb"] else: @@ -52,7 +54,9 @@ def engine(): return EngineSelector( engines={ "short": LocalEngine(cpus=4, mem=16), - "long": SimpleLinuxUtilityForResourceManagementEngine(default_rqmt=default_rqmt), + "long": SimpleLinuxUtilityForResourceManagementEngine( + default_rqmt=default_rqmt + ), }, default_engine="long", ) @@ -75,6 +79,8 @@ def worker_wrapper(job, task_name, call): "AdvancedTreeSearchJob", "AdvancedTreeSearchLmImageAndGlobalCacheJob", "GenericSeq2SeqSearchJob", + "CreateLmImageJob", + "BuildGenericSeq2SeqGlobalCacheJob", "GenericSeq2SeqLmImageAndGlobalCacheJob", "LatticeToCtmJob", "OptimizeAMandLMScaleJob", diff --git a/users/berger/systems/functors/seq2seq_base.py b/users/berger/systems/functors/seq2seq_base.py index ee897cae3..1944e0b56 100644 --- a/users/berger/systems/functors/seq2seq_base.py +++ b/users/berger/systems/functors/seq2seq_base.py @@ -81,6 +81,7 @@ def _get_tf_feature_flow_for_label_scorer( feature_flow = copy.deepcopy(base_feature_flow) feature_flow.config = feature_flow.config or rasr.RasrConfig() feature_flow.config.main_port_name = "samples" if feature_type == FeatureType.SAMPLES else "features" + label_scorer.set_input_config() label_scorer.set_loader_config(self._make_tf_model_loader_config(tf_graph, checkpoint)) else: raise NotImplementedError From 8b241f5c066db99f4f91ef4d0a6a5cd316c58301 Mon Sep 17 00:00:00 2001 From: Simon Berger Date: Wed, 22 May 2024 13:13:09 +0200 Subject: [PATCH 045/227] Update users/berger --- users/berger/network/helpers/label_context.py | 15 +- .../recipe/rasr/label_tree_and_scorer.py | 57 ++- .../recognition/generic_seq2seq_search.py | 385 ++++++++++++------ 3 files changed, 333 insertions(+), 124 deletions(-) diff --git a/users/berger/network/helpers/label_context.py b/users/berger/network/helpers/label_context.py index c592c3b76..8214634b7 100644 --- a/users/berger/network/helpers/label_context.py +++ b/users/berger/network/helpers/label_context.py @@ -263,6 +263,7 @@ def add_context_1_decoder( def add_context_1_decoder_recog( network: Dict, num_outputs: int, + blank_idx: int = 0, encoder: str = "encoder", embedding_size: int = 128, dec_mlp_args: Dict = {}, @@ -351,9 +352,21 @@ def add_context_1_decoder_recog( "reuse_params": "output", } + assert blank_idx == 0, "Blank idx != 0 not implemented for ilm" + # Set p(blank) = 1 and re-normalize the non-blank probs + # so we want P'[b, 0] = 1, sum(P'[b, 1:]) = 1, given a normalized tensor P, i.e. sum(P[b, :]) = 1 + # in log space logP'[b, 0] = 0, sum(exp(logP'[b, 1:])) = 1 + # so set logP'[b, 1:] <- logP[b, 1:] - log(1 - exp(P[b, 0])) + # then sum(exp(logP'[b, 1:])) = sum(P[1:] / (1 - exp(P[b, 0]))) = sum(P[b, 1:]) / sum(b, P[1:]) = 1 + output_unit["ilm_renorm"] = { + "class": "eval", + "from": ["ilm"], + "eval": "tf.concat([tf.zeros(tf.shape(source(0)[:, :1])), source(0)[:, 1:] - tf.math.log(1.0 - tf.exp(source(0)[:, :1]))], axis=-1)", + } + output_unit["output_sub_ilm"] = { "class": "eval", - "from": ["output", "ilm"], + "from": ["output", "ilm_renorm"], "eval": f"source(0) - {ilm_scale} * source(1)", } diff --git a/users/berger/recipe/rasr/label_tree_and_scorer.py b/users/berger/recipe/rasr/label_tree_and_scorer.py index cd1451d55..88a6d4fef 100644 --- a/users/berger/recipe/rasr/label_tree_and_scorer.py +++ b/users/berger/recipe/rasr/label_tree_and_scorer.py @@ -78,7 +78,7 @@ def __init__( use_prior: bool = False, prior_scale: float = 0.6, prior_file: Optional[tk.Path] = None, - extra_args: Dict = {}, + extra_args: Optional[Dict] = None, ): self.config = rasr.RasrConfig() self.post_config = rasr.RasrConfig() @@ -102,13 +102,64 @@ def __init__( self.config.priori_scale = prior_scale # sprint key values # - for key, value in extra_args.items(): - self.config[key.replace("_", "-")] = value + if extra_args is not None: + for key, value in extra_args.items(): + self.config[key.replace("_", "-")] = value @property def scorer_type(self): return self.config.label_scorer_type + @property + def scale(self): + return self.config.scale + + @property + def label_file(self): + if self.config._get("label-file") is not None: + return self.config.label_file + return None + + @property + def num_classes(self): + if self.config._get("number-of-classes") is not None: + return self.config.number_of_classes + return None + + @property + def use_prior(self): + if self.config._get("use-prior") is not None: + return self.config["use-prior"] + return False + + @property + def prior_scale(self): + if self.config._get("priori-scale") is not None: + return self.config["priori-scale"] + return 1.0 + + @property + def prior_file(self): + if self.config._get("prior-file") is not None: + return self.config["prior-file"] + return None + + @property + def extra_args(self): + return { + key: val + for key, val in self.config._items() + if key not in [ + "label-scorer-type", + "scale", + "label-file", + "number-of-classes", + "use-prior", + "priori-scale", + "prior-file", + ] + } + def apply_config( self, path: str, diff --git a/users/berger/recipe/recognition/generic_seq2seq_search.py b/users/berger/recipe/recognition/generic_seq2seq_search.py index 34cc1da5e..4ed1f4177 100644 --- a/users/berger/recipe/recognition/generic_seq2seq_search.py +++ b/users/berger/recipe/recognition/generic_seq2seq_search.py @@ -1,13 +1,17 @@ __all__ = ["GenericSeq2SeqLmImageAndGlobalCacheJob", "GenericSeq2SeqSearchJob"] +from typing import List, Optional, Tuple from sisyphus import * +assert __package__ is not None Path = setup_path(__package__) import shutil import copy from i6_core import rasr, util +from i6_core.lm.lm_image import CreateLmImageJob +from i6_experiments.users.berger.recipe.rasr.label_tree_and_scorer import LabelTree, LabelScorer class GenericSeq2SeqLmImageAndGlobalCacheJob(rasr.RasrCommand, Job): @@ -84,13 +88,15 @@ def find_arpa_lms(cls, config): lookahead_lm_config = config.flf_lattice_tool.network.recognizer.recognizer.lookahead_lm if separate_lookahead_lm: if lookahead_lm_config.type == "ARPA" and lookahead_lm_config._get("image") is None: - result.append(lookahead_lm_config) + pass + # result.append(lookahead_lm_config) # recombination lm # separate_recombination_lm = config.flf_lattice_tool.network.recognizer.recognizer.separate_recombination_lm recombination_lm_config = config.flf_lattice_tool.network.recognizer.recognizer.recombination_lm if separate_recombination_lm: if recombination_lm_config.type == "ARPA" and recombination_lm_config._get("image") is None: - result.append(recombination_lm_config) + pass + # result.append(recombination_lm_config) return result @classmethod @@ -168,8 +174,8 @@ def create_config( # lm images # arpa_lms = cls.find_arpa_lms(config) - for i, lm_config in enumerate(arpa_lms): - lm_config.image = "lm-%d.image" % (i + 1) + for i, lm_config in enumerate(arpa_lms, start=1): + lm_config.image = f"lm-{i}.image" # global cache # config.flf_lattice_tool.global_cache.file = "global.cache" @@ -185,42 +191,171 @@ def hash(cls, kwargs): return super().hash({"config": config, "exe": sprint_exe}) +class BuildGenericSeq2SeqGlobalCacheJob(rasr.RasrCommand, Job): + """ + Standalone job to create the global-cache for generic-seq2seq-tree-search + """ + + def __init__( + self, + crp: rasr.CommonRasrParameters, + label_tree: LabelTree, + label_scorer: LabelScorer, + extra_config: Optional[rasr.RasrConfig] = None, + extra_post_config: Optional[rasr.RasrConfig] = None, + ): + """ + :param crp: common RASR params (required: lexicon, acoustic_model, language_model, recognizer) + :param label_tree: label tree object for structuring the search tree + :param label_scorer: label scorer object for score computation + :param extra_config: overlay config that influences the Job's hash + :param extra_post_config: overlay config that does not influences the Job's hash + """ + self.set_vis_name("Build Global Cache") + + (self.config, self.post_config,) = BuildGenericSeq2SeqGlobalCacheJob.create_config( + crp=crp, + label_tree=label_tree, + label_scorer=label_scorer, + extra_config=extra_config, + extra_post_config=extra_post_config, + ) + + self.exe = self.select_exe(crp.speech_recognizer_exe, "speech-recognizer") + + self.out_log_file = self.log_file_output_path("build_global_cache", crp, False) + self.out_global_cache = self.output_path("global.cache", cached=True) + + self.rqmt = {"time": 1, "cpu": 1, "mem": 2} + + def tasks(self): + yield Task("create_files", mini_task=True) + yield Task("run", resume="run", rqmt=self.rqmt) + + def create_files(self): + self.write_config(self.config, self.post_config, "build_global_cache.config") + self.write_run_script(self.exe, "build_global_cache.config") + + def run(self): + self.run_script(1, self.out_log_file) + shutil.move("global.cache", self.out_global_cache.get_path()) + + @classmethod + def create_config( + cls, + crp: rasr.CommonRasrParameters, + label_tree: LabelTree, + label_scorer: LabelScorer, + extra_config: Optional[rasr.RasrConfig], + extra_post_config: Optional[rasr.RasrConfig], + ): + config, post_config = rasr.build_config_from_mapping( + crp, + { + "lexicon": "speech-recognizer.model-combination.lexicon", + "acoustic_model": "speech-recognizer.model-combination.acoustic-model", + "language_model": "speech-recognizer.model-combination.lm", + "recognizer": "speech-recognizer.recognizer", + }, + ) + + # Apply config from label tree + label_tree.apply_config( + "speech-recognizer.recognizer.label-tree", + config, + post_config, + ) + + # Optional lexicon overwrite + if label_tree.lexicon_config is not None: + config["speech-recognizer.model-combination.lexicon"]._update(label_tree.lexicon_config) + + # Apply config from label scorer and eliminate unnecessary arguments that don't affect the search space (scale, prior) + label_scorer_reduced = LabelScorer( + scorer_type=label_scorer.scorer_type, + scale=1.0, + label_file=label_scorer.label_file, + num_classes=label_scorer.num_classes, + use_prior=False, + extra_args=label_scorer.extra_args, + ) + + label_scorer_reduced.apply_config("speech-recognizer.recognizer.label-scorer", config, post_config) + + # skip conventional AM or load it without GMM # + if crp.acoustic_model_config is None: + config.speech_recognizer.recognizer.use_acoustic_model = False + else: + config.speech_recognizer.recognizer.use_mixture = False + if config.flf_lattice_tool.network.recognizer.acoustic_model._get("length") is not None: + del config.flf_lattice_tool.network.recognizer.acoustic_model["length"] + + # disable scaling + if config.flf_lattice_tool.network.recognizer.lm._get("scale") is not None: + del config.flf_lattice_tool.network.recognizer.lm["scale"] + + config.speech_recognizer.recognition_mode = "init-only" + config.speech_recognizer.search_type = "generic-seq2seq-tree-search" + config.speech_recognizer.global_cache.file = "global.cache" + config.speech_recognizer.global_cache.read_only = False + + config._update(extra_config) + post_config._update(extra_post_config) + + return config, post_config + + @classmethod + def hash(cls, kwargs): + config, _ = cls.create_config(**kwargs) + return super().hash({"config": config, "exe": kwargs["crp"].speech_recognizer_exe}) + + class GenericSeq2SeqSearchJob(rasr.RasrCommand, Job): __sis_hash_exclude__ = {"num_threads": None} def __init__( self, - crp, - feature_flow, - label_tree, - label_scorer, - search_parameters=None, - lm_lookahead=True, - lookahead_options=None, - eval_single_best=True, - eval_best_in_lattice=True, - use_gpu=False, - rtf=2, - mem=8, - hard_rqmt=False, - extra_config=None, - extra_post_config=None, - sprint_exe=None, # allow separat executable than default settings - lm_gc_job=None, - lm_gc_job_local=False, - lm_gc_job_mem=16, - lm_gc_job_default_search=False, - num_threads=None, - ): # TODO set this to true later + crp: rasr.CommonRasrParameters, + feature_flow: rasr.FlowNetwork, + label_tree: LabelTree, + label_scorer: LabelScorer, + rasr_exe: Optional[tk.Path] = None, + search_parameters: Optional[dict] = None, + lm_lookahead: bool = True, + lookahead_options: Optional[dict] = None, + eval_single_best: bool = True, + eval_best_in_lattice: bool = True, + use_gpu: bool = False, + global_cache: Optional[tk.Path] = None, + rtf: float = 2, + mem: float = 8, + extra_config: Optional[rasr.RasrConfig] = None, + extra_post_config: Optional[rasr.RasrConfig] = None, + num_threads: int = 2, + ): self.set_vis_name("Generic Seq2Seq Search") - kwargs = locals() - del kwargs["self"] - self.config, self.post_config = GenericSeq2SeqSearchJob.create_config(**kwargs) + self.config, self.post_config = GenericSeq2SeqSearchJob.create_config( + crp=crp, + feature_flow=feature_flow, + label_tree=label_tree, + label_scorer=label_scorer, + search_parameters=search_parameters, + lm_lookahead=lm_lookahead, + lookahead_options=lookahead_options, + eval_single_best=eval_single_best, + eval_best_in_lattice=eval_best_in_lattice, + extra_config=extra_config, + extra_post_config=extra_post_config, + global_cache=global_cache, + ) self.feature_flow = feature_flow - if sprint_exe is None: - sprint_exe = crp.flf_tool_exe - self.exe = self.select_exe(sprint_exe, "flf-tool") + if rasr_exe is not None: + self.rasr_exe = rasr_exe + else: + self.rasr_exe = crp.flf_tool_exe + assert self.rasr_exe is not None + self.concurrent = crp.concurrent self.use_gpu = use_gpu self.num_threads = num_threads @@ -237,21 +372,15 @@ def __init__( ) self.rqmt = { - "time": max(crp.corpus_duration * rtf / crp.concurrent, 4.5), - "cpu": 2, + "time": max(crp.corpus_duration * rtf / crp.concurrent, 24), + "cpu": num_threads, "gpu": 1 if self.use_gpu else 0, "mem": mem, } - # no automatic resume with doubled rqmt - self.hard_rqmt = hard_rqmt def tasks(self): yield Task("create_files", mini_task=True) - if self.hard_rqmt: # TODO - resume = None - else: - resume = "run" - yield Task("run", resume=resume, rqmt=self.rqmt, args=range(1, self.concurrent + 1)) + yield Task("run", resume="run", rqmt=self.rqmt, args=range(1, self.concurrent + 1)) def create_files(self): self.write_config(self.config, self.post_config, "recognition.config") @@ -261,16 +390,10 @@ def create_files(self): # sometimes crash without this if not self.use_gpu: extra_code += "\nexport CUDA_VISIBLE_DEVICES=" - if self.num_threads is None: - extra_code += "\nexport OMP_NUM_THREADS=%i" % self.rqmt["cpu"] - else: - extra_code += f"\nexport OMP_NUM_THREADS={self.num_threads}" - extra_code += f"\nexport MKL_NUM_THREADS={self.num_threads}" - self.write_run_script(self.exe, "recognition.config", extra_code=extra_code) - # TODO maybe not needed - def stop_run(self, task_id): - print("run job %d exceeds specified rqmt and stoped" % task_id) + extra_code += f"\nexport OMP_NUM_THREADS={self.num_threads}" + extra_code += f"\nexport MKL_NUM_THREADS={self.num_threads}" + self.write_run_script(self.rasr_exe, "recognition.config", extra_code=extra_code) def run(self, task_id): self.run_script(task_id, self.out_log_file[task_id]) @@ -279,45 +402,50 @@ def run(self, task_id): self.out_single_lattice_caches[task_id].get_path(), ) - def cleanup_before_run(self, cmd, retry, task_id, *args): - util.backup_if_exists("recognition.log.%d" % task_id) - util.delete_if_exists("lattice.cache.%d" % task_id) + @classmethod + def find_arpa_lms( + cls, lm_config: rasr.RasrConfig, lm_post_config: Optional[rasr.RasrConfig] = None + ) -> List[Tuple[rasr.RasrConfig, Optional[rasr.RasrConfig]]]: + result = [] + + if lm_config.type == "ARPA": + result.append((lm_config, lm_post_config)) + elif lm_config.type == "combine": + for i in range(1, lm_config.num_lms + 1): + sub_lm_config = lm_config["lm-%d" % i] + sub_lm_post_config = lm_post_config["lm-%d" % i] if lm_post_config is not None else None + result += cls.find_arpa_lms(sub_lm_config, sub_lm_post_config) + + return result + + @classmethod + def find_arpa_lms_without_image( + cls, lm_config: rasr.RasrConfig, lm_post_config: Optional[rasr.RasrConfig] = None + ) -> List[Tuple[rasr.RasrConfig, Optional[rasr.RasrConfig]]]: + def has_image(c, pc): + res = c._get("image") is not None + res = res or (pc is not None and pc._get("image") is not None) + return res + + return [(c, pc) for c, pc in cls.find_arpa_lms(lm_config, lm_post_config) if not has_image(c, pc)] @classmethod def create_config( cls, - crp, - feature_flow, - label_tree, - label_scorer, - search_parameters=None, - lm_lookahead=True, - lookahead_options=None, - eval_single_best=True, - eval_best_in_lattice=True, - extra_config=None, - extra_post_config=None, - sprint_exe=None, - lm_gc_job=None, - lm_gc_job_local=True, - lm_gc_job_mem=6, - lm_gc_job_default_search=False, - **kwargs, + crp: rasr.CommonRasrParameters, + feature_flow: rasr.FlowNetwork, + label_tree: LabelTree, + label_scorer: LabelScorer, + search_parameters: Optional[dict] = None, + lm_lookahead: bool = True, + lookahead_options: Optional[dict] = None, + eval_single_best: bool = True, + eval_best_in_lattice: bool = True, + extra_config: Optional[rasr.RasrConfig] = None, + extra_post_config: Optional[rasr.RasrConfig] = None, + global_cache: Optional[tk.Path] = None, + **_, ): - # optional individual lm-image and global-cache job # - if lm_gc_job is None: - lm_gc_job = GenericSeq2SeqLmImageAndGlobalCacheJob( - crp, - label_tree, - label_scorer, - extra_config, - extra_post_config, - mem=lm_gc_job_mem, - local_job=lm_gc_job_local, - sprint_exe=sprint_exe, - default_search=lm_gc_job_default_search, - ) - # get config from csp # config, post_config = rasr.build_config_from_mapping( crp, @@ -331,8 +459,8 @@ def create_config( parallelize=True, ) - # acoustic model maybe used for allophones and state-tying, but no mixture is needed # - # skip conventional AM or load it without GMM # + # acoustic model maybe used for allophones and state-tying, but no mixture is needed + # skip conventional AM or load it without GMM if crp.acoustic_model_config is None: config.flf_lattice_tool.network.recognizer.use_acoustic_model = False else: @@ -342,14 +470,17 @@ def create_config( config.flf_lattice_tool.network.recognizer.feature_extraction.file = "feature.flow" if feature_flow.outputs != {"features"}: assert len(feature_flow.outputs) == 1, "not implemented otherwise" - config.flf_lattice_tool.network.recognizer.feature_extraction.main_port_name = list(feature_flow.outputs)[0] + config.flf_lattice_tool.network.recognizer.feature_extraction.main_port_name = next( + iter(feature_flow.outputs) + ) + feature_flow.apply_config( "flf-lattice-tool.network.recognizer.feature-extraction", config, post_config, ) - # label tree and optional lexicon overwrite # + # label tree and optional lexicon overwrite label_tree.apply_config( "flf-lattice-tool.network.recognizer.recognizer.label-tree", config, @@ -358,14 +489,15 @@ def create_config( if label_tree.lexicon_config is not None: config["flf-lattice-tool.lexicon"]._update(label_tree.lexicon_config) - # label scorer # + # label scorer label_scorer.apply_config("flf-lattice-tool.network.recognizer.label-scorer", config, post_config) # search settings # search_config = rasr.RasrConfig() if search_parameters is not None: - for key in search_parameters.keys(): - search_config[key] = search_parameters[key] + for key, val in search_parameters.items(): + search_config[key.replace("_", "-")] = val + config.flf_lattice_tool.network.recognizer.recognizer._update(search_config) # lookahead settings # @@ -377,26 +509,23 @@ def create_config( if lookahead_options is not None: la_opts.update(lookahead_options) - config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead = rasr.RasrConfig() - config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead._value = lm_lookahead + config.flf_lattice_tool.network.recognizer.recognizer.optimize_lattice = True + + la_config = rasr.RasrConfig() + la_config._value = lm_lookahead + if "laziness" in la_opts: config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead_laziness = la_opts["laziness"] - config.flf_lattice_tool.network.recognizer.recognizer.optimize_lattice = True + if lm_lookahead: if "history_limit" in la_opts: - config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead.history_limit = la_opts[ - "history_limit" - ] + la_config.history_limit = la_opts["history_limit"] if "tree_cutoff" in la_opts: - config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead.tree_cutoff = la_opts["tree_cutoff"] + la_config.tree_cutoff = la_opts["tree_cutoff"] if "minimum_representation" in la_opts: - config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead.minimum_representation = la_opts[ - "minimum_representation" - ] + la_config.minimum_representation = la_opts["minimum_representation"] if "lm_lookahead_scale" in la_opts: - config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead.lm_lookahead_scale = la_opts[ - "lm_lookahead_scale" - ] + la_config.lm_lookahead_scale = la_opts["lm_lookahead_scale"] if "cache_low" in la_opts: post_config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead.cache_size_low = la_opts[ "cache_low" @@ -406,6 +535,8 @@ def create_config( "cache_high" ] + config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead = la_config + # flf network # config.flf_lattice_tool.network.initial_nodes = "segment" config.flf_lattice_tool.network.segment.type = "speech-segment" @@ -443,32 +574,46 @@ def create_config( post_config.flf_lattice_tool.network.sink.error_on_empty_lattice = False post_config["*"].output_channel.unbuffered = True - # update parameters # - config._update(extra_config) - post_config._update(extra_post_config) - # image and cache # - arpa_lms = GenericSeq2SeqLmImageAndGlobalCacheJob.find_arpa_lms(config) - assert len(arpa_lms) == lm_gc_job.num_images, "mismatch between image-cache config and recognition config" - for i, lm_config in enumerate(arpa_lms): - lm_config.image = lm_gc_job.lm_images[i + 1] + no_image_arpa_lms = GenericSeq2SeqSearchJob.find_arpa_lms_without_image( + lm_config=config.flf_lattice_tool.network.recognizer.lm + ) + if config.flf_lattice_tool.network.recognizer.recognizer._get("lookahead-lm") is not None: + no_image_arpa_lms += GenericSeq2SeqSearchJob.find_arpa_lms_without_image( + lm_config=config.flf_lattice_tool.network.recognizer.recognizer.lookahead_lm + ) + + for lm_config, lm_post_config in no_image_arpa_lms: + rp = rasr.CommonRasrParameters(base=crp) + rp.language_model_config = lm_config + rp.language_model_post_config = lm_post_config + lm_config.image = CreateLmImageJob(crp=rp, mem=8).out_image + + if global_cache is None: + global_cache = BuildGenericSeq2SeqGlobalCacheJob( + crp=crp, label_tree=label_tree, label_scorer=label_scorer + ).out_global_cache - if post_config.flf_lattice_tool.global_cache._get("file") is None: - post_config.flf_lattice_tool.global_cache.read_only = True - post_config.flf_lattice_tool.global_cache.file = lm_gc_job.global_cache + post_config.flf_lattice_tool.global_cache.read_only = True + post_config.flf_lattice_tool.global_cache.file = global_cache + # update parameters # + config._update(extra_config) + post_config._update(extra_post_config) + return config, post_config @classmethod def hash(cls, kwargs): - config, post_config = cls.create_config(**kwargs) - sprint_exe = kwargs["sprint_exe"] - if sprint_exe is None: - sprint_exe = kwargs["crp"].flf_tool_exe + config, _ = cls.create_config(**kwargs) + if kwargs["rasr_exe"] is not None: + rasr_exe = kwargs["rasr_exe"] + else: + rasr_exe = kwargs["crp"].flf_tool_exe return super().hash( { "config": config, "feature_flow": kwargs["feature_flow"], - "exe": sprint_exe, + "exe": rasr_exe, } ) From db7a0ca327588f370ed92c9c23a0d52c7eef0014 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Wed, 22 May 2024 11:13:57 +0000 Subject: [PATCH 046/227] better --- .../tedlium2/configs/ebranch_baseline.py | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ebranch_baseline.py b/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ebranch_baseline.py index f90326966..b0d1887bf 100644 --- a/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ebranch_baseline.py +++ b/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ebranch_baseline.py @@ -1,6 +1,7 @@ import copy, os import numpy +from typing import Any, Tuple import sisyphus.toolkit as tk @@ -891,7 +892,7 @@ def train_mini_self_att( ) apply_fairseq_init_to_transformer_decoder(trafo_dec_args) - training_args = dict() + training_args: dict[str, Any] = dict() training_args["with_staged_network"] = True training_args["speed_pert"] = True @@ -919,7 +920,7 @@ def train_mini_self_att( lstm_training_args["pretrain_reps"] = 5 lstm_training_args["batch_size"] = 15000 * 160 # frames * samples per frame - lstm_dec_exp_args = copy.deepcopy( + lstm_dec_exp_args: dict[str, Any] = copy.deepcopy( { **lstm_training_args, "encoder_args": ebranch_enc_args, @@ -941,7 +942,7 @@ def train_mini_self_att( _, _, global_mean, global_std = compute_features_stats(output_dirname="logmel_80", feat_dim=80) # --------------------- V1 --------------------- - def get_base_v1_args(lr, ep, enc_drop=0.1, pretrain_reps=3, use_legacy_stats=True): + def get_base_v1_args(lr, ep, enc_drop=0.1, pretrain_reps=3, use_legacy_stats=True) -> Tuple[dict[str, Any], str]: # base_bpe1000_peakLR0.0008_ep200_globalNorm_epochOCLR_pre3_fixZoneout_encDrop0.1_woDepthConvPre # Average ckpt: 8.19/7.64 (50 epochs) # - Epoch-based OCLR with peak LR 8e-4 @@ -986,7 +987,7 @@ def get_base_v1_args(lr, ep, enc_drop=0.1, pretrain_reps=3, use_legacy_stats=Tru for dec_att_drop in [0.0]: for weight_drop in [0.1]: for enc_drop in [0.15]: - for ctc_scale in [1.0, 0.3]: + for ctc_scale in [0.3]: base_v1_args, exp_name = get_base_v1_args( lr, ep, enc_drop=enc_drop, use_legacy_stats=False ) @@ -994,6 +995,17 @@ def get_base_v1_args(lr, ep, enc_drop=0.1, pretrain_reps=3, use_legacy_stats=Tru args["encoder_args"].num_blocks = num_blocks + args["with_pretrain"] = False + specaug_steps = {"step0": 6_000, "step1": 12_000, "step2": 18_000} + args["specaug_str_func_opts"] = { + "version": 2, + **specaug_steps, + "max_time_num": 100, + "max_time_dim": 20, + "min_num_add_factor": 0, + "freq_dim_factor": 5, + } + args["encoder_args"].frontend_conv_weight_dropout = weight_drop args["encoder_args"].mhsa_weight_dropout = weight_drop args["encoder_args"].ff_weight_dropout = weight_drop From f87d62a5399e24babc6dcb78aa374f567a94d61f Mon Sep 17 00:00:00 2001 From: Nick Rossenbach Date: Wed, 22 May 2024 13:59:33 +0200 Subject: [PATCH 047/227] config enable write cache manager --- .../ctc_rnnt_standalone_2024/config.py | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/config.py b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/config.py index d12944632..ccd8cf851 100644 --- a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/config.py +++ b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/config.py @@ -24,6 +24,7 @@ def get_training_config( debug: bool = False, use_speed_perturbation: bool = False, post_config: Optional[Dict[str, Any]] = None, + add_cache_manager: bool = False, ) -> ReturnnConfig: """ Get a generic config for training a model @@ -61,21 +62,28 @@ def get_training_config( include_native_ops=include_native_ops, debug=debug, ) - python_prolog = None + python_prolog_serializer_objects = [] # TODO: maybe make nice (if capability added to RETURNN itself) if use_speed_perturbation: - prolog_serializer = TorchCollection( - serializer_objects=[ - Import( - code_object_path=PACKAGE + ".extra_code.speed_perturbation.legacy_speed_perturbation", - unhashed_package_root=PACKAGE, - ) - ] + python_prolog_serializer_objects.append( + Import( + code_object_path=PACKAGE + ".extra_code.speed_perturbation.legacy_speed_perturbation", + unhashed_package_root=PACKAGE, + ) ) - python_prolog = [prolog_serializer] config["train"]["datasets"]["zip_dataset"]["audio"]["pre_process"] = CodeWrapper("legacy_speed_perturbation") + if add_cache_manager: + from i6_experiments.common.setups.serialization import PythonCacheManagerFunctionNonhashedCode, Collection + python_prolog_serializer_objects.append(PythonCacheManagerFunctionNonhashedCode) + + python_prolog = None + if len(python_prolog_serializer_objects) > 0: + python_prolog = [ + TorchCollection(python_prolog_serializer_objects) + ] + returnn_config = ReturnnConfig( config=config, post_config=post_config, python_prolog=python_prolog, python_epilog=[serializer] ) From 8231aad85e2cd1d3141e87ce1678860d41eba7e0 Mon Sep 17 00:00:00 2001 From: Nick Rossenbach Date: Wed, 22 May 2024 14:00:02 +0200 Subject: [PATCH 048/227] standalone 2024 setup add LSTM lm pipeline --- .../ctc_rnnt_standalone_2024/data/bpe_lm.py | 186 ++++++++++++++++++ .../experiments/lm_bpe/__init__.py | 0 .../experiments/lm_bpe/kazuki_lstm.py | 77 ++++++++ .../pytorch_networks/lm/__init__.py | 0 .../pytorch_networks/lm/lstm/__init__.py | 0 .../lm/lstm/kazuki_lstm_zijian_variant_v1.py | 108 ++++++++++ .../lstm/kazuki_lstm_zijian_variant_v1_cfg.py | 15 ++ 7 files changed, 386 insertions(+) create mode 100644 users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/data/bpe_lm.py create mode 100644 users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/experiments/lm_bpe/__init__.py create mode 100644 users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/experiments/lm_bpe/kazuki_lstm.py create mode 100644 users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/lm/__init__.py create mode 100644 users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/lm/lstm/__init__.py create mode 100644 users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/lm/lstm/kazuki_lstm_zijian_variant_v1.py create mode 100644 users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/lm/lstm/kazuki_lstm_zijian_variant_v1_cfg.py diff --git a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/data/bpe_lm.py b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/data/bpe_lm.py new file mode 100644 index 000000000..54f92758f --- /dev/null +++ b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/data/bpe_lm.py @@ -0,0 +1,186 @@ +from sisyphus import tk +from sisyphus.delayed_ops import DelayedFormat + +from dataclasses import dataclass +import os +from typing import Any, Dict, Optional + +from i6_core.text.label.subword_nmt.apply import ApplyBPEToTextJob +from i6_core.corpus.convert import CorpusToTxtJob +from i6_core.text.processing import ConcatenateJob +from i6_core.returnn.config import CodeWrapper + +from i6_experiments.common.setups.returnn.datasets import MetaDataset, ControlDataset, Dataset +from i6_experiments.common.setups.returnn.datastreams.base import Datastream +from i6_experiments.common.setups.returnn.datastreams.vocabulary import BpeDatastream +from i6_experiments.common.helpers.text_labels.subword_nmt_bpe import get_returnn_subword_nmt + +from i6_experiments.common.datasets.librispeech import get_bliss_corpus_dict +from i6_experiments.common.datasets.librispeech.vocab import get_subword_nmt_bpe_v2 +from i6_experiments.common.datasets.librispeech.language_model import get_librispeech_normalized_lm_data + + + +SOURCE_DATASTREAM_KEY = "data" +TARGET_DATASTREAN_KEY = "delayed" + + +@dataclass(frozen=True) +class TrainingDatasets: + train: Dataset + cv: Dataset + devtrain: Dataset + datastreams: Dict[str, Datastream] + + +class LmDataset(ControlDataset): + + def __init__( + self, + *, + corpus_file: tk.Path, + vocab_file: tk.Path, + # super parameters + partition_epoch: Optional[int] = None, + segment_file: Optional[tk.Path] = None, + seq_ordering: Optional[str] = None, + random_subset: Optional[int] = None, + additional_options: Optional[Dict] = None, + ): + super().__init__( + partition_epoch=partition_epoch, + segment_file=segment_file, + seq_ordering=seq_ordering, + random_subset=random_subset, + additional_options=additional_options + ) + + self.corpus_file = corpus_file + self.vocab_file = vocab_file + + def as_returnn_opts(self) -> Dict[str, Any]: + d = { + "class": "LmDataset", + "corpus_file": CodeWrapper(DelayedFormat('lambda: cf("{}")', self.corpus_file)), + "orth_symbols_map_file": self.vocab_file, + "orth_replace_map_file": "", + "word_based": True, + "seq_end_symbol": "", + "auto_replace_unknown_symbol": False, + "unknown_symbol": "", + "add_delayed_seq_data": True, + "delayed_seq_data_start_symbol": "", + } + sd = super().as_returnn_opts() + assert all([k not in sd.keys() for k in d.keys()]), ( + "conflicting keys in %s and %s" + % (str(list(sd.keys())), str(list(d.keys()))), + ) + d.update(sd) + + return d + +@dataclass() +class LMDatasetSettings: + train_partition_epoch: int + train_seq_ordering: str + + +def get_subword_repo(): + """ + This is a for now very ugly helper to get the same subword_nmt repo + as the get_subword_nmt_bpe_v2 is using + :return: + """ + subword_nmt_repo = get_returnn_subword_nmt( + commit_hash="5015a45e28a958f800ef1c50e7880c0c9ef414cf", output_prefix="" + ) + # overwrite hash for future bugfixes, it is unlikely the logic will ever be changed + subword_nmt_repo.hash_overwrite = "I6_SUBWORD_NMT_V2" + return subword_nmt_repo + +def build_lm_training_datasets(prefix, librispeech_key, bpe_size, settings: LMDatasetSettings): + + #data_map = {SOURCE_DATASTREAM_KEY: ("lm_dataset", "data"), TARGET_DATASTREAN_KEY: ("lm_dataset", "delayed")} + #def make_meta(dataset: LmDataset): + # return MetaDataset( + # data_map=data_map, datasets={"lm_dataset": dataset}, seq_order_control_dataset="lm_dataset" + # ) + + bpe_settings = get_subword_nmt_bpe_v2(corpus_key=librispeech_key, bpe_size=bpe_size, unk_label='') + ls_bliss_corpus_dict = get_bliss_corpus_dict() + bpe_datastream = BpeDatastream(available_for_inference=False, bpe_settings=bpe_settings) + + #### Training Data #### + + lm_data = get_librispeech_normalized_lm_data() + ls_train_bliss = ls_bliss_corpus_dict["train-other-960"] + ls_train_text = CorpusToTxtJob( + bliss_corpus=ls_train_bliss, + gzip=True, + ).out_txt + full_train_text = ConcatenateJob( + text_files=[lm_data, ls_train_text], + zip_out=True, + ).out + lm_bpe_data_job = ApplyBPEToTextJob( + text_file=full_train_text, + bpe_codes=bpe_settings.bpe_codes, + bpe_vocab=bpe_settings.bpe_count_vocab, + gzip_output=True, + subword_nmt_repo=get_subword_repo(), + mini_task=False, # this is a large file, so run in cluster + ) + lm_bpe_data_job.add_alias(os.path.join(prefix, "apply_bpe_to_train")) + + #### Dev Data #### + + dev_clean_text = CorpusToTxtJob(bliss_corpus=ls_bliss_corpus_dict["dev-clean"], gzip=True).out_txt + dev_other_text = CorpusToTxtJob(bliss_corpus=ls_bliss_corpus_dict["dev-other"], gzip=True).out_txt + cv_text = ConcatenateJob( + text_files=[dev_clean_text, dev_other_text], + zip_out=True, + ).out + cv_bpe_data_job = ApplyBPEToTextJob( + text_file=cv_text, + bpe_codes=bpe_settings.bpe_codes, + bpe_vocab=bpe_settings.bpe_count_vocab, + gzip_output=True, + subword_nmt_repo=get_subword_repo(), + ) + + #### datasets #### + lm_train_dataset = LmDataset( + corpus_file=lm_bpe_data_job.out_bpe_text, + vocab_file=bpe_settings.bpe_vocab, + partition_epoch=settings.train_partition_epoch, + segment_file=None, + seq_ordering=settings.train_seq_ordering + ) + + lm_cv_dataset = LmDataset( + corpus_file=cv_bpe_data_job.out_bpe_text, + vocab_file=bpe_settings.bpe_vocab, + partition_epoch=1, + segment_file=None, + seq_ordering="sorted" + ) + + lm_devtrain_dataset = LmDataset( + corpus_file=lm_bpe_data_job.out_bpe_text, + vocab_file=bpe_settings.bpe_vocab, + partition_epoch=1, + segment_file=None, + seq_ordering="sorted", + random_subset=3000, + ) + + return TrainingDatasets( + train=lm_train_dataset, + cv=lm_cv_dataset, + # devtrain=lm_devtrain_dataset, + # TODO: Ultra hack for now + devtrain=lm_cv_dataset, + datastreams={"data": bpe_datastream, "delayed": bpe_datastream}, + ) + diff --git a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/experiments/lm_bpe/__init__.py b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/experiments/lm_bpe/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/experiments/lm_bpe/kazuki_lstm.py b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/experiments/lm_bpe/kazuki_lstm.py new file mode 100644 index 000000000..64dfdfe00 --- /dev/null +++ b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/experiments/lm_bpe/kazuki_lstm.py @@ -0,0 +1,77 @@ +from sisyphus import tk + +from dataclasses import asdict +from typing import cast + +from i6_experiments.common.setups.returnn.datastreams.vocabulary import LabelDatastream + +from ...data.bpe_lm import build_lm_training_datasets, LMDatasetSettings +from ...default_tools import RETURNN_EXE, MINI_RETURNN_ROOT +from ...pipeline import training + + +def bpe_kazuki_lstm(): + prefix_name = "experiments/librispeech/ctc_rnnt_standalone_2024/kazuki_lstm/" + + train_settings = LMDatasetSettings( + train_partition_epoch=4, + train_seq_ordering="laplace:.100", + ) + + # build the training datasets object containing train, cv, dev-train and the extern_data dict + train_data_bpe10k = build_lm_training_datasets( + prefix=prefix_name, + librispeech_key="train-other-960", + bpe_size=10000, + settings=train_settings, + ) + label_datastream_bpe5000 = cast(LabelDatastream, train_data_bpe10k.datastreams["data"]) + vocab_size_without_blank = label_datastream_bpe5000.vocab_size + + default_returnn = { + "returnn_exe": RETURNN_EXE, + "returnn_root": MINI_RETURNN_ROOT, + } + + from ...pytorch_networks.lm.lstm.kazuki_lstm_zijian_variant_v1_cfg import ModelConfig + + default_init_args = { + 'init_args_w': {'func': 'normal', 'arg': {'mean': 0.0, 'std': 0.1}}, + 'init_args_b': {'func': 'normal', 'arg': {'mean': 0.0, 'std': 0.1}} + } + + lstm_base_config = ModelConfig( + vocab_dim=vocab_size_without_blank, + embed_dim=512, + hidden_dim=2048, + n_lstm_layers=2, + use_bottle_neck=False, + dropout=0.2, + init_args=default_init_args, + ) + + train_config_24gbgpu = { + "optimizer": {"class": "SGD"}, + ############# + "batch_size": 1280, # BPE tokens + "accum_grad_multiple_step": 1, + "learning_rate": 1.0, + "decay": 0.8, + "multi_num_epochs": train_settings.train_partition_epoch, + "relative_error_threshold": 0, + "multi_update_interval": 1, + "error_measure": "dev_ce", + } + + network_module = "lm.lstm.kazuki_lstm_zijian_variant_v1" + train_args = { + "config": train_config_24gbgpu, + "network_module": network_module, + "net_args": {"model_config_dict": asdict(lstm_base_config)}, + "debug": False, + "add_cache_manager": True, + } + + training_name = prefix_name + "/" + network_module + ".512dim_sub6_24gbgpu_50eps" + train_job = training(training_name, train_data_bpe10k, train_args, num_epochs=30, **default_returnn) + train_job.rqmt["gpu_mem"] = 24 diff --git a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/lm/__init__.py b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/lm/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/lm/lstm/__init__.py b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/lm/lstm/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/lm/lstm/kazuki_lstm_zijian_variant_v1.py b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/lm/lstm/kazuki_lstm_zijian_variant_v1.py new file mode 100644 index 000000000..2450df5d3 --- /dev/null +++ b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/lm/lstm/kazuki_lstm_zijian_variant_v1.py @@ -0,0 +1,108 @@ +import torch +from torch import nn + +from .kazuki_lstm_zijian_variant_v1_cfg import ModelConfig + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] as boolean + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Model(nn.Module): + """ + Simple LSTM LM with an embedding, an LSTM, and a final linear + """ + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig(**model_config_dict) + if self.cfg.dropout > 0: + self.dropout = nn.Dropout(p=self.cfg.dropout) + else: + self.dropout = None + self.use_bottle_neck = self.cfg.use_bottle_neck + self.embed = nn.Embedding(self.cfg.vocab_dim, self.cfg.embed_dim) + self.lstm = nn.LSTM( + input_size=self.cfg.embed_dim, + hidden_size=self.cfg.hidden_dim, + num_layers=self.cfg.n_lstm_layers, + bias=self.cfg.bias, + batch_first=True, + dropout=self.cfg.dropout, + bidirectional=False, + ) + if self.cfg.use_bottle_neck: + self.bottle_neck = nn.Linear(self.cfg.hidden_dim,self.cfg.bottle_neck_dim, bias=True) + self.final_linear = nn.Linear(self.cfg.bottle_neck_dim, self.cfg.vocab_dim, bias=True) + else: + self.final_linear = nn.Linear(self.cfg.hidden_dim, self.cfg.vocab_dim, bias=True) + self._param_init(**self.cfg.init_args) + + + def _param_init(self, init_args_w=None, init_args_b=None): + if init_args_w is None: + init_args_w = {'func': 'normal', 'arg': {'mean': 0.0, 'std': 0.1}} + if init_args_b is None: + init_args_b = {'func': 'normal', 'arg': {'mean': 0.0, 'std': 0.1}} + + for m in self.modules(): + + for name, param in m.named_parameters(): + if 'bias' in name: + if init_args_b['func'] == 'normal': + init_func = nn.init.normal_ + else: + NotImplementedError + hyp = init_args_b['arg'] + else: + if init_args_w['func'] == 'normal': + init_func = nn.init.normal_ + else: + NotImplementedError + hyp = init_args_w['arg'] + init_func(param, **hyp) + + def forward(self, x): + """ + Return logits of each batch at each time step + x: (B, S, F) + """ + x = self.embed(x) + if self.dropout: + x = self.dropout(x) + batch_size = x.shape[0] + h0 = torch.zeros((self.cfg.n_lstm_layers, batch_size, self.cfg.hidden_dim), device=x.device).detach() + c0 = torch.zeros_like(h0, device=x.device).detach() + # This is a uni-directional LSTM, so sequence masking is not necessary + x, _ = self.lstm(x, (h0, c0)) + if self.dropout: + x = self.dropout(x) + if self.use_bottle_neck: + x = self.bottle_neck(x) + if self.dropout: + x = self.dropout(x) + x = self.final_linear(x) + return x + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + labels = data["ldata"] + labels_len = data["data:size1"] + delayed_labels = data["delayed"] + + lm_logits = model(delayed_labels) # (B, S, F) + + ce_loss = torch.nn.functional.cross_entropy(lm_logits.transpose(1, 2), labels, reduction='none') + seq_mask = mask_tensor(labels, labels_len) + ce_loss = (ce_loss * seq_mask).sum() + total_length = torch.sum(labels_len) + + run_ctx.mark_as_loss(name="ce", loss=ce_loss, inv_norm_factor=total_length) diff --git a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/lm/lstm/kazuki_lstm_zijian_variant_v1_cfg.py b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/lm/lstm/kazuki_lstm_zijian_variant_v1_cfg.py new file mode 100644 index 000000000..8c8926917 --- /dev/null +++ b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/lm/lstm/kazuki_lstm_zijian_variant_v1_cfg.py @@ -0,0 +1,15 @@ +from dataclasses import dataclass + +from i6_models.config import ModelConfiguration + +@dataclass +class ModelConfig(): + vocab_dim: int + embed_dim: int + hidden_dim: int + n_lstm_layers: int + init_args: dict + bias: bool = True + use_bottle_neck: bool = False + bottle_neck_dim: int = 512 + dropout: float = 0.0 From ebbf720b5d0aef0e508007185acddd45bfada95e Mon Sep 17 00:00:00 2001 From: schmitt Date: Thu, 23 May 2024 10:18:49 +0200 Subject: [PATCH 049/227] update --- users/schmitt/augmentation/alignment.py | 605 +++++++++--------- .../att_weights.py | 2 - .../labels/v2/librispeech/label_singletons.py | 4 + .../returnn/config_builder/base.py | 21 +- .../returnn/config_builder/global_.py | 12 +- .../returnn/config_builder/segmental.py | 47 +- .../network_builder/network_builder.py | 73 ++- .../center_window_att/baseline_v2/__init__.py | 56 +- .../center_window_att/recog.py | 6 +- .../pipelines/pipeline_ls_conf/checkpoints.py | 3 +- .../global_att/baseline_v1/__init__.py | 11 + .../global_vs_segmental_2022_23/recog_new.py | 11 +- .../global_vs_segmental_2022_23/train_new.py | 13 +- .../returnn/config_builder_rf/base.py | 28 +- .../returnn/network_builder_rf/base.py | 3 - .../network_builder_rf/global_/model.py | 7 +- .../returnn/network_builder_rf/recog.py | 2 + .../network_builder_rf/segmental/model.py | 59 +- .../segmental/model_new/blank_model/model.py | 195 ++++-- .../segmental/model_new/blank_model/train.py | 113 +++- .../segmental/model_new/label_model/model.py | 80 +-- .../segmental/model_new/label_model/train.py | 41 +- .../network_builder_rf/segmental/recog.py | 271 ++++++-- .../network_builder_rf/segmental/train.py | 153 +++-- .../network_builder_rf/segmental/utils.py | 18 +- .../pipelines/pipeline_ls_conf/__init__.py | 5 +- .../center_window_att/baseline_v1/__init__.py | 24 +- .../center_window_att/baseline_v1/baseline.py | 62 +- .../center_window_att/baseline_v2/__init__.py | 40 -- .../center_window_att/baseline_v3/__init__.py | 56 ++ .../{baseline_v2 => baseline_v3}/alias.py | 2 +- .../center_window_att/baseline_v3/baseline.py | 20 + .../center_window_att/baseline_v4/__init__.py | 26 + .../center_window_att/baseline_v4/alias.py | 4 + .../center_window_att/baseline_v4/baseline.py | 18 + .../baseline.py => config_builder.py} | 40 +- .../center_window_att/train.py | 132 ++-- .../global_att/baseline_v1/__init__.py | 2 +- .../pipeline_ls_conf/global_att/train.py | 21 +- 39 files changed, 1432 insertions(+), 854 deletions(-) delete mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v2/__init__.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/__init__.py rename users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/{baseline_v2 => baseline_v3}/alias.py (82%) create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/baseline.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/__init__.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/alias.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/baseline.py rename users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/{baseline_v2/baseline.py => config_builder.py} (62%) diff --git a/users/schmitt/augmentation/alignment.py b/users/schmitt/augmentation/alignment.py index 3953fbf19..e0aef7eca 100644 --- a/users/schmitt/augmentation/alignment.py +++ b/users/schmitt/augmentation/alignment.py @@ -1,119 +1,25 @@ -import tensorflow as tf import numpy as np from matplotlib import pyplot as plt -import contextlib import ast from collections import Counter +import torch +from torch.utils.data import DataLoader +from typing import Sequence from i6_experiments.users.schmitt.hdf import load_hdf_data -from i6_experiments.users.schmitt.visualization.visualization import PlotAlignmentJob +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental import utils + +import returnn.frontend as rf +from returnn.tensor import Dim +from returnn.datasets.hdf import HDFDataset +from returnn.datasets.basic import Dataset +from returnn.torch.data import extern_data as extern_data_util from sisyphus import Path -def shift_alignment_boundaries_v1(data, blank_idx, max_shift, network): - """ - We take as input a batch of alignments (B, T) and output a batch of new alignments by randomly moving the alignment - boundaries with a maximum shift of `max_shift`. - This was my first implementation, which led to errors in RETURNN and which had a bias. - :param data: - :param blank_idx: - :param max_shift: - :param network: - :return: - """ +def shift_alignment_boundaries_single_seq(alignment: np.ndarray, blank_idx: int, max_shift: int): import tensorflow as tf - - x = data.get_placeholder_as_batch_major() - seq_lens = data.get_sequence_lengths() - max_shift = tf.constant(max_shift) - - @tf.function - def get_augmented_alignment(): - # the batch with the new alignments - new_data = tf.TensorArray( - tf.int32, - size=tf.shape(x)[0], - dynamic_size=True, - clear_after_read=True - ) - - i = 0 - - # go over each alignment - for alignment in x: - # positions of non-blank labels in the original alignment - non_blank_pos = tf.cast(tf.where(tf.not_equal(alignment, blank_idx)), dtype=tf.dtypes.int32) - nb_labels = alignment[tf.not_equal(alignment, blank_idx)] - # prepend -1 and append seq_len of alignment to the positions - non_blank_pos_ext = tf.concat((tf.constant([[-1]]), non_blank_pos, [[seq_lens[i]]]), axis=0) - # from these extended positions, we get the amount of left and right space of each label - right_spaces = non_blank_pos_ext[2:] - non_blank_pos_ext[1:-1] - 1 - left_spaces = non_blank_pos_ext[1:-1] - non_blank_pos_ext[:-2] - 1 - # the maximum shifting amount is then determined by the amount of space and the max_shift parameter - max_left_shift = tf.where(tf.greater(left_spaces, max_shift), max_shift, left_spaces) - max_right_shift = tf.where(tf.greater(right_spaces, max_shift), max_shift, right_spaces) - - # concat the tensors in order to loop over them - nb_datas = tf.concat( - ( - non_blank_pos, - max_left_shift, - max_right_shift - ), - axis=-1) - - # create an array of new positions - new_non_blank_pos = tf.TensorArray( - tf.int32, - size=tf.shape(non_blank_pos)[0], - dynamic_size=True, - clear_after_read=True - ) - - # loop index - j = 0 - # store prev pos - prev_pos = tf.TensorArray( - tf.int32, - size=1, - dynamic_size=True, - clear_after_read=True - ) - for nb_data in nb_datas: - # the min and max value of the new position is given by the current position and the maximum left and right shift - minval = nb_data[0] - nb_data[1] - maxval = nb_data[0] + nb_data[2] - - # if we are past the first element, the left boundary is additionally constrained by the new position of - # the previous label - if j > 0: - minval = tf.maximum(minval, prev_pos.read(0) + 1) - # the new position is uniformly sampled within the possible boundaries - new_pos = tf.random.uniform((1,), minval=minval, maxval=maxval + 1, dtype=tf.dtypes.int32) - prev_pos = prev_pos.write(0, tf.squeeze(new_pos)) - new_non_blank_pos = new_non_blank_pos.write(j, new_pos) - j += 1 - - # stack the new positions into an array - new_non_blank_pos = new_non_blank_pos.stack() - - # scatter the labels into the new alignment according to the new positions - output_shape = tf.expand_dims(tf.shape(alignment)[0], axis=0) - # add 1 to the labels here because a label might have value 0 which is the same as the default value in scatter - new_alignment = tf.scatter_nd(new_non_blank_pos, nb_labels + 1, output_shape) - # subtract 1 again from the labels in the final alignment and replace the 0's with the blank idx - new_alignment = tf.where(tf.equal(new_alignment, 0), blank_idx, new_alignment - 1) - - new_data = new_data.write(i, new_alignment) - i += 1 - - return new_data.stack() - - return get_augmented_alignment() - - -def shift_alignment_boundaries_v2(alignment: np.ndarray, blank_idx: int, max_shift: int): alignment = tf.convert_to_tensor(alignment) labels = alignment[tf.not_equal(alignment, blank_idx)] label_positions = tf.cast(tf.where(tf.not_equal(alignment, blank_idx)), tf.int32)[:, 0] # [S] @@ -152,205 +58,203 @@ def shift_alignment_boundaries_v2(alignment: np.ndarray, blank_idx: int, max_shi return new_alignment.numpy() -shift_alignment_boundaries_func_str = """ -def shift_alignment_boundaries(data, network): - import tensorflow as tf +def shift_alignment_boundaries_batched( + alignment: rf.Tensor, + alignment_spatial_dim: Dim, + batch_dims: Sequence[Dim], + blank_idx: int, + max_shift: int, +): + non_blank_mask = rf.logical_and( + alignment != rf.convert_to_tensor(blank_idx), + rf.sequence_mask(alignment.dims) + ) + labels, labels_spatial_dim = utils.get_masked(alignment, non_blank_mask, alignment_spatial_dim, batch_dims) + + label_positions = rf.where( + non_blank_mask, rf.range_over_dim(alignment_spatial_dim), rf.convert_to_tensor(-1)) + label_positions, _ = utils.get_masked( + label_positions, + non_blank_mask, + alignment_spatial_dim, + batch_dims, + labels_spatial_dim + ) + + labels_spatial_dim.dyn_size_ext.raw_tensor = rf.copy_to_device(labels_spatial_dim.dyn_size_ext, alignment.device).raw_tensor + singleton_dim = Dim(description="singleton", dimension=1, kind=Dim.Types.Spatial) + label_positions_ext, label_positions_ext_spatial_dim = rf.concat( + (rf.expand_dim(rf.copy_to_device(alignment_spatial_dim.dyn_size_ext, alignment.device), singleton_dim), singleton_dim), + (rf.reverse_sequence(label_positions, axis=labels_spatial_dim), labels_spatial_dim), + allow_broadcast=True, + ) + label_positions_ext, label_positions_ext_spatial_dim = rf.concat( + (rf.expand_dim(rf.convert_to_tensor(-1, device=alignment.device), singleton_dim), singleton_dim), + (rf.reverse_sequence(label_positions_ext, axis=label_positions_ext_spatial_dim), label_positions_ext_spatial_dim), + allow_broadcast=True, + ) + + # for each label, the distance to the next label on the right and left (or to the sequence boundaries) + distance_right = rf.gather( + label_positions_ext, + indices=rf.range_over_dim(labels_spatial_dim) + 2, + axis=label_positions_ext_spatial_dim + ) - label_positions - 1 # [S] + distance_left = label_positions - rf.gather( + label_positions_ext, + indices=rf.range_over_dim(labels_spatial_dim), + axis=label_positions_ext_spatial_dim + ) - 1 # [S] + + # find the (common) uneven distances + # ignore the last right distance and the first left distance as they are not shared + labels_spatial_dim_minus_one = labels_spatial_dim - 1 + labels_spatial_dim_minus_one_range = rf.range_over_dim(labels_spatial_dim_minus_one) + distance_right_uneven = rf.gather( + rf.convert_to_tensor(distance_right % 2 != 0), + indices=labels_spatial_dim_minus_one_range, + axis=labels_spatial_dim + ) # [S-1] + distance_left_uneven = rf.gather( + rf.convert_to_tensor(distance_left % 2 != 0), + indices=labels_spatial_dim_minus_one_range + 1, + axis=labels_spatial_dim + ) # [S-1] + + assert rf.reduce_all( + distance_right_uneven == distance_left_uneven, + axis=labels_spatial_dim_minus_one, + use_mask=False, # not implemented otherwise + ), "Uneven distances are not equal." - blank_idx = {blank_idx} - max_shift = {max_shift} - - x = data.get_placeholder_as_batch_major() - seq_lens = data.get_sequence_lengths() - max_shift = tf.constant(max_shift) - - @tf.function - def get_augmented_alignment(): - # the batch with the new alignments - new_data = tf.TensorArray( - tf.int32, - size=tf.shape(x)[0], - dynamic_size=True, - clear_after_read=True + # randomly choose whether to ceil or floor the uneven distances + random_ceil = rf.random_uniform(dims=distance_right_uneven.dims, dtype="int32", minval=0, maxval=2) + + singleton_zero_tensor = rf.zeros(batch_dims + [singleton_dim], dtype="int32") + random_ceil_right = rf.where(distance_right_uneven, random_ceil, rf.zeros_like(random_ceil)) + random_ceil_right, _ = rf.concat( + (random_ceil_right, labels_spatial_dim_minus_one), + (singleton_zero_tensor, singleton_dim), + out_dim=labels_spatial_dim, + ) + random_ceil_left = rf.where(distance_left_uneven, random_ceil, rf.ones_like(random_ceil)) + random_ceil_left, _ = rf.concat( + (singleton_zero_tensor + 1, singleton_dim), + (random_ceil_left, labels_spatial_dim_minus_one), + out_dim=labels_spatial_dim, + ) + + # can at most use half of the space to the right except for the last (or first) label + # use floor division in both cases and then apply random ceil + distance_right, _ = rf.concat( + (rf.gather(distance_right, indices=labels_spatial_dim_minus_one_range, axis=labels_spatial_dim) // 2, labels_spatial_dim_minus_one), + (rf.gather(distance_right, indices=singleton_zero_tensor + labels_spatial_dim.dyn_size_ext - 1, axis=labels_spatial_dim), singleton_dim), + out_dim=labels_spatial_dim, + ) + distance_left, _ = rf.concat( + (rf.gather(distance_left, indices=singleton_zero_tensor, axis=labels_spatial_dim), singleton_dim), + (rf.gather(distance_left, indices=labels_spatial_dim_minus_one_range + 1, axis=labels_spatial_dim) // 2, labels_spatial_dim_minus_one), + out_dim=labels_spatial_dim, + ) + + distance_right += random_ceil_right + distance_left += 1 - random_ceil_left + + # random shift is either max_shift or the available space to the right or left + random_shift = rf.random_uniform( + dims=label_positions.dims, dtype="int32", minval=-max_shift, maxval=max_shift + 1) + random_shift = rf.clip_by_value(random_shift, -distance_left, distance_right) + + # new positions are the old positions plus the random shift + new_positions = label_positions + random_shift + assert rf.reduce_all( + rf.gather( + new_positions, indices=labels_spatial_dim_minus_one_range + 1, axis=labels_spatial_dim + ) > rf.gather( + new_positions, indices=labels_spatial_dim_minus_one_range, axis=labels_spatial_dim + ), + axis=labels_spatial_dim_minus_one, + use_mask=False, # not implemented otherwise + ), "New positions are not sorted anymore." + + # set new positions in the padded area to the (last + 1) position of the new alignment and cut that position off later + alignment_spatial_dim_plus_one = alignment_spatial_dim + 1 + new_positions = rf.cast(new_positions, "int64") + new_positions = rf.where( + rf.sequence_mask(new_positions.dims), + new_positions, + rf.copy_to_device( + rf.reduce_max( + alignment_spatial_dim_plus_one.dyn_size_ext, axis=alignment_spatial_dim_plus_one.dyn_size_ext.dims) - 1, + alignment.device ) + ) - i = 0 - - # go over each alignment - for alignment in x: - # positions of non-blank labels in the original alignment - non_blank_pos = tf.cast(tf.where(tf.not_equal(alignment, blank_idx)), dtype=tf.dtypes.int32) - nb_labels = alignment[tf.not_equal(alignment, blank_idx)] - # prepend -1 and append seq_len of alignment to the positions - non_blank_pos_ext = tf.concat((tf.constant([[-1]]), non_blank_pos, [[seq_lens[i]]]), axis=0) - # from these extended positions, we get the amount of left and right space of each label - right_spaces = non_blank_pos_ext[2:] - non_blank_pos_ext[1:-1] - 1 - left_spaces = non_blank_pos_ext[1:-1] - non_blank_pos_ext[:-2] - 1 - # the maximum shifting amount is then determined by the amount of space and the max_shift parameter - max_left_shift = tf.where(tf.greater(left_spaces, max_shift), max_shift, left_spaces) - max_right_shift = tf.where(tf.greater(right_spaces, max_shift), max_shift, right_spaces) - - # concat the tensors in order to loop over them - nb_datas = tf.concat( - ( - non_blank_pos, - max_left_shift, - max_right_shift - ), - axis=-1) - - # create an array of new positions - new_non_blank_pos = tf.TensorArray( - tf.int32, - size=tf.shape(non_blank_pos)[0], - dynamic_size=True, - clear_after_read=True - ) + # extend the alignment by one in the spatial dimension to store the labels in the padded area + new_alignment_ext = alignment.copy_template_replace_dim_tag( + axis=alignment.get_axis_from_description(alignment_spatial_dim), + new_dim_tag=alignment_spatial_dim_plus_one + ) + new_alignment_ext = new_alignment_ext.copy_transpose(batch_dims + [alignment_spatial_dim_plus_one]) - # loop index - j = 0 - # store prev pos - prev_pos = tf.TensorArray( - tf.int32, - size=1, - dynamic_size=True, - clear_after_read=True - ) - for nb_data in nb_datas: - # the min and max value of the new position is given by the current position and the maximum left and right shift - minval = nb_data[0] - nb_data[1] - maxval = nb_data[0] + nb_data[2] - - # if we are past the first element, the left boundary is additionally constrained by the new position of - # the previous label - if j > 0: - minval = tf.maximum(minval, prev_pos.read(0) + 1) - # the new position is uniformly sampled within the possible boundaries - new_pos = tf.random.uniform((1,), minval=minval, maxval=maxval + 1, dtype=tf.dtypes.int32) - prev_pos = prev_pos.write(0, tf.squeeze(new_pos)) - new_non_blank_pos = new_non_blank_pos.write(j, new_pos) - j += 1 - - # stack the new positions into an array - new_non_blank_pos = new_non_blank_pos.stack() - - # scatter the labels into the new alignment according to the new positions - output_shape = tf.expand_dims(tf.shape(alignment)[0], axis=0) - # add 1 to the labels here because a label might have value 0 which is the same as the default value in scatter - new_alignment = tf.scatter_nd(new_non_blank_pos, nb_labels + 1, output_shape) - # subtract 1 again from the labels in the final alignment and replace the 0's with the blank idx - new_alignment = tf.where(tf.equal(new_alignment, 0), blank_idx, new_alignment - 1) - - new_data = new_data.write(i, new_alignment) - i += 1 - - return new_data.stack() - - return get_augmented_alignment() -""" - - -@contextlib.contextmanager -def make_scope(): - """ - :rtype: tf.compat.v1.Session - """ - - import returnn.tf.compat as tf_compat - with tf.Graph().as_default() as graph: - with tf_compat.v1.Session(graph=graph) as session: - yield session - - -def test_alignment_augmentation_in_returnn(): - from returnn.config import Config - from returnn.tf.network import TFNetwork - import returnn.tf.compat as tf_compat - - with make_scope() as session: - config = Config() - config.update({ - "shift_alignment_boundaries": shift_alignment_boundaries_v1, - "extern_data": { - "data": { - "dim": 10, - "sparse": True - } - }, - "network": { - "augmented_align": { - "class": "eval", - "from": "data:data", - "eval": "self.network.get_config().typed_value('shift_alignment_boundaries')(source(0, as_data=True), blank_idx=0, max_shift=4, network=self.network)" - }, - "output": {"class": "copy", "from": ["augmented_align"]}, - }}) - network = TFNetwork(config=config, train_flag=True) - network.construct_from_dict(config.typed_value("network")) - - session.run(tf_compat.v1.global_variables_initializer()) - out = network.layers["output"].output.placeholder - n_batch = 2 - seq_len = 21 - input_data = np.array([ - [0, 0, 0, 4, 0, 0, 8, 0, 2, 0, 0, 0, 0, 6, 0, 0, 0, 0, 3, 0, 0], - [0, 7, 0, 6, 2, 0, 0, 7, 0, 6, 0, 0, 5, 0, 0, 0, 0, 0, 5, 0, 0] - ], - dtype="int32") - seq_lens = np.array([seq_len, seq_len], dtype="int32") - assert input_data.shape == (n_batch, seq_lens[0]) - feed = {network.extern_data.data["data"].placeholder: input_data, - network.extern_data.data["data"].size_placeholder[0]: seq_lens, - # network.extern_data.data["seq_tag"].placeholder: input_tags - } - - print("input: ", input_data) - out = session.run([out, network.get_post_control_dependencies()], feed_dict=feed) - - print("output: ", out) - - network.call_graph_reset_callbacks() - - -def calculate_shift_statistics_over_corpus(max_shift, num_iterations, alignment_data, blank_idx): - label_position_diff_counter = Counter() - for i, (seq_tag, alignment_old) in enumerate(alignment_data.items()): - if i % 100 == 0: - print(i) + # scatter the labels into the new alignment according to the new positions + new_alignment_ext.raw_tensor = torch.full( + rf.zeros_like(new_alignment_ext).raw_tensor.shape, + blank_idx, + dtype=torch.int32, + device=alignment.device + ).scatter( + dim=1, + index=new_positions.raw_tensor.long(), + src=labels.copy_transpose(batch_dims + [labels_spatial_dim]).raw_tensor, + ) - alignment_new = alignment_old.copy() - for _ in range(num_iterations): - alignment_new = shift_alignment_boundaries_v2(alignment_new, blank_idx=blank_idx, max_shift=max_shift) + new_alignment = alignment.copy() + # cut off the last position in the spatial dimension + new_alignment.raw_tensor = new_alignment_ext.raw_tensor[:, :-1] - label_positions_old = np.argwhere(alignment_old != blank_idx)[:, 0] - label_positions_new = np.argwhere(alignment_new != blank_idx)[:, 0] - label_position_diffs = label_positions_new - label_positions_old - label_position_diff_counter.update(label_position_diffs) + return new_alignment - plt.bar(label_position_diff_counter.keys(), label_position_diff_counter.values()) - plt.title(f"Label position differences for \nmax_shift={max_shift} and {num_iterations} iterations.") - plt.show() - plt.close() +def test_alignment_augmentation_single_seq(hdf_path: str): + def calculate_shift_statistics_over_corpus(max_shift, num_iterations, alignment_data, blank_idx): + label_position_diff_counter = Counter() + for i, (seq_tag, alignment_old) in enumerate(alignment_data.items()): + if i % 100 == 0: + print(i) -def compare_alignments(seq_tag, alignment_data, vocab, blank_idx, max_shift, num_iterations): - alignment_old = alignment_data[seq_tag] - alignment_new = alignment_old.copy() - for _ in range(num_iterations): - alignment_new = shift_alignment_boundaries_v2(alignment_new, blank_idx=blank_idx, max_shift=max_shift) + alignment_new = alignment_old.copy() + for _ in range(num_iterations): + alignment_new = shift_alignment_boundaries_single_seq(alignment_new, blank_idx=blank_idx, max_shift=max_shift) - labels_old = alignment_old[alignment_old != blank_idx] - labels_new = alignment_new[alignment_new != blank_idx] + label_positions_old = np.argwhere(alignment_old != blank_idx)[:, 0] + label_positions_new = np.argwhere(alignment_new != blank_idx)[:, 0] + label_position_diffs = label_positions_new - label_positions_old + label_position_diff_counter.update(label_position_diffs) - fig, ax_old = PlotAlignmentJob._get_fig_ax(alignment_old) - PlotAlignmentJob._set_ticks(ax_old, alignment_old, labels_old, vocab, blank_idx, ymin=0.5) - ax_new = ax_old.twiny() - PlotAlignmentJob._set_ticks(ax_new, alignment_new, labels_new, vocab, blank_idx, ymax=0.5, color="g") - plt.title(f"Comparison of alignment for {seq_tag} \nwith max_shift={max_shift} and {num_iterations} iterations.") - plt.show() - plt.close() + plt.bar(label_position_diff_counter.keys(), label_position_diff_counter.values()) + plt.title(f"Label position differences for \nmax_shift={max_shift} and {num_iterations} iterations.") + plt.show() + plt.close() + + def compare_alignments(seq_tag, alignment_data, vocab, blank_idx, max_shift, num_iterations): + from i6_experiments.users.schmitt.visualization.visualization import PlotAlignmentJob + alignment_old = alignment_data[seq_tag] + alignment_new = alignment_old.copy() + for _ in range(num_iterations): + alignment_new = shift_alignment_boundaries_single_seq(alignment_new, blank_idx=blank_idx, max_shift=max_shift) + + labels_old = alignment_old[alignment_old != blank_idx] + labels_new = alignment_new[alignment_new != blank_idx] + fig, ax_old = PlotAlignmentJob._get_fig_ax(alignment_old) + PlotAlignmentJob._set_ticks(ax_old, alignment_old, labels_old, vocab, blank_idx, ymin=0.5) + ax_new = ax_old.twiny() + PlotAlignmentJob._set_ticks(ax_new, alignment_new, labels_new, vocab, blank_idx, ymax=0.5, color="g") + plt.title(f"Comparison of alignment for {seq_tag} \nwith max_shift={max_shift} and {num_iterations} iterations.") + plt.show() + plt.close() -def test_alignment_augmentation(hdf_path: str): # load vocabulary as dictionary with open("/u/zeineldeen/setups/librispeech/2022-11-28--conformer-att/work/i6_core/text/label/subword_nmt/train/ReturnnTrainBpeJob.vTq56NZ8STWt/output/bpe.vocab", "r") as f: json_data = f.read() @@ -373,5 +277,122 @@ def test_alignment_augmentation(hdf_path: str): ) +def test_alignment_augmentation_batched(hdf_path: str): + from returnn.torch.data import pipeline as data_pipeline + from returnn.torch.data import returnn_dataset_wrapper + from returnn.tensor import batch_dim + + # from test_torch_dataset.py + def get_dataloader( + dataset: Dataset, mp_manager: torch.multiprocessing.Manager, *, batch_size: int = 5, max_seqs: int = 2 + ) -> DataLoader: + # Follow mostly similar logic as in the PT engine. + + epoch_mp_shared = mp_manager.Value("i", 0) + epoch_mp_shared.value = 1 + reset_callback = returnn_dataset_wrapper.ReturnnDatasetResetMpSharedEpochCallback( + dataset=dataset, epoch_mp_shared=epoch_mp_shared + ) + + wrapped_dataset = returnn_dataset_wrapper.ReturnnDatasetIterDataPipe(dataset, reset_callback=reset_callback) + + batches_dataset = data_pipeline.BatchingIterDataPipe(wrapped_dataset, batch_size=batch_size, max_seqs=max_seqs) + + # Test different ways to deepcopy/serialize the dataset. + # This is what DataLoader2 also would do, although DataLoader2 also uses dill as a fallback, + # if it is available. + # Dill is not always available though, + # so it is important that we make sure that it also works without dill. + + from copy import deepcopy + + deepcopy(batches_dataset) + + import pickle + + pickle.loads(pickle.dumps(batches_dataset)) + + return data_pipeline.create_data_loader_from_batches(batches_dataset, { + "num_workers": 1 + }) + + blank_idx = 10025 + max_shift = 2 + num_iterations = 2 + + hdf_dataset = HDFDataset([hdf_path]) + hdf_dataset.initialize() + hdf_dataset.init_seq_order(epoch=1) + + out_spatial_dim = Dim(description="out_spatial", dimension=None, kind=Dim.Types.Spatial) + vocab_dim = Dim(description="vocab", dimension=10026, kind=Dim.Types.Spatial) + + extern_data_dict = { + "data": { + "dim_tags": [batch_dim, out_spatial_dim], + "sparse_dim": vocab_dim + }, + } + extern_data = extern_data_util.extern_data_template_from_config_opts(extern_data_dict) + + mp_manager = torch.multiprocessing.Manager() + dataloader = get_dataloader(hdf_dataset, mp_manager, batch_size=10_000, max_seqs=10) + data_iter = iter(dataloader) + + label_position_diff_counter = Counter() + rf.select_backend_torch() + + seq_idx = 0 + while True: + try: + extern_data_raw = next(data_iter) + except StopIteration: + break + + extern_data_tensor_dict = extern_data_util.raw_dict_to_extern_data( + extern_data_raw, extern_data_template=extern_data, device="cpu" + ) + alignment_old = extern_data_tensor_dict["data"] + + batch_axis = alignment_old.get_axis_from_description(batch_dim) + seq_idx += alignment_old.dims[batch_axis].dyn_size_ext.raw_tensor.item() + print(f"Processing sequence {seq_idx}/{hdf_dataset.num_seqs}") + + alignment_new = alignment_old.copy() + for _ in range(num_iterations): + alignment_new = shift_alignment_boundaries_batched( + alignment_new, + alignment_spatial_dim=out_spatial_dim, + batch_dims=[batch_dim], + blank_idx=blank_idx, + max_shift=max_shift + ) + + alignment_old_raw = alignment_old.copy_transpose([batch_dim, out_spatial_dim]).raw_tensor + alignment_new_raw = alignment_new.copy_transpose([batch_dim, out_spatial_dim]).raw_tensor + seq_mask_raw = rf.sequence_mask(alignment_old.dims).copy_transpose([batch_dim, out_spatial_dim]).raw_tensor + + label_positions_old = torch.where( + torch.logical_and( + alignment_old_raw != blank_idx, + seq_mask_raw + ) + )[1] # [S] + label_positions_new = torch.where( + torch.logical_and( + alignment_new_raw != blank_idx, + seq_mask_raw + ) + )[1] # [S] + + label_position_diffs = label_positions_new - label_positions_old + label_position_diff_counter.update(label_position_diffs.numpy()) + + plt.bar(label_position_diff_counter.keys(), label_position_diff_counter.values()) + plt.title(f"Label position differences for \nmax_shift={max_shift} and {num_iterations} iterations.") + plt.show() + plt.close() + + if __name__ == "__main__": - test_alignment_augmentation("/work/asr3/zeyer/schmitt/sisyphus_work_dirs/segmental_models_2021_22/i6_core/returnn/forward/ReturnnForwardJob.1fohfY7LLczN/output/alignments.hdf") + test_alignment_augmentation_batched("/work/asr3/zeyer/schmitt/sisyphus_work_dirs/segmental_models_2021_22/i6_core/returnn/forward/ReturnnForwardJob.1fohfY7LLczN/output/alignments.hdf") diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/att_weights.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/att_weights.py index b83e65f2e..1bef63d9f 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/att_weights.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/att_weights.py @@ -96,8 +96,6 @@ def dump_att_weights( hdf_filenames["att_energies"] = "att_energies.hdf" if isinstance(config_builder, SegmentalConfigBuilder): - if hdf_alias == "ground_truth": - assert hdf_targets == ref_alignment dump_frame_probs_opts["use_train_net"] = True hdf_filenames.update({ diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/labels/v2/librispeech/label_singletons.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/labels/v2/librispeech/label_singletons.py index eea5f52d4..a248e57ae 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/labels/v2/librispeech/label_singletons.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/labels/v2/librispeech/label_singletons.py @@ -1,5 +1,9 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.labels.v2.librispeech.bpe.bpe_labels import LibrispeechBPE10025Labels, LibrispeechBPE10025LabelsWithSilence from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.labels.v2.librispeech.bpe.bpe_alignments import LibrispeechBpe10025CtcAlignment, LibrispeechBpe10025CtcAlignmentEos +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.corpora.librispeech import LibrispeechCorpora + + +LIBRISPEECH_CORPUS = LibrispeechCorpora() LibrispeechBPE10025_LABELS = LibrispeechBPE10025Labels() LibrispeechBPE10025_LABELS_WITH_SILENCE = LibrispeechBPE10025LabelsWithSilence(LibrispeechBPE10025_LABELS) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/returnn/config_builder/base.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/returnn/config_builder/base.py index 9f8d733dd..b2d55452b 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/returnn/config_builder/base.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/returnn/config_builder/base.py @@ -99,8 +99,6 @@ def get_train_config(self, opts: Dict, python_epilog: Optional[Dict] = None): ) ) net_dict = self.get_net_dict("train", config_dict=config_dict, python_prolog=python_prolog) - if opts.get("align_augment", False): - self.add_align_augment(net_dict=net_dict, networks_dict=networks_dict, python_prolog=python_prolog) if opts.get("dataset_opts", {}).get("use_speed_pert"): python_prolog.append(speed_pert_str) @@ -126,8 +124,10 @@ def get_train_config(self, opts: Dict, python_epilog: Optional[Dict] = None): version=decoder_version, net_dict=net_dict, train=True, - target_num_labels=self.dependencies.model_hyperparameters.target_num_labels_wo_blank + target_num_labels=self.dependencies.model_hyperparameters.target_num_labels_wo_blank, + python_prolog=python_prolog, ) + config_dict["network"] = net_dict if opts.get("cleanup_old_models"): @@ -185,7 +185,8 @@ def get_recog_config(self, opts: Dict): net_dict = self.get_net_dict("search", config_dict=config_dict, python_prolog=python_prolog) if net_dict is not None: - net_dict.pop("ctc") # not needed during recognition + if not opts.get("ctc_shallow_fusion_opts"): + net_dict.pop("ctc") # not needed during recognition # set beam size net_dict["output"]["unit"]["output"]["beam_size"] = opts.get("beam_size", self.get_default_beam_size()) @@ -315,15 +316,19 @@ def edit_network_only_train_length_model(self, net_dict: Dict): def edit_network_freeze_encoder(self, net_dict: Dict): pass - def edit_network_modify_decoder(self, version: int, net_dict: Dict, train: bool, target_num_labels: int): + def edit_network_modify_decoder( + self, + version: int, + net_dict: Dict, + train: bool, + target_num_labels: int, + python_prolog: Optional[List] = None + ): raise NotImplementedError def edit_network_use_same_static_padding(self, net_dict: Dict): raise NotImplementedError - def add_align_augment(self, net_dict, networks_dict, python_prolog): - raise NotImplementedError - def edit_network_freeze_layers_excluding( self, net_dict: Dict, diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/returnn/config_builder/global_.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/returnn/config_builder/global_.py index e92abd7f5..956f70147 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/returnn/config_builder/global_.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/returnn/config_builder/global_.py @@ -255,8 +255,16 @@ def get_dump_scores_config(self, corpus_key: str, opts: Dict): def edit_network_only_train_length_model(self, net_dict: Dict): raise NotImplementedError - def edit_network_modify_decoder(self, version: int, net_dict: Dict, train: bool, target_num_labels: int): - network_builder.modify_decoder(version, net_dict, "output", target_num_labels, False, train) + def edit_network_modify_decoder( + self, + version: int, + net_dict: Dict, + train: bool, + target_num_labels: int, + python_prolog: Optional[List] = None + ): + network_builder.modify_decoder( + version, net_dict, "output", target_num_labels, False, train, python_prolog) class SWBBlstmGlobalAttentionConfigBuilder(GlobalConfigBuilder, SWBBlstmConfigBuilder, ConfigBuilder): diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/returnn/config_builder/segmental.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/returnn/config_builder/segmental.py index 08a1b9744..2d206a4e9 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/returnn/config_builder/segmental.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/returnn/config_builder/segmental.py @@ -1,5 +1,4 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.labels.v2.general import SegmentalLabelDefinition -from i6_experiments.users.schmitt.augmentation.alignment import shift_alignment_boundaries_func_str from i6_experiments.users.schmitt.chunking import custom_chunkin_func_str, custom_chunkin_w_reduction_func_str from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.returnn.network_builder import network_builder from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.returnn.network_builder.lm import lm_irie, lstm_bpe_10k @@ -120,6 +119,15 @@ def get_recog_config(self, opts: Dict): lm_embedding_layer_name=lm_embedding_layer_name ) + ctc_shallow_fusion_opts = opts.get("ctc_shallow_fusion_opts", None) + if ctc_shallow_fusion_opts is not None: + network_builder.add_ctc_shallow_fusion( + network=recog_config.config["network"], + rec_layer_name="output", + ctc_scale=ctc_shallow_fusion_opts["ctc_scale"], + target_num_labels_w_blank=self.dependencies.model_hyperparameters.target_num_labels, + ) + return recog_config def get_compile_tf_graph_config(self, opts: Dict): @@ -133,11 +141,20 @@ def edit_network_only_train_length_model(self, net_dict: Dict): if type(net_dict[item]) == dict and item != "output": self.edit_network_only_train_length_model(net_dict[item]) - def edit_network_modify_decoder(self, version: int, net_dict: Dict, train: bool, target_num_labels: int): + def edit_network_modify_decoder( + self, + version: int, + net_dict: Dict, + train: bool, + target_num_labels: int, + python_prolog: Optional[List] = None + ): if train: - network_builder.modify_decoder(version, net_dict, "label_model", target_num_labels, False, train) + network_builder.modify_decoder( + version, net_dict, "label_model", target_num_labels, False, train, python_prolog) else: - network_builder.modify_decoder(version, net_dict, "output", target_num_labels, True, train) + network_builder.modify_decoder( + version, net_dict, "output", target_num_labels, True, train, python_prolog) def get_dump_scores_config(self, corpus_key: str, opts: Dict): returnn_config = self.get_eval_config(eval_corpus_key=corpus_key, opts=opts) @@ -327,28 +344,6 @@ def get_recog_config_for_forward_job(self, opts: Dict): return forward_recog_config - def add_align_augment(self, net_dict, networks_dict, python_prolog): - python_prolog.append(shift_alignment_boundaries_func_str.format( - blank_idx=self.dependencies.model_hyperparameters.blank_idx, - max_shift=2 - )) - - def _add_align_augment_layer(network_dict): - network_dict.update({ - "existing_alignment0": copy.deepcopy(network_dict["existing_alignment"]), - "existing_alignment": { - "class": "eval", - "from": "existing_alignment0", - "eval": "self.network.get_config().typed_value('shift_alignment_boundaries')(source(0, as_data=True), network=self.network)" - } - }) - - if net_dict is not None: - _add_align_augment_layer(net_dict) - if networks_dict is not None: - for net_dict in networks_dict: - _add_align_augment_layer(net_dict) - @staticmethod def get_att_t_dim_tag_code_wrapper(): return CodeWrapper( diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/returnn/network_builder/network_builder.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/returnn/network_builder/network_builder.py index a51bcc457..449aa5854 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/returnn/network_builder/network_builder.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/returnn/network_builder/network_builder.py @@ -636,39 +636,41 @@ def modify_decoder( target_num_labels: int, masked_computation: bool, train: bool, + python_prolog: Optional[List] = None, ): """ Modify the decoder part of the network. - V1: Replace Zoneout LSTM with a normal LSTM. - V2: Simply remove the att vector from the LSTM input. - V3: Like https://arxiv.org/abs/2404.01716 but without CE loss on non-blank predictor. - V4: Like V3 but with CE loss on non-blank predictor. + V1: Simply remove the att vector from the LSTM input. + V2: Like https://arxiv.org/abs/2404.01716 but without CE loss on non-blank predictor. + V3: Like V3 but with CE loss on non-blank predictor. :param version: :param net_dict: :param rec_layer_name: :param target_num_labels: :param masked_computation: :param train: + :param python_prolog: :return: """ - # otherwise we need to add extra checks for masked layers in recog and layer names (e.g. output prob vs label log prob) - assert "s_length_model" in net_dict["output"][ - "unit"], "This function is only supported for our segmental model for now" if masked_computation: + assert not train, "expected train=False for masked computation" + net_dict[rec_layer_name]["unit"]["s_masked"]["unit"]["subnetwork"]["s"]["name_scope"] = "/output/rec/s_wo_att/rec" s_layer = net_dict[rec_layer_name]["unit"]["s_masked"]["unit"]["subnetwork"]["s"] else: - s_layer = net_dict[rec_layer_name]["unit"]["s"] + if train: + net_dict[rec_layer_name]["unit"]["s_wo_att"] = copy.deepcopy(net_dict[rec_layer_name]["unit"]["s"]) + del net_dict[rec_layer_name]["unit"]["s"] + s_layer = net_dict[rec_layer_name]["unit"]["s_wo_att"] + net_dict[rec_layer_name]["unit"]["s"] = { + "class": "copy", + "from": "s_wo_att" + } + else: + net_dict[rec_layer_name]["unit"]["s"]["name_scope"] = "/output/rec/s_wo_att/rec" + s_layer = net_dict[rec_layer_name]["unit"]["s"] - if version in (1, 2, 3, 4): - # before: Zoneout LSTM - s_layer.update({ - "class": "rec", - "dropout": 0.3, - "unit": "nativelstm2", - "unit_opts": {"rec_weight_dropout": 0.3}, - }) - if version in (2, 3, 4): + if version in (1, 2, 3): if masked_computation: # before: ["data", "prev:att"] s_layer["from"] = ["data"] @@ -677,7 +679,7 @@ def modify_decoder( s_layer["from"] = ["prev:target_embed"] # before: "except_batch" -> does not work since att layer is optimized out of the loop now net_dict[rec_layer_name]["unit"]["att"]["axes"] = "except_time" - if version in (3, 4): + if version in (2, 3): ####### Label Model ####### # project directly to the target labels s_layer["n_out"] = target_num_labels @@ -728,7 +730,7 @@ def modify_decoder( } }) net_dict["output"]["unit"]["emit_prob0"]["from"] = "s_length_model_plus_encoder" - if version == 4 and train: + if version == 3 and train: net_dict[rec_layer_name]["unit"]["s_softmax"] = { "class": "activation", "from": "s_log_softmax", @@ -736,7 +738,7 @@ def modify_decoder( "loss": "ce", "target": net_dict[rec_layer_name]["unit"]["output_prob"]["target"] } - if version not in (1, 2, 3, 4): + if version not in (1, 2, 3): raise ValueError("Invalid version %d" % version) @@ -1059,6 +1061,7 @@ def add_length_model_pos_probs( } }) + def add_att_weight_interpolation( network: Dict, rec_layer_name: str, interpolation_layer_name: str, interpolation_scale: float): # just to make sure the network looks as we expect @@ -1076,6 +1079,36 @@ def add_att_weight_interpolation( }) +def add_ctc_shallow_fusion(network: Dict, rec_layer_name: str, ctc_scale: float, target_num_labels_w_blank: int): + + assert "s_length_model" in network["output"]["unit"], "This function is only supported for our segmental model for now" + assert network["output"]["unit"]["output_log_prob"]["from"] == [ + "label_log_prob_plus_emit", "blank_log_prob"], "output_log_prob layer does not look as expected" + assert 0 <= ctc_scale <= 1, "CTC scale must be in [0, 1]" + + del network["ctc"]["loss"] + del network["ctc"]["loss_opts"] + del network["ctc"]["loss_scale"] + del network["ctc"]["target"] + network["ctc"]["n_out"] = target_num_labels_w_blank + + network["output"]["unit"]["output_log_prob0"] = copy.deepcopy(network["output"]["unit"]["output_log_prob"]) + + network[rec_layer_name]["unit"].update({ + "gather_ctc_prob": { + "class": "gather", + "from": "base:ctc", + "position": ":i", + "axis": "t" + }, + "output_log_prob": { + "class": "eval", + "from": ["output_log_prob0", "gather_ctc_prob"], + "eval": f"{1 - ctc_scale} * source(0) + {ctc_scale} * tf.math.log(source(1))" + } + }) + + def get_segment_starts_and_lengths(segment_center_window_size: Optional[int]): if segment_center_window_size is None: return { diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/pipelines/pipeline_ls_conf/center_window_att/baseline_v2/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/pipelines/pipeline_ls_conf/center_window_att/baseline_v2/__init__.py index 19db5cf73..1f5ab56ed 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/pipelines/pipeline_ls_conf/center_window_att/baseline_v2/__init__.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/pipelines/pipeline_ls_conf/center_window_att/baseline_v2/__init__.py @@ -49,6 +49,15 @@ def run_exps(): checkpoint_aliases=("last",) ) + for ctc_scale in (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9): + recog.center_window_returnn_frame_wise_beam_search( + alias=train_alias, + config_builder=config_builder, + checkpoint=checkpoint, + checkpoint_aliases=("last",), + ctc_shallow_fusion_opts={"ctc_scale": ctc_scale} + ) + recog.center_window_returnn_frame_wise_beam_search_use_global_att_ilm( alias=train_alias, config_builder=config_builder, @@ -167,50 +176,3 @@ def run_exps(): lm_scale_list=(0.52, 0.54, 0.56, 0.58, 0.6), ilm_scale_list=(0.3, 0.4, 0.5) ) - - for model_alias, config_builder in baseline.center_window_att_baseline( - win_size_list=(5,), - ): - for train_alias, checkpoint in train.train_center_window_att_import_global_global_ctc_align_only_import_encoder( - alias=model_alias, - config_builder=config_builder, - n_epochs_list=(40, 100), - ): - recog.center_window_returnn_frame_wise_beam_search( - alias=train_alias, - config_builder=config_builder, - checkpoint=checkpoint, - ) - - for decoder_version in (2,): - for model_alias, config_builder in decoder_variations.center_window_att_decoder_variation( - win_size_list=(5,), - decoder_version=decoder_version, - ): - for train_alias, checkpoint in train.train_center_window_att_import_global_global_ctc_align_only_import_encoder( - alias=model_alias, - config_builder=config_builder, - n_epochs_list=(40,), - time_rqmt=1 - ): - recog.center_window_returnn_frame_wise_beam_search( - alias=train_alias, - config_builder=config_builder, - checkpoint=checkpoint, - ) - - for model_alias, config_builder in att_weight_interpolation.center_window_att_gaussian_att_weight_interpolation( - win_size_list=(1, 3, 129), - n_epochs_list=(10,), - gauss_scale_list=(1.,) - ): - for train_alias, checkpoint in train.train_center_window_att_import_global_global_ctc_align( - alias=model_alias, - config_builder=config_builder, - n_epochs_list=(10,), - ): - recog.center_window_returnn_frame_wise_beam_search( - alias=train_alias, - config_builder=config_builder, - checkpoint=checkpoint, - ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/pipelines/pipeline_ls_conf/center_window_att/recog.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/pipelines/pipeline_ls_conf/center_window_att/recog.py index 4f10be904..1cb2cc913 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/pipelines/pipeline_ls_conf/center_window_att/recog.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/pipelines/pipeline_ls_conf/center_window_att/recog.py @@ -19,7 +19,8 @@ def center_window_returnn_frame_wise_beam_search( beam_size_list: Tuple[int, ...] = (12,), checkpoint_aliases: Tuple[str, ...] = ("last", "best", "best-4-avg"), run_analysis: bool = False, - att_weight_seq_tags: Optional[List] = None + att_weight_seq_tags: Optional[List] = None, + ctc_shallow_fusion_opts: Optional[Dict] = None ): ilm_opts = {"type": ilm_type} if ilm_type == "mini_att": @@ -38,7 +39,8 @@ def center_window_returnn_frame_wise_beam_search( ilm_scales=ilm_scale_list, ilm_opts=ilm_opts, run_analysis=run_analysis, - analysis_opts={"att_weight_seq_tags": att_weight_seq_tags} + analysis_opts={"att_weight_seq_tags": att_weight_seq_tags}, + recog_opts={"ctc_shallow_fusion_opts": ctc_shallow_fusion_opts}, ).run() diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/pipelines/pipeline_ls_conf/checkpoints.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/pipelines/pipeline_ls_conf/checkpoints.py index 01ba58e7f..1d6058266 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/pipelines/pipeline_ls_conf/checkpoints.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/pipelines/pipeline_ls_conf/checkpoints.py @@ -2,7 +2,8 @@ from i6_core.returnn.training import Checkpoint external_checkpoints = { - "glob.conformer.mohammad.5.6": Checkpoint(Path("/work/asr4/zeineldeen/setups-data/librispeech/2022-11-28--conformer-att/models-backup/best_att_100/avg_ckpt/epoch.2029.index", cached=True)) + "glob.conformer.mohammad.5.6": Checkpoint(Path("/work/asr4/zeineldeen/setups-data/librispeech/2022-11-28--conformer-att/models-backup/best_att_100/avg_ckpt/epoch.2029.index", cached=True)), + "glob.conformer.mohammad.5.4": Checkpoint(Path("/work/asr4/zeineldeen/setups-data/librispeech/2022-11-28--conformer-att/work/i6_core/returnn/training/AverageTFCheckpointsJob.BxqgICRSGkgb/output/model/average.index", cached=True)) } default_import_model_name = "glob.conformer.mohammad.5.6" diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/pipelines/pipeline_ls_conf/global_att/baseline_v1/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/pipelines/pipeline_ls_conf/global_att/baseline_v1/__init__.py index 111041353..55356acbd 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/pipelines/pipeline_ls_conf/global_att/baseline_v1/__init__.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/pipelines/pipeline_ls_conf/global_att/baseline_v1/__init__.py @@ -62,6 +62,17 @@ def run_exps(): } ) + # this is Mohammad's 5.4 WER model + for train_alias, checkpoint in ( + (f"{model_alias}/no-finetuning", external_checkpoints["glob.conformer.mohammad.5.4"]),): + train_alias = train_alias.replace(default_import_model_name, "glob.conformer.mohammad.5.4") + recog.global_att_returnn_label_sync_beam_search( + alias=train_alias, + config_builder=config_builder, + checkpoint=checkpoint, + checkpoint_aliases=("best-4-avg",), + ) + # continue training for 1 epoch for train_alias, checkpoint in train.train_global_att_import_global( alias=model_alias, diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/recog_new.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/recog_new.py index a91ce3104..a301f504c 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/recog_new.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/recog_new.py @@ -196,6 +196,12 @@ def __init__(self, config_builder: SegmentalConfigBuilder, **kwargs): super().__init__(config_builder=config_builder, **kwargs) self.config_builder = config_builder + ctc_shallow_fusion_opts = self.recog_opts.get("ctc_shallow_fusion_opts") + if ctc_shallow_fusion_opts: + self.alias += "/time-sync-recog-w-ctc_shallow_fusion-%f" % ctc_shallow_fusion_opts["ctc_scale"] + else: + self.alias += "/time-sync-recog" + def get_mini_att_checkpoint(self, train_mini_lstm_opts: Dict) -> Checkpoint: train_opts = {} if train_mini_lstm_opts["use_eos"]: @@ -290,6 +296,8 @@ def get_ctm_path(self) -> Path: returnn_root=self.returnn_root, returnn_python_exe=self.returnn_python_exe, output_files=["output.py.gz"], + mem_rqmt=6, + time_rqmt=1, ) search_job.add_alias("%s/search_%s" % (self.alias, self.stm_corpus_key)) search_take_best_job = SearchTakeBestJob(search_py_output=search_job.out_files["output.py.gz"]) @@ -313,9 +321,6 @@ def run_analysis(self, analysis_opts: Optional[Dict] = None): if analysis_opts is not None: _analysis_opts.update(analysis_opts) - if _analysis_opts["ground_truth_hdf"] is not None: - assert _analysis_opts["ground_truth_hdf"] == _analysis_opts["att_weight_ref_alignment_hdf"] - forward_recog_config = self.config_builder.get_recog_config_for_forward_job(opts=self.recog_opts) forward_search_job = ReturnnForwardJob( model_checkpoint=self.checkpoint, diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/train_new.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/train_new.py index ffca3cd18..e903aba56 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/train_new.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/train_new.py @@ -1,6 +1,6 @@ from sisyphus import tk, Path import copy -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from abc import ABC, abstractmethod from i6_core.returnn.training import Checkpoint, ReturnnTrainingJob @@ -8,6 +8,7 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.returnn.config_builder.base import ConfigBuilder from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.returnn.config_builder.segmental import SegmentalConfigBuilder from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.returnn.config_builder.global_ import GlobalConfigBuilder +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.config_builder_rf.base import GlobalAttConfigBuilderRF, SegmentalAttConfigBuilderRF, ConfigBuilderRF from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.returnn.config_builder.ctc import CtcConfigBuilder from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.pipelines.pipeline_ls_conf.checkpoints import external_checkpoints from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.labels.v2.librispeech.label_singletons import LibrispeechBPE10025_LABELS_WITH_SILENCE, LibrispeechBPE10025_CTC_ALIGNMENT @@ -18,7 +19,7 @@ class TrainExperiment(ABC): def __init__( self, - config_builder: ConfigBuilder, + config_builder: Union[ConfigBuilder, ConfigBuilderRF], alias: str, num_epochs: int, train_opts: Optional[Dict] = None, @@ -33,6 +34,10 @@ def __init__( dataset_opts = _train_opts.pop("dataset_opts", {}) self.train_opts.update(_train_opts) self.train_opts["dataset_opts"].update(dataset_opts) + if "cleanup_old_models" not in self.train_opts: + self.train_opts["cleanup_old_models"] = { + "keep_best_n": 4, "keep_last_n": 1, "keep": [num_epochs] + } self.train_rqmt = train_rqmt if train_rqmt is not None else {} self.alias = self.alias + "/train" @@ -76,7 +81,7 @@ def run_train(self) -> Tuple[Dict[int, Checkpoint], Path, Path]: class SegmentalTrainExperiment(TrainExperiment): - def __init__(self, config_builder: SegmentalConfigBuilder, **kwargs): + def __init__(self, config_builder: Union[SegmentalConfigBuilder, SegmentalAttConfigBuilderRF], **kwargs): super().__init__(config_builder=config_builder, **kwargs) @property @@ -99,7 +104,7 @@ def default_train_opts(self) -> Dict: class GlobalTrainExperiment(TrainExperiment): - def __init__(self, config_builder: GlobalConfigBuilder, **kwargs): + def __init__(self, config_builder: Union[GlobalConfigBuilder, GlobalAttConfigBuilderRF], **kwargs): super().__init__(config_builder=config_builder, **kwargs) @property diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/config_builder_rf/base.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/config_builder_rf/base.py index 156565816..b94dca43a 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/config_builder_rf/base.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/config_builder_rf/base.py @@ -467,8 +467,10 @@ class SegmentalAttConfigBuilderRF(LibrispeechConformerConfigBuilderRF): def __init__( self, center_window_size: int, - decoder_version: Optional[int] = None, - length_model_opts: Optional[Dict] = None, + label_decoder_version: int, + blank_decoder_version: Optional[int] = None, + use_joint_model: bool = False, + use_weight_feedback: bool = True, **kwargs ): super(SegmentalAttConfigBuilderRF, self).__init__(**kwargs) @@ -477,9 +479,25 @@ def __init__( center_window_size=center_window_size, )) - print(decoder_version) - if decoder_version: - self.config_dict["label_decoder_version"] = decoder_version + if use_joint_model: + assert not blank_decoder_version, "Either use joint model or separate label and blank model" + + if label_decoder_version != 1: + self.config_dict["label_decoder_version"] = label_decoder_version + if blank_decoder_version is not None and blank_decoder_version != 1: + self.config_dict["blank_decoder_version"] = blank_decoder_version + if use_joint_model: + self.config_dict["use_joint_model"] = use_joint_model + if not use_weight_feedback: + self.config_dict["use_weight_feedback"] = use_weight_feedback + + def get_train_config(self, opts: Dict): + train_config = super(SegmentalAttConfigBuilderRF, self).get_train_config(opts) + + if opts.get("alignment_augmentation_opts"): + train_config.config["alignment_augmentation_opts"] = opts["alignment_augmentation_opts"] + + return train_config def get_recog_config(self, opts: Dict): recog_config = super(SegmentalAttConfigBuilderRF, self).get_recog_config(opts) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/base.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/base.py index 8f02d397a..c6f58441c 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/base.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/base.py @@ -23,7 +23,6 @@ def __init__( att_num_heads: Dim = Dim(name="att_num_heads", dimension=1), att_dropout: float = 0.1, l2: float = 0.0001, - language_model: Optional[RFModelWithMakeLabelScorer] = None, ): super(BaseLabelDecoder, self).__init__() @@ -67,5 +66,3 @@ def __init__( # Instead, it is intended to make a separate label scorer for it. self.language_model = None self.language_model_make_label_scorer = None - if language_model: - self.language_model, self.language_model_make_label_scorer = language_model diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model.py index e9d4165ab..0d4f54109 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model.py @@ -59,10 +59,15 @@ def __init__( blank_idx=blank_idx, enc_key_total_dim=enc_key_total_dim, l2=l2, - language_model=language_model, eos_idx=eos_idx, ) + if language_model: + self.language_model, self.language_model_make_label_scorer = language_model + else: + self.language_model = None + self.language_model_make_label_scorer = None + self.blank_idx = blank_idx self.target_dim = target_dim diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/recog.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/recog.py index 6269b3fe0..37a07cc37 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/recog.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/recog.py @@ -24,6 +24,8 @@ def _returnn_v2_forward_step(*, model, extern_data: TensorDict, **_kwargs_unused default_target_key = config.typed_value("target") targets = extern_data[default_target_key] extra.update(dict(targets=targets, targets_spatial_dim=targets.get_time_dim_tag())) + if config.bool("use_recombination", False): + extra.update(dict(use_recombination=True)) recog_out = recog_def(model=model, data=data, data_spatial_dim=data_spatial_dim, **extra) if len(recog_out) == 5: # recog results including beam {batch, beam, out_spatial}, diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model.py index da3bba4c0..e4cd2de4d 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model.py @@ -6,7 +6,10 @@ from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.model import ModelDef from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.base import _batch_size_factor, _log_mel_feature_dim -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.blank_model.model import BlankDecoder +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.blank_model.model import ( + BlankDecoderV1, + BlankDecoderV3, +) from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.model import ( SegmentalAttLabelDecoder, SegmentalAttLabelDecoderWoCtxInState @@ -39,6 +42,9 @@ def __init__( dec_att_num_heads: Dim = Dim(name="att_num_heads", dimension=1), enc_dropout: float = 0.1, label_decoder_version: int = 1, + blank_decoder_version: int = 1, + use_joint_model: bool = False, + use_weight_feedback: bool = True, ): super(SegmentalAttentionModel, self).__init__() @@ -59,11 +65,12 @@ def __init__( l2=l2, ) - print("using label_decoder_version", label_decoder_version) + assert label_decoder_version in {1, 2} + assert blank_decoder_version in {1, 3} + if label_decoder_version == 1: label_decoder_class = SegmentalAttLabelDecoder else: - assert label_decoder_version == 2 label_decoder_class = SegmentalAttLabelDecoderWoCtxInState self.label_decoder = label_decoder_class( @@ -75,19 +82,37 @@ def __init__( enc_key_total_dim=enc_key_total_dim, l2=l2, center_window_size=center_window_size, - language_model=language_model, - ) - self.blank_decoder = BlankDecoder( - length_model_state_dim=length_model_state_dim, - length_model_embed_dim=length_model_embed_dim, - align_target_dim=align_target_dim, - encoder_out_dim=self.encoder.out_dim, + use_weight_feedback=use_weight_feedback, ) + if not use_joint_model: + if blank_decoder_version == 1: + self.blank_decoder = BlankDecoderV1( + length_model_state_dim=length_model_state_dim, + length_model_embed_dim=length_model_embed_dim, + align_target_dim=align_target_dim, + encoder_out_dim=self.encoder.out_dim, + ) + else: + self.blank_decoder = BlankDecoderV3( + length_model_state_dim=length_model_state_dim, + label_state_dim=self.label_decoder.get_lstm().out_dim, + encoder_out_dim=self.encoder.out_dim, + ) + else: + self.blank_decoder = None + + if language_model: + self.language_model, self.language_model_make_label_scorer = language_model + else: + self.language_model = None + self.language_model_make_label_scorer = None + self.blank_idx = self.label_decoder.blank_idx self.center_window_size = center_window_size self.target_dim = self.label_decoder.target_dim self.align_target_dim = align_target_dim + self.use_joint_model = use_joint_model class MakeModel: @@ -125,6 +150,9 @@ def make_model( pos_emb_dropout: float = 0.0, language_model: Optional[Dict[str, Any]] = None, label_decoder_version: int, + blank_decoder_version: int, + use_joint_model: bool, + use_weight_feedback: bool, **extra, ) -> SegmentalAttentionModel: """make""" @@ -162,12 +190,15 @@ def make_model( ), target_dim=target_dim, align_target_dim=align_target_dim, - blank_idx=target_dim.dimension, + blank_idx=0 if use_joint_model else target_dim.dimension, language_model=lm, length_model_state_dim=Dim(name="length_model_state", dimension=128, kind=Dim.Types.Feature), length_model_embed_dim=Dim(name="length_model_embed", dimension=128, kind=Dim.Types.Feature), center_window_size=center_window_size, label_decoder_version=label_decoder_version, + blank_decoder_version=blank_decoder_version, + use_joint_model=use_joint_model, + use_weight_feedback=use_weight_feedback, **extra, ) @@ -189,6 +220,9 @@ def from_scratch_model_def( raise ValueError("center_window_size is not set!") label_decoder_version = config.int("label_decoder_version", 1) + blank_decoder_version = config.int("blank_decoder_version", 1) + use_joint_model = config.bool("use_joint_model", False) + use_weight_feedback = config.bool("use_weight_feedback", True) return MakeModel.make_model( in_dim, @@ -199,6 +233,9 @@ def from_scratch_model_def( pos_emb_dropout=pos_emb_dropout, language_model=lm_opts, label_decoder_version=label_decoder_version, + blank_decoder_version=blank_decoder_version, + use_joint_model=use_joint_model, + use_weight_feedback=use_weight_feedback, ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/model.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/model.py index 8b57923e8..746349237 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/model.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/model.py @@ -1,4 +1,5 @@ from typing import Optional, Dict, Any, Sequence, Tuple, List +from abc import ABC, abstractmethod import functools from returnn.tensor import Tensor, Dim, single_step_dim @@ -9,30 +10,22 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.base import BaseLabelDecoder -class BlankDecoder(rf.Module): - def __init__( - self, - length_model_state_dim: Dim, - length_model_embed_dim: Dim, - align_target_dim: Dim, - encoder_out_dim: Dim, - ): - super(BlankDecoder, self).__init__() +class BlankDecoderBase(rf.Module, ABC): + def __init__(self, length_model_state_dim: Dim): + super(BlankDecoderBase, self).__init__() self.length_model_state_dim = length_model_state_dim - self.length_model_embed_dim = length_model_embed_dim self.emit_prob_dim = Dim(name="emit_prob", dimension=1) - - self.target_embed = rf.Embedding(align_target_dim, self.length_model_embed_dim) - self.s = rf.LSTM( - encoder_out_dim + self.length_model_embed_dim, - self.length_model_state_dim, - ) self.emit_prob = rf.Linear(self.length_model_state_dim, self.emit_prob_dim) + @property + @abstractmethod + def _s(self) -> rf.LSTM: + pass + def default_initial_state(self, *, batch_dims: Sequence[Dim]) -> rf.State: """Default initial state""" state = rf.State( - s_blank=self.s.default_initial_state(batch_dims=batch_dims), + s_blank=self._s.default_initial_state(batch_dims=batch_dims), i=rf.zeros(batch_dims, dtype="int32"), ) return state @@ -42,29 +35,14 @@ def loop_step_output_templates(self, batch_dims: List[Dim]) -> Dict[str, Tensor] return { "s_blank": Tensor( "s_blank", - dims=batch_dims + [self.s.out_dim], + dims=batch_dims + [self._s.out_dim], dtype=rf.get_default_float_dtype(), feature_dim_axis=-1 ), } - def loop_step( - self, - *, - enc: rf.Tensor, - enc_spatial_dim: Dim, - input_embed: rf.Tensor, - state: Optional[rf.State] = None, - spatial_dim=single_step_dim - ) -> Tuple[Dict[str, rf.Tensor], rf.State]: - """step of the inner loop""" - if state is None: - batch_dims = enc.remaining_dims( - remove=(enc.feature_dim, enc_spatial_dim) if enc_spatial_dim != single_step_dim else (enc.feature_dim,) - ) - state = self.default_initial_state(batch_dims=batch_dims) - state_ = rf.State() - + @staticmethod + def _get_am(enc: rf.Tensor, enc_spatial_dim: Dim, state: rf.State, spatial_dim) -> rf.Tensor: if spatial_dim == single_step_dim: i = state.i clip_to_valid = True @@ -75,7 +53,53 @@ def loop_step( i = rf.where(i < seq_lens, i, seq_lens - 1) clip_to_valid = False - am = rf.gather(enc, axis=enc_spatial_dim, indices=i, clip_to_valid=clip_to_valid) + return rf.gather(enc, axis=enc_spatial_dim, indices=i, clip_to_valid=clip_to_valid) + + def decode_logits(self, *, s_blank: Tensor) -> Tensor: + """logits for the decoder""" + logits = self.emit_prob(s_blank) + return logits + + def get_label_decoder_deps(self) -> Optional[List[str]]: + return None + + +class BlankDecoderV1(BlankDecoderBase): + def __init__( + self, + length_model_state_dim: Dim, + length_model_embed_dim: Dim, + align_target_dim: Dim, + encoder_out_dim: Dim, + ): + super(BlankDecoderV1, self).__init__(length_model_state_dim=length_model_state_dim) + self.length_model_state_dim = length_model_state_dim + self.length_model_embed_dim = length_model_embed_dim + self.emit_prob_dim = Dim(name="emit_prob", dimension=1) + + self.target_embed = rf.Embedding(align_target_dim, self.length_model_embed_dim) + self.s = rf.LSTM( + encoder_out_dim + self.length_model_embed_dim, + self.length_model_state_dim, + ) + self.emit_prob = rf.Linear(self.length_model_state_dim, self.emit_prob_dim) + + @property + def _s(self) -> rf.LSTM: + return self.s + + def loop_step( + self, + *, + enc: rf.Tensor, + enc_spatial_dim: Dim, + input_embed: rf.Tensor, + state: rf.State, + spatial_dim=single_step_dim + ) -> Tuple[Dict[str, rf.Tensor], rf.State]: + state_ = rf.State() + + am = self._get_am(enc, enc_spatial_dim, state, spatial_dim) s_blank, state_.s_blank = self.s( rf.concat_features(am, input_embed), state=state.s_blank, @@ -90,3 +114,102 @@ def decode_logits(self, *, s_blank: Tensor) -> Tensor: """logits for the decoder""" logits = self.emit_prob(s_blank) return logits + + +class BlankDecoderV2(BlankDecoderBase): + def __init__( + self, + length_model_state_dim: Dim, + length_model_embed_dim: Dim, + target_dim: Dim, + label_state_dim: Dim, + encoder_out_dim: Dim, + ): + super(BlankDecoderV2, self).__init__(length_model_state_dim=length_model_state_dim) + self.length_model_state_dim = length_model_state_dim + self.length_model_embed_dim = length_model_embed_dim + self.emit_prob_dim = Dim(name="emit_prob", dimension=1) + + self.target_embed = rf.Embedding(target_dim, self.length_model_embed_dim) + self.s = rf.LSTM( + encoder_out_dim + length_model_embed_dim + label_state_dim, + self.length_model_state_dim, + ) + self.emit_prob = rf.Linear(self.length_model_state_dim, self.emit_prob_dim) + + @property + def _s(self) -> rf.LSTM: + return self.s + + def loop_step( + self, + *, + enc: rf.Tensor, + enc_spatial_dim: Dim, + input_embed: rf.Tensor, + label_model_state: rf.Tensor, + state: rf.State, + spatial_dim=single_step_dim + ) -> Tuple[Dict[str, rf.Tensor], rf.State]: + state_ = rf.State() + + am = self._get_am(enc, enc_spatial_dim, state, spatial_dim) + s_blank, state_.s_blank = self.s( + rf.concat_features(am, input_embed, label_model_state), + state=state.s_blank, + spatial_dim=spatial_dim + ) + + state_.i = state.i + 1 + + return {"s_blank": s_blank}, state_ + + def get_label_decoder_deps(self) -> Optional[List[str]]: + return ["s"] + + +class BlankDecoderV3(BlankDecoderBase): + def __init__( + self, + length_model_state_dim: Dim, + label_state_dim: Dim, + encoder_out_dim: Dim, + ): + super(BlankDecoderV3, self).__init__(length_model_state_dim=length_model_state_dim) + self.length_model_state_dim = length_model_state_dim + self.emit_prob_dim = Dim(name="emit_prob", dimension=1) + + self.s = rf.LSTM( + encoder_out_dim + label_state_dim, + self.length_model_state_dim, + ) + self.emit_prob = rf.Linear(self.length_model_state_dim, self.emit_prob_dim) + + @property + def _s(self) -> rf.LSTM: + return self.s + + def loop_step( + self, + *, + enc: rf.Tensor, + enc_spatial_dim: Dim, + label_model_state: rf.Tensor, + state: rf.State, + spatial_dim=single_step_dim + ) -> Tuple[Dict[str, rf.Tensor], rf.State]: + state_ = rf.State() + + am = self._get_am(enc, enc_spatial_dim, state, spatial_dim) + s_blank, state_.s_blank = self.s( + rf.concat_features(am, label_model_state), + state=state.s_blank, + spatial_dim=spatial_dim + ) + + state_.i = state.i + 1 + + return {"s_blank": s_blank}, state_ + + def get_label_decoder_deps(self) -> Optional[List[str]]: + return ["s"] diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/train.py index 13daa8de3..1abef2546 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/train.py @@ -1,34 +1,23 @@ from typing import Optional, Dict, Any, Sequence, Tuple, List -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.blank_model.model import BlankDecoder +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.blank_model.model import ( + BlankDecoderV1, + BlankDecoderV3, + BlankDecoderBase +) from returnn.tensor import Dim import returnn.frontend as rf -def viterbi_training( +def decode_logits( *, - model: BlankDecoder, - enc_args: Dict, - enc_spatial_dim: Dim, - align_targets: rf.Tensor, + model: BlankDecoderBase, + blank_loop_out: Dict, align_targets_spatial_dim: Dim, emit_ground_truth: rf.Tensor, - emit_blank_target_dim: Dim, - batch_dims: List[Dim], + batch_dims: List[Dim] ): - align_input_embeddings = model.target_embed(align_targets) - align_input_embeddings = rf.shift_right( - align_input_embeddings, axis=align_targets_spatial_dim, pad_value=0.0) - - blank_loop_out, _ = model.loop_step( - enc=enc_args["enc"], - enc_spatial_dim=enc_spatial_dim, - input_embed=align_input_embeddings, - state=model.default_initial_state(batch_dims=batch_dims,), - spatial_dim=align_targets_spatial_dim, - ) - blank_logits = model.decode_logits(**blank_loop_out) blank_logits_packed, pack_dim = rf.pack_padded( blank_logits, dims=batch_dims + [align_targets_spatial_dim], enforce_sorted=False) @@ -36,6 +25,16 @@ def viterbi_training( emit_ground_truth, dims=batch_dims + [align_targets_spatial_dim], enforce_sorted=False, out_dim=pack_dim ) + return blank_logits_packed, pack_dim, emit_ground_truth_packed + + +def calc_loss( + *, + blank_logits_packed: rf.Tensor, + emit_ground_truth_packed: rf.Tensor, + emit_blank_target_dim: Dim, + pack_dim: Dim +): # rf.log_sigmoid not implemented for torch backend emit_log_prob = rf.log(rf.sigmoid(blank_logits_packed)) blank_log_prob = rf.log(rf.sigmoid(-blank_logits_packed)) @@ -53,3 +52,77 @@ def viterbi_training( best = rf.reduce_argmax(emit_blank_log_prob, axis=emit_blank_target_dim) frame_error = best != emit_ground_truth_packed frame_error.mark_as_loss(name="emit_blank_fer", as_error=True) + + +def viterbi_training( + *, + model: BlankDecoderV1, + enc_args: Dict, + enc_spatial_dim: Dim, + align_targets: rf.Tensor, + align_targets_spatial_dim: Dim, + emit_ground_truth: rf.Tensor, + emit_blank_target_dim: Dim, + batch_dims: List[Dim], +): + align_input_embeddings = model.target_embed(align_targets) + align_input_embeddings = rf.shift_right( + align_input_embeddings, axis=align_targets_spatial_dim, pad_value=0.0) + + blank_loop_out, _ = model.loop_step( + enc=enc_args["enc"], + enc_spatial_dim=enc_spatial_dim, + input_embed=align_input_embeddings, + state=model.default_initial_state(batch_dims=batch_dims,), + spatial_dim=align_targets_spatial_dim, + ) + + blank_logits_packed, pack_dim, emit_ground_truth_packed = decode_logits( + model=model, + blank_loop_out=blank_loop_out, + align_targets_spatial_dim=align_targets_spatial_dim, + emit_ground_truth=emit_ground_truth, + batch_dims=batch_dims + ) + + calc_loss( + blank_logits_packed=blank_logits_packed, + emit_ground_truth_packed=emit_ground_truth_packed, + emit_blank_target_dim=emit_blank_target_dim, + pack_dim=pack_dim + ) + + +def viterbi_training_v3( + *, + model: BlankDecoderV3, + enc_args: Dict, + enc_spatial_dim: Dim, + label_states_unmasked: rf.Tensor, + label_states_unmasked_spatial_dim: Dim, + emit_ground_truth: rf.Tensor, + emit_blank_target_dim: Dim, + batch_dims: List[Dim], +): + blank_loop_out, _ = model.loop_step( + enc=enc_args["enc"], + enc_spatial_dim=enc_spatial_dim, + label_model_state=label_states_unmasked, + state=model.default_initial_state(batch_dims=batch_dims,), + spatial_dim=label_states_unmasked_spatial_dim, + ) + + blank_logits_packed, pack_dim, emit_ground_truth_packed = decode_logits( + model=model, + blank_loop_out=blank_loop_out, + align_targets_spatial_dim=label_states_unmasked_spatial_dim, + emit_ground_truth=emit_ground_truth, + batch_dims=batch_dims + ) + + calc_loss( + blank_logits_packed=blank_logits_packed, + emit_ground_truth_packed=emit_ground_truth_packed, + emit_blank_target_dim=emit_blank_target_dim, + pack_dim=pack_dim + ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/model.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/model.py index 2313ced51..03a3a756d 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/model.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/model.py @@ -8,12 +8,16 @@ class SegmentalAttLabelDecoder(BaseLabelDecoder): - def __init__(self, center_window_size: int, **kwargs): + def __init__(self, center_window_size: int, use_weight_feedback: bool, **kwargs): super(SegmentalAttLabelDecoder, self).__init__(**kwargs) self.center_window_size = center_window_size self.accum_att_weights_dim = Dim(name="accum_att_weights", dimension=center_window_size) + self.use_weight_feedback = use_weight_feedback + if not use_weight_feedback: + delattr(self, "weight_feedback") + def default_initial_state( self, *, @@ -23,22 +27,28 @@ def default_initial_state( ) -> rf.State: """Default initial state""" state = rf.State( - s=self._get_lstm().default_initial_state(batch_dims=batch_dims), + s=self.get_lstm().default_initial_state(batch_dims=batch_dims), att=rf.zeros(list(batch_dims) + [self.att_num_heads * self.enc_out_dim]), - accum_att_weights=rf.zeros( - list(batch_dims) + [self.accum_att_weights_dim, self.att_num_heads], feature_dim=self.att_num_heads - ), + # accum_att_weights=rf.zeros( + # list(batch_dims) + [self.accum_att_weights_dim, self.att_num_heads], feature_dim=self.att_num_heads + # ), segment_starts=rf.zeros(batch_dims, sparse_dim=segment_starts_sparse_dim, dtype="int32"), segment_lens=rf.zeros(batch_dims, sparse_dim=segment_lens_sparse_dim, dtype="int32"), ) state.att.feature_dim_axis = len(state.att.dims) - 1 + + if self.use_weight_feedback: + state.accum_att_weights = rf.zeros( + list(batch_dims) + [self.accum_att_weights_dim, self.att_num_heads], feature_dim=self.att_num_heads + ) + return state def loop_step_output_templates(self, batch_dims: List[Dim]) -> Dict[str, Tensor]: """loop step out""" return { "s": Tensor( - "s", dims=batch_dims + [self._get_lstm().out_dim], dtype=rf.get_default_float_dtype(), feature_dim_axis=-1 + "s", dims=batch_dims + [self.get_lstm().out_dim], dtype=rf.get_default_float_dtype(), feature_dim_axis=-1 ), "att": Tensor( "att", @@ -133,7 +143,7 @@ def _update_state( ): return self.s(rf.concat_features(input_embed, prev_att), state=prev_s_state, spatial_dim=single_step_dim) - def _get_lstm(self): + def get_lstm(self): return self.s def loop_step( @@ -146,20 +156,13 @@ def loop_step( input_embed: rf.Tensor, segment_starts: rf.Tensor, segment_lens: rf.Tensor, - state: Optional[rf.State] = None, + state: rf.State, ) -> Tuple[Dict[str, rf.Tensor], rf.State]: - """step of the inner loop""" - if state is None: - batch_dims = enc.remaining_dims( - remove=(enc.feature_dim, enc_spatial_dim) if enc_spatial_dim != single_step_dim else (enc.feature_dim,) - ) - state = self.default_initial_state(batch_dims=batch_dims) state_ = rf.State() # during search, these need to be the values from the previous "emit" step (not necessarily the previous time step) prev_att = state.att prev_s_state = state.s - prev_accum_att_weights = state.accum_att_weights prev_segment_starts = state.segment_starts prev_segment_lens = state.segment_lens @@ -173,16 +176,20 @@ def loop_step( enc_ctx_sliced = rf.gather(enc_ctx, axis=enc_spatial_dim, indices=gather_positions, clip_to_valid=True) enc_sliced = rf.gather(enc, axis=enc_spatial_dim, indices=gather_positions, clip_to_valid=True) - prev_accum_att_weights_scattered = self._get_prev_accum_att_weights_scattered( - prev_accum_att_weights=prev_accum_att_weights, - segment_starts=segment_starts, - prev_segment_starts=prev_segment_starts, - prev_segment_lens=prev_segment_lens, - ) - weight_feedback = self._get_weight_feedback( - prev_accum_att_weights_scattered=prev_accum_att_weights_scattered, - att_t_dim=slice_dim, - ) + if self.use_weight_feedback: + prev_accum_att_weights = state.accum_att_weights + prev_accum_att_weights_scattered = self._get_prev_accum_att_weights_scattered( + prev_accum_att_weights=prev_accum_att_weights, + segment_starts=segment_starts, + prev_segment_starts=prev_segment_starts, + prev_segment_lens=prev_segment_lens, + ) + weight_feedback = self._get_weight_feedback( + prev_accum_att_weights_scattered=prev_accum_att_weights_scattered, + att_t_dim=slice_dim, + ) + else: + weight_feedback = rf.zeros((self.enc_key_total_dim,)) energy_in = enc_ctx_sliced + weight_feedback + s_transformed energy = self.energy(rf.tanh(energy_in)) @@ -193,16 +200,17 @@ def loop_step( att, _ = rf.merge_dims(att0, dims=(self.att_num_heads, self.enc_out_dim)) state_.att = att - accum_att_weights = self._get_accum_att_weights( - att_t_dim=slice_dim, - enc_spatial_dim=enc_spatial_dim, - inv_fertility=inv_fertility, - att_weights=att_weights, - prev_accum_att_weights_scattered=prev_accum_att_weights_scattered, - gather_positions=gather_positions, - ) - accum_att_weights.feature_dim = self.att_num_heads - state_.accum_att_weights = accum_att_weights + if self.use_weight_feedback: + accum_att_weights = self._get_accum_att_weights( + att_t_dim=slice_dim, + enc_spatial_dim=enc_spatial_dim, + inv_fertility=inv_fertility, + att_weights=att_weights, + prev_accum_att_weights_scattered=prev_accum_att_weights_scattered, + gather_positions=gather_positions, + ) + accum_att_weights.feature_dim = self.att_num_heads + state_.accum_att_weights = accum_att_weights state_.segment_starts = segment_starts state_.segment_lens = segment_lens @@ -242,5 +250,5 @@ def _update_state( ): return self.s_wo_att(rf.concat_features(input_embed), state=prev_s_state, spatial_dim=single_step_dim) - def _get_lstm(self): + def get_lstm(self): return self.s_wo_att diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/train.py index fe2d14b7f..3a547e7bd 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/train.py @@ -3,7 +3,7 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.utils import get_non_blank_mask, get_masked from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.model import SegmentalAttLabelDecoder -from returnn.tensor import Dim +from returnn.tensor import Dim, single_step_dim import returnn.frontend as rf from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.training import TrainDef @@ -19,9 +19,10 @@ def viterbi_training( segment_starts: rf.Tensor, segment_lens: rf.Tensor, batch_dims: List[Dim], -): + output_tensors: Optional[List[str]] = None +) -> Optional[Dict[str, Tuple[rf.Tensor, Dim]]]: non_blank_input_embeddings = model.target_embed(non_blank_targets) - non_blank_input_embeddings = rf.shift_right( + non_blank_input_embeddings_shifted = rf.shift_right( non_blank_input_embeddings, axis=non_blank_targets_spatial_dim, pad_value=0.0) # ------------------- label loop ------------------- @@ -38,10 +39,10 @@ def _label_loop_body(xs, state: rf.State): ) return loop_out_, new_state - label_loop_out, _, _ = rf.scan( + label_loop_out, final_state, _ = rf.scan( spatial_dim=non_blank_targets_spatial_dim, xs={ - "input_embed": non_blank_input_embeddings, + "input_embed": non_blank_input_embeddings_shifted, "segment_starts": segment_starts, "segment_lens": segment_lens, }, @@ -57,7 +58,7 @@ def _label_loop_body(xs, state: rf.State): body=_label_loop_body, ) - logits = model.decode_logits(input_embed=non_blank_input_embeddings, **label_loop_out) + logits = model.decode_logits(input_embed=non_blank_input_embeddings_shifted, **label_loop_out) logits_packed, pack_dim = rf.pack_padded(logits, dims=batch_dims + [non_blank_targets_spatial_dim], enforce_sorted=False) non_blank_targets_packed, _ = rf.pack_padded( non_blank_targets, dims=batch_dims + [non_blank_targets_spatial_dim], enforce_sorted=False, out_dim=pack_dim @@ -73,3 +74,31 @@ def _label_loop_body(xs, state: rf.State): best = rf.reduce_argmax(logits_packed, axis=model.target_dim) frame_error = best != non_blank_targets_packed frame_error.mark_as_loss(name="non_blank_fer", as_error=True) + + if output_tensors is not None: + extended_outputs = {} + # need to run the loop one more time to get the last output (which is not needed for the loss computation) + last_embedding = rf.gather( + non_blank_input_embeddings, + axis=non_blank_targets_spatial_dim, + indices=rf.copy_to_device( + non_blank_targets_spatial_dim.get_size_tensor() - 1, non_blank_input_embeddings.device) + ) + last_loop_out, _ = model.loop_step( + **enc_args, + enc_spatial_dim=enc_spatial_dim, + input_embed=last_embedding, + segment_starts=final_state.decoder.segment_starts, + segment_lens=final_state.decoder.segment_lens, + state=final_state.decoder, + ) + for key, val in last_loop_out.items(): + if key not in output_tensors: + continue + extended_outputs[key] = rf.concat( + (label_loop_out[key], non_blank_targets_spatial_dim), + (rf.expand_dim(val, single_step_dim), single_step_dim), + ) + return extended_outputs + + return None diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recog.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recog.py index 4245362d5..730e67383 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recog.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recog.py @@ -1,7 +1,7 @@ from typing import Optional, Dict, Any, Tuple import tree -from returnn.tensor import Tensor, Dim +from returnn.tensor import Tensor, Dim, single_step_dim import returnn.frontend as rf from returnn.frontend.tensor_array import TensorArray @@ -9,6 +9,74 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.base import _batch_size_factor from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model import SegmentalAttentionModel from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.utils import get_masked, get_non_blank_mask +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.beam_search import utils as beam_search_utils +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.blank_model.model import ( + BlankDecoderV1, + BlankDecoderV3, +) + + +def recombine_seqs( + seq_targets: list, + seq_log_prob: Tensor, + seq_backrefs: list, + seq_hash: Tensor, + beam_dim: Dim, + batch_dim: Dim, + i: int +) -> Tensor: + if len(seq_targets) in (0, 1): + return seq_log_prob + + print("seq_hash: ", seq_hash.raw_tensor) + print("seq_log_prob before: ", seq_log_prob.raw_tensor) + + seq_hash_cpu = rf.copy_to_device(seq_hash.copy_transpose([batch_dim, beam_dim]), device="cpu") + seq_log_prob = rf.copy_to_device(seq_log_prob.copy_transpose([batch_dim, beam_dim]), device="cpu") + + for b in range(batch_dim.get_dim_value()): + seq_sets = {} + for h in range(beam_dim.dimension): + seq_hash_value = seq_hash_cpu.raw_tensor[b, h] + if seq_hash_value not in seq_sets: + seq_sets[seq_hash_value] = [] + seq_sets[seq_hash_value].append(h) + + for seq_set in seq_sets.values(): + if len(seq_set) == 1: + continue + best_score = 0 + best_idx = -1 + for idx in seq_set: + if seq_log_prob.raw_tensor[b, idx] > best_score: + best_score = seq_log_prob.raw_tensor[b, idx] + best_idx = idx + for idx in seq_set: + if idx != best_idx: + seq_log_prob.raw_tensor[b, idx] = -float("inf") + else: + seq_log_prob.raw_tensor[b, idx] = best_score + + print("seq_log_prob after: ", seq_log_prob.raw_tensor) + exit() + + return rf.copy_to_device(seq_log_prob, device="gpu") + + +def update_seq_hash(seq_hash: Tensor, target: Tensor, backrefs: Tensor) -> Tensor: + print("update_seq_hash") + print("old seq_hash", seq_hash.raw_tensor) + print("target", target.raw_tensor) + print("backrefs", backrefs.raw_tensor) + print("\n\n") + + old_seq_hash = rf.gather(seq_hash, indices=backrefs) + seq_hash = rf.where( + target == 10025, + old_seq_hash, + (old_seq_hash * 257 + (target + 1)) % (10 ** 9 + 7) + ) + return seq_hash def model_recog( @@ -16,7 +84,7 @@ def model_recog( model: SegmentalAttentionModel, data: Tensor, data_spatial_dim: Dim, - max_seq_len: Optional[int] = None, + use_recombination: bool = False, ) -> Tuple[Tensor, Tensor, Dim, Dim]: """ Function is run within RETURNN. @@ -31,47 +99,79 @@ def model_recog( out_spatial_dim, final beam_dim """ - assert not model.label_decoder.language_model # not implemented here. use the pure PyTorch search instead + # assert not model.language_model # not implemented here. use the pure PyTorch search instead + assert any( + isinstance(model.blank_decoder, cls) for cls in (BlankDecoderV1, BlankDecoderV3) + ) or model.blank_decoder is None, "blank_decoder not supported" + if model.blank_decoder is None: + assert model.use_joint_model, "blank_decoder is None, so use_joint_model must be True" batch_dims = data.remaining_dims((data_spatial_dim, data.feature_dim)) enc_args, enc_spatial_dim = model.encoder.encode(data, in_spatial_dim=data_spatial_dim) beam_size = 12 - if max_seq_len is None: - max_seq_len = enc_spatial_dim.get_size_tensor() - else: - max_seq_len = rf.convert_to_tensor(max_seq_len, dtype="int32") + max_seq_len = enc_spatial_dim.get_size_tensor() print("** max seq len:", max_seq_len.raw_tensor) max_seq_len = rf.reduce_max(max_seq_len, axis=max_seq_len.dims) - # Eager-mode implementation of beam search. # Initial state. beam_dim = Dim(1, name="initial-beam") batch_dims_ = [beam_dim] + batch_dims + label_decoder_state = model.label_decoder.default_initial_state(batch_dims=batch_dims_, ) + if model.blank_decoder is not None: + blank_decoder_state = model.blank_decoder.default_initial_state(batch_dims=batch_dims_) + if model.language_model: + lm_state = model.language_model.default_initial_state(batch_dims=batch_dims_) - blank_decoder_state = model.blank_decoder.default_initial_state(batch_dims=batch_dims_) bos_idx = 0 - target = rf.constant(bos_idx, dims=batch_dims_, sparse_dim=model.align_target_dim) - target_non_blank = rf.constant(bos_idx, dims=batch_dims_, sparse_dim=model.target_dim) - # ended = rf.constant(False, dims=batch_dims_) + + if model.use_joint_model: + target = rf.constant(bos_idx, dims=batch_dims_, sparse_dim=model.target_dim) + else: + target = rf.constant(bos_idx, dims=batch_dims_, sparse_dim=model.align_target_dim) + update_state_mask = rf.convert_to_tensor(target != model.blank_idx) + target_non_blank = rf.constant(bos_idx, dims=batch_dims_, sparse_dim=model.target_dim) + seq_log_prob = rf.constant(0.0, dims=batch_dims_) + if use_recombination: + assert len(batch_dims) == 1 + seq_hash = rf.constant(0, dims=batch_dims_, dtype="int64") + + input_embed = rf.zeros( + batch_dims_ + [model.label_decoder.target_embed.out_dim], + feature_dim=model.label_decoder.target_embed.out_dim, + dtype="float32" + ) + + if isinstance(model.blank_decoder, BlankDecoderV1): + input_embed_length_model = rf.zeros( + batch_dims_ + [model.blank_decoder.target_embed.out_dim], feature_dim=model.blank_decoder.target_embed.out_dim) + else: + input_embed_length_model = None + + old_beam_dim = beam_dim.copy() + backrefs = rf.zeros(batch_dims_, dtype="int32") i = 0 seq_targets = [] seq_backrefs = [] while i < max_seq_len.raw_tensor: - if i == 0: - input_embed = rf.zeros( - batch_dims_ + [model.label_decoder.target_embed.out_dim], - feature_dim=model.label_decoder.target_embed.out_dim, - dtype="float32" - ) - input_embed_length_model = rf.zeros( - batch_dims_ + [model.blank_decoder.target_embed.out_dim], feature_dim=model.blank_decoder.target_embed.out_dim) - else: - input_embed_length_model = model.blank_decoder.target_embed(target) + if i > 0: + if model.use_joint_model: + input_embed = model.label_decoder.target_embed(target) + else: + target_non_blank = rf.where(update_state_mask, target, rf.gather(target_non_blank, indices=backrefs)) + target_non_blank.sparse_dim = model.label_decoder.target_embed.in_dim + input_embed = rf.where( + update_state_mask, + model.label_decoder.target_embed(target_non_blank), + rf.gather(input_embed, indices=backrefs, axis=old_beam_dim) + ) + if isinstance(model.blank_decoder, BlankDecoderV1): + input_embed_length_model = model.blank_decoder.target_embed(target) # ------------------- label step ------------------- + center_position = rf.minimum( rf.full(dims=[beam_dim] + batch_dims, fill_value=i, dtype="int32"), rf.copy_to_device(enc_spatial_dim.get_size_tensor() - 1, data.device) @@ -95,56 +195,113 @@ def model_recog( label_logits = model.label_decoder.decode_logits(input_embed=input_embed, **label_step_out) label_log_prob = rf.log_softmax(label_logits, axis=model.target_dim) + # ------------------- external LM step ------------------- + + if model.language_model: + lm_logits, lm_state_updated = model.language_model( + target_non_blank, + spatial_dim=single_step_dim, + state=lm_state, + ) + label_log_prob += rf.log_softmax(lm_logits, axis=model.target_dim) + # ------------------- blank step ------------------- - blank_step_out, blank_decoder_state = model.blank_decoder.loop_step( - enc=enc_args["enc"], - enc_spatial_dim=enc_spatial_dim, - input_embed=input_embed_length_model, - state=blank_decoder_state, - ) - blank_logits = model.blank_decoder.decode_logits(**blank_step_out) - emit_log_prob = rf.log(rf.sigmoid(blank_logits)) - emit_log_prob = rf.squeeze(emit_log_prob, axis=emit_log_prob.feature_dim) - blank_log_prob = rf.log(rf.sigmoid(-blank_logits)) - - # combine blank and label probs - label_log_prob += emit_log_prob - output_log_prob, _ = rf.concat( - (label_log_prob, model.target_dim), (blank_log_prob, blank_log_prob.feature_dim), - out_dim=model.align_target_dim - ) + if not model.use_joint_model: + blank_loop_step_kwargs = dict( + enc=enc_args["enc"], + enc_spatial_dim=enc_spatial_dim, + state=blank_decoder_state, + ) + if isinstance(model.blank_decoder, BlankDecoderV1): + blank_loop_step_kwargs["input_embed"] = input_embed_length_model + else: + blank_loop_step_kwargs["label_model_state"] = label_step_out["s"] + + blank_step_out, blank_decoder_state = model.blank_decoder.loop_step(**blank_loop_step_kwargs) + blank_logits = model.blank_decoder.decode_logits(**blank_step_out) + emit_log_prob = rf.log(rf.sigmoid(blank_logits)) + emit_log_prob = rf.squeeze(emit_log_prob, axis=emit_log_prob.feature_dim) + blank_log_prob = rf.log(rf.sigmoid(-blank_logits)) + # update blank decoder state + blank_decoder_state = tree.map_structure(lambda s: rf.gather(s, indices=backrefs), blank_decoder_state) + + # ------------------- combination ------------------- + + label_log_prob += emit_log_prob + output_log_prob, _ = rf.concat( + (label_log_prob, model.target_dim), (blank_log_prob, blank_log_prob.feature_dim), + out_dim=model.align_target_dim + ) + else: + output_log_prob = label_log_prob + + # ------------------- top-k ------------------- + + if use_recombination: + seq_log_prob = recombine_seqs(seq_targets, seq_log_prob, seq_backrefs, seq_hash, beam_dim, batch_dims[0], i) + if i== 3: + exit() - # top-k seq_log_prob = seq_log_prob + output_log_prob # Batch, InBeam, Vocab old_beam_dim = beam_dim.copy() seq_log_prob, (backrefs, target), beam_dim = rf.top_k( - seq_log_prob, k_dim=Dim(beam_size, name=f"dec-step{i}-beam"), axis=[beam_dim, model.align_target_dim] + seq_log_prob, + k_dim=Dim(beam_size, name=f"dec-step{i}-beam"), + axis=[beam_dim, model.target_dim if model.use_joint_model else model.align_target_dim] ) # seq_log_prob, backrefs, target: Batch, Beam seq_targets.append(target) seq_backrefs.append(backrefs) + if use_recombination: + seq_hash = update_seq_hash(seq_hash, target, backrefs) + + # mask for updating label-sync states update_state_mask = rf.convert_to_tensor(target != model.blank_idx) - def _get_masked_state(old, new, mask): - old = rf.gather(old, indices=backrefs, axis=old_beam_dim) - new = rf.gather(new, indices=backrefs, axis=old_beam_dim) - return rf.where(mask, new, old) + # ------------------- update label decoder state ------------------- - label_decoder_state = tree.map_structure( - lambda old_state, new_state: _get_masked_state(old_state, new_state, update_state_mask), - label_decoder_state, label_decoder_state_updated - ) + if model.use_joint_model: + label_decoder_state = tree.map_structure(lambda s: rf.gather(s, indices=backrefs), label_decoder_state) + else: + def _get_masked_state(old, new, mask): + old = rf.gather(old, indices=backrefs, axis=old_beam_dim) + new = rf.gather(new, indices=backrefs, axis=old_beam_dim) + return rf.where(mask, new, old) + + label_decoder_state = tree.map_structure( + lambda old_state, new_state: _get_masked_state(old_state, new_state, update_state_mask), + label_decoder_state, label_decoder_state_updated + ) - target_non_blank = rf.where(update_state_mask, target, rf.gather(target_non_blank, indices=backrefs)) - target_non_blank.sparse_dim = model.label_decoder.target_embed.in_dim - input_embed = rf.where( - update_state_mask, - model.label_decoder.target_embed(target_non_blank), - rf.gather(input_embed, indices=backrefs, axis=old_beam_dim) - ) + # ------------------- update external LM state ------------------- + + if model.language_model: + def _get_masked_state_lm(old: rf.Tensor, new: rf.Tensor, mask: rf.Tensor): + if isinstance(old, Dim): + return new + + def _update(tensor: rf.Tensor): + tensor = tensor.copy_transpose(batch_dims + [old_beam_dim] + tensor.remaining_dims(batch_dims_)) + tensor_raw_tensor = beam_search_utils.batch_gather( + tensor.raw_tensor, + indices=backrefs.copy_transpose(batch_dims + [beam_dim]).raw_tensor + ) + tensor = tensor.copy_template_replace_dim_tag(1, beam_dim) + tensor.raw_tensor = tensor_raw_tensor + return tensor + + old = _update(old) + new = _update(new) + + return rf.where(mask, new, old) + + lm_state = tree.map_structure( + lambda old_state, new_state: _get_masked_state_lm(old_state, new_state, update_state_mask), + lm_state, lm_state_updated + ) - blank_decoder_state = tree.map_structure(lambda s: rf.gather(s, indices=backrefs), blank_decoder_state) + exit() i += 1 diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/train.py index 599aa6ef8..fe38ba5cc 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/train.py @@ -1,13 +1,11 @@ +import torch + from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.training import FramewiseTrainDef from returnn.tensor import TensorDict -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.utils import ( - get_non_blank_mask, - get_masked, - get_emit_ground_truth, - get_segment_starts_and_lens -) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental import utils +from i6_experiments.users.schmitt.augmentation.alignment import shift_alignment_boundaries_batched from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model import SegmentalAttentionModel from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.train import ( viterbi_training as label_model_viterbi_training @@ -15,6 +13,13 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.blank_model.train import ( viterbi_training as blank_model_viterbi_training ) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.blank_model.train import ( + viterbi_training_v3 as blank_model_viterbi_training_v3 +) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.blank_model.model import ( + BlankDecoderV1, + BlankDecoderV3, +) from returnn.tensor import Dim import returnn.frontend as rf @@ -63,17 +68,43 @@ def viterbi_training( batch_dims = data.remaining_dims(data_spatial_dim) - non_blank_targets, non_blank_targets_spatial_dim = get_masked( - align_targets, get_non_blank_mask(align_targets, model.blank_idx), align_targets_spatial_dim, batch_dims - ) - non_blank_targets.sparse_dim = model.target_dim - segment_starts, segment_lens = get_segment_starts_and_lens( - align_targets, - align_targets_spatial_dim, - model, - batch_dims, - non_blank_targets_spatial_dim - ) + alignment_augmentation_opts = config.typed_value("alignment_augmentation_opts", None) + if alignment_augmentation_opts is not None: + for _ in range(alignment_augmentation_opts["num_iterations"]): + align_targets = shift_alignment_boundaries_batched( + alignment=align_targets, + alignment_spatial_dim=align_targets_spatial_dim, + batch_dims=batch_dims, + blank_idx=model.blank_idx, + max_shift=alignment_augmentation_opts["max_shift"], + ) + + if model.use_joint_model: + non_blank_targets, non_blank_targets_spatial_dim = None, None + segment_starts, segment_lens = utils.get_segment_starts_and_lens( + utils.get_non_blank_mask(align_targets, blank_idx=-1), # this way, every frame is interpreted as non-blank + align_targets, + align_targets_spatial_dim, + model, + batch_dims, + align_targets_spatial_dim + ) + # set blank indices in alignment to 0 (= EOS index of imported global att model which is not used otherwise) + align_targets.raw_tensor[align_targets.raw_tensor == model.target_dim.dimension] = 0 + align_targets.sparse_dim = model.target_dim + else: + non_blank_targets, non_blank_targets_spatial_dim = utils.get_masked( + align_targets, utils.get_non_blank_mask(align_targets, model.blank_idx), align_targets_spatial_dim, batch_dims + ) + non_blank_targets.sparse_dim = model.target_dim + segment_starts, segment_lens = utils.get_segment_starts_and_lens( + utils.get_non_blank_mask(align_targets, model.blank_idx), + align_targets, + align_targets_spatial_dim, + model, + batch_dims, + non_blank_targets_spatial_dim + ) # ------------------- encoder aux loss ------------------- @@ -101,33 +132,67 @@ def viterbi_training( use_normalized_loss=True, ) - # ------------------- label loop ------------------- - - label_model_viterbi_training( - model=model.label_decoder, - enc_args=enc_args, - enc_spatial_dim=enc_spatial_dim, - non_blank_targets=non_blank_targets, - non_blank_targets_spatial_dim=non_blank_targets_spatial_dim, - segment_starts=segment_starts, - segment_lens=segment_lens, - batch_dims=batch_dims, - ) - - # ------------------- blank loop ------------------- - - emit_ground_truth, emit_blank_target_dim = get_emit_ground_truth(align_targets, model.blank_idx) - blank_model_viterbi_training( - model=model.blank_decoder, - enc_args=enc_args, - enc_spatial_dim=enc_spatial_dim, - align_targets=align_targets, - align_targets_spatial_dim=align_targets_spatial_dim, - emit_ground_truth=emit_ground_truth, - emit_blank_target_dim=emit_blank_target_dim, - batch_dims=batch_dims, - ) - + if model.use_joint_model: + # ------------------- joint loop ------------------- + label_model_viterbi_training( + model=model.label_decoder, + enc_args=enc_args, + enc_spatial_dim=enc_spatial_dim, + non_blank_targets=align_targets, + non_blank_targets_spatial_dim=align_targets_spatial_dim, + segment_starts=segment_starts, + segment_lens=segment_lens, + batch_dims=batch_dims, + ) + else: + # ------------------- label loop ------------------- + + label_decoder_outputs = label_model_viterbi_training( + model=model.label_decoder, + enc_args=enc_args, + enc_spatial_dim=enc_spatial_dim, + non_blank_targets=non_blank_targets, + non_blank_targets_spatial_dim=non_blank_targets_spatial_dim, + segment_starts=segment_starts, + segment_lens=segment_lens, + batch_dims=batch_dims, + output_tensors=model.blank_decoder.get_label_decoder_deps(), + ) + + # ------------------- blank loop ------------------- + + emit_ground_truth, emit_blank_target_dim = utils.get_emit_ground_truth(align_targets, model.blank_idx) + if isinstance(model.blank_decoder, BlankDecoderV1): + blank_model_viterbi_training( + model=model.blank_decoder, + enc_args=enc_args, + enc_spatial_dim=enc_spatial_dim, + align_targets=align_targets, + align_targets_spatial_dim=align_targets_spatial_dim, + emit_ground_truth=emit_ground_truth, + emit_blank_target_dim=emit_blank_target_dim, + batch_dims=batch_dims, + ) + else: + assert isinstance(model.blank_decoder, BlankDecoderV3) + assert "s" in label_decoder_outputs + + label_states_unmasked = utils.get_unmasked( + input=label_decoder_outputs["s"][0], + input_spatial_dim=label_decoder_outputs["s"][1], + mask=utils.get_non_blank_mask(align_targets, model.blank_idx), + mask_spatial_dim=align_targets_spatial_dim + ) + blank_model_viterbi_training_v3( + model=model.blank_decoder, + enc_args=enc_args, + enc_spatial_dim=enc_spatial_dim, + label_states_unmasked=label_states_unmasked, + label_states_unmasked_spatial_dim=align_targets_spatial_dim, + emit_ground_truth=emit_ground_truth, + emit_blank_target_dim=emit_blank_target_dim, + batch_dims=batch_dims, + ) viterbi_training: TrainDef[SegmentalAttentionModel] viterbi_training.learning_rate_control_error_measure = "dev_score_full_sum" diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/utils.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/utils.py index 707be5236..7b8a7281a 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/utils.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/utils.py @@ -43,14 +43,30 @@ def get_masked( return result, result_spatial_dim +def get_unmasked( + input: Tensor, input_spatial_dim: Dim, mask: Tensor, mask_spatial_dim: Dim +): + mask_shifted = rf.shift_right(mask, axis=mask_spatial_dim, pad_value=False) + mask_axis = mask.get_axis_from_description(mask_spatial_dim) + cumsum = rf.cast(mask_shifted, "int32").copy_template() + cumsum.raw_tensor = torch.cumsum( + mask_shifted.raw_tensor.to(torch.int32), dim=mask_axis, dtype=torch.int32 + ) + return rf.gather( + input, + indices=cumsum, + axis=input_spatial_dim, + ) + + def get_segment_starts_and_lens( + non_blank_mask: Tensor, align_targets: Tensor, align_targets_spatial_dim: Dim, model: SegmentalAttentionModel, batch_dims: Sequence[Dim], out_spatial_dim: Dim ): - non_blank_mask = get_non_blank_mask(align_targets, model.blank_idx) targets_range = rf.range_over_dim(align_targets_spatial_dim, dtype="int32") targets_range = rf.expand_dim(targets_range, batch_dims[0]) non_blank_positions, _ = get_masked( diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/__init__.py index 71d73b64b..c7797b8a9 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/__init__.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/__init__.py @@ -1,4 +1,6 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.center_window_att import baseline_v1 as center_window_baseline_v1 +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.center_window_att import baseline_v3 as center_window_baseline_v3 +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.center_window_att import baseline_v4 as center_window_baseline_v4 from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.global_att import baseline_v1 as global_att_baseline_v1 from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.pipelines.pipeline_ls_conf.global_att import baseline_v1 as global_att_baseline_v1_no_rf @@ -9,4 +11,5 @@ def run_exps(): global_att_baseline_v1_no_rf.register_ctc_alignments() global_att_baseline_v1.run_exps() - center_window_baseline_v1.run_exps() + center_window_baseline_v3.run_exps() + center_window_baseline_v4.run_exps() diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v1/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v1/__init__.py index f9266f4f2..b33815fb9 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v1/__init__.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v1/__init__.py @@ -11,50 +11,34 @@ def run_exps(): for model_alias, config_builder in baseline.center_window_att_baseline_rf( win_size_list=(5,), ): - for train_alias, checkpoint in train.train_center_window_att_import_global_tf( + for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( alias=model_alias, config_builder=config_builder, n_epochs_list=(10,), + const_lr_list=(1e-4, 2e-4, 3e-4), time_rqmt=4, - train_opts=dict( - aux_loss_layers=None, - accum_grad_multiple_step=2, - optimizer={"class": "adam", "epsilon": 1e-8} - ) ): recog.center_window_returnn_frame_wise_beam_search( alias=train_alias, config_builder=config_builder, checkpoint=checkpoint, - checkpoint_aliases=("last",) - ) - recog.center_window_returnn_frame_wise_beam_search( - alias=train_alias, - config_builder=config_builder, - checkpoint=checkpoint, - checkpoint_aliases=("last",), pure_torch=True, ) for model_alias, config_builder in baseline.center_window_att_baseline_rf( win_size_list=(5,), decoder_version=2 ): - for train_alias, checkpoint in train.train_center_window_att_import_global_tf( + for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( alias=model_alias, config_builder=config_builder, n_epochs_list=(10,), + const_lr_list=(1e-4, 2e-4, 3e-4), time_rqmt=4, - train_opts=dict( - aux_loss_layers=None, - accum_grad_multiple_step=2, - optimizer={"class": "adam", "epsilon": 1e-8} - ), custom_missing_load_func=load_missing_params ): recog.center_window_returnn_frame_wise_beam_search( alias=train_alias, config_builder=config_builder, checkpoint=checkpoint, - checkpoint_aliases=("last",), pure_torch=True, ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v1/baseline.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v1/baseline.py index b80ee17cc..496bae2e0 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v1/baseline.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v1/baseline.py @@ -1,66 +1,18 @@ -from typing import Tuple, Optional, List, Dict, Union -import copy +from typing import Tuple, Optional -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.pipelines.pipeline_ls_conf.center_window_att.baseline_v1.alias import alias as base_alias -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.config_builder_rf.base import SegmentalAttConfigBuilderRF -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model import from_scratch_model_def, _returnn_v2_get_model -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.model_variants.model_variants_ls_conf import models - - -def get_center_window_att_config_builder_rf( - win_size: int, - use_weight_feedback: bool = True, - use_positional_embedding: bool = False, - att_weight_recog_penalty_opts: Optional[Dict] = None, - length_model_opts: Optional[Dict] = None, - length_scale: float = 1.0, - blank_penalty: Union[float, str] = 0.0, - gaussian_att_weight_interpolation_opts: Optional[Dict] = None, - expected_position_aux_loss_opts: Optional[Dict] = None, - pos_pred_att_weight_interpolation_opts: Optional[Dict] = None, - search_remove_eos: bool = False, - decoder_version: Optional[int] = None, -) -> SegmentalAttConfigBuilderRF: - model_type = "librispeech_conformer_seg_att" - variant_name = "seg.conformer.like-global" - variant_params = copy.deepcopy(models[model_type][variant_name]) - variant_params["network"]["segment_center_window_size"] = win_size - variant_params["network"]["use_weight_feedback"] = use_weight_feedback - variant_params["network"]["use_positional_embedding"] = use_positional_embedding - variant_params["network"]["att_weight_recog_penalty_opts"] = att_weight_recog_penalty_opts - variant_params["network"]["gaussian_att_weight_interpolation_opts"] = gaussian_att_weight_interpolation_opts - variant_params["network"]["pos_pred_att_weight_interpolation_opts"] = pos_pred_att_weight_interpolation_opts - variant_params["network"]["expected_position_aux_loss_opts"] = expected_position_aux_loss_opts - variant_params["network"]["length_scale"] = length_scale - variant_params["network"]["blank_penalty"] = blank_penalty - variant_params["network"]["search_remove_eos"] = search_remove_eos - variant_params["network"]["decoder_version"] = decoder_version - - if length_model_opts: - # make sure that we do not add any params which are not present in the defaults - assert set(length_model_opts.keys()).issubset(set(variant_params["network"]["length_model_opts"].keys())) - variant_params["network"]["length_model_opts"].update(length_model_opts) - - config_builder = SegmentalAttConfigBuilderRF( - variant_params=variant_params, - center_window_size=win_size, - decoder_version=decoder_version, - model_def=from_scratch_model_def, - get_model_func=_returnn_v2_get_model, - ) - - return config_builder +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.center_window_att.baseline_v1.alias import alias as base_alias +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.center_window_att.config_builder import get_center_window_att_config_builder_rf def center_window_att_baseline_rf( win_size_list: Tuple[int, ...] = (5, 129), - decoder_version: Optional[int] = None, + decoder_version: int = 1, ): for win_size in win_size_list: alias = f"{base_alias}/baseline_rf/win-size-{win_size}/decoder-version-{decoder_version if decoder_version else 1}" yield alias, get_center_window_att_config_builder_rf( win_size=win_size, - use_weight_feedback=True, - length_model_opts={"use_label_model_state": False, "use_alignment_ctx": False}, - decoder_version=decoder_version, + label_decoder_version=decoder_version, + blank_decoder_version=1, + use_joint_model=False, ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v2/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v2/__init__.py deleted file mode 100644 index 6b74c4bc4..000000000 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v2/__init__.py +++ /dev/null @@ -1,40 +0,0 @@ -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.center_window_att.baseline_v2 import ( - baseline, -) -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.center_window_att import ( - train, recog -) - -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model import MakeModel -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_import import map_param_func_v2 - -from i6_experiments.users.zeyer.returnn.convert_ckpt_rf import ConvertTfCheckpointToRfPtJob - -from i6_core.returnn.training import PtCheckpoint, Checkpoint - -from sisyphus import Path - - -def run_exps(): - for model_alias, config_builder in baseline.center_window_att_baseline_rf( - win_size_list=(5,), - ): - for train_alias, checkpoint in train.train_center_window_att_import_global_tf( - alias=model_alias, - config_builder=config_builder, - n_epochs_list=(1,), - time_rqmt=4 - ): - recog.center_window_returnn_frame_wise_beam_search( - alias=train_alias, - config_builder=config_builder, - checkpoint=checkpoint, - checkpoint_aliases=("last",) - ) - recog.center_window_returnn_frame_wise_beam_search( - alias=train_alias, - config_builder=config_builder, - checkpoint=checkpoint, - checkpoint_aliases=("last",), - pure_torch=True, - ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/__init__.py new file mode 100644 index 000000000..89d9f44d5 --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/__init__.py @@ -0,0 +1,56 @@ +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.center_window_att.baseline_v3 import ( + baseline, +) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.center_window_att import ( + train, recog +) +from i6_experiments.users.schmitt.custom_load_params import load_missing_params + + +def run_exps(): + for model_alias, config_builder in baseline.center_window_att_baseline_rf( + win_size_list=(5, 129), + ): + for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( + alias=model_alias, + config_builder=config_builder, + n_epochs_list=(100,), + const_lr_list=(1e-4,), + ): + recog.center_window_returnn_frame_wise_beam_search( + alias=train_alias, + config_builder=config_builder, + checkpoint=checkpoint, + ) + + for model_alias, config_builder in baseline.center_window_att_baseline_rf( + win_size_list=(5, 129), decoder_version=2 + ): + for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( + alias=model_alias, + config_builder=config_builder, + n_epochs_list=(100,), + const_lr_list=(1e-4,), + ): + recog.center_window_returnn_frame_wise_beam_search( + alias=train_alias, + config_builder=config_builder, + checkpoint=checkpoint, + ) + + for model_alias, config_builder in baseline.center_window_att_baseline_rf( + win_size_list=(5,), decoder_version=2 + ): + for max_shift, num_iterations in [(1, 1), (2, 1), (1, 2), (2, 2)]: + for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( + alias=model_alias, + config_builder=config_builder, + n_epochs_list=(100,), + const_lr_list=(1e-4,), + alignment_augmentation_opts={"max_shift": max_shift, "num_iterations": num_iterations}, + ): + recog.center_window_returnn_frame_wise_beam_search( + alias=train_alias, + config_builder=config_builder, + checkpoint=checkpoint, + ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v2/alias.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/alias.py similarity index 82% rename from users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v2/alias.py rename to users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/alias.py index 3b09afcf4..c25647962 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v2/alias.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/alias.py @@ -1,4 +1,4 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.pipelines.pipeline_ls_conf.center_window_att.alias import alias as base_alias -alias = f"{base_alias}/baseline_v2" +alias = f"{base_alias}/baseline_v3" diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/baseline.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/baseline.py new file mode 100644 index 000000000..ef8071026 --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/baseline.py @@ -0,0 +1,20 @@ +from typing import Tuple, Optional + +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.center_window_att.baseline_v3.alias import alias as base_alias +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.center_window_att.config_builder import get_center_window_att_config_builder_rf + + +def center_window_att_baseline_rf( + win_size_list: Tuple[int, ...] = (5, 129), + decoder_version: int = 1, + use_weight_feedback: bool = True, +): + for win_size in win_size_list: + alias = f"{base_alias}/baseline_rf/win-size-{win_size}/{'w' if use_weight_feedback else 'wo'}-weight-feedback/decoder-version-{decoder_version if decoder_version else 1}" + yield alias, get_center_window_att_config_builder_rf( + win_size=win_size, + label_decoder_version=decoder_version, + blank_decoder_version=3, + use_joint_model=False, + use_weight_feedback=use_weight_feedback, + ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/__init__.py new file mode 100644 index 000000000..07a9ffbcc --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/__init__.py @@ -0,0 +1,26 @@ +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.center_window_att.baseline_v4 import ( + baseline, +) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.center_window_att import ( + train, recog +) +from i6_experiments.users.schmitt.custom_load_params import load_missing_params + + +def run_exps(): + for model_alias, config_builder in baseline.center_window_att_baseline_rf( + win_size_list=(1, 5,), + ): + for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( + alias=model_alias, + config_builder=config_builder, + n_epochs_list=(10,), + time_rqmt=4, + ): + recog.center_window_returnn_frame_wise_beam_search( + alias=train_alias, + config_builder=config_builder, + checkpoint=checkpoint, + checkpoint_aliases=("last",), + pure_torch=False, + ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/alias.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/alias.py new file mode 100644 index 000000000..dc33bc1e9 --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/alias.py @@ -0,0 +1,4 @@ +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.pipelines.pipeline_ls_conf.center_window_att.alias import alias as base_alias + + +alias = f"{base_alias}/baseline_v4" diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/baseline.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/baseline.py new file mode 100644 index 000000000..069601d20 --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/baseline.py @@ -0,0 +1,18 @@ +from typing import Tuple, Optional + +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.center_window_att.baseline_v4.alias import alias as base_alias +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.center_window_att.config_builder import get_center_window_att_config_builder_rf + + +def center_window_att_baseline_rf( + win_size_list: Tuple[int, ...] = (5, 129), + decoder_version: int = 1, +): + for win_size in win_size_list: + alias = f"{base_alias}/baseline_rf/win-size-{win_size}/decoder-version-{decoder_version if decoder_version else 1}" + yield alias, get_center_window_att_config_builder_rf( + win_size=win_size, + label_decoder_version=decoder_version, + blank_decoder_version=None, + use_joint_model=True, + ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v2/baseline.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/config_builder.py similarity index 62% rename from users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v2/baseline.py rename to users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/config_builder.py index ae2e7c93a..1f0687758 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v2/baseline.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/config_builder.py @@ -1,21 +1,21 @@ from typing import Tuple, Optional, List, Dict, Union -import copy -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.pipelines.pipeline_ls_conf.center_window_att.baseline_v2.alias import alias as base_alias -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.config_builder_rf.base import SegmentalAttConfigBuilderRF -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model import from_scratch_model_def, _returnn_v2_get_model -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.general.returnn.exes import RETURNN_EXE_NEW, RETURNN_CURRENT_ROOT -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.corpora.librispeech import LibrispeechCorpora from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.labels.v2.librispeech.label_singletons import ( LibrispeechBPE10025_CTC_ALIGNMENT, + LIBRISPEECH_CORPUS ) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.general.returnn.exes import RETURNN_EXE_NEW, RETURNN_CURRENT_ROOT -LIBRISPEECH_CORPUS = LibrispeechCorpora() +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.config_builder_rf.base import SegmentalAttConfigBuilderRF +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model import from_scratch_model_def, _returnn_v2_get_model def get_center_window_att_config_builder_rf( win_size: int, - length_model_opts: Dict + label_decoder_version: int, + blank_decoder_version: Optional[int], + use_joint_model: bool, + use_weight_feedback: bool = True, ) -> SegmentalAttConfigBuilderRF: variant_params = { "dependencies": LibrispeechBPE10025_CTC_ALIGNMENT, @@ -23,29 +23,23 @@ def get_center_window_att_config_builder_rf( "feature_type": "raw", "corpus": LIBRISPEECH_CORPUS }, + "config": { + "train_seq_ordering": "laplace:.1000" + }, + "network": {"length_scale": 1.0}, "returnn_python_exe": RETURNN_EXE_NEW, "returnn_root": RETURNN_CURRENT_ROOT - }, + } config_builder = SegmentalAttConfigBuilderRF( variant_params=variant_params, model_def=from_scratch_model_def, get_model_func=_returnn_v2_get_model, center_window_size=win_size, - length_model_opts=length_model_opts, + label_decoder_version=label_decoder_version, + blank_decoder_version=blank_decoder_version, + use_joint_model=use_joint_model, + use_weight_feedback=use_weight_feedback, ) return config_builder - - -def center_window_att_baseline_rf( - win_size_list: Tuple[int, ...] = (5, 129), -): - for win_size in win_size_list: - alias = f"{base_alias}/baseline_rf/win-size-%d" % ( - win_size - ) - yield alias, get_center_window_att_config_builder_rf( - win_size=win_size, - length_model_opts={"use_label_model_state": True, "use_alignment_ctx": False}, - ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/train.py index d82fdf89d..c21b470f0 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/train.py @@ -10,7 +10,7 @@ ) -def train_center_window_att_from_scratch( +def train_center_window_att_viterbi_from_scratch( alias: str, config_builder: SegmentalAttConfigBuilderRF, n_epochs_list: Tuple[int, ...], @@ -42,99 +42,71 @@ def train_center_window_att_from_scratch( checkpoint = { "model_dir": model_dir, "learning_rates": learning_rates, - "key": "dev_score_label_model/output_prob", + "key": "dev_loss_non_blank_ce", "checkpoints": checkpoints, "n_epochs": n_epochs } yield alias, checkpoint -def train_center_window_att_import_global_tf( +def train_center_window_att_viterbi_import_global_tf( alias: str, config_builder: SegmentalAttConfigBuilderRF, n_epochs_list: Tuple[int, ...], - time_rqmt: int = 168, - train_opts: Optional[Dict] = None, - custom_missing_load_func: Optional[Callable] = None + const_lr_list: Tuple[float, ...] = (1e-4,), + time_rqmt: int = 30, + custom_missing_load_func: Optional[Callable] = None, + alignment_augmentation_opts: Optional[Dict] = None, ): - _train_opts = { - "preload_from_files": { - "pretrained_global_att_params": { - "filename": external_checkpoints[default_import_model_name], - "init_for_train": True, - "ignore_missing": True, # because of length model params - } - }, - "train_def": viterbi_training, - "train_step_func": _returnn_v2_train_step, - "batching": "random", - } - if custom_missing_load_func: - _train_opts["preload_from_files"]["pretrained_global_att_params"]["custom_missing_load_func"] = custom_missing_load_func - if train_opts: - _train_opts.update(train_opts) - for n_epochs in n_epochs_list: - alias += "/train_from_global_att_tf_checkpoint/standard-training/%d-epochs_wo-ctc-loss" % (n_epochs,) - - train_exp = SegmentalTrainExperiment( - config_builder=config_builder, - alias=alias, - num_epochs=n_epochs, - train_rqmt={ - "time": time_rqmt - }, - train_opts=_train_opts - ) - checkpoints, model_dir, learning_rates = train_exp.run_train() + for const_lr in const_lr_list: + train_alias = alias + f"/train_from_global_att_tf_checkpoint/standard-training/{n_epochs}-epochs_{const_lr}-const-lr_wo-ctc-loss" + if alignment_augmentation_opts: + opts = alignment_augmentation_opts + train_alias += f"_align-aug-{opts['num_iterations']}-iters_{opts['max_shift']}-max-shift" - checkpoint = { - "model_dir": model_dir, - "learning_rates": learning_rates, - "key": "dev_score_label_model/output_prob", - "checkpoints": checkpoints, - "n_epochs": n_epochs - } - yield alias, checkpoint - - -def train_center_window_att_import_center_window_tf( - alias: str, - config_builder: SegmentalAttConfigBuilderRF, - n_epochs_list: Tuple[int, ...], - time_rqmt: int = 168, -): - for n_epochs in n_epochs_list: - alias += "/train_from_center_window_baseline_v1_tf_checkpoint/standard-training/%d-epochs_wo-ctc-loss" % (n_epochs,) - - train_exp = SegmentalTrainExperiment( - config_builder=config_builder, - alias=alias, - num_epochs=n_epochs, - train_rqmt={ - "time": time_rqmt - }, - train_opts={ - # "preload_from_files": { - # "pretrained_global_att_params": { - # "filename": external_checkpoints["center-window_baseline-v1_tf"], - # "ignore_params_prefixes": ["emit_prob.", "s_length_model.", "target_embed_length_model."] - # } - # }, - "import_model_train_epoch1": get_center_window_baseline_v1_tf_checkpoint(), + train_opts = { + "preload_from_files": { + "pretrained_global_att_params": { + "filename": external_checkpoints[default_import_model_name], + "init_for_train": True, + "ignore_missing": True, # because of length model params + } + }, + "aux_loss_layers": None, + "accum_grad_multiple_step": 2, + "optimizer": {"class": "adam", "epsilon": 1e-8}, "train_def": viterbi_training, "train_step_func": _returnn_v2_train_step, "batching": "random", - "aux_loss_layers": None, + "lr_opts": { + "type": "const_then_linear", + "const_lr": const_lr, + "const_frac": 1 / 3, + "final_lr": 1e-6, + "num_epochs": n_epochs + }, + "alignment_augmentation_opts": alignment_augmentation_opts } - ) - checkpoints, model_dir, learning_rates = train_exp.run_train() + if custom_missing_load_func: + train_opts["preload_from_files"]["pretrained_global_att_params"]["custom_missing_load_func"] = custom_missing_load_func - checkpoint = { - "model_dir": model_dir, - "learning_rates": learning_rates, - "key": "dev_score_label_model/output_prob", - "checkpoints": checkpoints, - "n_epochs": n_epochs - } - yield alias, checkpoint + train_exp = SegmentalTrainExperiment( + config_builder=config_builder, + alias=train_alias, + num_epochs=n_epochs, + train_rqmt={ + "time": time_rqmt + }, + train_opts=train_opts + ) + checkpoints, model_dir, learning_rates = train_exp.run_train() + + checkpoint = { + "model_dir": model_dir, + "learning_rates": learning_rates, + "key": "dev_loss_non_blank_ce", + "checkpoints": checkpoints, + "n_epochs": n_epochs + } + yield train_alias, checkpoint diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/__init__.py index b291006ca..182f46415 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/__init__.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/__init__.py @@ -30,11 +30,11 @@ def run_exps(): alias=model_alias, config_builder=config_builder, n_epochs_list=(10,), + const_lr_list=(1e-4,), time_rqmt=4, ): recog.global_att_returnn_label_sync_beam_search( alias=train_alias, config_builder=config_builder, checkpoint=checkpoint, - checkpoint_aliases=("last",), ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/train.py index 9a8c7100d..d5e500543 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/train.py @@ -1,4 +1,5 @@ from typing import Tuple, Optional, List +import itertools from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.config_builder_rf.base import GlobalAttConfigBuilderRF from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.train_new import GlobalTrainExperiment @@ -42,7 +43,7 @@ def train_from_scratch( checkpoint = { "model_dir": model_dir, "learning_rates": learning_rates, - "key": "dev_score_output/output_prob", + "key": "dev_loss_ce", "checkpoints": checkpoints, "n_epochs": n_epochs } @@ -54,14 +55,15 @@ def train_import_global_tf( alias: str, config_builder: GlobalAttConfigBuilderRF, n_epochs_list: Tuple[int, ...], + const_lr_list: Tuple[float, ...], time_rqmt: int = 168, ): - for n_epochs in n_epochs_list: - alias += "/train_from_global_att_tf_checkpoint/standard-training/%d-epochs_wo-ctc-loss" % (n_epochs,) + for n_epochs, const_lr in itertools.product(n_epochs_list, const_lr_list): + train_alias = alias + f"/train_from_global_att_tf_checkpoint/standard-training/{n_epochs}-epochs_{const_lr}-const-lr_wo-ctc-loss" train_exp = GlobalTrainExperiment( config_builder=config_builder, - alias=alias, + alias=train_alias, num_epochs=n_epochs, train_rqmt={ "time": time_rqmt @@ -77,6 +79,13 @@ def train_import_global_tf( "train_step_func": _returnn_v2_train_step, "batching": "random", "aux_loss_layers": None, + "lr_opts": { + "type": "const_then_linear", + "const_lr": const_lr, + "const_frac": 1 / 3, + "final_lr": 1e-6, + "num_epochs": n_epochs + }, } ) checkpoints, model_dir, learning_rates = train_exp.run_train() @@ -84,8 +93,8 @@ def train_import_global_tf( checkpoint = { "model_dir": model_dir, "learning_rates": learning_rates, - "key": "dev_score_label_model/output_prob", + "key": "dev_loss_ce", "checkpoints": checkpoints, "n_epochs": n_epochs } - yield alias, checkpoint + yield train_alias, checkpoint From 6a1257f190f66c76e79823dc530feb0d190ec5a0 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 23 May 2024 10:16:16 +0000 Subject: [PATCH 050/227] add horovod to libri pipeline --- .../librispeech_960/pipeline.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/pipeline.py b/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/pipeline.py index aff3a22ca..6d30e979b 100644 --- a/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/pipeline.py +++ b/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/pipeline.py @@ -13,7 +13,15 @@ def training( - prefix_name, returnn_config, returnn_exe, returnn_root, num_epochs, mem_rqmt=15, time_rqmt=168, gpu_mem=None + prefix_name, + returnn_config, + returnn_exe, + returnn_root, + num_epochs, + mem_rqmt=15, + time_rqmt=168, + gpu_mem=None, + horovod_num_processes=None, ): """ @@ -31,7 +39,12 @@ def training( "returnn_root": returnn_root, } - train_job = ReturnnTrainingJob(returnn_config=returnn_config, num_epochs=num_epochs, **default_rqmt) + train_job = ReturnnTrainingJob( + returnn_config=returnn_config, + num_epochs=num_epochs, + horovod_num_processes=horovod_num_processes, + **default_rqmt, + ) if gpu_mem: assert gpu_mem in [11, 24] train_job.rqmt["gpu_mem"] = gpu_mem From 09661d7ccab0fba1455f0d78b4e4a0d780f1b005 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 23 May 2024 10:22:06 +0000 Subject: [PATCH 051/227] update configs --- .../tedlium2/configs/ebranch_baseline.py | 11 ++-- .../tedlium2/configs/ted2_att_baseline.py | 53 ++++++++++++++++++- 2 files changed, 57 insertions(+), 7 deletions(-) diff --git a/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ebranch_baseline.py b/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ebranch_baseline.py index b0d1887bf..b8c368eeb 100644 --- a/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ebranch_baseline.py +++ b/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ebranch_baseline.py @@ -869,10 +869,10 @@ def train_mini_self_att( ebranch_enc_args = EBranchformerEncoderArgs( num_blocks=12, input_layer="conv-6", - att_num_heads=6, - ff_dim=1536, - enc_key_dim=384, - conv_kernel_size=31, + att_num_heads=8, + ff_dim=2048, + enc_key_dim=512, + conv_kernel_size=32, pos_enc="rel", dropout=0.1, att_dropout=0.1, @@ -996,7 +996,7 @@ def get_base_v1_args(lr, ep, enc_drop=0.1, pretrain_reps=3, use_legacy_stats=Tru args["encoder_args"].num_blocks = num_blocks args["with_pretrain"] = False - specaug_steps = {"step0": 6_000, "step1": 12_000, "step2": 18_000} + specaug_steps = {"step0": 12_000, "step1": 18_000, "step2": 24_000} args["specaug_str_func_opts"] = { "version": 2, **specaug_steps, @@ -1027,7 +1027,6 @@ def get_base_v1_args(lr, ep, enc_drop=0.1, pretrain_reps=3, use_legacy_stats=Tru num_epochs=ep, epoch_wise_filter=None, bpe_size=BPE_1K, - partition_epoch=4, ) diff --git a/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py b/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py index b9e6e4dde..22c0fce9c 100644 --- a/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py +++ b/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py @@ -1266,7 +1266,58 @@ def get_base_v1_args(lr, ep, enc_drop=0.1, pretrain_reps=3, use_legacy_stats=Tru partition_epoch=4, ) - # TODO: ebranchformer encoder + # TODO: multi-gpu + for num_blocks in [12]: + for ep in [50 * 4]: + for lr in [8e-4, 16e-4]: + for target_embed_dim in [256]: + for att_drop in [0.0]: + for weight_drop in [0.1]: + for enc_drop in [0.15]: + for ctc_scale in [0.3]: + for sync_step in [50]: + base_v1_args, exp_name = get_base_v1_args( + lr, ep, enc_drop=enc_drop, use_legacy_stats=False + ) + + args = copy.deepcopy(base_v1_args) + args["encoder_args"].num_blocks = num_blocks + args["encoder_args"].mhsa_weight_dropout = weight_drop + args["encoder_args"].ff_weight_dropout = weight_drop + args["encoder_args"].conv_weight_dropout = weight_drop + + args["decoder_args"].embed_dim = target_embed_dim + args["decoder_args"].att_dropout = att_drop + + args["horovod_params"] = { + "horovod_reduce_type": "param", + "horovod_param_sync_step": sync_step, + "horovod_dataset_distribution": "random_seed_offset", + } + + args["batch_size"] = 15_000 * 160 + args["accum_grad"] = 1 + gradient_clip_global_norm = 1 + args["gradient_clip_global_norm"] = gradient_clip_global_norm + + exp_name += f"_weightDrop{weight_drop}_decAttDrop{att_drop}_embedDim{target_embed_dim}_numBlocks{num_blocks}" + exp_name += f"_gradClipNorm{gradient_clip_global_norm}" + exp_name += f"_paramSync_step{sync_step}_accum1" + + if ctc_scale != 1.0: + args["encoder_args"].ctc_loss_scale = ctc_scale + args["decoder_args"].ce_loss_scale = 1.0 - ctc_scale + exp_name += f"_ctcScale{ctc_scale}" + + run_exp( + exp_name + "_gpu4", + args, + num_epochs=ep, + epoch_wise_filter=None, + bpe_size=BPE_1K, + partition_epoch=4 * 4, + horovod_num_processes=4, + ) # # TODO: mixup # for num_blocks in [12]: From c655f06f4e21ff4e1bcbf5fdf65ecd071dbde728 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 23 May 2024 10:23:43 +0000 Subject: [PATCH 052/227] fix --- .../conformer_att_2023/tedlium2/configs/ted2_att_baseline.py | 1 + 1 file changed, 1 insertion(+) diff --git a/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py b/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py index 22c0fce9c..c8c13e602 100644 --- a/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py +++ b/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py @@ -236,6 +236,7 @@ def run_train( RETURNN_ROOT, num_epochs=num_epochs, gpu_mem=kwargs.get("gpu_mem", 11), + horovod_num_processes=kwargs.get("horovod_num_processes", None), ) return train_job From 2b1fe7751203b2dd8d2d61fb2d6ff535ef0ce93d Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 23 May 2024 14:46:06 +0200 Subject: [PATCH 053/227] more --- .../exp2024_04_23_baselines/aed.py | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/aed.py b/users/zeyer/experiments/exp2024_04_23_baselines/aed.py index 6a4ccc9df..941f3086a 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/aed.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/aed.py @@ -68,17 +68,22 @@ def py(): }, ) - train_exp( # 5.16 - "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-spm10k", - config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, - config_updates={ - **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), - "optimizer.weight_decay": 1e-2, - "__train_audio_preprocess": speed_pert_librosa_config, - "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], - }, - vocab="spm10k", - ) + for vocab in [ + "bpe10k", # 5.32 + "spm10k", # 5.16 + "spm_bpe10k", + ]: + train_exp( # 5.16 + f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-{vocab}", + config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), + "optimizer.weight_decay": 1e-2, + "__train_audio_preprocess": speed_pert_librosa_config, + "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], + }, + vocab=vocab, + ) # Testing sampling in SPM. Baseline without sampling: 5.24 dev-other. # The lower the alpha, the more aggressive the sampling. From 88e03e1c5747fbf40aa1edcb75780f080bdd8de7 Mon Sep 17 00:00:00 2001 From: "luca.gaudino" Date: Thu, 23 May 2024 17:03:47 +0200 Subject: [PATCH 054/227] update --- .../librispeech_960/default_tools.py | 8 +- .../conformer_import_moh_att_2023_06_30.py | 4 +- .../model_recogs/model_recog.py | 4 +- .../tedlium2/_import_model.py | 26 +- .../conformer_import_moh_att_2023_10_19.py | 17 +- .../conformer_import_moh_att_train.py | 97 ++++- .../librispeech_960/__init__.py | 0 .../librispeech_960/conformer_ctc_train.py | 338 ++++++++++++++++++ .../librispeech_960/sis_setup.py | 22 ++ .../tedlium2/conformer_ctc_train.py | 185 ++++++++-- .../tedlium2/conformer_rnnt_train.py | 8 +- .../models/asr/decoder/att_decoder_rf.py | 4 +- 12 files changed, 652 insertions(+), 61 deletions(-) create mode 100644 users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/__init__.py create mode 100644 users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py create mode 100644 users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/sis_setup.py diff --git a/users/gaudino/experiments/ctc_rnnt_standalone_2024/librispeech_960/default_tools.py b/users/gaudino/experiments/ctc_rnnt_standalone_2024/librispeech_960/default_tools.py index accfd9035..f26ca58c3 100644 --- a/users/gaudino/experiments/ctc_rnnt_standalone_2024/librispeech_960/default_tools.py +++ b/users/gaudino/experiments/ctc_rnnt_standalone_2024/librispeech_960/default_tools.py @@ -10,10 +10,10 @@ from i6_experiments.common.tools.sctk import compile_sctk # python from apptainer/singularity/docker -RETURNN_EXE = tk.Path( - "/u/luca.gaudino/bin/returnn_launcher_nocuda.sh", hash_overwrite="GENERIC_RETURNN_LAUNCHER" -) -# RETURNN_EXE = tk.Path("/usr/bin/python3", hash_overwrite="GENERIC_RETURNN_LAUNCHER") +# RETURNN_EXE = tk.Path( +# "/u/luca.gaudino/bin/returnn_launcher_nocuda.sh", hash_overwrite="GENERIC_RETURNN_LAUNCHER" +# ) +RETURNN_EXE = tk.Path("/usr/bin/python3", hash_overwrite="GENERIC_RETURNN_LAUNCHER") MINI_RETURNN_ROOT = CloneGitRepositoryJob( "https://github.com/JackTemaki/MiniReturnn", commit="0dc69329b21ce0acade4fcb2bf1be0dc8cc0b121" diff --git a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py index 2e9b4cf04..e692580cd 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py @@ -6,7 +6,7 @@ from typing import Optional, Any, Tuple, Dict, Sequence, List from itertools import product -from i6_experiments.users.gaudino.models.asr.rf.trafo_lm.lm_import_2023_11_09 import ( +from i6_experiments.users.gaudino.models.asr.rf.nn_lm.lm_import_2023_11_09 import ( Trafo_LM_Model, ) from sisyphus import tk @@ -19,7 +19,7 @@ LSTM_LM_Model, # MakeModel, ) -from i6_experiments.users.gaudino.experiments.rf_conformer_att_2023.tedlium2.ilm_import_2024_04_17 import ( +from i6_experiments.users.gaudino.models.asr.rf.ilm_import_2024_04_17 import ( MiniAtt_ILM_Model, ) from i6_experiments.users.gaudino.model_interfaces.model_interfaces import ModelDef, TrainDef diff --git a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/model_recogs/model_recog.py b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/model_recogs/model_recog.py index 92f158239..1c8d16b85 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/model_recogs/model_recog.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/model_recogs/model_recog.py @@ -166,7 +166,9 @@ def model_recog( lm_state = lm_out["state"] lm_log_prob = rf.log_softmax(lm_out["output"], axis=model.target_dim) - if i > 0: + + + if not model.search_args.get("use_lm_first_label", False) and i > 0: label_log_prob = ( label_log_prob + model.search_args["lm_scale"] * lm_log_prob ) diff --git a/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/_import_model.py b/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/_import_model.py index 3971eae02..5971097fa 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/_import_model.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/_import_model.py @@ -18,11 +18,11 @@ MakeModel, ) -from i6_experiments.users.gaudino.models.asr.rf.trafo_lm.lm_import_2023_11_09 import ( +from i6_experiments.users.gaudino.models.asr.rf.nn_lm.lm_import_2023_11_09 import ( MakeModel as MakeModelLM, ) -from i6_experiments.users.gaudino.experiments.rf_conformer_att_2023.tedlium2.ilm_import_2024_04_17 import ( +from i6_experiments.users.gaudino.models.asr.rf.ilm_import_2024_04_17 import ( MakeModel as MakeModelILM, ) @@ -598,9 +598,9 @@ def map_param_func_mini_att_ilm( def import_models(): # for model_name, sep_enc in product(list(models.keys())[-1:], [True, False]): - model_list = ["model_ctc0.5_att0.5"] + model_list = ["model_baseline"] # model_list = ["model_ctc0.9_att0.1", "model_ctc0.8_att0.2", "model_ctc0.7_att0.3", "model_ctc0.6_att0.4", "model_ctc0.5_att0.5", "model_ctc0.4_att0.6"] - for model_name, sep_enc, add_trafo_lm in product(model_list, [False], [True]): + for model_name, sep_enc, add_trafo_lm in product(model_list, [False], [False]): model_args = { "target_embed_dim": 256, "add_trafo_lm": add_trafo_lm, @@ -615,7 +615,7 @@ def import_models(): + " ..." ) out_dir = "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/tedlium2/without_lm/" - out_dir_postfix = model_name + ("__ctc_only" if sep_enc else "") + ("__trafo_lm" if add_trafo_lm else "") + out_dir_postfix = model_name + ("__ctc_only" if sep_enc else "") + ("__trafo_lm" if add_trafo_lm else "") + "_24_05_22" ckpt_path = models[model_name]["ckpt"].ckpt_path @@ -647,16 +647,16 @@ def import_models(): if __name__ == "__main__": - # import_models() + import_models() # convert_lm( # _ted2_lm_ckpt_filename, # "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/tedlium2/trafo_lm_only_24_02_05", # 1057, # ) - convert_mini_att_ilm( - ckpt_path_prior="/u/zeineldeen/setups/ubuntu_22_setups/2023-04-17--conformer-att/work/i6_core/returnn/training/AverageTFCheckpointsJob.yB4JK4GDCxWG/output/model/average", - ckpt_path_mini_att="/u/zeineldeen/setups/ubuntu_22_setups/2023-04-17--conformer-att/work/i6_core/returnn/training/GetBestTFCheckpointJob.70hGEsLQ6ynw/output/model/checkpoint", - model_in_dim=256, - model_target_dim=1057, - out_dir="/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/tedlium2/mini_att_ilm_24_04_21", - ) + # convert_mini_att_ilm( + # ckpt_path_prior="/u/zeineldeen/setups/ubuntu_22_setups/2023-04-17--conformer-att/work/i6_core/returnn/training/AverageTFCheckpointsJob.yB4JK4GDCxWG/output/model/average", + # ckpt_path_mini_att="/u/zeineldeen/setups/ubuntu_22_setups/2023-04-17--conformer-att/work/i6_core/returnn/training/GetBestTFCheckpointJob.70hGEsLQ6ynw/output/model/checkpoint", + # model_in_dim=256, + # model_target_dim=1057, + # out_dir="/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/tedlium2/mini_att_ilm_24_04_21", + # ) diff --git a/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_2023_10_19.py b/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_2023_10_19.py index dd8e5700c..526a7d6a6 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_2023_10_19.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_2023_10_19.py @@ -403,7 +403,7 @@ def sis_run_with_prefix(prefix_name: str = None): # ctc beam search espnet for model_name in ctc_beam_search_model_names: for scales, beam_size in product( - ctc_beam_search_model_names[model_name]["scales"], [32] + ctc_beam_search_model_names[model_name]["scales"], [32] # 32 ): att_scale, ctc_scale, prior_scale = scales @@ -463,7 +463,8 @@ def sis_run_with_prefix(prefix_name: str = None): # att + ilm correction for model_name, lm_scale, ilm_scale, beam_size in product( - ["model_baseline", "model_ctc0.5_att0.5"], [0.36] ,[0.28], [12] + # ["model_baseline", "model_ctc0.5_att0.5"], [0.36] ,[0.28], [12] + ["model_baseline"], [0.36] ,[0.28], [12] ): ilm_model_args = copy.deepcopy(models_with_pt_ckpt[model_name]["model_args"]) ilm_model_args["preload_from_files"] = { @@ -526,14 +527,16 @@ def sis_run_with_prefix(prefix_name: str = None): models_with_pt_ckpt[model_name]["model_args"] = model_args # att + trafo lm - for model_name, lm_scale, beam_size in product( - ["model_ctc0.5_att0.5"], [0.15], [12] + for model_name, lm_scale, beam_size, use_first_lm in product( + ["model_baseline"], [0.15], [6, 12, 18], [True, False] ): + lm_model_args = copy.deepcopy(models_with_pt_ckpt[model_name]["model_args"]) name = ( prefix_name + "/" + model_name + f"/att_trafolm{lm_scale}" + + (f"_first_lm" if use_first_lm else "") + f"_beam{beam_size}" ) search_args = { @@ -542,13 +545,17 @@ def sis_run_with_prefix(prefix_name: str = None): "lm_scale": lm_scale, "bsf": bsf, } + if use_first_lm: + search_args["use_first_lm"] = True + else: + search_args.pop("use_first_lm", None) recog_res, recog_out = recog_model( task, models_with_pt_ckpt[model_name]["ckpt"], model_recog, dev_sets=["dev"], - model_args=models_with_pt_ckpt[model_name]["model_args"], + model_args=lm_model_args, search_args=search_args, prefix_name=name, ) diff --git a/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_train.py b/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_train.py index 74c413ba6..fe0451e21 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_train.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_train.py @@ -105,7 +105,7 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): # train_exp("base-11gb", config_11gb, gpu_mem=11) # train_exp("base-11gb-v1", my_config_11gb, num_epochs=400, gpu_mem=11) - train_exp( + train_exp( # dev 8.77 test 8.26 "base-11gb-v3-lrlin1e_5_600k_aux4_8", my_config_11gb, config_updates={ @@ -120,6 +120,101 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): gpu_mem=11, ) + # train_exp( # aux 12: does not converge + # "base-11gb-v3-lrlin1e_5_261k", + # my_config_11gb, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + # }, + # num_epochs=400, + # gpu_mem=11, + # ) + + train_exp( # dev 7.89 test 7.3 + "base-11gb-v3-lrlin1e_5_261k_aux4_8", + my_config_11gb, + config_updates={ + "learning_rate": 1.0, + "dynamic_learning_rate": dyn_lr_piecewise_linear, + "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + "aux_loss_layers": [4, 8], + }, + num_epochs=400, + gpu_mem=11, + ) + + train_exp( # + "base-11gb-v3-lrlin1e_5_261k_aux4_8_12", + my_config_11gb, + config_updates={ + "learning_rate": 1.0, + "dynamic_learning_rate": dyn_lr_piecewise_linear, + "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + "aux_loss_layers": [4, 8, 12], + }, + num_epochs=400, + gpu_mem=11, + ) + + # train_exp( # does not converge + # "base-11gb-v3-lrlin8e_5_261k_aux4_8", + # my_config_11gb, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [8e-5, 8e-4, 8e-5, 1e-6], + # "aux_loss_layers": [4, 8], + # }, + # num_epochs=400, + # gpu_mem=11, + # ) + + model = train_exp( # with aux 4 8: dev 9.92 test 8.96 - wrong steps!!! + "base-24gb-v6-lrlin1e_5_261k", + config_24gb_v6, + config_updates={ + "learning_rate": 1.0, + "dynamic_learning_rate": dyn_lr_piecewise_linear, + "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], + "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + }, + num_epochs=400, + ) + + + model = train_exp( # + "base-24gb-v6-lrlin1e_5_85k", + config_24gb_v6, + config_updates={ + "learning_rate": 1.0, + "dynamic_learning_rate": dyn_lr_piecewise_linear, + # total steps after 400 epochs: 189.995 + "learning_rate_piecewise_steps": [85_500, 171_000, 190_000], + "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + }, + num_epochs=400, + ) + + + # train_exp( # does not converge + # "base-11gb-v3-lrlin8e_5_lrmax8e_4_261k", + # my_config_11gb, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [8e-5, 8e-4, 8e-5, 1e-6], + # }, + # num_epochs=400, + # gpu_mem=11, + # ) + # train_exp( # dev-other 7.6 # "base-24gb-bs30k-f32", # config_24gb, diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/__init__.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py new file mode 100644 index 000000000..25d1a4bb4 --- /dev/null +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py @@ -0,0 +1,338 @@ +"""Copied from Albert Zeyer 25.03.2024, then modified +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Optional, Union, Tuple, Sequence, List, Collection +import tree +import math +import numpy as np +import hashlib +import contextlib +import functools + +from returnn.tensor import Tensor, Dim, single_step_dim +import returnn.frontend as rf +from returnn.frontend.tensor_array import TensorArray +from returnn.frontend.encoder.conformer import ConformerEncoder, ConformerConvSubsample + +from i6_experiments.users.gaudino.model_interfaces.supports_label_scorer_torch import ( + RFModelWithMakeLabelScorer, +) +from i6_experiments.users.gaudino.experiments.rf_conformer_att_2023.librispeech_960.configs import * +from i6_experiments.users.gaudino.experiments.rf_conformer_att_2023.librispeech_960.configs import ( + _batch_size_factor, + _cfg_lrlin1e_5_295k, + _get_cfg_lrlin_oclr_by_bs_nep, +) +# from .trafo_lm import trafo_lm_kazuki_import + +from i6_experiments.users.gaudino.models.asr.rf.conformer_ctc.model_conformer_ctc import from_scratch_model_def, from_scratch_training +from i6_experiments.users.gaudino.models.asr.rf.conformer_ctc.model_recog_ctc_greedy import model_recog + +if TYPE_CHECKING: + from i6_experiments.users.zeyer.model_interfaces import ModelDef, RecogDef, TrainDef + from i6_experiments.users.zeyer.model_with_checkpoints import ( + ModelWithCheckpoints, + ModelWithCheckpoint, + ) + +# From Mohammad, 2023-06-29 +# dev-clean 2.27 +# dev-other 5.39 +# test-clean 2.41 +# test-other 5.51 +# _returnn_tf_config_filename = ( +# "/work/asr4/zeineldeen/setups-data/librispeech/2022-11-28--conformer-att/work/i6_core/returnn/search/ReturnnSearchJobV2.1oORPHJTAcW0/output/returnn.config") +# E.g. via /u/zeineldeen/setups/librispeech/2022-11-28--conformer-att/work +_returnn_tf_ckpt_filename = "i6_core/returnn/training/AverageTFCheckpointsJob.BxqgICRSGkgb/output/model/average.index" +# /u/zeineldeen/setups/librispeech/2022-11-28--conformer-att/work/i6_core/returnn/training/AverageTFCheckpointsJob.BxqgICRSGkgb +# Main train (2035 subepochs): /work/asr4/zeineldeen/setups-data/librispeech/2022-11-28--conformer-att/work/i6_core/returnn/training/ReturnnTrainingJob.SAh74CLCNJQi +# 15k batch size, accum grad 2 (1350 steps per epoch?) +# (With batch size 40k (here default), I have usually 495 steps/epoch. Same accum grad 2.) +# peak_lr = 0.9e-3 (1e-3 should also be fine), with Adam, optimizer_epsilon = 1e-08 +# phase1: peak_lr / 10 -> peak_lr (45%) +# phase2: peak_lr -> peak_lr / 10 (45%) +# phase3: peak_lr / 10 -> 1e-6 (10%) +# all linear decay and step-based +# specaugment like my orig (same here, specaugorig), speed perturb same here. +# weight decay: L2 1e-4 in some layers (not all): FF, depthwise conv, self-att, output, LSTM, readout +# Final from learning_rates file: +# 2035: EpochData(learningRate=, error={ +# 'dev_error_ctc': 0.0520755184693418, +# 'dev_error_output/output_prob': 0.035661241551042944, +# 'dev_score_ctc': 0.2796084385705723, +# 'dev_score_output/output_prob': 0.1718613621694714, +# 'devtrain_error_ctc': 0.005757552549708462, +# 'devtrain_error_output/output_prob': 0.005408351877314902, +# 'devtrain_score_ctc': 0.022935187616968285, +# 'devtrain_score_output/output_prob': 0.05237826015574962, +# 'train_error_ctc': 0.05592114304093772, +# 'train_error_output/output_prob': 0.041970552995693494, +# 'train_score_ctc': 0.21249712733341475, +# 'train_score_output/output_prob': 0.20816428663741796, +# }), +# Retrain RETURNN training job (600 subepochs): /u/zeineldeen/setups/librispeech/2022-11-28--conformer-att/work/i6_core/returnn/training/ReturnnTrainingJob.ZhtaEElHqWlr +# Epoch-wise LR: +# Fixed for 20 subepochs: 1e-4 +# Linear(?) decay for remaining (?): 1e-4 to 1e-6 +# Final from learning_rates file: +# 600: EpochData(learningRate=1e-06, error={ +# 'dev_error_ctc': 0.04999311020358747, +# 'dev_error_output/output_prob': 0.03406881170076022, +# 'dev_score_ctc': 0.2881619431223589, +# 'dev_score_output/output_prob': 0.16851828029171323, +# 'devtrain_error_ctc': 0.003611245977923651, +# 'devtrain_error_output/output_prob': 0.004028583366881808, +# 'devtrain_score_ctc': 0.014762402644778178, +# 'devtrain_score_output/output_prob': 0.0458638666428664, +# 'train_error_ctc': 0.051649620746772214, +# 'train_error_output/output_prob': 0.03977601830532325, +# 'train_score_ctc': 0.19722012590584306, +# 'train_score_output/output_prob': 0.19768974342596793, +# }), + + +# The model gets raw features (16khz) and does feature extraction internally. +_log_mel_feature_dim = 80 + + +def sis_run_with_prefix(prefix_name: Optional[str] = None): + """run the exp""" + _sis_setup_global_prefix(prefix_name) + + # Moh: dev-clean 2.27, dev-other 5.39, test-clean 2.41, test-other 5.51 + # RF recog: {"dev-clean": 2.25, "dev-other": 5.34, "test-clean": 2.42, "test-other": 5.56} + # _recog_imported() + + # train_exp("from-scratch-train", config_11gb, gpu_mem=11) + + # model = train_exp( # 5.41 + # "base-24gb-v6-lrlin1e_5_600k", + # config_24gb_v6, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 2000 epochs: 982.312 + # "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], + # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + # }, + # ) + + train_exp( # + "base-24gb-lrlin1e_5_600k_ctc_only", + config_24gb_v6, + config_updates={ + "learning_rate": 1.0, + "dynamic_learning_rate": dyn_lr_piecewise_linear, + # total steps after 2000 epochs: 982.312 + "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], + "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + "aux_loss_layers":[], + }, + ) + + train_exp( # + "base-24gb-lrlin1e_5_600k_ctc_only_aux4_8", + config_24gb_v6, + config_updates={ + "learning_rate": 1.0, + "dynamic_learning_rate": dyn_lr_piecewise_linear, + # total steps after 2000 epochs: 982.312 + "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], + "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + }, + ) + +_sis_prefix: Optional[str] = None + + +def _sis_setup_global_prefix(prefix_name: Optional[str] = None): + if not prefix_name: + from .sis_setup import get_prefix_for_config + + prefix_name = get_prefix_for_config(__file__) + global _sis_prefix + _sis_prefix = prefix_name + + +def _recog( + name: str, + model_with_checkpoint: ModelWithCheckpoint, + recog_def: RecogDef = None, + recog_config: Optional[Dict[str, Any]] = None, + *, + search_rqmt: Optional[Dict[str, Any]] = None, + dev_sets: Optional[Collection[str]] = None, +): + from sisyphus import tk + from i6_experiments.users.zeyer.recog import recog_model + + if recog_def is None: + recog_def = model_recog + + task = _get_ls_task() + + res = recog_model( + task, + model_with_checkpoint, + recog_def=recog_def, + config=recog_config, + search_rqmt=search_rqmt, + dev_sets=dev_sets, + ) + tk.register_output(_sis_prefix + "/" + name, res.output) + + +# noinspection PyShadowingNames +def train_exp( + name: str, + config: Dict[str, Any], + *, + config_updates: Optional[Dict[str, Any]] = None, + config_deletes: Optional[Sequence[str]] = None, + post_config_updates: Optional[Dict[str, Any]] = None, + num_epochs: int = 2000, + gpu_mem: Optional[int] = 24, + num_processes: Optional[int] = None, + fine_tune: Optional[Union[int, List[Tuple[int, Dict[str, Any]]]]] = None, + time_rqmt: Optional[int] = None, + model_avg: bool = False, +) -> ModelWithCheckpoints: + """ + Train experiment + """ + from i6_experiments.users.gaudino.experiments.rf_conformer_att_2023.train import ( + train, + ) + from i6_experiments.users.zeyer.recog import recog_training_exp + + if _sis_prefix is None: + _sis_setup_global_prefix() + + prefix = _sis_prefix + "/" + name + task = _get_ls_task() + config = config.copy() + config = dict_update_deep(config, config_updates, config_deletes) + if "__num_epochs" in config: + num_epochs = config.pop("__num_epochs") + if "__gpu_mem" in config: + gpu_mem = config.pop("__gpu_mem") + if "__num_processes" in config: + num_processes = config.pop("__num_processes") + + model_with_checkpoint = train( + prefix, + task=task, + config=config, + post_config=dict_update_deep(post_config, post_config_updates), + model_def=from_scratch_model_def, + train_def=from_scratch_training, + num_epochs=num_epochs, + gpu_mem=gpu_mem, + num_processes=num_processes, + distributed_launch_cmd="torchrun" if num_processes else "mpirun", + time_rqmt=time_rqmt, + ) + recog_training_exp( + prefix, task, model_with_checkpoint, recog_def=model_recog, model_avg=model_avg + ) + + if fine_tune: + if isinstance(fine_tune, int): + fine_tune = [(fine_tune, {})] + for ep, opts in fine_tune: + assert isinstance(ep, int) and isinstance(opts, dict) + suffix = f"/finetune/{ep}" + opts = opts.copy() + if opts: + for k, v in sorted(opts.items()): + k: str + suffix += "-" + k.lstrip("_") + v = str(v).replace("-", "_") + if len(v) > 16 and not k.startswith("_"): + suffix += "_" + hashlib.md5(v.encode("utf8")).hexdigest()[:8] + else: + suffix += v + num_epochs_ = opts.pop("num_epochs", 50) + config_ = config.copy() + config_["import_model_train_epoch1"] = model_with_checkpoint.get_epoch( + ep + ).checkpoint + config_.pop("dynamic_learning_rate") + lrs = opts.pop("learning_rates", None) + if lrs is None: + lr_decay_type = opts.pop( + "lr_decay_type", "geomspace" + ) # geomspace or linspace + lr_decay_func = getattr(np, lr_decay_type) + lr = config_["learning_rate"] + final_lr = opts.pop("final_lr", 1e-7) + lrs = list(lr_decay_func(lr, final_lr, num=num_epochs_)) + else: + assert isinstance(lrs, (list, tuple)) + assert len(lrs) == num_epochs_ + config_["learning_rates"] = lrs + config_["learning_rate"] = float(lrs[-1]) + config_["specaugment_steps"] = (0, 0, 0) + config_.update({k: v for k, v in opts.items() if not k.startswith("_")}) + + finetune_model_with_ckpt = train( + prefix + suffix, + task=task, + config=config_, + post_config=post_config, + model_def=from_scratch_model_def, + train_def=from_scratch_training, + num_epochs=num_epochs_, + gpu_mem=gpu_mem, + ) + # _recog(name + suffix + "/recog/last", finetune_model_with_ckpt.get_last_fixed_epoch()) + recog_training_exp( + prefix + suffix, task, finetune_model_with_ckpt, recog_def=model_recog + ) + + return model_with_checkpoint + + +_ls_task = None + + +def _get_ls_task(): + global _ls_task + if _ls_task: + return _ls_task + + from i6_experiments.users.zeyer.datasets.librispeech import ( + get_librispeech_task_bpe10k_raw, + ) + + _ls_task = get_librispeech_task_bpe10k_raw(with_eos_postfix=True) + return _ls_task + + +py = sis_run_with_prefix # if run directly via `sis m ...` + + +def model_warmup(*, model: Model, **_kwargs): + """warmup, for more reliable timing measures""" + import torch + import time + from returnn.config import get_global_config + from returnn.tensor import Dim + import returnn.frontend as rf + + config = get_global_config() + start_time = time.monotonic() + limit = start_time + config.float("model_warmup_time", 10.0) + + print("*** warming up...") + while time.monotonic() < limit: + batch_dim = Dim(10, name="dummy_batch") + time_dim = Dim(rf.full(dims=[batch_dim], fill_value=16_000), name="dummy_time") + feat_dim = Dim(1, name="dummy_feat") + source = rf.zeros([batch_dim, time_dim, feat_dim]) + res = model.encode(source=source, in_spatial_dim=time_dim) + if source.raw_tensor.device.type == "cuda": + torch.cuda.synchronize(source.raw_tensor.device) + res # noqa # keep ref to make sure it is calculated diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/sis_setup.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/sis_setup.py new file mode 100644 index 000000000..e477f6100 --- /dev/null +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/sis_setup.py @@ -0,0 +1,22 @@ +""" +Helpers for the Sisyphus setup +""" + +from __future__ import annotations +import os + + +_my_dir = os.path.dirname(os.path.abspath(__file__)) + + +def get_prefix_for_config(src_filename: str): + """ + :param src_filename: pass `__file__` here + :return: some prefix name + """ + assert src_filename.endswith(".py") + assert src_filename.startswith(_my_dir + "/"), f"unexpected prefix in {src_filename}" + src_filename = src_filename[len(_my_dir) + 1 :] + assert "/" not in src_filename, f"unexpected path separator in {src_filename}" + exp_name = src_filename[:-3] + return "librispeech_960_exp2024_05_13_rf/" + exp_name diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_ctc_train.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_ctc_train.py index 11c06ed43..900926eae 100644 --- a/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_ctc_train.py +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_ctc_train.py @@ -51,50 +51,175 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): # RF recog: {"dev-clean": 2.25, "dev-other": 5.34, "test-clean": 2.42, "test-other": 5.56} # _recog_imported() - rnnt_train_config = dict( - batching="laplace:.1000", - batch_size=15_000 * _batch_size_factor, - max_seqs=200, - # max_seq_length_default_target=75, - specaugment_steps=(10_000, 20_000, 40_000), - # gradient_clip=0, - # gradient_clip_global_norm = 1.0 - optimizer={ - "class": "adamw", - "epsilon": 1e-8, - "weight_decay": 1e-6, - }, - # accum_grad_multiple_step=4, - # gradient_noise=0.0, - learning_rate=2.5e-3, - dynamic_learning_rate=dyn_lr_lin_warmup_invsqrt_decay, - learning_rate_warmup_steps=40_000, - learning_rate_invsqrt_norm=40_000, - # aux_loss_layers=[4, 8], - max_seq_length_default_target=None, - gradient_clip_global_norm=5.0, - accum_grad_multiple_step=2, - aux_loss_layers=[4,8], - # rnnt_loss=False, -) + ctc_train_config = dict( + batching="laplace:.1000", + batch_size=15_000 * _batch_size_factor, + max_seqs=200, + # max_seq_length_default_target=75, + specaugment_steps=(10_000, 20_000, 40_000), + # gradient_clip=0, + # gradient_clip_global_norm = 1.0 + optimizer={ + "class": "adamw", + "epsilon": 1e-8, + "weight_decay": 1e-6, + }, + # accum_grad_multiple_step=4, + # gradient_noise=0.0, + learning_rate=2.5e-3, + dynamic_learning_rate=dyn_lr_lin_warmup_invsqrt_decay, + learning_rate_warmup_steps=40_000, + learning_rate_invsqrt_norm=40_000, + max_seq_length_default_target=None, + gradient_clip_global_norm=5.0, + accum_grad_multiple_step=2, + aux_loss_layers=[4,8], + # rnnt_loss=False, + ) + + ctc_train_24gb_config = ctc_train_config.copy() + ctc_train_24gb_config.update( + dict( + torch_amp="bfloat16", + batch_size=40_000 * _batch_size_factor, + accum_grad_multiple_step=2, + learning_rate=1e-3, + learning_rate_warmup_steps=20_000, + learning_rate_invsqrt_norm=20_000, + specaugment_steps=(5_000, 15_000, 25_000), + grad_scaler=None, + rf_att_dropout_broadcast=False, + ) + ) + ctc_train_24gb_config["optimizer"]["epsilon"] = 1e-16 + ctc_train_24gb_config["optimizer"]["weight_decay_modules_blacklist"]: [ # wdblacklist2 + "rf.Embedding", + "rf.LearnedRelativePositionalEncoding", + ] + # train_exp("base-11gb", config_11gb, gpu_mem=11) # train_exp("base-11gb-v1", my_config_11gb, num_epochs=400, gpu_mem=11) + # train_exp( # does not converge + # "from-scratch-11gb_aux4_8_lrmaxs522k", + # ctc_train_config, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 2000 epochs: 982.312 + # # total steps after 400 epochs: + # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + # # "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + # }, + # num_epochs=400, + # gpu_mem=11, + # ) + + # train_exp(# does not converge + # "from-scratch-11gb_lrmaxs522k", + # ctc_train_config, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 2000 epochs: 982.312 + # # total steps after 400 epochs: + # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + # # "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + # "aux_loss_layers": [], + # }, + # num_epochs=400, + # gpu_mem=11, + # ) + + # train_exp( # does not converge + # "from-scratch-11gb_lrmaxs522k_lrmax5e-4", + # ctc_train_config, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 2000 epochs: 982.312 + # # total steps after 400 epochs: + # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + # # "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [1e-5, 3e-4, 1e-5, 1e-6], + # "aux_loss_layers": [], + # }, + # num_epochs=400, + # gpu_mem=11, + # ) + # + # train_exp( # does not converge + # "from-scratch-11gb_lrmaxs522k_lrmin8e-5_lrmax8e-4", + # ctc_train_config, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 2000 epochs: 982.312 + # # total steps after 400 epochs: + # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + # # "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [8e-5, 8e-4, 8e-5, 1e-6], + # "aux_loss_layers": [], + # }, + # num_epochs=400, + # gpu_mem=11, + # ) + train_exp( - "from-scratch-11gb_aux4_8", - rnnt_train_config, + "from-scratch-24gb_lrmaxs85k_lrmin1e-5_lrmax1e-3", + ctc_train_24gb_config, config_updates={ "learning_rate": 1.0, "dynamic_learning_rate": dyn_lr_piecewise_linear, - # total steps after 2000 epochs: 982.312 - "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], + # total steps after 400 epochs: + "learning_rate_piecewise_steps": [85_500, 171_000, 190_000], # 45% 45 % 10% "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + "aux_loss_layers": [], + }, + num_epochs=400, + ) + + train_exp( + "from-scratch-24gb_lrmaxs85k_lrmin8e-5_lrmax8e-4", + ctc_train_24gb_config, + config_updates={ + "learning_rate": 1.0, + "dynamic_learning_rate": dyn_lr_piecewise_linear, + # total steps after 400 epochs: + "learning_rate_piecewise_steps": [85_500, 171_000, 190_000], # 45% 45 % 10% + "learning_rate_piecewise_values": [8e-5, 8e-4, 8e-5, 1e-6], + "aux_loss_layers": [], }, num_epochs=400, + ) + + # init for tf ctc only model + train_exp( # does not converge + "from-scratch-11gb_lrmaxs522k_lrmin8e-5_lrmax8e-4", + ctc_train_config, + config_updates={ + "learning_rate": 1.0, + "dynamic_learning_rate": dyn_lr_piecewise_linear, + # total steps after 2000 epochs: 982.312 + # total steps after 400 epochs: + "learning_rate_piecewise_steps": [66_000, 132_000, 145_000], # 45% 45 % 10% + # "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], # 45% 45 % 10% + "learning_rate_piecewise_values": [8e-5, 8e-4, 8e-5, 1e-6], + "aux_loss_layers": [], + "preload_from_files": { + "filename": "", + "ignore_missing": True, + "init_for_train": True, + }, + }, + num_epochs=100, gpu_mem=11, ) + _sis_prefix: Optional[str] = None diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_rnnt_train.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_rnnt_train.py index a7e088a5c..9718aed90 100644 --- a/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_rnnt_train.py +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_rnnt_train.py @@ -74,7 +74,7 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): max_seq_length_default_target=None, gradient_clip_global_norm=5.0, accum_grad_multiple_step=2, - aux_loss_layers=[12], + # aux_loss_layers=[12], ) # train_exp("base-11gb", config_11gb, gpu_mem=11) @@ -86,11 +86,13 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): "learning_rate": 1.0, "dynamic_learning_rate": dyn_lr_piecewise_linear, # total steps after 2000 epochs: 982.312 - "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], + # "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], + # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], }, num_epochs=400, - gpu_mem=11, + gpu_mem=24, ) diff --git a/users/gaudino/models/asr/decoder/att_decoder_rf.py b/users/gaudino/models/asr/decoder/att_decoder_rf.py index 6cd44c2b3..f058f0414 100644 --- a/users/gaudino/models/asr/decoder/att_decoder_rf.py +++ b/users/gaudino/models/asr/decoder/att_decoder_rf.py @@ -4,7 +4,7 @@ import torch import torch.nn.functional as F from typing import Tuple -from typeguard import check_argument_types +from typeguard import typechecked from espnet2.asr.decoder.abs_decoder import AbsDecoder from espnet2.utils.get_default_kwargs import get_default_kwargs @@ -132,6 +132,7 @@ def build_attention_list( class RNNDecoder(AbsDecoder): + @typechecked def __init__( self, vocab_size: int, @@ -147,7 +148,6 @@ def __init__( att_conf: dict = get_default_kwargs(build_attention_list), ): # FIXME(kamo): The parts of num_spk should be refactored more more more - assert check_argument_types() if rnn_type not in {"lstm", "gru"}: raise ValueError(f"Not supported: rnn_type={rnn_type}") From 7a979475861976ce947fa7cde8c4962125047bde Mon Sep 17 00:00:00 2001 From: "luca.gaudino" Date: Fri, 24 May 2024 11:59:38 +0200 Subject: [PATCH 055/227] update zoneout fix ted2 --- .../conformer_import_moh_att_2023_06_30.py | 4 +- .../conformer_import_moh_att_train.py | 4 +- .../model_recogs/model_recog.py | 1 - .../conformer_import_moh_att_2023_10_19.py | 79 ++++++++++++++----- .../models/asr/rf/ilm_import_2024_04_17.py | 3 +- 5 files changed, 69 insertions(+), 22 deletions(-) diff --git a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py index e692580cd..8aea3533c 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py @@ -799,6 +799,7 @@ def __init__( self.mel_normalization = model_args.get("mel_normalization", False) self.no_ctc = model_args.get("no_ctc", False) self.enc_layer_w_ctc = model_args.get("enc_layer_w_ctc", None) + self.s_use_zoneout_output = model_args.get("s_use_zoneout_output", True) self.encoder = ConformerEncoder( in_dim, @@ -887,7 +888,8 @@ def __init__( Dim(name="lstm", dimension=1024), zoneout_factor_cell=0.15, zoneout_factor_output=0.05, - use_zoneout_output=False, # like RETURNN/TF ZoneoutLSTM old default + use_zoneout_output=self.s_use_zoneout_output, # like RETURNN/TF ZoneoutLSTM old default + # use_zoneout_output=False, # like RETURNN/TF ZoneoutLSTM old default # this was a bug # parts_order="icfo", # like RETURNN/TF ZoneoutLSTM # parts_order="ifco", parts_order="jifo", # NativeLSTM (the code above converts it...) diff --git a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_train.py b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_train.py index 79cb8641f..85c0a7435 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_train.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_train.py @@ -1302,7 +1302,9 @@ def __init__( Dim(name="lstm", dimension=1024), zoneout_factor_cell=0.15, zoneout_factor_output=0.05, - use_zoneout_output=False, # like RETURNN/TF ZoneoutLSTM old default + # TODO: was this a bug? + use_zoneout_output=True, + # use_zoneout_output=False, # like RETURNN/TF ZoneoutLSTM old default # parts_order="icfo", # like RETURNN/TF ZoneoutLSTM # parts_order="ifco", parts_order="jifo", # NativeLSTM (the code above converts it...) diff --git a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/model_recogs/model_recog.py b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/model_recogs/model_recog.py index 1c8d16b85..2d19622f4 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/model_recogs/model_recog.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/model_recogs/model_recog.py @@ -174,7 +174,6 @@ def model_recog( ) if model.search_args.get("ilm_scale", 0.0) > 0: - breakpoint() ilm_out = model.ilm(input_embed, state=ilm_state, spatial_dim=single_step_dim) ilm_state = ilm_out["state"] ilm_log_prob = rf.log_softmax(ilm_out["output"], axis=model.target_dim) diff --git a/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_2023_10_19.py b/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_2023_10_19.py index 526a7d6a6..f1ca364cf 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_2023_10_19.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_2023_10_19.py @@ -108,7 +108,7 @@ def sis_run_with_prefix(prefix_name: str = None): bsf = 10 prefix_name_single_seq = prefix_name + f"/single_seq" - prefix_name = prefix_name + f"/bsf{bsf}" + prefix_name = prefix_name + f"/bsf{bsf}" + "_fix_zoneout_output" ### Single model experiments @@ -287,9 +287,10 @@ def sis_run_with_prefix(prefix_name: str = None): opls_model_names = { # -------- tuning done ---------- - # "model_baseline":{ - # "scales": [(0.7, 0.3, 0.7, 0.4), (0.7, 0.3, 0.7, 0.5)], - # }, + "model_baseline":{ + "scales": [(0.7, 0.3, 0.7, 0.4), (0.7, 0.3, 0.7, 0.5)], + "scales_w_fix": [], + }, # "model_ctc0.43_att1.0": { # "scales": [(0.8,0.2, 0.6), (0.8, 0.2, 0.7), (0.8, 0.2, 0.9)], # }, @@ -355,9 +356,12 @@ def sis_run_with_prefix(prefix_name: str = None): } # opls att + ctc prefix scorer - for model_name in opls_model_names: - for scales, beam_size in product(opls_model_names[model_name]["scales"], []): - att_scale, ctc_scale, prior_scale, _ = scales + for model_name in ["model_baseline"]: + # for model_name in opls_model_names: + # for scales, beam_size in product(opls_model_names[model_name]["scales"], [12]): + for scales, beam_size in product([(0.6, 0.4), (0.65, 0.35), (0.7, 0.3), (0.75, 0.25), (0.8, 0.2)], [12]): + att_scale, ctc_scale, = scales + prior_scale = 0.0 search_args = { "beam_size": beam_size, @@ -369,6 +373,7 @@ def sis_run_with_prefix(prefix_name: str = None): "prior_corr": True if prior_scale > 0 else False, "prior_scale": prior_scale, "ctc_prior_file": models[model_name]["prior"], + "use_zoneout_output": True, } name = ( @@ -383,7 +388,7 @@ def sis_run_with_prefix(prefix_name: str = None): task, models_with_pt_ckpt[model_name]["ckpt"], model_recog, - dev_sets=["dev", "test"], # set to None for all + dev_sets=["dev"], # set to None for all model_args=models_with_pt_ckpt[model_name]["model_args"], search_args=search_args, prefix_name=name, @@ -461,10 +466,10 @@ def sis_run_with_prefix(prefix_name: str = None): } models_with_pt_ckpt[model_name]["model_args"] = copy.deepcopy(model_args) - # att + ilm correction + # att + trafo lm + ilm correction for model_name, lm_scale, ilm_scale, beam_size in product( # ["model_baseline", "model_ctc0.5_att0.5"], [0.36] ,[0.28], [12] - ["model_baseline"], [0.36] ,[0.28], [12] + ["model_baseline"], [0.3, 0.34, 0.36, 0.4] ,[0.28], [12] ): ilm_model_args = copy.deepcopy(models_with_pt_ckpt[model_name]["model_args"]) ilm_model_args["preload_from_files"] = { @@ -482,7 +487,7 @@ def sis_run_with_prefix(prefix_name: str = None): prefix_name + "/" + model_name - + f"/att_trafolm{lm_scale}_ilm1_{ilm_scale}" + + f"/att_trafolm{lm_scale}_ilm{ilm_scale}" + f"_beam{beam_size}" ) search_args = { @@ -491,6 +496,8 @@ def sis_run_with_prefix(prefix_name: str = None): "ilm_scale": ilm_scale, "lm_scale": lm_scale, "bsf": bsf, + "use_first_lm": True, + "use_zoneout_output": True, } recog_res, recog_out = recog_model( task, @@ -506,6 +513,45 @@ def sis_run_with_prefix(prefix_name: str = None): recog_res.output, ) + # opls att + ctc + trafo lm + ilm + for model_name, beam_size, ilm_scale in product(["model_baseline"], [12], [0.1, 0.2, 0.25, 0.3, 0.4]): + for scales in opls_model_names[model_name]["scales"]: + att_scale, ctc_scale, prior_scale, lm_scale = scales + name = ( + prefix_name + + "/" + + model_name + + f"/opls_att{att_scale}_ctc{ctc_scale}_trafolm{lm_scale}_ilm{ilm_scale}" + + (f"_prior{prior_scale}" if prior_scale > 0 else "") + + f"_beam{beam_size}" + ) + search_args = { + "beam_size": beam_size, + "att_scale": att_scale, + "ctc_scale": ctc_scale, + "use_ctc": True, + "add_trafo_lm": True, + "lm_scale": lm_scale, + "bsf": bsf, + "prior_corr": True if prior_scale > 0 else False, + "prior_scale": prior_scale, + "ctc_prior_file": models[model_name]["prior"], + } + + recog_res, recog_out = recog_model( + task, + models_with_pt_ckpt[model_name]["ckpt"], + model_recog, + dev_sets=["dev", "test"], + model_args=models_with_pt_ckpt[model_name]["model_args"], + search_args=search_args, + prefix_name=name, + ) + tk.register_output( + name + f"/recog_results", + recog_res.output, + ) + # ----------------- With Trafo LM ----------------- for model_name in model_names: @@ -527,8 +573,8 @@ def sis_run_with_prefix(prefix_name: str = None): models_with_pt_ckpt[model_name]["model_args"] = model_args # att + trafo lm - for model_name, lm_scale, beam_size, use_first_lm in product( - ["model_baseline"], [0.15], [6, 12, 18], [True, False] + for model_name, lm_scale, beam_size in product( + ["model_baseline"], [0.13, 0.15, 0.18, 0.2], [12] ): lm_model_args = copy.deepcopy(models_with_pt_ckpt[model_name]["model_args"]) name = ( @@ -536,7 +582,6 @@ def sis_run_with_prefix(prefix_name: str = None): + "/" + model_name + f"/att_trafolm{lm_scale}" - + (f"_first_lm" if use_first_lm else "") + f"_beam{beam_size}" ) search_args = { @@ -544,11 +589,9 @@ def sis_run_with_prefix(prefix_name: str = None): "att_scale": 1.0, "lm_scale": lm_scale, "bsf": bsf, + "use_zoneout_output": True, + "use_first_lm": True, } - if use_first_lm: - search_args["use_first_lm"] = True - else: - search_args.pop("use_first_lm", None) recog_res, recog_out = recog_model( task, diff --git a/users/gaudino/models/asr/rf/ilm_import_2024_04_17.py b/users/gaudino/models/asr/rf/ilm_import_2024_04_17.py index 737dbdd54..e7f671c18 100644 --- a/users/gaudino/models/asr/rf/ilm_import_2024_04_17.py +++ b/users/gaudino/models/asr/rf/ilm_import_2024_04_17.py @@ -54,7 +54,8 @@ def __init__( self.prior_dim, zoneout_factor_cell=0.15, zoneout_factor_output=0.05, - use_zoneout_output=False, # like RETURNN/TF ZoneoutLSTM old default + # use_zoneout_output=False, # like RETURNN/TF ZoneoutLSTM old default + use_zoneout_output=True, # parts_order="icfo", # like RETURNN/TF ZoneoutLSTM # parts_order="ifco", parts_order="jifo", # NativeLSTM (the code above converts it...) From be654cee7f56a97d878f24e900210c261167c4de Mon Sep 17 00:00:00 2001 From: Simon Berger Date: Fri, 24 May 2024 13:11:56 +0200 Subject: [PATCH 056/227] Update users/berger --- .../config_02a_transducer_raw_samples.py | 10 +- .../config_02b_transducer_rasr_features.py | 145 +++++- ...ig_02c_transducer_rasr_features_wei_lex.py | 10 +- ...nfig_03a_transducer_fullsum_raw_samples.py | 10 +- ...ig_03b_transducer_fullsum_rasr_features.py | 9 +- ...ransducer_fullsum_rasr_features_wei_lex.py | 10 +- ...ucer_fullsum_from_scratch_rasr_features.py | 10 +- .../helpers/returnn.py | 1 + ...config_01_conformer_hybrid_tfgridnet_v2.py | 8 +- .../config_01a_conformer_hybrid_pt.py | 8 +- .../config_02_conformer_hybrid_blstm_v2.py | 6 +- ..._03_conformer_hybrid_tfgridnet_seqtrain.py | 8 +- .../config_03_blstm_transducer.py | 32 +- .../config_03_blstm_transducer.py | 14 +- .../config_03_blstm_transducer.py | 14 +- .../config_03_blstm_transducer.py | 32 +- .../wsj_16kHz/config_03_blstm_transducer.py | 14 +- .../wsj_8kHz/config_02_blstm_ctc.py | 6 +- .../wsj_8kHz/config_03_blstm_transducer.py | 14 +- .../config_02a_transducer.py | 10 +- .../config_02b_transducer_wei_data.py | 10 +- ...config_02c_transducer_wei_data_tinaconf.py | 10 +- .../config_03a_transducer_fullsum.py | 10 +- .../config_03b_transducer_fullsum_wei_data.py | 10 +- users/berger/corpus/librispeech/lm_data.py | 10 +- .../switchboard/viterbi_transducer_data.py | 7 +- users/berger/helpers/rasr.py | 26 +- users/berger/helpers/rasr_lm_config.py | 42 +- users/berger/network/helpers/label_context.py | 216 ++++++++- .../network/models/context_1_transducer.py | 35 +- .../recipe/rasr/label_tree_and_scorer.py | 3 +- users/berger/recipe/recognition/__init__.py | 1 + .../recognition/generic_seq2seq_search_v2.py | 444 ++++++++++++++++++ users/berger/settings.py | 9 +- users/berger/systems/base_system.py | 3 +- .../functors/recognition/seq2seq_search.py | 27 +- users/berger/systems/functors/seq2seq_base.py | 8 +- 37 files changed, 995 insertions(+), 247 deletions(-) create mode 100644 users/berger/recipe/recognition/generic_seq2seq_search_v2.py diff --git a/users/berger/configs/librispeech/20230210_baselines/config_02a_transducer_raw_samples.py b/users/berger/configs/librispeech/20230210_baselines/config_02a_transducer_raw_samples.py index 4a1100e06..21fdb1c20 100644 --- a/users/berger/configs/librispeech/20230210_baselines/config_02a_transducer_raw_samples.py +++ b/users/berger/configs/librispeech/20230210_baselines/config_02a_transducer_raw_samples.py @@ -47,10 +47,7 @@ def generate_returnn_config( **kwargs, ) -> ReturnnConfig: if train: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer( num_outputs=num_classes, gt_args={ "sample_rate": 16000, @@ -87,10 +84,7 @@ def generate_returnn_config( }, ) else: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer_recog( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_recog( num_outputs=num_classes, gt_args={ "sample_rate": 16000, diff --git a/users/berger/configs/librispeech/20230210_baselines/config_02b_transducer_rasr_features.py b/users/berger/configs/librispeech/20230210_baselines/config_02b_transducer_rasr_features.py index f6deeff86..3953a13fa 100644 --- a/users/berger/configs/librispeech/20230210_baselines/config_02b_transducer_rasr_features.py +++ b/users/berger/configs/librispeech/20230210_baselines/config_02b_transducer_rasr_features.py @@ -44,6 +44,7 @@ def generate_returnn_config( *, train_data_config: dict, dev_data_config: dict, + precompute: bool = False, **kwargs, ) -> ReturnnConfig: specaug_v2 = kwargs.get("specaug_v2", False) @@ -102,30 +103,50 @@ def generate_returnn_config( specaug_v2=specaug_v2, ) else: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer_recog( - num_outputs=num_classes, - conformer_args={ - "num_blocks": 12, - "size": 512, - }, - decoder_args={ - "dec_mlp_args": { - "num_layers": 2, - "size": 640, - "activation": "tanh", + if precompute: + network_dict, extra_python = transducer_model.make_context_1_conformer_transducer_precomputed_recog( + num_outputs=num_classes, + conformer_args={ + "num_blocks": 12, + "size": 512, }, - "combination_mode": "concat", - "joint_mlp_args": { - "num_layers": 1, - "size": 1024, - "activation": "tanh", + decoder_args={ + "dec_mlp_args": { + "num_layers": 2, + "size": 640, + "activation": "tanh", + }, + "combination_mode": "concat", + "joint_mlp_args": { + "num_layers": 1, + "size": 1024, + "activation": "tanh", + }, + "ilm_scale": kwargs.get("ilm_scale", 0.0), }, - "ilm_scale": kwargs.get("ilm_scale", 0.0), - }, - ) + ) + else: + network_dict, extra_python = transducer_model.make_context_1_conformer_transducer_recog( + num_outputs=num_classes, + conformer_args={ + "num_blocks": 12, + "size": 512, + }, + decoder_args={ + "dec_mlp_args": { + "num_layers": 2, + "size": 640, + "activation": "tanh", + }, + "combination_mode": "concat", + "joint_mlp_args": { + "num_layers": 1, + "size": 1024, + "activation": "tanh", + }, + "ilm_scale": kwargs.get("ilm_scale", 0.0), + }, + ) extra_config = { "train": train_data_config, @@ -158,7 +179,8 @@ def generate_returnn_config( python_prolog=[ "import sys", "sys.setrecursionlimit(10 ** 6)", - ], + ] + + (["from returnn.tf.util.data import FeatureDim"] if precompute else []), extra_python=extra_python, num_inputs=50, num_outputs=num_classes, @@ -215,6 +237,7 @@ def run_exp( SummaryKey.CORPUS, SummaryKey.EPOCH, SummaryKey.LM, + SummaryKey.RTF, SummaryKey.WER, SummaryKey.SUB, SummaryKey.INS, @@ -249,6 +272,7 @@ def run_exp( feature_type=FeatureType.GAMMATONE_16K, reduction_factor=4, reduction_subtrahend=0, + search_stats=True, ) # ********** Returnn Configs ********** @@ -374,6 +398,81 @@ def run_exp( **recog_args, ) + recog_args.update( + { + "epochs": [382], + "lm_scales": [0.8], + } + ) + recog_args["search_parameters"].update( + { + "label-pruning": 11.0, + "label-pruning-limit": 300, + "word-end-pruning": 0.5, + "word-end-pruning-limit": 200, + } + ) + system.run_recog_step_for_corpora( + exp_names=[f"Conformer_Transducer_Viterbi_specaug-v2_{name_suffix}"], + corpora=["dev-other_4gram"], + recog_descriptor="lp-11_lpl-300_wep-0.5_wepl-200", + **recog_args, + ) + + system.add_experiment_configs( + f"Conformer_Transducer_Viterbi_specaug-v2_{name_suffix}", + ReturnnConfigs( + train_config=generate_returnn_config( + train=True, + train_data_config=data.train_data_config, + dev_data_config=data.cv_data_config, + label_smoothing=None, + loss_boost_v2=False, + loss_boost_scale=0.0, + specaug_v2=True, + peak_lr=8e-04, + model_preload=None, + ), + recog_configs={ + f"recog_precompute_ilm-{ilm_scale}": generate_returnn_config( + train=False, + train_data_config=data.train_data_config, + dev_data_config=data.cv_data_config, + ilm_scale=ilm_scale, + precompute=True, + ) + for ilm_scale in [0.3] + }, + ), + ) + + for data_input in data.data_inputs.values(): + data_input.create_lm_images(tools.rasr_binary_path) + system.init_corpora( + dev_keys=data.dev_keys, + test_keys=data.test_keys, + align_keys=data.align_keys, + corpus_data=data.data_inputs, + am_args=exp_args.transducer_recog_am_args, + ) + system.setup_scoring() + + recog_args.update( + { + "seq2seq_v2": True, + "label_scorer_type": "precomputed-log-posterior", + "model_flow_args": {"output_layer_name": "output_precompute"}, + } + ) + recog_args["label_scorer_args"]["extra_args"]["first_order"] = True + recog_args["label_scorer_args"]["extra_args"]["start_label_index"] = 0 + system.run_recog_step_for_corpora( + exp_names=[f"Conformer_Transducer_Viterbi_specaug-v2_{name_suffix}"], + corpora=["dev-other_4gram"], + recog_descriptor="lp-11_lpl-300_wep-0.5_wepl-200", + **recog_args, + ) + train_job = system.get_train_job(f"Conformer_Transducer_Viterbi_lr-0.0008_{name_suffix}") model = train_job.out_checkpoints[400] assert isinstance(model, Checkpoint) diff --git a/users/berger/configs/librispeech/20230210_baselines/config_02c_transducer_rasr_features_wei_lex.py b/users/berger/configs/librispeech/20230210_baselines/config_02c_transducer_rasr_features_wei_lex.py index 2d3875947..f492eb5f0 100644 --- a/users/berger/configs/librispeech/20230210_baselines/config_02c_transducer_rasr_features_wei_lex.py +++ b/users/berger/configs/librispeech/20230210_baselines/config_02c_transducer_rasr_features_wei_lex.py @@ -47,10 +47,7 @@ def generate_returnn_config( **kwargs, ) -> ReturnnConfig: if train: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer( num_outputs=num_classes, specaug_args={ "max_time_num": 1, @@ -88,10 +85,7 @@ def generate_returnn_config( loss_boost_v2=kwargs.get("loss_boost_v2", False), ) else: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer_recog( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_recog( num_outputs=num_classes, conformer_args={ "num_blocks": 12, diff --git a/users/berger/configs/librispeech/20230210_baselines/config_03a_transducer_fullsum_raw_samples.py b/users/berger/configs/librispeech/20230210_baselines/config_03a_transducer_fullsum_raw_samples.py index 95dcbd9ad..2ee08d156 100644 --- a/users/berger/configs/librispeech/20230210_baselines/config_03a_transducer_fullsum_raw_samples.py +++ b/users/berger/configs/librispeech/20230210_baselines/config_03a_transducer_fullsum_raw_samples.py @@ -47,10 +47,7 @@ def generate_returnn_config( model_preload: tk.Path, ) -> ReturnnConfig: if train: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer_fullsum( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_fullsum( num_outputs=num_classes, gt_args={ "sample_rate": 16000, @@ -88,10 +85,7 @@ def generate_returnn_config( }, ) else: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer_recog( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_recog( num_outputs=num_classes, gt_args={ "sample_rate": 16000, diff --git a/users/berger/configs/librispeech/20230210_baselines/config_03b_transducer_fullsum_rasr_features.py b/users/berger/configs/librispeech/20230210_baselines/config_03b_transducer_fullsum_rasr_features.py index f45cac63e..496d6fd5b 100644 --- a/users/berger/configs/librispeech/20230210_baselines/config_03b_transducer_fullsum_rasr_features.py +++ b/users/berger/configs/librispeech/20230210_baselines/config_03b_transducer_fullsum_rasr_features.py @@ -22,6 +22,7 @@ from i6_experiments.users.berger.util import default_tools from i6_private.users.vieting.helpers.returnn import serialize_dim_tags from i6_experiments.users.berger.systems.dataclasses import AlignmentData +from i6_experiments.users.berger.network.helpers.label_context import ILMMode from .config_01b_ctc_blstm_rasr_features import py as py_ctc from .config_02b_transducer_rasr_features import py as py_transducer from sisyphus import gs, tk @@ -106,6 +107,7 @@ def generate_returnn_config( "size": 1024, "activation": "tanh", }, + "ilm_mode": ILMMode.ZeroEnc, "ilm_scale": kwargs.get("ilm_scale", 0.0), }, ) @@ -167,6 +169,9 @@ def run_exp(alignments: Dict[str, AlignmentData], viterbi_model_checkpoint: Chec feature_type=FeatureType.GAMMATONE_16K, ) + for data_input in data.data_inputs.values(): + data_input.create_lm_images(tools.rasr_binary_path) + # ********** Step args ********** train_args = exp_args.get_transducer_train_step_args( @@ -254,6 +259,7 @@ def run_exp(alignments: Dict[str, AlignmentData], viterbi_model_checkpoint: Chec system.run_recog_step_for_corpora(corpora=["test-clean_4gram", "test-other_4gram"], **recog_args) recog_args["lm_scales"] = [0.8] + recog_args["seq2seq_v2"] = True recog_args["search_parameters"].update( { "full-sum-decoding": True, @@ -280,7 +286,8 @@ def run_exp(alignments: Dict[str, AlignmentData], viterbi_model_checkpoint: Chec ) recog_args["lookahead_options"].update({"lm_lookahead_scale": 0.45}) recog_args["use_gpu"] = True - recog_args["rtf"] = 50 + recog_args["rtf"] = 100 + recog_args["mem"] = 24 system.run_recog_step_for_corpora( recog_descriptor="fs", diff --git a/users/berger/configs/librispeech/20230210_baselines/config_03c_transducer_fullsum_rasr_features_wei_lex.py b/users/berger/configs/librispeech/20230210_baselines/config_03c_transducer_fullsum_rasr_features_wei_lex.py index 8e2aa71b9..e3b4d82bc 100644 --- a/users/berger/configs/librispeech/20230210_baselines/config_03c_transducer_fullsum_rasr_features_wei_lex.py +++ b/users/berger/configs/librispeech/20230210_baselines/config_03c_transducer_fullsum_rasr_features_wei_lex.py @@ -48,10 +48,7 @@ def generate_returnn_config( **kwargs, ) -> ReturnnConfig: if train: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer_fullsum( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_fullsum( num_outputs=num_classes, specaug_args={ "max_time_num": 1, @@ -85,10 +82,7 @@ def generate_returnn_config( }, ) else: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer_recog( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_recog( num_outputs=num_classes, conformer_args={ "num_blocks": 12, diff --git a/users/berger/configs/librispeech/20230210_baselines/config_04b_transducer_fullsum_from_scratch_rasr_features.py b/users/berger/configs/librispeech/20230210_baselines/config_04b_transducer_fullsum_from_scratch_rasr_features.py index 2f2941734..27b80179f 100644 --- a/users/berger/configs/librispeech/20230210_baselines/config_04b_transducer_fullsum_from_scratch_rasr_features.py +++ b/users/berger/configs/librispeech/20230210_baselines/config_04b_transducer_fullsum_from_scratch_rasr_features.py @@ -45,10 +45,7 @@ def generate_returnn_config( **kwargs, ) -> ReturnnConfig: if train: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer_fullsum( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_fullsum( num_outputs=num_classes, specaug_args={ "max_time_num": 1, @@ -82,10 +79,7 @@ def generate_returnn_config( }, ) else: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer_recog( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_recog( num_outputs=num_classes, conformer_args={ "num_blocks": 12, diff --git a/users/berger/configs/librispeech/20230420_dfg_multi_speaker/helpers/returnn.py b/users/berger/configs/librispeech/20230420_dfg_multi_speaker/helpers/returnn.py index 5d67124a3..51976cbb0 100644 --- a/users/berger/configs/librispeech/20230420_dfg_multi_speaker/helpers/returnn.py +++ b/users/berger/configs/librispeech/20230420_dfg_multi_speaker/helpers/returnn.py @@ -1,6 +1,7 @@ """ RETURNN-related helpers """ + from typing import Any from i6_core.returnn import ReturnnConfig, CodeWrapper diff --git a/users/berger/configs/librispeech/20230804_libri_css/config_01_conformer_hybrid_tfgridnet_v2.py b/users/berger/configs/librispeech/20230804_libri_css/config_01_conformer_hybrid_tfgridnet_v2.py index 473af68da..a61022c2f 100644 --- a/users/berger/configs/librispeech/20230804_libri_css/config_01_conformer_hybrid_tfgridnet_v2.py +++ b/users/berger/configs/librispeech/20230804_libri_css/config_01_conformer_hybrid_tfgridnet_v2.py @@ -173,9 +173,11 @@ def run_exp() -> SummaryReport: for lm_name in ["4gram", "kazuki_transformer"]: data_per_lm[lm_name] = get_hybrid_data( train_key="enhanced_tfgridnet_v1", - dev_keys=["segmented_libri_css_tfgridnet_dev_v1", "segmented_libri_css_tfgridnet_eval_v1"] - if lm_name == "4gram" - else [], + dev_keys=( + ["segmented_libri_css_tfgridnet_dev_v1", "segmented_libri_css_tfgridnet_eval_v1"] + if lm_name == "4gram" + else [] + ), test_keys=["segmented_libri_css_tfgridnet_eval_v1"] if lm_name == "kazuki_transformer" else [], gmm_system=gmm_system, returnn_root=tools.returnn_root, diff --git a/users/berger/configs/librispeech/20230804_libri_css/config_01a_conformer_hybrid_pt.py b/users/berger/configs/librispeech/20230804_libri_css/config_01a_conformer_hybrid_pt.py index 549771f3a..13bf26768 100644 --- a/users/berger/configs/librispeech/20230804_libri_css/config_01a_conformer_hybrid_pt.py +++ b/users/berger/configs/librispeech/20230804_libri_css/config_01a_conformer_hybrid_pt.py @@ -197,9 +197,11 @@ def run_exp() -> SummaryReport: for lm_name in ["4gram", "kazuki_transformer"]: data_per_lm[lm_name] = get_hybrid_data( train_key="enhanced_tfgridnet_v1", - dev_keys=["segmented_libri_css_tfgridnet_dev_v1", "segmented_libri_css_tfgridnet_eval_v1"] - if lm_name == "4gram" - else [], + dev_keys=( + ["segmented_libri_css_tfgridnet_dev_v1", "segmented_libri_css_tfgridnet_eval_v1"] + if lm_name == "4gram" + else [] + ), test_keys=["segmented_libri_css_tfgridnet_eval_v1"] if lm_name == "kazuki_transformer" else [], gmm_system=gmm_system, returnn_root=tools.returnn_root, diff --git a/users/berger/configs/librispeech/20230804_libri_css/config_02_conformer_hybrid_blstm_v2.py b/users/berger/configs/librispeech/20230804_libri_css/config_02_conformer_hybrid_blstm_v2.py index b38df7e6a..7bd586710 100644 --- a/users/berger/configs/librispeech/20230804_libri_css/config_02_conformer_hybrid_blstm_v2.py +++ b/users/berger/configs/librispeech/20230804_libri_css/config_02_conformer_hybrid_blstm_v2.py @@ -162,9 +162,9 @@ def run_exp() -> SummaryReport: for lm_name in ["4gram", "kazuki_transformer"]: data_per_lm[lm_name] = get_hybrid_data( train_key="enhanced_blstm_v1", - dev_keys=["segmented_libri_css_blstm_dev_v1", "segmented_libri_css_blstm_eval_v1"] - if lm_name == "4gram" - else [], + dev_keys=( + ["segmented_libri_css_blstm_dev_v1", "segmented_libri_css_blstm_eval_v1"] if lm_name == "4gram" else [] + ), test_keys=["segmented_libri_css_blstm_eval_v1"] if lm_name == "kazuki_transformer" else [], gmm_system=gmm_system, returnn_root=tools.returnn_root, diff --git a/users/berger/configs/librispeech/20230804_libri_css/config_03_conformer_hybrid_tfgridnet_seqtrain.py b/users/berger/configs/librispeech/20230804_libri_css/config_03_conformer_hybrid_tfgridnet_seqtrain.py index 368f768b5..02d10ba67 100644 --- a/users/berger/configs/librispeech/20230804_libri_css/config_03_conformer_hybrid_tfgridnet_seqtrain.py +++ b/users/berger/configs/librispeech/20230804_libri_css/config_03_conformer_hybrid_tfgridnet_seqtrain.py @@ -244,9 +244,11 @@ def run_exp() -> SummaryReport: for lm_name in ["4gram", "kazuki_transformer"]: data_per_lm[lm_name] = get_hybrid_data( train_key="enhanced_tfgridnet_v1", - dev_keys=["segmented_libri_css_tfgridnet_dev_v1", "segmented_libri_css_tfgridnet_eval_v1"] - if lm_name == "4gram" - else [], + dev_keys=( + ["segmented_libri_css_tfgridnet_dev_v1", "segmented_libri_css_tfgridnet_eval_v1"] + if lm_name == "4gram" + else [] + ), test_keys=["segmented_libri_css_tfgridnet_eval_v1"] if lm_name == "kazuki_transformer" else [], gmm_system=gmm_system, returnn_root=tools.returnn_root, diff --git a/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/sms_wsj_16kHz.bak/config_03_blstm_transducer.py b/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/sms_wsj_16kHz.bak/config_03_blstm_transducer.py index 9067bf3db..f4927a36c 100644 --- a/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/sms_wsj_16kHz.bak/config_03_blstm_transducer.py +++ b/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/sms_wsj_16kHz.bak/config_03_blstm_transducer.py @@ -218,21 +218,25 @@ def run_exp(**kwargs): "base_chunk_size": 256, "chunking_factors": {"data": 1, "classes": red_fact}, "extra_config": { - "pretrain": { - "repetitions": 6, - "construction_algo": CodeWrapper("pretrain_construction_algo"), - } - if kwargs.get("pretrain", False) - else None, - "preload_from_files": { - "base": { - "init_for_train": True, - "ignore_missing": True, - "filename": ctc_model, + "pretrain": ( + { + "repetitions": 6, + "construction_algo": CodeWrapper("pretrain_construction_algo"), } - } - if kwargs.get("ctc_init", False) - else None, + if kwargs.get("pretrain", False) + else None + ), + "preload_from_files": ( + { + "base": { + "init_for_train": True, + "ignore_missing": True, + "filename": ctc_model, + } + } + if kwargs.get("ctc_init", False) + else None + ), "train": {"reduce_target_factor": red_fact}, "dev": {"reduce_target_factor": red_fact}, }, diff --git a/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/sms_wsj_16kHz/config_03_blstm_transducer.py b/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/sms_wsj_16kHz/config_03_blstm_transducer.py index 4f18db9a4..7366a84d4 100644 --- a/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/sms_wsj_16kHz/config_03_blstm_transducer.py +++ b/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/sms_wsj_16kHz/config_03_blstm_transducer.py @@ -178,12 +178,14 @@ def run_exp(alignments: Dict[str, Any], **kwargs) -> SummaryReport: "base_chunk_size": 256, "chunking_factors": {"data": 1, "classes": red_fact}, "extra_config": { - "pretrain": { - "repetitions": 6, - "construction_algo": CodeWrapper("pretrain_construction_algo"), - } - if kwargs.get("pretrain", False) - else None, + "pretrain": ( + { + "repetitions": 6, + "construction_algo": CodeWrapper("pretrain_construction_algo"), + } + if kwargs.get("pretrain", False) + else None + ), "train": {"reduce_target_factor": red_fact}, "dev": {"reduce_target_factor": red_fact}, }, diff --git a/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/sms_wsj_8kHz/config_03_blstm_transducer.py b/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/sms_wsj_8kHz/config_03_blstm_transducer.py index ef429df11..a3340c710 100644 --- a/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/sms_wsj_8kHz/config_03_blstm_transducer.py +++ b/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/sms_wsj_8kHz/config_03_blstm_transducer.py @@ -175,12 +175,14 @@ def run_exp(alignments: Dict[str, Any], **kwargs) -> SummaryReport: "base_chunk_size": 256, "chunking_factors": {"data": 1, "classes": red_fact}, "extra_config": { - "pretrain": { - "repetitions": 6, - "construction_algo": CodeWrapper("pretrain_construction_algo"), - } - if kwargs.get("pretrain", False) - else None, + "pretrain": ( + { + "repetitions": 6, + "construction_algo": CodeWrapper("pretrain_construction_algo"), + } + if kwargs.get("pretrain", False) + else None + ), "train": {"reduce_target_factor": red_fact}, "dev": {"reduce_target_factor": red_fact}, }, diff --git a/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/wsj_16kHz.bak/config_03_blstm_transducer.py b/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/wsj_16kHz.bak/config_03_blstm_transducer.py index ee261aea8..61ee68a8a 100644 --- a/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/wsj_16kHz.bak/config_03_blstm_transducer.py +++ b/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/wsj_16kHz.bak/config_03_blstm_transducer.py @@ -208,21 +208,25 @@ def run_exp(**kwargs): "base_chunk_size": 256, "chunking_factors": {"data": 1, "classes": red_fact}, "extra_config": { - "pretrain": { - "repetitions": 6, - "construction_algo": CodeWrapper("pretrain_construction_algo"), - } - if kwargs.get("pretrain", False) - else None, - "preload_from_files": { - "base": { - "init_for_train": True, - "ignore_missing": True, - "filename": ctc_model, + "pretrain": ( + { + "repetitions": 6, + "construction_algo": CodeWrapper("pretrain_construction_algo"), } - } - if kwargs.get("ctc_init", False) - else None, + if kwargs.get("pretrain", False) + else None + ), + "preload_from_files": ( + { + "base": { + "init_for_train": True, + "ignore_missing": True, + "filename": ctc_model, + } + } + if kwargs.get("ctc_init", False) + else None + ), "train": {"reduce_target_factor": red_fact}, "dev": {"reduce_target_factor": red_fact}, }, diff --git a/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/wsj_16kHz/config_03_blstm_transducer.py b/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/wsj_16kHz/config_03_blstm_transducer.py index be1a98329..f198b0d87 100644 --- a/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/wsj_16kHz/config_03_blstm_transducer.py +++ b/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/wsj_16kHz/config_03_blstm_transducer.py @@ -176,12 +176,14 @@ def run_exp(alignments: Dict[str, Any], **kwargs) -> SummaryReport: "base_chunk_size": 256, "chunking_factors": {"data": 1, "classes": red_fact}, "extra_config": { - "pretrain": { - "repetitions": 6, - "construction_algo": CodeWrapper("pretrain_construction_algo"), - } - if kwargs.get("pretrain", False) - else None, + "pretrain": ( + { + "repetitions": 6, + "construction_algo": CodeWrapper("pretrain_construction_algo"), + } + if kwargs.get("pretrain", False) + else None + ), "train": {"reduce_target_factor": red_fact}, "dev": {"reduce_target_factor": red_fact}, }, diff --git a/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/wsj_8kHz/config_02_blstm_ctc.py b/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/wsj_8kHz/config_02_blstm_ctc.py index 32ce0678b..ae8dbeaef 100644 --- a/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/wsj_8kHz/config_02_blstm_ctc.py +++ b/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/wsj_8kHz/config_02_blstm_ctc.py @@ -66,9 +66,9 @@ def run_exp(**kwargs) -> Tuple[Dict[str, Any], SummaryReport]: train_keys=[train_key], dev_keys=[dev_key], test_keys=[test_key], - align_keys=[train_key, dev_key, speechsource_train_key, speechsource_dev_key] - if kwargs.get("align", False) - else [], + align_keys=( + [train_key, dev_key, speechsource_train_key, speechsource_dev_key] if kwargs.get("align", False) else [] + ), freq=frequency, lm_name="64k_3gram", recog_lex_name="nab-64k", diff --git a/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/wsj_8kHz/config_03_blstm_transducer.py b/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/wsj_8kHz/config_03_blstm_transducer.py index b56141405..aaead78bf 100644 --- a/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/wsj_8kHz/config_03_blstm_transducer.py +++ b/users/berger/configs/sms_wsj/20220615_dfg_multi_speaker/wsj_8kHz/config_03_blstm_transducer.py @@ -176,12 +176,14 @@ def run_exp(alignments: Dict[str, Any], **kwargs) -> SummaryReport: "base_chunk_size": 256, "chunking_factors": {"data": 1, "classes": red_fact}, "extra_config": { - "pretrain": { - "repetitions": 6, - "construction_algo": CodeWrapper("pretrain_construction_algo"), - } - if kwargs.get("pretrain", False) - else None, + "pretrain": ( + { + "repetitions": 6, + "construction_algo": CodeWrapper("pretrain_construction_algo"), + } + if kwargs.get("pretrain", False) + else None + ), "train": {"reduce_target_factor": red_fact}, "dev": {"reduce_target_factor": red_fact}, }, diff --git a/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_02a_transducer.py b/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_02a_transducer.py index 8063dd200..6194a422d 100644 --- a/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_02a_transducer.py +++ b/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_02a_transducer.py @@ -46,10 +46,7 @@ def generate_returnn_config( **kwargs, ) -> ReturnnConfig: if train: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer( num_outputs=num_classes, specaug_args={ "max_time_num": 1, @@ -87,10 +84,7 @@ def generate_returnn_config( loss_boost_v2=kwargs.get("loss_boost_v2", False), ) else: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer_recog( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_recog( num_outputs=num_classes, conformer_args={ "num_blocks": 12, diff --git a/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_02b_transducer_wei_data.py b/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_02b_transducer_wei_data.py index 99650ac74..9ec52b39d 100644 --- a/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_02b_transducer_wei_data.py +++ b/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_02b_transducer_wei_data.py @@ -70,10 +70,7 @@ def generate_returnn_config( } if train: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer( num_outputs=num_classes, specaug_args=specaug_args, conformer_args={ @@ -107,10 +104,7 @@ def generate_returnn_config( specaug_v2=specaug_v2, ) else: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer_recog( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_recog( num_outputs=num_classes, conformer_args={ "num_blocks": 12, diff --git a/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_02c_transducer_wei_data_tinaconf.py b/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_02c_transducer_wei_data_tinaconf.py index 4854e0857..5ad7a0e0e 100644 --- a/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_02c_transducer_wei_data_tinaconf.py +++ b/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_02c_transducer_wei_data_tinaconf.py @@ -45,10 +45,7 @@ def generate_returnn_config( **kwargs, ) -> ReturnnConfig: if train: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer( num_inputs=40, num_outputs=num_classes, specaug_args={ @@ -81,10 +78,7 @@ def generate_returnn_config( loss_boost_v2=kwargs.get("loss_boost_v2", False), ) else: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer_recog( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_recog( num_inputs=40, num_outputs=num_classes, decoder_args={ diff --git a/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_03a_transducer_fullsum.py b/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_03a_transducer_fullsum.py index c7e5bcf73..80e587212 100644 --- a/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_03a_transducer_fullsum.py +++ b/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_03a_transducer_fullsum.py @@ -48,10 +48,7 @@ def generate_returnn_config( **kwargs, ) -> ReturnnConfig: if train: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer_fullsum( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_fullsum( num_outputs=num_classes, specaug_args={ "max_time_num": 1, @@ -85,10 +82,7 @@ def generate_returnn_config( }, ) else: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer_recog( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_recog( num_outputs=num_classes, conformer_args={ "num_blocks": 12, diff --git a/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_03b_transducer_fullsum_wei_data.py b/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_03b_transducer_fullsum_wei_data.py index 15623ada2..6e103088c 100644 --- a/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_03b_transducer_fullsum_wei_data.py +++ b/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_03b_transducer_fullsum_wei_data.py @@ -47,10 +47,7 @@ def generate_returnn_config( **kwargs, ) -> ReturnnConfig: if train: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer_fullsum( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_fullsum( num_outputs=num_classes, specaug_args={ "max_time_num": 1, @@ -84,10 +81,7 @@ def generate_returnn_config( }, ) else: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer_recog( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_recog( num_outputs=num_classes, conformer_args={ "num_blocks": 12, diff --git a/users/berger/corpus/librispeech/lm_data.py b/users/berger/corpus/librispeech/lm_data.py index d2beef445..35eabaf8e 100644 --- a/users/berger/corpus/librispeech/lm_data.py +++ b/users/berger/corpus/librispeech/lm_data.py @@ -10,7 +10,7 @@ def get_lm(name: str) -> rasr_lm_config.LMData: lm_dict = {} for key, val in lbs_dataset.get_arpa_lm_dict().items(): - lm_dict[key] = rasr_lm_config.ArpaLMData(10, val) + lm_dict[key] = rasr_lm_config.ArpaLMData(scale=10, filename=val, lookahead_lm=None) kazuki_lstm_path = dependency_path.join_right("kazuki_lstmlm_27062019") lm_dict["kazuki_lstm"] = rasr_lm_config.RNNLMData( @@ -18,7 +18,9 @@ def get_lm(name: str) -> rasr_lm_config.LMData: vocab_file=kazuki_lstm_path.join_right("vocabulary"), model_file=returnn.Checkpoint(index_path=kazuki_lstm_path.join_right("network.040.index")), graph_file=kazuki_lstm_path.join_right("network.040.meta"), - lookahead_lm=rasr_lm_config.ArpaLMData(scale=1.0, filename=lbs_dataset.get_arpa_lm_dict()["4gram"]), + lookahead_lm=rasr_lm_config.ArpaLMData( + scale=1.0, filename=lbs_dataset.get_arpa_lm_dict()["4gram"], lookahead_lm=None + ), ) kazuki_transformer_path = dependency_path.join_right("kazuki_transformerlm_2019interspeech") @@ -27,7 +29,9 @@ def get_lm(name: str) -> rasr_lm_config.LMData: vocab_file=kazuki_transformer_path.join_right("vocabulary"), model_file=returnn.Checkpoint(index_path=kazuki_transformer_path.join_right("network.030.index")), graph_file=kazuki_transformer_path.join_right("inference.meta"), - lookahead_lm=rasr_lm_config.ArpaLMData(scale=1.0, filename=lbs_dataset.get_arpa_lm_dict()["4gram"]), + lookahead_lm=rasr_lm_config.ArpaLMData( + scale=1.0, filename=lbs_dataset.get_arpa_lm_dict()["4gram"], lookahead_lm=None + ), ) return lm_dict[name] diff --git a/users/berger/corpus/switchboard/viterbi_transducer_data.py b/users/berger/corpus/switchboard/viterbi_transducer_data.py index 1dc86e529..622c2b6b3 100644 --- a/users/berger/corpus/switchboard/viterbi_transducer_data.py +++ b/users/berger/corpus/switchboard/viterbi_transducer_data.py @@ -40,12 +40,7 @@ def get_switchboard_data( # ********** Data inputs ********** - ( - train_data_inputs, - cv_data_inputs, - dev_data_inputs, - test_data_inputs, - ) = data.get_data_inputs( + (train_data_inputs, cv_data_inputs, dev_data_inputs, test_data_inputs,) = data.get_data_inputs( train_key=train_key, cv_keys=cv_keys, dev_keys=dev_keys, diff --git a/users/berger/helpers/rasr.py b/users/berger/helpers/rasr.py index e30ebc8c9..aaaeba234 100644 --- a/users/berger/helpers/rasr.py +++ b/users/berger/helpers/rasr.py @@ -7,7 +7,7 @@ from i6_experiments.common.datasets.util import CorpusObject from sisyphus import tk -from .rasr_lm_config import LMData +from i6_experiments.users.berger.helpers.rasr_lm_config import ArpaLMData, LMData @dataclass @@ -99,6 +99,16 @@ class RasrDataInput: lm: Optional[LMData] = None concurrent: int = 10 + def create_lm_images(self, rasr_binary_path: tk.Path) -> None: + if self.lm is None: + return + + if isinstance(self.lm, ArpaLMData): + self.lm.create_image(rasr_binary_path=rasr_binary_path, lexicon_file=self.lexicon.filename) + + if self.lm.lookahead_lm is not None and isinstance(self.lm.lookahead_lm, ArpaLMData): + self.lm.lookahead_lm.create_image(rasr_binary_path=rasr_binary_path, lexicon_file=self.lexicon.filename) + def get_crp_for_data_input( data: RasrDataInput, @@ -114,12 +124,6 @@ def get_crp_for_data_input( data.corpus_object.corpus_file, data.concurrent ).out_segment_path - if data.lm is not None: - crp.language_model_config = data.lm.get_config(tool_paths) # type: ignore - lookahead_config = data.lm.get_lookahead_config(tool_paths) - if lookahead_config is not None: - crp.lookahead_language_model_config = lookahead_config # type: ignore - crp.lexicon_config = rasr.RasrConfig() # type: ignore crp.lexicon_config.file = data.lexicon.filename crp.lexicon_config.normalize_pronunciation = data.lexicon.normalize_pronunciation @@ -128,4 +132,12 @@ def get_crp_for_data_input( crp.acoustic_model_config.allophones.add_all = data.lexicon.add_all_allophones # type: ignore crp.acoustic_model_config.allophones.add_from_lexicon = data.lexicon.add_allophones_from_lexicon # type: ignore + if data.lm is not None: + lm_config = data.lm.get_config(tool_paths=tool_paths) # type: ignore + lookahead_lm_config = data.lm.get_lookahead_config(tool_paths=tool_paths) + + crp.language_model_config = lm_config + if lookahead_lm_config is not None: + crp.lookahead_language_model_config = lookahead_lm_config # type: ignore + return crp diff --git a/users/berger/helpers/rasr_lm_config.py b/users/berger/helpers/rasr_lm_config.py index b7a28a9a4..32f678c89 100644 --- a/users/berger/helpers/rasr_lm_config.py +++ b/users/berger/helpers/rasr_lm_config.py @@ -1,7 +1,9 @@ +from __future__ import annotations from abc import ABC, abstractmethod from dataclasses import dataclass from typing import Optional +from i6_core.lm.lm_image import CreateLmImageJob from sisyphus import tk from i6_core import rasr, returnn @@ -13,27 +15,45 @@ @dataclass class LMData(ABC): scale: float + lookahead_lm: Optional[LMData] @abstractmethod - def get_config(self, tool_paths: ToolPaths) -> rasr.RasrConfig: + def get_config(self, **kwargs) -> rasr.RasrConfig: ... - def get_lookahead_config(self, tool_paths: ToolPaths) -> Optional[rasr.RasrConfig]: - return None + def get_lookahead_config(self, **kwargs) -> Optional[rasr.RasrConfig]: + if self.lookahead_lm is None: + return None + return self.lookahead_lm.get_config(**kwargs) @dataclass class ArpaLMData(LMData): filename: tk.Path + image: Optional[tk.Path] = None - def get_config(self, tool_paths: ToolPaths) -> rasr.RasrConfig: + def get_config(self, **_) -> rasr.RasrConfig: config = rasr.RasrConfig() config.type = "ARPA" config.file = self.filename config.scale = self.scale + if self.image is not None: + config.image = self.image return config + def create_image(self, rasr_binary_path: tk.Path, lexicon_file: tk.Path) -> None: + if self.image is not None: + return + + lm_crp = rasr.CommonRasrParameters() + lm_crp.set_executables(rasr_binary_path=rasr_binary_path) + rasr.crp_add_default_output(lm_crp) + lm_crp.lexicon_config = rasr.RasrConfig() + lm_crp.lexicon_config.file = lexicon_file + lm_crp.language_model_config = self.get_config() + self.image = CreateLmImageJob(crp=lm_crp, mem=8).out_image + @dataclass class NNLMData(LMData, ABC): @@ -42,9 +62,8 @@ class NNLMData(LMData, ABC): returnn_config: Optional[returnn.ReturnnConfig] = None graph_file: Optional[tk.Path] = None unknown_word: str = "" - lookahead_lm: Optional[LMData] = None - def _get_graph(self, tool_paths: ToolPaths) -> tk.Path: + def _get_graph(self, tool_paths: ToolPaths, **_) -> tk.Path: if self.graph_file is not None: return self.graph_file assert self.returnn_config is not None, "Must specify either a graph .meta file or a returnn config" @@ -55,7 +74,7 @@ def _get_graph(self, tool_paths: ToolPaths) -> tk.Path: blas_lib=tool_paths.blas_lib, ).out_graph - def get_config(self, tool_paths: ToolPaths) -> rasr.RasrConfig: + def get_config(self, tool_paths: ToolPaths, **_) -> rasr.RasrConfig: config = rasr.RasrConfig() config.scale = self.scale config.vocab_file = self.vocab_file @@ -76,11 +95,6 @@ def get_config(self, tool_paths: ToolPaths) -> rasr.RasrConfig: return config - def get_lookahead_config(self, tool_paths: ToolPaths) -> Optional[rasr.RasrConfig]: - if self.lookahead_lm is None: - return None - return self.lookahead_lm.get_config(tool_paths) - @dataclass class RNNLMData(NNLMData): @@ -89,7 +103,7 @@ class RNNLMData(NNLMData): opt_batch_size: int = 64 allow_reduced_history: bool = True - def get_config(self, tool_paths: ToolPaths) -> rasr.RasrConfig: + def get_config(self, tool_paths: ToolPaths, **_) -> rasr.RasrConfig: config = super().get_config(tool_paths) config.type = "tfrnn" @@ -106,7 +120,7 @@ def get_config(self, tool_paths: ToolPaths) -> rasr.RasrConfig: class TransformerLMData(NNLMData): max_batch_size: int = 64 - def get_config(self, tool_paths: ToolPaths) -> rasr.RasrConfig: + def get_config(self, tool_paths: ToolPaths, **_) -> rasr.RasrConfig: config = super().get_config(tool_paths) config.type = "simple-transformer" diff --git a/users/berger/network/helpers/label_context.py b/users/berger/network/helpers/label_context.py index 8214634b7..bb66e332c 100644 --- a/users/berger/network/helpers/label_context.py +++ b/users/berger/network/helpers/label_context.py @@ -1,4 +1,5 @@ from typing import Dict, List, Optional, Tuple +from i6_core.returnn import CodeWrapper from i6_experiments.users.berger.network.helpers.mlp import add_feed_forward_stack from i6_experiments.users.berger.network.helpers.output import add_softmax_output from i6_experiments.users.berger.network.helpers.compressed_input import ( @@ -11,6 +12,7 @@ class ILMMode(Enum): ZeroEnc = auto() + ZeroEncInclBlank = auto() def add_context_label_sequence_blank( @@ -18,7 +20,6 @@ def add_context_label_sequence_blank( base_labels: str = "data:classes", blank_index: int = 0, ) -> Tuple[str, str]: - # Example: # Classes = ..AB..C.D. # Then base labels should be ABCD @@ -51,7 +52,6 @@ def add_context_label_sequence_noblank( nonword_labels: List[int], base_labels: str = "data:classes", ): - # Example: # Classes = AAABBCCCDD # Let A be a nonword label, e.g. silence @@ -192,7 +192,6 @@ def add_context_1_decoder( combination_mode: Optional[str] = "add", output_args: Dict = {}, ) -> Tuple[List[str], Dict]: - output_unit = {} decoder_ff = add_dec_ffnn_stack(output_unit, context_labels, embedding_size, dec_mlp_args) @@ -269,10 +268,9 @@ def add_context_1_decoder_recog( dec_mlp_args: Dict = {}, joint_mlp_args: Dict = {}, ilm_scale: float = 0.0, - ilm_mode: ILMMode = ILMMode.ZeroEnc, + ilm_mode: ILMMode = ILMMode.ZeroEncInclBlank, combination_mode: Optional[str] = "add", ): - output_unit = {} output_unit["output_choice"] = { @@ -324,7 +322,7 @@ def add_context_1_decoder_recog( } if ilm_scale: - if ilm_mode == ILMMode.ZeroEnc: + if ilm_mode == ILMMode.ZeroEnc or ilm_mode == ILMMode.ZeroEncInclBlank: output_unit["zero_enc"] = {"class": "eval", "from": "data:source", "eval": "source(0) * 0"} joint_input_ilm = ["zero_enc", "decoder"] else: @@ -352,21 +350,25 @@ def add_context_1_decoder_recog( "reuse_params": "output", } - assert blank_idx == 0, "Blank idx != 0 not implemented for ilm" - # Set p(blank) = 1 and re-normalize the non-blank probs - # so we want P'[b, 0] = 1, sum(P'[b, 1:]) = 1, given a normalized tensor P, i.e. sum(P[b, :]) = 1 - # in log space logP'[b, 0] = 0, sum(exp(logP'[b, 1:])) = 1 - # so set logP'[b, 1:] <- logP[b, 1:] - log(1 - exp(P[b, 0])) - # then sum(exp(logP'[b, 1:])) = sum(P[1:] / (1 - exp(P[b, 0]))) = sum(P[b, 1:]) / sum(b, P[1:]) = 1 - output_unit["ilm_renorm"] = { - "class": "eval", - "from": ["ilm"], - "eval": "tf.concat([tf.zeros(tf.shape(source(0)[:, :1])), source(0)[:, 1:] - tf.math.log(1.0 - tf.exp(source(0)[:, :1]))], axis=-1)", - } + if ilm_mode == ILMMode.ZeroEncInclBlank: + ilm_layer = "ilm" + else: + assert blank_idx == 0, "Blank idx != 0 not implemented for ilm" + # Set p(blank) = 1 and re-normalize the non-blank probs + # so we want P'[b, 0] = 1, sum(P'[b, 1:]) = 1, given a normalized tensor P, i.e. sum(P[b, :]) = 1 + # in log space logP'[b, 0] = 0, sum(exp(logP'[b, 1:])) = 1 + # so set logP'[b, 1:] <- logP[b, 1:] - log(1 - exp(P[b, 0])) + # then sum(exp(logP'[b, 1:])) = sum(P[1:] / (1 - exp(P[b, 0]))) = sum(P[b, 1:]) / sum(b, P[1:]) = 1 + output_unit["ilm_renorm"] = { + "class": "eval", + "from": ["ilm"], + "eval": "tf.concat([tf.zeros(tf.shape(source(0)[:, :1])), source(0)[:, 1:] - tf.math.log(1.0 - tf.exp(source(0)[:, :1]))], axis=-1)", + } + ilm_layer = "ilm_renorm" output_unit["output_sub_ilm"] = { "class": "eval", - "from": ["output", "ilm_renorm"], + "from": ["output", ilm_layer], "eval": f"source(0) - {ilm_scale} * source(1)", } @@ -382,6 +384,183 @@ def add_context_1_decoder_recog( return joint_output, output_unit +def add_precomputed_context_1_decoder_recog( + network: Dict, + num_outputs: int, + blank_idx: int = 0, + encoder: str = "encoder", + embedding_size: int = 128, + dec_mlp_args: Dict = {}, + joint_mlp_args: Dict = {}, + ilm_scale: float = 0.0, + ilm_mode: ILMMode = ILMMode.ZeroEncInclBlank, + combination_mode: Optional[str] = "add", +): + output_unit = {} + + # [V-1] + output_unit["all_context"] = { + "class": "constant", + "value": list(range(1, num_outputs)), # first index: out-of-bounds for all-zero-embedding of init history + "dtype": "int32", + "with_batch_dim": True, + "as_batch": True, + "sparse_dim": CodeWrapper(f"FeatureDim('label', dimension={num_outputs})"), + } + + # [V-1, F] + output_unit["context_embedding"] = { + "class": "linear", + "from": "all_context", + "n_out": embedding_size, + "with_bias": False, + "initial_output": None, + } + + # [V, F] + output_unit["context_embedding_padded"] = { + "class": "pad", + "from": "context_embedding", + "axes": "B", + "padding": (1, 0), + "value": 0, + "mode": "constant", + } + + # [V, D] + decoder_ff = add_feed_forward_stack( + output_unit, from_list="context_embedding_padded", name="dec_ff", **dec_mlp_args + ) + output_unit["decoder"] = { + "class": "copy", + "from": decoder_ff, + } + + # [V, T, E] + output_unit["tile_encoder"] = { + "class": "eval", + "from": f"base:base:{encoder}", + "eval": f"tf.tile(source(0), [{num_outputs}, 1, 1])", + } + + joint_input = ["tile_encoder", "decoder"] + if combination_mode is None or combination_mode == "concat": + # [V, T, E+D] + output_unit["joint_input"] = { + "class": "copy", + "from": joint_input, + } + else: + # [V, T, E] + output_unit["joint_input"] = { + "class": "combine", + "from": joint_input, + "kind": combination_mode, + } + + # [V, T, J] + joint_output = add_feed_forward_stack(output_unit, from_list="joint_input", name="joint_ff", **joint_mlp_args) + + # [V, T, V] + output_unit["output"] = { + "class": "linear", + "from": joint_output, + "activation": "log_softmax", + "n_out": num_outputs, + } + + if ilm_scale: + if ilm_mode == ILMMode.ZeroEnc or ilm_mode == ILMMode.ZeroEncInclBlank: + # [V, T, E] + output_unit["zero_enc"] = {"class": "eval", "from": "tile_encoder", "eval": "source(0) * 0"} + # [V, T, E+D] + joint_input_ilm = ["zero_enc", "decoder"] + else: + raise NotImplementedError + if combination_mode is None or combination_mode == "concat": + # [V, T, E+D] + output_unit["joint_input_ilm"] = { + "class": "copy", + "from": joint_input_ilm, + } + else: + # [V, T, E] + output_unit["joint_input_ilm"] = { + "class": "combine", + "from": joint_input_ilm, + "kind": combination_mode, + } + + # [V, T, J] + joint_output_ilm = add_feed_forward_stack( + output_unit, from_list="joint_input_ilm", name="joint_ff_ilm", reuse_from_name="joint_ff", **joint_mlp_args + ) + + # [V, T, V] + output_unit["ilm"] = { + "class": "linear", + "from": joint_output_ilm, + "activation": "log_softmax", + "n_out": num_outputs, + "reuse_params": "output", + } + + if ilm_mode == ILMMode.ZeroEncInclBlank: + ilm_layer = "ilm" + else: + assert blank_idx == 0, "Blank idx != 0 not implemented for ilm" + # Set p(blank) = 1 and re-normalize the non-blank probs + # so we want P'[b, 0] = 1, sum(P'[b, 1:]) = 1, given a normalized tensor P, i.e. sum(P[b, :]) = 1 + # in log space logP'[b, 0] = 0, sum(exp(logP'[b, 1:])) = 1 + # so set logP'[b, 1:] <- logP[b, 1:] - log(1 - exp(P[b, 0])) + # then sum(exp(logP'[b, 1:])) = sum(P[1:] / (1 - exp(P[b, 0]))) = sum(P[b, 1:]) / sum(b, P[1:]) = 1 + output_unit["ilm_renorm"] = { + "class": "eval", + "from": "ilm", + "eval": "tf.concat([tf.zeros(tf.shape(source(0)[:, :1])), source(0)[:, 1:] - tf.math.log(1.0 - tf.exp(source(0)[:, :1]))], axis=-1)", + } + ilm_layer = "ilm_renorm" + + # [V, T, V] + output_unit["output_sub_ilm"] = { + "class": "eval", + "from": ["output", ilm_layer], + "eval": f"source(0) - {ilm_scale} * source(1)", + "is_output_layer": True, + } + + out_layer = "output/rec/output_sub_ilm" + else: + out_layer = "output" + + network["output"] = { + "class": "subnetwork", + "from": encoder, + "subnetwork": { + "rec": { + "class": "subnetwork", + "from": "data", + "subnetwork": output_unit, + }, + "output": { + "class": "copy", + "from": "rec", + }, + }, + "is_output_layer": False, + } + + network["output_precompute"] = { + "class": "eval", + "from": out_layer, + "eval": f"tf.transpose(tf.reshape(tf.transpose(source(0, auto_convert=False, enforce_batch_major=True), [0, 2, 1]), [1, {num_outputs*num_outputs}, -1]), [0, 2, 1])", + "is_output_layer": True, + "out_type": {"shape": (None, num_outputs * num_outputs), "dim": num_outputs * num_outputs}, + } + + return output_unit + + def add_context_1_decoder_fullsum( network: Dict, context_labels: str, @@ -392,7 +571,6 @@ def add_context_1_decoder_fullsum( combination_mode: Optional[str] = "add", compress_joint_input: bool = True, ) -> Tuple[List[str], Dict, List]: - output_unit = {} extra_python = [] diff --git a/users/berger/network/models/context_1_transducer.py b/users/berger/network/models/context_1_transducer.py index de26d3805..38cd61d42 100644 --- a/users/berger/network/models/context_1_transducer.py +++ b/users/berger/network/models/context_1_transducer.py @@ -155,11 +155,7 @@ def make_context_1_conformer_transducer_fullsum( } context_labels = "pred_labels_int32" - ( - joint_output, - decoder_unit, - decoder_python, - ) = label_context.add_context_1_decoder_fullsum( + (joint_output, decoder_unit, decoder_python,) = label_context.add_context_1_decoder_fullsum( network, context_labels=context_labels, encoder="encoder", @@ -226,6 +222,29 @@ def make_context_1_conformer_transducer_recog( return network, python_code +def make_context_1_conformer_transducer_precomputed_recog( + num_outputs: int, + vgg_args: Dict = {}, + conformer_args: Dict = {}, + decoder_args: Dict = {}, +) -> Tuple[Dict, List]: + network = {} + python_code = [] + + from_list = ["data"] + + from_list = add_initial_conv(network, from_list, **vgg_args) + from_list, _ = add_conformer_stack(network, from_list, **conformer_args) + + network["encoder"] = {"class": "copy", "from": from_list} + + label_context.add_precomputed_context_1_decoder_recog( + network, num_outputs=num_outputs, encoder="encoder", **decoder_args + ) + + return network, python_code + + def make_context_1_blstm_transducer( num_outputs: int, blank_index: int = 0, @@ -342,11 +361,7 @@ def make_context_1_blstm_transducer_fullsum( } context_labels = "pred_labels_int32" - ( - joint_output, - decoder_unit, - decoder_python, - ) = label_context.add_context_1_decoder_fullsum( + (joint_output, decoder_unit, decoder_python,) = label_context.add_context_1_decoder_fullsum( network, context_labels=context_labels, encoder="encoder", diff --git a/users/berger/recipe/rasr/label_tree_and_scorer.py b/users/berger/recipe/rasr/label_tree_and_scorer.py index 88a6d4fef..62b7cf942 100644 --- a/users/berger/recipe/rasr/label_tree_and_scorer.py +++ b/users/berger/recipe/rasr/label_tree_and_scorer.py @@ -149,7 +149,8 @@ def extra_args(self): return { key: val for key, val in self.config._items() - if key not in [ + if key + not in [ "label-scorer-type", "scale", "label-file", diff --git a/users/berger/recipe/recognition/__init__.py b/users/berger/recipe/recognition/__init__.py index d115ab2e9..7d55d439b 100644 --- a/users/berger/recipe/recognition/__init__.py +++ b/users/berger/recipe/recognition/__init__.py @@ -1,4 +1,5 @@ from .generic_seq2seq_search import * +from .generic_seq2seq_search_v2 import * from .label_sync_search import * from .scoring import * from .statistics import * diff --git a/users/berger/recipe/recognition/generic_seq2seq_search_v2.py b/users/berger/recipe/recognition/generic_seq2seq_search_v2.py new file mode 100644 index 000000000..c469f615d --- /dev/null +++ b/users/berger/recipe/recognition/generic_seq2seq_search_v2.py @@ -0,0 +1,444 @@ +__all__ = ["BuildGenericSeq2SeqGlobalCacheJob", "GenericSeq2SeqSearchJobV2"] + +from typing import List, Optional, Tuple +from sisyphus import * + +assert __package__ is not None +Path = setup_path(__package__) + +import shutil + +from i6_core import rasr, util +from i6_experiments.users.berger.recipe.rasr.label_tree_and_scorer import LabelTree, LabelScorer + + +class BuildGenericSeq2SeqGlobalCacheJob(rasr.RasrCommand, Job): + """ + Standalone job to create the global-cache for generic-seq2seq-tree-search + """ + + def __init__( + self, + crp: rasr.CommonRasrParameters, + label_tree: LabelTree, + label_scorer: LabelScorer, + search_parameters: Optional[dict] = None, + extra_config: Optional[rasr.RasrConfig] = None, + extra_post_config: Optional[rasr.RasrConfig] = None, + ): + """ + :param crp: common RASR params (required: lexicon, acoustic_model, language_model, recognizer) + :param label_tree: label tree object for structuring the search tree + :param label_scorer: label scorer object for score computation + :param extra_config: overlay config that influences the Job's hash + :param extra_post_config: overlay config that does not influences the Job's hash + """ + self.set_vis_name("Build Global Cache") + + self.config, self.post_config = BuildGenericSeq2SeqGlobalCacheJob.create_config( + crp=crp, + label_tree=label_tree, + label_scorer=label_scorer, + search_parameters=search_parameters, + extra_config=extra_config, + extra_post_config=extra_post_config, + ) + + self.exe = self.select_exe(crp.flf_tool_exe, "flf-tool") + + self.out_log_file = self.log_file_output_path("build_global_cache", crp, False) + self.out_global_cache = self.output_path("global.cache", cached=True) + + self.rqmt = {"time": 1, "cpu": 1, "mem": 4} + + def tasks(self): + yield Task("create_files", mini_task=True) + yield Task("run", resume="run", rqmt=self.rqmt) + + def create_files(self): + self.write_config(self.config, self.post_config, "build_global_cache.config") + with open("dummy.corpus", "wt") as f: + f.write('\n') + with open("dummy.flow", "wt") as f: + f.write(f'\n') + extra_code = ( + ":${THEANO_FLAGS:=" + '}\nexport THEANO_FLAGS="$THEANO_FLAGS,device=cpu,force_device=True"\nexport TF_DEVICE="cpu"' + ) + self.write_run_script(self.exe, "build_global_cache.config", extra_code=extra_code) + + def run(self): + self.run_script(1, self.out_log_file) + shutil.move("global.cache", self.out_global_cache.get_path()) + + @classmethod + def create_config( + cls, + crp: rasr.CommonRasrParameters, + label_tree: LabelTree, + label_scorer: LabelScorer, + search_parameters: Optional[dict], + extra_config: Optional[rasr.RasrConfig], + extra_post_config: Optional[rasr.RasrConfig], + ): + config, post_config = rasr.build_config_from_mapping( + crp, + { + "lexicon": "flf-lattice-tool.lexicon", + "acoustic_model": "flf-lattice-tool.network.recognizer.acoustic-model", + "language_model": "flf-lattice-tool.network.recognizer.lm", + "lookahead_language_model": "flf-lattice-tool.network.recognizer.recognizer.lookahead-lm", + }, + ) + + # Apply config from label tree + label_tree.apply_config( + "flf-lattice-tool.network.recognizer.recognizer.label-tree", + config, + post_config, + ) + + # Optional lexicon overwrite + if label_tree.lexicon_config is not None: + config["flf-lattice-tool.lexicon"]._update(label_tree.lexicon_config) + + # Apply config from label scorer and eliminate unnecessary arguments that don't affect the search space (scale, prior) + label_scorer_reduced = LabelScorer( + scorer_type=label_scorer.scorer_type, + scale=1.0, + label_file=label_scorer.label_file, + num_classes=label_scorer.num_classes, + use_prior=False, + extra_args={key: val for key, val in label_scorer.extra_args.items() if key != "first-order"}, + ) + + label_scorer_reduced.apply_config("flf-lattice-tool.network.recognizer.label-scorer", config, post_config) + + # search settings # + search_config = rasr.RasrConfig() + if search_parameters is not None: + for key, val in search_parameters.items(): + search_config[key.replace("_", "-")] = val + + config.flf_lattice_tool.network.recognizer.recognizer._update(search_config) + + # flf network # + config.flf_lattice_tool.network.initial_nodes = "segment" + config.flf_lattice_tool.network.segment.type = "speech-segment" + config.flf_lattice_tool.network.segment.links = "1->recognizer:1" + config.flf_lattice_tool.corpus.file = "dummy.corpus" + config.flf_lattice_tool.network.recognizer.type = "recognizer" + config.flf_lattice_tool.network.recognizer.links = "sink" + config.flf_lattice_tool.network.recognizer.apply_non_word_closure_filter = False + config.flf_lattice_tool.network.recognizer.add_confidence_score = False + config.flf_lattice_tool.network.recognizer.apply_posterior_pruning = False + config.flf_lattice_tool.network.recognizer.search_type = "generic-seq2seq-tree-search" + config.flf_lattice_tool.network.recognizer.feature_extraction.file = "dummy.flow" + config.flf_lattice_tool.network.sink.type = "sink" + post_config.flf_lattice_tool.network.sink.warn_on_empty_lattice = True + post_config.flf_lattice_tool.network.sink.error_on_empty_lattice = False + + # skip conventional AM or load it without GMM # + if crp.acoustic_model_config is None: + config.flf_lattice_tool.network.recognizer.use_acoustic_model = False + else: + config.flf_lattice_tool.network.recognizer.use_mixture = False + if config.flf_lattice_tool.network.recognizer.acoustic_model._get("length") is not None: + del config.flf_lattice_tool.network.recognizer.acoustic_model["length"] + + # disable scaling + if config.flf_lattice_tool.network.recognizer.lm._get("scale") is not None: + del config.flf_lattice_tool.network.recognizer.lm["scale"] + + if config.flf_lattice_tool.network.recognizer.recognizer._get("lookahead-lm") is not None: + del config.flf_lattice_tool.network.recognizer.recognizer.lookahead_lm["scale"] + + config.flf_lattice_tool.network.recognition_mode = "init-only" + config.flf_lattice_tool.network.search_type = "generic-seq2seq-tree-search" + + config.flf_lattice_tool.global_cache.file = "global.cache" + config.flf_lattice_tool.global_cache.read_only = False + + config._update(extra_config) + post_config._update(extra_post_config) + + return config, post_config + + @classmethod + def hash(cls, kwargs): + config, _ = cls.create_config(**kwargs) + return super().hash({"config": config, "exe": kwargs["crp"].speech_recognizer_exe}) + + +class GenericSeq2SeqSearchJobV2(rasr.RasrCommand, Job): + __sis_hash_exclude__ = {"num_threads": None} + + def __init__( + self, + crp: rasr.CommonRasrParameters, + feature_flow: rasr.FlowNetwork, + label_tree: LabelTree, + label_scorer: LabelScorer, + rasr_exe: Optional[tk.Path] = None, + search_parameters: Optional[dict] = None, + lm_lookahead: bool = True, + lookahead_options: Optional[dict] = None, + eval_single_best: bool = True, + eval_best_in_lattice: bool = True, + use_gpu: bool = False, + global_cache: Optional[tk.Path] = None, + rtf: float = 2, + mem: float = 8, + extra_config: Optional[rasr.RasrConfig] = None, + extra_post_config: Optional[rasr.RasrConfig] = None, + num_threads: int = 2, + ): + self.set_vis_name("Generic Seq2Seq Search") + + self.config, self.post_config = GenericSeq2SeqSearchJobV2.create_config( + crp=crp, + feature_flow=feature_flow, + label_tree=label_tree, + label_scorer=label_scorer, + search_parameters=search_parameters, + lm_lookahead=lm_lookahead, + lookahead_options=lookahead_options, + eval_single_best=eval_single_best, + eval_best_in_lattice=eval_best_in_lattice, + extra_config=extra_config, + extra_post_config=extra_post_config, + global_cache=global_cache, + ) + self.feature_flow = feature_flow + if rasr_exe is not None: + self.rasr_exe = rasr_exe + else: + self.rasr_exe = crp.flf_tool_exe + assert self.rasr_exe is not None + + self.concurrent = crp.concurrent + self.use_gpu = use_gpu + self.num_threads = num_threads + + self.out_log_file = self.log_file_output_path("search", crp, True) + + self.out_single_lattice_caches = dict( + (task_id, self.output_path("lattice.cache.%d" % task_id, cached=True)) + for task_id in range(1, crp.concurrent + 1) + ) + self.out_lattice_bundle = self.output_path("lattice.bundle", cached=True) + self.out_lattice_path = util.MultiOutputPath( + self, "lattice.cache.$(TASK)", self.out_single_lattice_caches, cached=True + ) + + self.rqmt = { + "time": max(crp.corpus_duration * rtf / crp.concurrent, 24), + "cpu": num_threads, + "gpu": 1 if self.use_gpu else 0, + "mem": mem, + } + + def tasks(self): + yield Task("create_files", mini_task=True) + yield Task("run", resume="run", rqmt=self.rqmt, args=range(1, self.concurrent + 1)) + + def create_files(self): + self.write_config(self.config, self.post_config, "recognition.config") + self.feature_flow.write_to_file("feature.flow") + util.write_paths_to_file(self.out_lattice_bundle, self.out_single_lattice_caches.values()) + extra_code = 'export TF_DEVICE="{0}"'.format("gpu" if self.use_gpu else "cpu") + # sometimes crash without this + if not self.use_gpu: + extra_code += "\nexport CUDA_VISIBLE_DEVICES=" + + extra_code += f"\nexport OMP_NUM_THREADS={self.num_threads}" + extra_code += f"\nexport MKL_NUM_THREADS={self.num_threads}" + self.write_run_script(self.rasr_exe, "recognition.config", extra_code=extra_code) + + def stop_run(self, task_id): + print(f"run job {task_id} exceeds specified rqmt and stopped") + + def run(self, task_id): + self.run_script(task_id, self.out_log_file[task_id]) + shutil.move( + "lattice.cache.%d" % task_id, + self.out_single_lattice_caches[task_id].get_path(), + ) + + def cleanup_before_run(self, cmd, retry, task_id, *args): + util.backup_if_exists(f"recognition.log.{task_id}") + util.delete_if_exists(f"lattice.cache.{task_id}") + + @classmethod + def create_config( + cls, + crp: rasr.CommonRasrParameters, + feature_flow: rasr.FlowNetwork, + label_tree: LabelTree, + label_scorer: LabelScorer, + search_parameters: Optional[dict] = None, + lm_lookahead: bool = True, + lookahead_options: Optional[dict] = None, + eval_single_best: bool = True, + eval_best_in_lattice: bool = True, + extra_config: Optional[rasr.RasrConfig] = None, + extra_post_config: Optional[rasr.RasrConfig] = None, + global_cache: Optional[tk.Path] = None, + **_, + ): + # get config from csp # + config, post_config = rasr.build_config_from_mapping( + crp, + { + "corpus": "flf-lattice-tool.corpus", + "lexicon": "flf-lattice-tool.lexicon", + "acoustic_model": "flf-lattice-tool.network.recognizer.acoustic-model", + "language_model": "flf-lattice-tool.network.recognizer.lm", + "lookahead_language_model": "flf-lattice-tool.network.recognizer.recognizer.lookahead-lm", + }, + parallelize=True, + ) + + # acoustic model maybe used for allophones and state-tying, but no mixture is needed + # skip conventional AM or load it without GMM + if crp.acoustic_model_config is None: + config.flf_lattice_tool.network.recognizer.use_acoustic_model = False + else: + config.flf_lattice_tool.network.recognizer.use_mixture = False + + # feature flow # + config.flf_lattice_tool.network.recognizer.feature_extraction.file = "feature.flow" + if feature_flow.outputs != {"features"}: + assert len(feature_flow.outputs) == 1, "not implemented otherwise" + config.flf_lattice_tool.network.recognizer.feature_extraction.main_port_name = next( + iter(feature_flow.outputs) + ) + + feature_flow.apply_config( + "flf-lattice-tool.network.recognizer.feature-extraction", + config, + post_config, + ) + + # label tree and optional lexicon overwrite + label_tree.apply_config( + "flf-lattice-tool.network.recognizer.recognizer.label-tree", + config, + post_config, + ) + if label_tree.lexicon_config is not None: + config["flf-lattice-tool.lexicon"]._update(label_tree.lexicon_config) + + # label scorer + label_scorer.apply_config("flf-lattice-tool.network.recognizer.label-scorer", config, post_config) + + # search settings # + search_config = rasr.RasrConfig() + if search_parameters is not None: + for key, val in search_parameters.items(): + search_config[key.replace("_", "-")] = val + + config.flf_lattice_tool.network.recognizer.recognizer._update(search_config) + + # lookahead settings # + la_opts = { + "history_limit": 1, + "cache_low": 2000, + "cache_high": 3000, + } + if lookahead_options is not None: + la_opts.update(lookahead_options) + + config.flf_lattice_tool.network.recognizer.recognizer.optimize_lattice = True + + la_config = rasr.RasrConfig() + la_config._value = lm_lookahead + + if "laziness" in la_opts: + config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead_laziness = la_opts["laziness"] + + if lm_lookahead: + if "history_limit" in la_opts: + la_config.history_limit = la_opts["history_limit"] + if "tree_cutoff" in la_opts: + la_config.tree_cutoff = la_opts["tree_cutoff"] + if "minimum_representation" in la_opts: + la_config.minimum_representation = la_opts["minimum_representation"] + if "lm_lookahead_scale" in la_opts: + la_config.lm_lookahead_scale = la_opts["lm_lookahead_scale"] + if "cache_low" in la_opts: + post_config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead.cache_size_low = la_opts[ + "cache_low" + ] + if "cache_high" in la_opts: + post_config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead.cache_size_high = la_opts[ + "cache_high" + ] + + config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead = la_config + + # flf network # + config.flf_lattice_tool.network.initial_nodes = "segment" + config.flf_lattice_tool.network.segment.type = "speech-segment" + config.flf_lattice_tool.network.segment.links = "1->recognizer:1 0->archive-writer:1 0->evaluator:1" + + config.flf_lattice_tool.network.recognizer.type = "recognizer" + config.flf_lattice_tool.network.recognizer.search_type = "generic-seq2seq-tree-search" + config.flf_lattice_tool.network.recognizer.apply_non_word_closure_filter = False + config.flf_lattice_tool.network.recognizer.add_confidence_score = False + config.flf_lattice_tool.network.recognizer.apply_posterior_pruning = False + + if label_scorer.config.label_unit == "hmm": + config.flf_lattice_tool.network.recognizer.links = "expand" + config.flf_lattice_tool.network.expand.type = "expand-transits" + config.flf_lattice_tool.network.expand.links = "evaluator archive-writer" + else: + config.flf_lattice_tool.network.recognizer.links = "evaluator archive-writer" + + config.flf_lattice_tool.network.evaluator.type = "evaluator" + config.flf_lattice_tool.network.evaluator.links = "sink:0" + config.flf_lattice_tool.network.evaluator.word_errors = True + config.flf_lattice_tool.network.evaluator.single_best = eval_single_best + config.flf_lattice_tool.network.evaluator.best_in_lattice = eval_best_in_lattice + config.flf_lattice_tool.network.evaluator.edit_distance.format = "bliss" + config.flf_lattice_tool.network.evaluator.edit_distance.allow_broken_words = False + + config.flf_lattice_tool.network.archive_writer.type = "archive-writer" + config.flf_lattice_tool.network.archive_writer.links = "sink:1" + config.flf_lattice_tool.network.archive_writer.format = "flf" + config.flf_lattice_tool.network.archive_writer.path = "lattice.cache.$(TASK)" + post_config.flf_lattice_tool.network.archive_writer.info = True + + config.flf_lattice_tool.network.sink.type = "sink" + post_config.flf_lattice_tool.network.sink.warn_on_empty_lattice = True + post_config.flf_lattice_tool.network.sink.error_on_empty_lattice = False + post_config["*"].output_channel.unbuffered = True + + if global_cache is None: + global_cache = BuildGenericSeq2SeqGlobalCacheJob( + crp=crp, label_tree=label_tree, label_scorer=label_scorer + ).out_global_cache + + post_config.flf_lattice_tool.global_cache.read_only = True + post_config.flf_lattice_tool.global_cache.file = global_cache + + # update parameters # + config._update(extra_config) + post_config._update(extra_post_config) + + return config, post_config + + @classmethod + def hash(cls, kwargs): + config, _ = cls.create_config(**kwargs) + if kwargs["rasr_exe"] is not None: + rasr_exe = kwargs["rasr_exe"] + else: + rasr_exe = kwargs["crp"].flf_tool_exe + return super().hash( + { + "config": config, + "feature_flow": kwargs["feature_flow"], + "exe": rasr_exe, + } + ) diff --git a/users/berger/settings.py b/users/berger/settings.py index 9744555d6..1cb0369a1 100644 --- a/users/berger/settings.py +++ b/users/berger/settings.py @@ -23,9 +23,7 @@ def check_engine_limits(current_rqmt, task): i6 support for gpu_mem """ current_rqmt["time"] = min(168, current_rqmt.get("time", 2)) - if current_rqmt.get("gpu", 0) > 0 and "-p" not in current_rqmt.get( - "sbatch_args", [] - ): + if current_rqmt.get("gpu", 0) > 0 and "-p" not in current_rqmt.get("sbatch_args", []): if current_rqmt.get("gpu_mem", 0) > 11: current_rqmt["sbatch_args"] = ["-p", "gpu_24gb"] else: @@ -54,9 +52,7 @@ def engine(): return EngineSelector( engines={ "short": LocalEngine(cpus=4, mem=16), - "long": SimpleLinuxUtilityForResourceManagementEngine( - default_rqmt=default_rqmt - ), + "long": SimpleLinuxUtilityForResourceManagementEngine(default_rqmt=default_rqmt), }, default_engine="long", ) @@ -79,6 +75,7 @@ def worker_wrapper(job, task_name, call): "AdvancedTreeSearchJob", "AdvancedTreeSearchLmImageAndGlobalCacheJob", "GenericSeq2SeqSearchJob", + "GenericSeq2SeqSearchJobV2", "CreateLmImageJob", "BuildGenericSeq2SeqGlobalCacheJob", "GenericSeq2SeqLmImageAndGlobalCacheJob", diff --git a/users/berger/systems/base_system.py b/users/berger/systems/base_system.py index 73450cc71..1351a955d 100644 --- a/users/berger/systems/base_system.py +++ b/users/berger/systems/base_system.py @@ -59,7 +59,8 @@ def __init__( self._functors = self._initialize_functors() @abstractmethod - def _initialize_functors(self) -> Functors[types.TrainJobType, types.ConfigType]: ... + def _initialize_functors(self) -> Functors[types.TrainJobType, types.ConfigType]: + ... def get_train_job(self, exp_name: Optional[str] = None) -> types.TrainJobType: if exp_name is not None: diff --git a/users/berger/systems/functors/recognition/seq2seq_search.py b/users/berger/systems/functors/recognition/seq2seq_search.py index 7190de29d..0667834c4 100644 --- a/users/berger/systems/functors/recognition/seq2seq_search.py +++ b/users/berger/systems/functors/recognition/seq2seq_search.py @@ -42,6 +42,7 @@ def __call__( recognition_scoring_type=RecognitionScoringType.Lattice, rqmt_update: Optional[dict] = None, search_stats: bool = False, + seq2seq_v2: bool = False, **kwargs, ) -> List[Dict]: assert recog_corpus is not None @@ -135,14 +136,24 @@ def __call__( else: raise NotImplementedError - rec = recognition.GenericSeq2SeqSearchJob( - crp=crp, - feature_flow=feature_flow, - label_scorer=label_scorer, - label_tree=label_tree, - lookahead_options=lookahead_options, - **kwargs, - ) + if seq2seq_v2: + rec = recognition.GenericSeq2SeqSearchJobV2( + crp=crp, + feature_flow=feature_flow, + label_scorer=label_scorer, + label_tree=label_tree, + lookahead_options=lookahead_options, + **kwargs, + ) + else: + rec = recognition.GenericSeq2SeqSearchJob( + crp=crp, + feature_flow=feature_flow, + label_scorer=label_scorer, + label_tree=label_tree, + lookahead_options=lookahead_options, + **kwargs, + ) if rqmt_update is not None: rec.rqmt.update(rqmt_update) diff --git a/users/berger/systems/functors/seq2seq_base.py b/users/berger/systems/functors/seq2seq_base.py index 1944e0b56..46f95b3a2 100644 --- a/users/berger/systems/functors/seq2seq_base.py +++ b/users/berger/systems/functors/seq2seq_base.py @@ -73,10 +73,16 @@ def _get_tf_feature_flow_for_label_scorer( tf_graph: tk.Path, checkpoint: returnn.Checkpoint, feature_type: FeatureType = FeatureType.SAMPLES, + output_layer_name: str = "output", **_, ) -> rasr.FlowNetwork: if label_scorer.scorer_type == "precomputed-log-posterior": - feature_flow = self._make_precomputed_tf_feature_flow(base_feature_flow, tf_graph, checkpoint) + feature_flow = self._make_precomputed_tf_feature_flow( + base_flow=base_feature_flow, + tf_graph=tf_graph, + tf_checkpoint=checkpoint, + output_layer_name=output_layer_name, + ) elif label_scorer.scorer_type in ["tf-attention", "tf-rnn-transducer", "tf-ffnn-transducer", "tf-segmental"]: feature_flow = copy.deepcopy(base_feature_flow) feature_flow.config = feature_flow.config or rasr.RasrConfig() From f97194791f3eef9d8aac686c2aee6f2dde43e211 Mon Sep 17 00:00:00 2001 From: Simon Berger Date: Fri, 24 May 2024 13:52:38 +0200 Subject: [PATCH 057/227] Update users/berger --- .../20230602_rescale_baselines/__init__.py | 4 +- .../config_01_conformer_ctc.py | 102 +++-- .../config_01b_conformer_ctc_logmel.py | 197 --------- .../config_04a_conformer_transducer_bpe.py | 2 +- .../config_04b_conformer_transducer_phon.py | 24 +- users/berger/corpus/tedlium2/lm_data.py | 4 +- users/berger/pytorch/models/conformer_ctc.py | 7 +- .../pytorch/models/conformer_transducer_v2.py | 1 - .../recognition/generic_seq2seq_search.py | 385 ++++++------------ 9 files changed, 207 insertions(+), 519 deletions(-) delete mode 100644 users/berger/configs/tedlium2/20230602_rescale_baselines/config_01b_conformer_ctc_logmel.py diff --git a/users/berger/configs/tedlium2/20230602_rescale_baselines/__init__.py b/users/berger/configs/tedlium2/20230602_rescale_baselines/__init__.py index 949f5fa7d..d8ebae699 100644 --- a/users/berger/configs/tedlium2/20230602_rescale_baselines/__init__.py +++ b/users/berger/configs/tedlium2/20230602_rescale_baselines/__init__.py @@ -3,7 +3,6 @@ from i6_experiments.users.berger.systems.dataclasses import SummaryKey from sisyphus import tk, gs -from .config_01b_conformer_ctc_logmel import py as py_01b from .config_01_conformer_ctc import py as py_01 from .config_04a_conformer_transducer_bpe import py as py_04a @@ -21,6 +20,8 @@ def worker_wrapper(job, task_name, call): "FeatureExtractionJob", "GenericSeq2SeqSearchJob", "GenericSeq2SeqLmImageAndGlobalCacheJob", + "CreateLmImageJob", + "BuildGenericSeq2SeqGlobalCacheJob", "LatticeToCtmJob", "OptimizeAMandLMScaleJob", "AlignmentJob", @@ -87,7 +88,6 @@ def worker_wrapper(job, task_name, call): for subreport in [ copy.deepcopy(py_01()), - copy.deepcopy(py_01b()), copy.deepcopy(py_04a()), copy.deepcopy(py_04a_rasr()), copy.deepcopy(py_04b()), diff --git a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_01_conformer_ctc.py b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_01_conformer_ctc.py index f2c222a29..c9513483b 100644 --- a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_01_conformer_ctc.py +++ b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_01_conformer_ctc.py @@ -22,8 +22,6 @@ rasr.flow.FlowNetwork.default_flags = {"cache_mode": "task_dependent"} num_outputs = 79 -# num_subepochs = 250 -num_subepochs = 1 tools = copy.deepcopy(default_tools_v2) tools.rasr_binary_path = tk.Path("/u/berger/repositories/rasr_versions/gen_seq2seq_dev/arch/linux-x86_64-standard") @@ -32,48 +30,45 @@ # ********** Return Config generators ********** -def returnn_config_generator(variant: ConfigVariant, train_data_config: dict, dev_data_config: dict) -> ReturnnConfig: - model_config = conformer_ctc.get_default_config_v2(num_inputs=50, num_outputs=num_outputs) +def returnn_config_generator( + variant: ConfigVariant, train_data_config: dict, dev_data_config: dict, num_subepochs: int, **kwargs +) -> ReturnnConfig: + model_config = conformer_ctc.get_default_config_v3(num_outputs=num_outputs) extra_config = { "train": train_data_config, "dev": dev_data_config, + "max_seq_length": {"audio_features": 560000}, + "torch_amp": {"dtype": "bfloat16"}, } if variant == ConfigVariant.RECOG: + extra_config["extern_data"] = { + "sources": {"dim": 80, "dtype": "float32"}, + } extra_config["model_outputs"] = { "log_probs": { "dim": num_outputs, } } - extra_config["preload_from_files"] = { - "base": { - "init_for_train": True, - "filename": tk.Path( - "/work/asr4/berger/sisyphus_work_dirs/tedlium2/20230602_rescale_baselines/i6_core/returnn/training/ReturnnTrainingJob.yEnOhxiP8CgO/output/models/epoch.250.pt" - ), - } - } - return get_returnn_config( num_epochs=num_subepochs, - num_inputs=50, + num_inputs=1, num_outputs=num_outputs, target="classes", extra_python=[conformer_ctc.get_serializer(model_config, variant=variant)], extern_data_config=True, backend=Backend.PYTORCH, - grad_noise=0.0, + grad_noise=kwargs.get("grad_noise", 0.0), grad_clip=0.0, optimizer=Optimizers.AdamW, schedule=LearningRateSchedules.OCLR, max_seqs=60, - # initial_lr=2.2e-05, - # peak_lr=2.2e-04, - peak_lr=0.0, - # final_lr=1e-08, - batch_size=12000, - accum_grad=3, + initial_lr=7e-06, + peak_lr=7e-04, + decayed_lr=7e-05, + final_lr=1e-08, + batch_size=36000 * 160, use_chunking=False, extra_config=extra_config, ) @@ -82,16 +77,37 @@ def returnn_config_generator(variant: ConfigVariant, train_data_config: dict, de def get_returnn_config_collection( train_data_config: dict, dev_data_config: dict, + num_subepochs: int, + **kwargs, ) -> ReturnnConfigs[ReturnnConfig]: - generator_kwargs = {"train_data_config": train_data_config, "dev_data_config": dev_data_config} return ReturnnConfigs( - train_config=returnn_config_generator(variant=ConfigVariant.TRAIN, **generator_kwargs), - prior_config=returnn_config_generator(variant=ConfigVariant.PRIOR, **generator_kwargs), - recog_configs={"recog": returnn_config_generator(variant=ConfigVariant.RECOG, **generator_kwargs)}, + train_config=returnn_config_generator( + variant=ConfigVariant.TRAIN, + train_data_config=train_data_config, + dev_data_config=dev_data_config, + num_subepochs=num_subepochs, + **kwargs, + ), + prior_config=returnn_config_generator( + variant=ConfigVariant.PRIOR, + train_data_config=train_data_config, + dev_data_config=dev_data_config, + num_subepochs=num_subepochs, + **kwargs, + ), + recog_configs={ + "recog": returnn_config_generator( + variant=ConfigVariant.RECOG, + train_data_config=train_data_config, + dev_data_config=dev_data_config, + num_subepochs=num_subepochs, + **kwargs, + ) + }, ) -def run_exp() -> SummaryReport: +def run_exp(num_subepochs: int = 250) -> SummaryReport: assert tools.returnn_root assert tools.returnn_python_exe assert tools.rasr_binary_path @@ -101,28 +117,27 @@ def run_exp() -> SummaryReport: returnn_python_exe=tools.returnn_python_exe, rasr_binary_path=tools.rasr_binary_path, augmented_lexicon=True, - feature_type=FeatureType.GAMMATONE_16K, + feature_type=FeatureType.SAMPLES, ) + for data_input in data.data_inputs.values(): + data_input.create_lm_images(tools.rasr_binary_path) + # ********** Step args ********** - train_args = exp_args.get_ctc_train_step_args(num_epochs=num_subepochs) + train_args = exp_args.get_ctc_train_step_args(num_epochs=num_subepochs, gpu_mem_rqmt=24) recog_args = exp_args.get_ctc_recog_step_args( num_classes=num_outputs, - # epochs=[160, num_subepochs], - epochs=[1], + epochs=[ep for ep in [80, 160, 320, 640, 1280, num_subepochs] if ep <= num_subepochs], prior_scales=[0.5], lm_scales=[1.1], - feature_type=FeatureType.GAMMATONE_16K, + feature_type=FeatureType.SAMPLES, search_stats=True, + seq2seq_v2=True, ) # ********** System ********** - # tools.returnn_root = tk.Path("/u/berger/repositories/MiniReturnn") - # tools.rasr_binary_path = tk.Path( - # "/u/berger/repositories/rasr_versions/gen_seq2seq_onnx_apptainer/arch/linux-x86_64-standard" - # ) system = ReturnnSeq2SeqSystem(tools) system.init_corpora( @@ -135,9 +150,16 @@ def run_exp() -> SummaryReport: # ********** Returnn Configs ********** - system.add_experiment_configs( - "Conformer_CTC", get_returnn_config_collection(data.train_data_config, data.cv_data_config) - ) + for grad_noise in [0.0, 0.1]: + system.add_experiment_configs( + f"Conformer_CTC_{num_subepochs}-epochs_gn-{grad_noise}", + get_returnn_config_collection( + train_data_config=data.train_data_config, + dev_data_config=data.cv_data_config, + num_subepochs=num_subepochs, + grad_noise=grad_noise, + ), + ) system.run_train_step(**train_args) system.run_dev_recog_step(**recog_args) @@ -152,7 +174,9 @@ def py() -> SummaryReport: summary_report = SummaryReport() - summary_report.merge_report(run_exp(), update_structure=True) + summary_report.merge_report(run_exp(num_subepochs=250), update_structure=True) + summary_report.merge_report(run_exp(num_subepochs=500), update_structure=True) + summary_report.merge_report(run_exp(num_subepochs=1000), update_structure=True) tk.register_report(f"{gs.ALIAS_AND_OUTPUT_SUBDIR}/summary.report", summary_report) diff --git a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_01b_conformer_ctc_logmel.py b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_01b_conformer_ctc_logmel.py deleted file mode 100644 index 2e62a39cb..000000000 --- a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_01b_conformer_ctc_logmel.py +++ /dev/null @@ -1,197 +0,0 @@ -import copy -import os -from typing import Any, Dict, Optional -from i6_core.returnn.config import ReturnnConfig - -from sisyphus import gs, tk - -import i6_core.rasr as rasr -from i6_experiments.users.berger.args.experiments import ctc as exp_args -from i6_experiments.users.berger.args.returnn.config import get_returnn_config, Backend -from i6_experiments.users.berger.args.returnn.learning_rates import LearningRateSchedules, Optimizers -from i6_experiments.users.berger.corpus.tedlium2.ctc_data import get_tedlium2_data_dumped_labels -from i6_experiments.users.berger.pytorch.models import conformer_ctc -from i6_experiments.users.berger.recipe.summary.report import SummaryReport -from i6_experiments.users.berger.systems.dataclasses import ConfigVariant, FeatureType, ReturnnConfigs -from i6_experiments.users.berger.systems.returnn_native_system import ReturnnNativeSystem -from i6_experiments.users.berger.util import default_tools_v2 -from i6_experiments.users.berger.systems.functors.recognition.returnn_search import LexiconType, LmType, VocabType - -# ********** Settings ********** - -rasr.flow.FlowNetwork.default_flags = {"cache_mode": "task_dependent"} - -num_outputs = 79 -num_subepochs = 250 - -tools = copy.deepcopy(default_tools_v2) - -# ********** Return Config generators ********** - - -def returnn_config_generator( - *, - variant: ConfigVariant, - train_data_config: dict, - dev_data_config: dict, - forward_data_config: Optional[dict] = None, - **kwargs, -) -> ReturnnConfig: - model_config = conformer_ctc.get_default_config_v3(num_outputs=num_outputs) - - extra_config = { - "train": train_data_config, - "dev": dev_data_config, - "torch_amp": {"dtype": "bfloat16"}, - } - - if variant == ConfigVariant.TRAIN: - extra_config["max_seq_length"] = {"audio_features": 560000} - - if variant == ConfigVariant.PRIOR: - extra_config = {"forward_data": train_data_config} - - if variant == ConfigVariant.RECOG: - assert forward_data_config is not None - extra_config = { - "forward_data": forward_data_config, - "model_outputs": { - "tokens": { - "dtype": "string", - "feature_dim_axis": None, - } - }, - } - - return get_returnn_config( - num_epochs=num_subepochs, - num_inputs=1, - num_outputs=num_outputs, - target="classes" if variant != ConfigVariant.RECOG else None, - extra_python=[ - conformer_ctc.get_serializer( - model_config, variant=variant, recog_type=conformer_ctc.RecogType.FLASHLIGHT, **kwargs - ) - ], - extern_data_config=True, - backend=Backend.PYTORCH, - grad_noise=0.0, - grad_clip=0.0, - optimizer=Optimizers.AdamW, - schedule=LearningRateSchedules.OCLR, - max_seqs=60, - initial_lr=7e-06, - peak_lr=7e-04, - decayed_lr=7e-05, - final_lr=1e-08, - batch_size=360 * 16000, - use_chunking=False, - extra_config=extra_config, - ) - - -def get_returnn_config_collection( - *, recog_variations: Optional[Dict[str, Any]] = None, forward_data_configs: dict, **kwargs -) -> ReturnnConfigs[ReturnnConfig]: - if recog_variations is None: - return ReturnnConfigs( - train_config=returnn_config_generator(variant=ConfigVariant.TRAIN, **kwargs), - prior_config=returnn_config_generator(variant=ConfigVariant.PRIOR, **kwargs), - recog_configs={ - f"recog_{key}": returnn_config_generator( - variant=ConfigVariant.RECOG, forward_data_config=forward_data_config, **kwargs - ) - for key, forward_data_config in forward_data_configs.items() - }, - ) - else: - return ReturnnConfigs( - train_config=returnn_config_generator(variant=ConfigVariant.TRAIN, **kwargs), - prior_config=returnn_config_generator(variant=ConfigVariant.PRIOR, **kwargs), - recog_configs={ - f"recog_{key}_{variation_name}": returnn_config_generator( - variant=ConfigVariant.RECOG, forward_data_config=forward_data_config, **variation_kwargs, **kwargs - ) - for key, forward_data_config in forward_data_configs.items() - for variation_name, variation_kwargs in recog_variations.items() - }, - ) - - -def run_exp() -> SummaryReport: - assert tools.returnn_root - assert tools.returnn_python_exe - assert tools.rasr_binary_path - data = get_tedlium2_data_dumped_labels( - num_classes=num_outputs, - returnn_root=tools.returnn_root, - returnn_python_exe=tools.returnn_python_exe, - rasr_binary_path=tools.rasr_binary_path, - augmented_lexicon=True, - feature_type=FeatureType.SAMPLES, - ) - - # ********** Step args ********** - - train_args = exp_args.get_ctc_train_step_args(num_epochs=num_subepochs, gpu_mem_rqmt=24) - recog_args = { - "epochs": [num_subepochs], - "lm_scales": [2.0], - "prior_scales": [0.5], - "lexicon_type": LexiconType.FLASHLIGHT, - "vocab_type": VocabType.RETURNN, - "lm_type": LmType.ARPA_FILE, - } - # ********** System ********** - - system = ReturnnNativeSystem(tools) - - system.init_corpora( - dev_keys=data.dev_keys, - test_keys=data.test_keys, - corpus_data=data.data_inputs, - am_args=exp_args.ctc_recog_am_args, - ) - system.setup_scoring() - - # ********** Returnn Configs ********** - - system.add_experiment_configs( - "Conformer_CTC_logmel", - get_returnn_config_collection( - train_data_config=data.train_data_config, - dev_data_config=data.cv_data_config, - forward_data_configs=data.forward_data_config, - beam_size=64, - beam_threshold=14.0, - ), - ) - - system.run_train_step(**train_args) - - system.run_dev_recog_step( - recog_exp_names={ - exp_name: [ - recog_exp_name for recog_exp_name in system.get_recog_exp_names()[exp_name] if dev_key in recog_exp_name - ] - for dev_key in data.dev_keys - for exp_name in system.get_exp_names() - }, - **recog_args, - ) - - assert system.summary_report - return system.summary_report - - -def py() -> SummaryReport: - filename_handle = os.path.splitext(os.path.basename(__file__))[0][len("config_") :] - gs.ALIAS_AND_OUTPUT_SUBDIR = f"{filename_handle}/" - - summary_report = SummaryReport() - - summary_report.merge_report(run_exp(), update_structure=True) - - tk.register_report(f"{gs.ALIAS_AND_OUTPUT_SUBDIR}/summary.report", summary_report) - - return summary_report diff --git a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_04a_conformer_transducer_bpe.py b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_04a_conformer_transducer_bpe.py index 301506b5c..d9610a62d 100644 --- a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_04a_conformer_transducer_bpe.py +++ b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_04a_conformer_transducer_bpe.py @@ -133,7 +133,7 @@ def run_exp() -> SummaryReport: train_args = exp_args.get_transducer_train_step_args(num_epochs=num_subepochs, gpu_mem_rqmt=24) recog_args = { - "epochs": [20, 40, 80, 160, 320, 500], + "epochs": [500], "prior_scales": [0.0], "lm_scales": [0.0], "lexicon_type": LexiconType.BLISS, diff --git a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_04b_conformer_transducer_phon.py b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_04b_conformer_transducer_phon.py index b6c026453..637357823 100644 --- a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_04b_conformer_transducer_phon.py +++ b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_04b_conformer_transducer_phon.py @@ -24,7 +24,7 @@ rasr.flow.FlowNetwork.default_flags = {"cache_mode": "task_dependent"} num_outputs = 79 -num_subepochs = 500 +num_subepochs = 1000 tools = copy.deepcopy(default_tools_v2) tools.rasr_binary_path = tk.Path("/u/berger/repositories/rasr_versions/gen_seq2seq_dev/arch/linux-x86_64-standard") @@ -61,10 +61,11 @@ def returnn_config_generator( optimizer=Optimizers.AdamW, schedule=LearningRateSchedules.OCLR, initial_lr=1e-06, - peak_lr=8e-05, + peak_lr=kwargs.get("peak_lr", 8e-05), decayed_lr=1e-05, final_lr=1e-08, - batch_size=10000 * 160, + batch_size=15000 * 160, + accum_grad=2, use_chunking=False, extra_config=extra_config, ) @@ -173,6 +174,7 @@ def run_exp() -> SummaryReport: label_scorer_type="onnx-ffnn-transducer", label_scorer_args={"extra_args": {"start_label_index": 0}}, reduction_subtrahend=3, + # reduction_subtrahend=0, reduction_factor=4, feature_type=FeatureType.LOGMEL_16K, ) @@ -206,13 +208,15 @@ def run_exp() -> SummaryReport: # ********** Returnn Configs ********** - system.add_experiment_configs( - "Conformer_Transducer", - get_returnn_config_collection( - data.train_data_config, - data.cv_data_config, - ), - ) + for peak_lr in [8e-06, 1e-05, 4e-05, 8e-05]: + system.add_experiment_configs( + f"Conformer_Transducer_lr-{peak_lr}", + get_returnn_config_collection( + data.train_data_config, + data.cv_data_config, + peak_lr=peak_lr, + ), + ) system.run_train_step(**train_args) diff --git a/users/berger/corpus/tedlium2/lm_data.py b/users/berger/corpus/tedlium2/lm_data.py index 6253f9830..63c54f87e 100644 --- a/users/berger/corpus/tedlium2/lm_data.py +++ b/users/berger/corpus/tedlium2/lm_data.py @@ -8,6 +8,8 @@ def get_lm(name: str) -> rasr_lm_config.LMData: ted_4gram = get_corpus_data_inputs()["dev"]["dev"].lm assert ted_4gram is not None - lm_dict["4gram"] = rasr_lm_config.ArpaLMData(filename=ted_4gram["filename"], scale=ted_4gram.get("scale", 1.0)) + lm_dict["4gram"] = rasr_lm_config.ArpaLMData( + filename=ted_4gram["filename"], scale=ted_4gram.get("scale", 1.0), lookahead_lm=None + ) return lm_dict[name] diff --git a/users/berger/pytorch/models/conformer_ctc.py b/users/berger/pytorch/models/conformer_ctc.py index 370bf4ea1..d85b144ee 100644 --- a/users/berger/pytorch/models/conformer_ctc.py +++ b/users/berger/pytorch/models/conformer_ctc.py @@ -322,17 +322,18 @@ def get_default_config_v3(num_outputs: int) -> ConformerCTCConfig: sample_rate=16000, win_size=0.025, hop_size=0.01, - min_amp=1e-10, + min_amp=1.175494e-38, num_filters=80, + alpha=0.97, ), ) specaugment = ModuleFactoryV1( module_class=SpecaugmentByLengthModuleV1, cfg=SpecaugmentByLengthConfigV1( - time_min_num_masks=2, + time_min_num_masks=1, time_max_mask_per_n_frames=25, time_mask_max_size=20, - freq_min_num_masks=2, + freq_min_num_masks=1, freq_max_num_masks=16, freq_mask_max_size=5, ), diff --git a/users/berger/pytorch/models/conformer_transducer_v2.py b/users/berger/pytorch/models/conformer_transducer_v2.py index 32ab9d10f..9df3af6c5 100644 --- a/users/berger/pytorch/models/conformer_transducer_v2.py +++ b/users/berger/pytorch/models/conformer_transducer_v2.py @@ -47,7 +47,6 @@ def forward( with torch.no_grad(): sources = sources.squeeze(-1) x, source_lengths = self.feature_extraction(sources, source_lengths) - print("Features: ", x[0, :3, :5]) sequence_mask = lengths_to_padding_mask(source_lengths) x = self.specaugment(x) # [B, T, F] diff --git a/users/berger/recipe/recognition/generic_seq2seq_search.py b/users/berger/recipe/recognition/generic_seq2seq_search.py index 4ed1f4177..34cc1da5e 100644 --- a/users/berger/recipe/recognition/generic_seq2seq_search.py +++ b/users/berger/recipe/recognition/generic_seq2seq_search.py @@ -1,17 +1,13 @@ __all__ = ["GenericSeq2SeqLmImageAndGlobalCacheJob", "GenericSeq2SeqSearchJob"] -from typing import List, Optional, Tuple from sisyphus import * -assert __package__ is not None Path = setup_path(__package__) import shutil import copy from i6_core import rasr, util -from i6_core.lm.lm_image import CreateLmImageJob -from i6_experiments.users.berger.recipe.rasr.label_tree_and_scorer import LabelTree, LabelScorer class GenericSeq2SeqLmImageAndGlobalCacheJob(rasr.RasrCommand, Job): @@ -88,15 +84,13 @@ def find_arpa_lms(cls, config): lookahead_lm_config = config.flf_lattice_tool.network.recognizer.recognizer.lookahead_lm if separate_lookahead_lm: if lookahead_lm_config.type == "ARPA" and lookahead_lm_config._get("image") is None: - pass - # result.append(lookahead_lm_config) + result.append(lookahead_lm_config) # recombination lm # separate_recombination_lm = config.flf_lattice_tool.network.recognizer.recognizer.separate_recombination_lm recombination_lm_config = config.flf_lattice_tool.network.recognizer.recognizer.recombination_lm if separate_recombination_lm: if recombination_lm_config.type == "ARPA" and recombination_lm_config._get("image") is None: - pass - # result.append(recombination_lm_config) + result.append(recombination_lm_config) return result @classmethod @@ -174,8 +168,8 @@ def create_config( # lm images # arpa_lms = cls.find_arpa_lms(config) - for i, lm_config in enumerate(arpa_lms, start=1): - lm_config.image = f"lm-{i}.image" + for i, lm_config in enumerate(arpa_lms): + lm_config.image = "lm-%d.image" % (i + 1) # global cache # config.flf_lattice_tool.global_cache.file = "global.cache" @@ -191,171 +185,42 @@ def hash(cls, kwargs): return super().hash({"config": config, "exe": sprint_exe}) -class BuildGenericSeq2SeqGlobalCacheJob(rasr.RasrCommand, Job): - """ - Standalone job to create the global-cache for generic-seq2seq-tree-search - """ - - def __init__( - self, - crp: rasr.CommonRasrParameters, - label_tree: LabelTree, - label_scorer: LabelScorer, - extra_config: Optional[rasr.RasrConfig] = None, - extra_post_config: Optional[rasr.RasrConfig] = None, - ): - """ - :param crp: common RASR params (required: lexicon, acoustic_model, language_model, recognizer) - :param label_tree: label tree object for structuring the search tree - :param label_scorer: label scorer object for score computation - :param extra_config: overlay config that influences the Job's hash - :param extra_post_config: overlay config that does not influences the Job's hash - """ - self.set_vis_name("Build Global Cache") - - (self.config, self.post_config,) = BuildGenericSeq2SeqGlobalCacheJob.create_config( - crp=crp, - label_tree=label_tree, - label_scorer=label_scorer, - extra_config=extra_config, - extra_post_config=extra_post_config, - ) - - self.exe = self.select_exe(crp.speech_recognizer_exe, "speech-recognizer") - - self.out_log_file = self.log_file_output_path("build_global_cache", crp, False) - self.out_global_cache = self.output_path("global.cache", cached=True) - - self.rqmt = {"time": 1, "cpu": 1, "mem": 2} - - def tasks(self): - yield Task("create_files", mini_task=True) - yield Task("run", resume="run", rqmt=self.rqmt) - - def create_files(self): - self.write_config(self.config, self.post_config, "build_global_cache.config") - self.write_run_script(self.exe, "build_global_cache.config") - - def run(self): - self.run_script(1, self.out_log_file) - shutil.move("global.cache", self.out_global_cache.get_path()) - - @classmethod - def create_config( - cls, - crp: rasr.CommonRasrParameters, - label_tree: LabelTree, - label_scorer: LabelScorer, - extra_config: Optional[rasr.RasrConfig], - extra_post_config: Optional[rasr.RasrConfig], - ): - config, post_config = rasr.build_config_from_mapping( - crp, - { - "lexicon": "speech-recognizer.model-combination.lexicon", - "acoustic_model": "speech-recognizer.model-combination.acoustic-model", - "language_model": "speech-recognizer.model-combination.lm", - "recognizer": "speech-recognizer.recognizer", - }, - ) - - # Apply config from label tree - label_tree.apply_config( - "speech-recognizer.recognizer.label-tree", - config, - post_config, - ) - - # Optional lexicon overwrite - if label_tree.lexicon_config is not None: - config["speech-recognizer.model-combination.lexicon"]._update(label_tree.lexicon_config) - - # Apply config from label scorer and eliminate unnecessary arguments that don't affect the search space (scale, prior) - label_scorer_reduced = LabelScorer( - scorer_type=label_scorer.scorer_type, - scale=1.0, - label_file=label_scorer.label_file, - num_classes=label_scorer.num_classes, - use_prior=False, - extra_args=label_scorer.extra_args, - ) - - label_scorer_reduced.apply_config("speech-recognizer.recognizer.label-scorer", config, post_config) - - # skip conventional AM or load it without GMM # - if crp.acoustic_model_config is None: - config.speech_recognizer.recognizer.use_acoustic_model = False - else: - config.speech_recognizer.recognizer.use_mixture = False - if config.flf_lattice_tool.network.recognizer.acoustic_model._get("length") is not None: - del config.flf_lattice_tool.network.recognizer.acoustic_model["length"] - - # disable scaling - if config.flf_lattice_tool.network.recognizer.lm._get("scale") is not None: - del config.flf_lattice_tool.network.recognizer.lm["scale"] - - config.speech_recognizer.recognition_mode = "init-only" - config.speech_recognizer.search_type = "generic-seq2seq-tree-search" - config.speech_recognizer.global_cache.file = "global.cache" - config.speech_recognizer.global_cache.read_only = False - - config._update(extra_config) - post_config._update(extra_post_config) - - return config, post_config - - @classmethod - def hash(cls, kwargs): - config, _ = cls.create_config(**kwargs) - return super().hash({"config": config, "exe": kwargs["crp"].speech_recognizer_exe}) - - class GenericSeq2SeqSearchJob(rasr.RasrCommand, Job): __sis_hash_exclude__ = {"num_threads": None} def __init__( self, - crp: rasr.CommonRasrParameters, - feature_flow: rasr.FlowNetwork, - label_tree: LabelTree, - label_scorer: LabelScorer, - rasr_exe: Optional[tk.Path] = None, - search_parameters: Optional[dict] = None, - lm_lookahead: bool = True, - lookahead_options: Optional[dict] = None, - eval_single_best: bool = True, - eval_best_in_lattice: bool = True, - use_gpu: bool = False, - global_cache: Optional[tk.Path] = None, - rtf: float = 2, - mem: float = 8, - extra_config: Optional[rasr.RasrConfig] = None, - extra_post_config: Optional[rasr.RasrConfig] = None, - num_threads: int = 2, - ): + crp, + feature_flow, + label_tree, + label_scorer, + search_parameters=None, + lm_lookahead=True, + lookahead_options=None, + eval_single_best=True, + eval_best_in_lattice=True, + use_gpu=False, + rtf=2, + mem=8, + hard_rqmt=False, + extra_config=None, + extra_post_config=None, + sprint_exe=None, # allow separat executable than default settings + lm_gc_job=None, + lm_gc_job_local=False, + lm_gc_job_mem=16, + lm_gc_job_default_search=False, + num_threads=None, + ): # TODO set this to true later self.set_vis_name("Generic Seq2Seq Search") + kwargs = locals() + del kwargs["self"] - self.config, self.post_config = GenericSeq2SeqSearchJob.create_config( - crp=crp, - feature_flow=feature_flow, - label_tree=label_tree, - label_scorer=label_scorer, - search_parameters=search_parameters, - lm_lookahead=lm_lookahead, - lookahead_options=lookahead_options, - eval_single_best=eval_single_best, - eval_best_in_lattice=eval_best_in_lattice, - extra_config=extra_config, - extra_post_config=extra_post_config, - global_cache=global_cache, - ) + self.config, self.post_config = GenericSeq2SeqSearchJob.create_config(**kwargs) self.feature_flow = feature_flow - if rasr_exe is not None: - self.rasr_exe = rasr_exe - else: - self.rasr_exe = crp.flf_tool_exe - assert self.rasr_exe is not None - + if sprint_exe is None: + sprint_exe = crp.flf_tool_exe + self.exe = self.select_exe(sprint_exe, "flf-tool") self.concurrent = crp.concurrent self.use_gpu = use_gpu self.num_threads = num_threads @@ -372,15 +237,21 @@ def __init__( ) self.rqmt = { - "time": max(crp.corpus_duration * rtf / crp.concurrent, 24), - "cpu": num_threads, + "time": max(crp.corpus_duration * rtf / crp.concurrent, 4.5), + "cpu": 2, "gpu": 1 if self.use_gpu else 0, "mem": mem, } + # no automatic resume with doubled rqmt + self.hard_rqmt = hard_rqmt def tasks(self): yield Task("create_files", mini_task=True) - yield Task("run", resume="run", rqmt=self.rqmt, args=range(1, self.concurrent + 1)) + if self.hard_rqmt: # TODO + resume = None + else: + resume = "run" + yield Task("run", resume=resume, rqmt=self.rqmt, args=range(1, self.concurrent + 1)) def create_files(self): self.write_config(self.config, self.post_config, "recognition.config") @@ -390,10 +261,16 @@ def create_files(self): # sometimes crash without this if not self.use_gpu: extra_code += "\nexport CUDA_VISIBLE_DEVICES=" + if self.num_threads is None: + extra_code += "\nexport OMP_NUM_THREADS=%i" % self.rqmt["cpu"] + else: + extra_code += f"\nexport OMP_NUM_THREADS={self.num_threads}" + extra_code += f"\nexport MKL_NUM_THREADS={self.num_threads}" + self.write_run_script(self.exe, "recognition.config", extra_code=extra_code) - extra_code += f"\nexport OMP_NUM_THREADS={self.num_threads}" - extra_code += f"\nexport MKL_NUM_THREADS={self.num_threads}" - self.write_run_script(self.rasr_exe, "recognition.config", extra_code=extra_code) + # TODO maybe not needed + def stop_run(self, task_id): + print("run job %d exceeds specified rqmt and stoped" % task_id) def run(self, task_id): self.run_script(task_id, self.out_log_file[task_id]) @@ -402,50 +279,45 @@ def run(self, task_id): self.out_single_lattice_caches[task_id].get_path(), ) - @classmethod - def find_arpa_lms( - cls, lm_config: rasr.RasrConfig, lm_post_config: Optional[rasr.RasrConfig] = None - ) -> List[Tuple[rasr.RasrConfig, Optional[rasr.RasrConfig]]]: - result = [] - - if lm_config.type == "ARPA": - result.append((lm_config, lm_post_config)) - elif lm_config.type == "combine": - for i in range(1, lm_config.num_lms + 1): - sub_lm_config = lm_config["lm-%d" % i] - sub_lm_post_config = lm_post_config["lm-%d" % i] if lm_post_config is not None else None - result += cls.find_arpa_lms(sub_lm_config, sub_lm_post_config) - - return result - - @classmethod - def find_arpa_lms_without_image( - cls, lm_config: rasr.RasrConfig, lm_post_config: Optional[rasr.RasrConfig] = None - ) -> List[Tuple[rasr.RasrConfig, Optional[rasr.RasrConfig]]]: - def has_image(c, pc): - res = c._get("image") is not None - res = res or (pc is not None and pc._get("image") is not None) - return res - - return [(c, pc) for c, pc in cls.find_arpa_lms(lm_config, lm_post_config) if not has_image(c, pc)] + def cleanup_before_run(self, cmd, retry, task_id, *args): + util.backup_if_exists("recognition.log.%d" % task_id) + util.delete_if_exists("lattice.cache.%d" % task_id) @classmethod def create_config( cls, - crp: rasr.CommonRasrParameters, - feature_flow: rasr.FlowNetwork, - label_tree: LabelTree, - label_scorer: LabelScorer, - search_parameters: Optional[dict] = None, - lm_lookahead: bool = True, - lookahead_options: Optional[dict] = None, - eval_single_best: bool = True, - eval_best_in_lattice: bool = True, - extra_config: Optional[rasr.RasrConfig] = None, - extra_post_config: Optional[rasr.RasrConfig] = None, - global_cache: Optional[tk.Path] = None, - **_, + crp, + feature_flow, + label_tree, + label_scorer, + search_parameters=None, + lm_lookahead=True, + lookahead_options=None, + eval_single_best=True, + eval_best_in_lattice=True, + extra_config=None, + extra_post_config=None, + sprint_exe=None, + lm_gc_job=None, + lm_gc_job_local=True, + lm_gc_job_mem=6, + lm_gc_job_default_search=False, + **kwargs, ): + # optional individual lm-image and global-cache job # + if lm_gc_job is None: + lm_gc_job = GenericSeq2SeqLmImageAndGlobalCacheJob( + crp, + label_tree, + label_scorer, + extra_config, + extra_post_config, + mem=lm_gc_job_mem, + local_job=lm_gc_job_local, + sprint_exe=sprint_exe, + default_search=lm_gc_job_default_search, + ) + # get config from csp # config, post_config = rasr.build_config_from_mapping( crp, @@ -459,8 +331,8 @@ def create_config( parallelize=True, ) - # acoustic model maybe used for allophones and state-tying, but no mixture is needed - # skip conventional AM or load it without GMM + # acoustic model maybe used for allophones and state-tying, but no mixture is needed # + # skip conventional AM or load it without GMM # if crp.acoustic_model_config is None: config.flf_lattice_tool.network.recognizer.use_acoustic_model = False else: @@ -470,17 +342,14 @@ def create_config( config.flf_lattice_tool.network.recognizer.feature_extraction.file = "feature.flow" if feature_flow.outputs != {"features"}: assert len(feature_flow.outputs) == 1, "not implemented otherwise" - config.flf_lattice_tool.network.recognizer.feature_extraction.main_port_name = next( - iter(feature_flow.outputs) - ) - + config.flf_lattice_tool.network.recognizer.feature_extraction.main_port_name = list(feature_flow.outputs)[0] feature_flow.apply_config( "flf-lattice-tool.network.recognizer.feature-extraction", config, post_config, ) - # label tree and optional lexicon overwrite + # label tree and optional lexicon overwrite # label_tree.apply_config( "flf-lattice-tool.network.recognizer.recognizer.label-tree", config, @@ -489,15 +358,14 @@ def create_config( if label_tree.lexicon_config is not None: config["flf-lattice-tool.lexicon"]._update(label_tree.lexicon_config) - # label scorer + # label scorer # label_scorer.apply_config("flf-lattice-tool.network.recognizer.label-scorer", config, post_config) # search settings # search_config = rasr.RasrConfig() if search_parameters is not None: - for key, val in search_parameters.items(): - search_config[key.replace("_", "-")] = val - + for key in search_parameters.keys(): + search_config[key] = search_parameters[key] config.flf_lattice_tool.network.recognizer.recognizer._update(search_config) # lookahead settings # @@ -509,23 +377,26 @@ def create_config( if lookahead_options is not None: la_opts.update(lookahead_options) - config.flf_lattice_tool.network.recognizer.recognizer.optimize_lattice = True - - la_config = rasr.RasrConfig() - la_config._value = lm_lookahead - + config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead = rasr.RasrConfig() + config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead._value = lm_lookahead if "laziness" in la_opts: config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead_laziness = la_opts["laziness"] - + config.flf_lattice_tool.network.recognizer.recognizer.optimize_lattice = True if lm_lookahead: if "history_limit" in la_opts: - la_config.history_limit = la_opts["history_limit"] + config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead.history_limit = la_opts[ + "history_limit" + ] if "tree_cutoff" in la_opts: - la_config.tree_cutoff = la_opts["tree_cutoff"] + config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead.tree_cutoff = la_opts["tree_cutoff"] if "minimum_representation" in la_opts: - la_config.minimum_representation = la_opts["minimum_representation"] + config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead.minimum_representation = la_opts[ + "minimum_representation" + ] if "lm_lookahead_scale" in la_opts: - la_config.lm_lookahead_scale = la_opts["lm_lookahead_scale"] + config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead.lm_lookahead_scale = la_opts[ + "lm_lookahead_scale" + ] if "cache_low" in la_opts: post_config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead.cache_size_low = la_opts[ "cache_low" @@ -535,8 +406,6 @@ def create_config( "cache_high" ] - config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead = la_config - # flf network # config.flf_lattice_tool.network.initial_nodes = "segment" config.flf_lattice_tool.network.segment.type = "speech-segment" @@ -574,46 +443,32 @@ def create_config( post_config.flf_lattice_tool.network.sink.error_on_empty_lattice = False post_config["*"].output_channel.unbuffered = True - # image and cache # - no_image_arpa_lms = GenericSeq2SeqSearchJob.find_arpa_lms_without_image( - lm_config=config.flf_lattice_tool.network.recognizer.lm - ) - if config.flf_lattice_tool.network.recognizer.recognizer._get("lookahead-lm") is not None: - no_image_arpa_lms += GenericSeq2SeqSearchJob.find_arpa_lms_without_image( - lm_config=config.flf_lattice_tool.network.recognizer.recognizer.lookahead_lm - ) - - for lm_config, lm_post_config in no_image_arpa_lms: - rp = rasr.CommonRasrParameters(base=crp) - rp.language_model_config = lm_config - rp.language_model_post_config = lm_post_config - lm_config.image = CreateLmImageJob(crp=rp, mem=8).out_image - - if global_cache is None: - global_cache = BuildGenericSeq2SeqGlobalCacheJob( - crp=crp, label_tree=label_tree, label_scorer=label_scorer - ).out_global_cache - - post_config.flf_lattice_tool.global_cache.read_only = True - post_config.flf_lattice_tool.global_cache.file = global_cache - # update parameters # config._update(extra_config) post_config._update(extra_post_config) - + + # image and cache # + arpa_lms = GenericSeq2SeqLmImageAndGlobalCacheJob.find_arpa_lms(config) + assert len(arpa_lms) == lm_gc_job.num_images, "mismatch between image-cache config and recognition config" + for i, lm_config in enumerate(arpa_lms): + lm_config.image = lm_gc_job.lm_images[i + 1] + + if post_config.flf_lattice_tool.global_cache._get("file") is None: + post_config.flf_lattice_tool.global_cache.read_only = True + post_config.flf_lattice_tool.global_cache.file = lm_gc_job.global_cache + return config, post_config @classmethod def hash(cls, kwargs): - config, _ = cls.create_config(**kwargs) - if kwargs["rasr_exe"] is not None: - rasr_exe = kwargs["rasr_exe"] - else: - rasr_exe = kwargs["crp"].flf_tool_exe + config, post_config = cls.create_config(**kwargs) + sprint_exe = kwargs["sprint_exe"] + if sprint_exe is None: + sprint_exe = kwargs["crp"].flf_tool_exe return super().hash( { "config": config, "feature_flow": kwargs["feature_flow"], - "exe": rasr_exe, + "exe": sprint_exe, } ) From 8a418d2127c33a7670554eac065386e2e18be302 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Fri, 24 May 2024 14:03:40 +0200 Subject: [PATCH 058/227] more --- .../exp2024_04_23_baselines/ctc.py | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 480eefcd1..1b07cc885 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -30,6 +30,33 @@ def py(): + train_exp( + f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_4-lrlin1e_5_295k-bpe10k", + config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), + }, + ) + + train_exp( + f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-bpe10k", + config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), + "optimizer.weight_decay": 1e-2, + }, + ) + + train_exp( + f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_4-lrlin1e_5_295k-speedpertV2-bpe10k", + config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), + "__train_audio_preprocess": speed_pert_librosa_config, + "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], + }, + ) + for vocab in [ "spm20k", "bpe10k", # 8.23 From ce494fe4bc52cb2c0831412e69b37315dc2baab1 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Fri, 24 May 2024 15:02:50 +0000 Subject: [PATCH 059/227] update --- .../tedlium2/configs/ebranch_baseline.py | 14 +-- .../tedlium2/configs/ted2_att_baseline.py | 94 ++++++++++--------- 2 files changed, 53 insertions(+), 55 deletions(-) diff --git a/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ebranch_baseline.py b/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ebranch_baseline.py index b8c368eeb..c65fa8c26 100644 --- a/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ebranch_baseline.py +++ b/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ebranch_baseline.py @@ -995,17 +995,6 @@ def get_base_v1_args(lr, ep, enc_drop=0.1, pretrain_reps=3, use_legacy_stats=Tru args["encoder_args"].num_blocks = num_blocks - args["with_pretrain"] = False - specaug_steps = {"step0": 12_000, "step1": 18_000, "step2": 24_000} - args["specaug_str_func_opts"] = { - "version": 2, - **specaug_steps, - "max_time_num": 100, - "max_time_dim": 20, - "min_num_add_factor": 0, - "freq_dim_factor": 5, - } - args["encoder_args"].frontend_conv_weight_dropout = weight_drop args["encoder_args"].mhsa_weight_dropout = weight_drop args["encoder_args"].ff_weight_dropout = weight_drop @@ -1014,6 +1003,9 @@ def get_base_v1_args(lr, ep, enc_drop=0.1, pretrain_reps=3, use_legacy_stats=Tru args["decoder_args"].embed_dim = target_embed_dim args["decoder_args"].att_dropout = dec_att_drop + args["batch_size"] = 10_000 * 160 + args["accum_grad"] = 3 + exp_name += f"_weightDrop{weight_drop}_decAttDrop{dec_att_drop}_embedDim{target_embed_dim}_numBlocks{num_blocks}" if ctc_scale != 1.0: diff --git a/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py b/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py index c8c13e602..ae8d5256a 100644 --- a/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py +++ b/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py @@ -1270,55 +1270,61 @@ def get_base_v1_args(lr, ep, enc_drop=0.1, pretrain_reps=3, use_legacy_stats=Tru # TODO: multi-gpu for num_blocks in [12]: for ep in [50 * 4]: - for lr in [8e-4, 16e-4]: + for lr in [8e-4, 13e-4, 16e-4]: for target_embed_dim in [256]: for att_drop in [0.0]: for weight_drop in [0.1]: for enc_drop in [0.15]: for ctc_scale in [0.3]: - for sync_step in [50]: - base_v1_args, exp_name = get_base_v1_args( - lr, ep, enc_drop=enc_drop, use_legacy_stats=False - ) - - args = copy.deepcopy(base_v1_args) - args["encoder_args"].num_blocks = num_blocks - args["encoder_args"].mhsa_weight_dropout = weight_drop - args["encoder_args"].ff_weight_dropout = weight_drop - args["encoder_args"].conv_weight_dropout = weight_drop - - args["decoder_args"].embed_dim = target_embed_dim - args["decoder_args"].att_dropout = att_drop - - args["horovod_params"] = { - "horovod_reduce_type": "param", - "horovod_param_sync_step": sync_step, - "horovod_dataset_distribution": "random_seed_offset", - } - - args["batch_size"] = 15_000 * 160 - args["accum_grad"] = 1 - gradient_clip_global_norm = 1 - args["gradient_clip_global_norm"] = gradient_clip_global_norm - - exp_name += f"_weightDrop{weight_drop}_decAttDrop{att_drop}_embedDim{target_embed_dim}_numBlocks{num_blocks}" - exp_name += f"_gradClipNorm{gradient_clip_global_norm}" - exp_name += f"_paramSync_step{sync_step}_accum1" - - if ctc_scale != 1.0: - args["encoder_args"].ctc_loss_scale = ctc_scale - args["decoder_args"].ce_loss_scale = 1.0 - ctc_scale - exp_name += f"_ctcScale{ctc_scale}" - - run_exp( - exp_name + "_gpu4", - args, - num_epochs=ep, - epoch_wise_filter=None, - bpe_size=BPE_1K, - partition_epoch=4 * 4, - horovod_num_processes=4, - ) + for sync_step in [100]: + for grad_clip in [None, 1.0, 5.0]: + if grad_clip is None and lr != 8e-4: + continue + + base_v1_args, exp_name = get_base_v1_args( + lr, ep, enc_drop=enc_drop, use_legacy_stats=False + ) + + args = copy.deepcopy(base_v1_args) + args["encoder_args"].num_blocks = num_blocks + args["encoder_args"].mhsa_weight_dropout = weight_drop + args["encoder_args"].ff_weight_dropout = weight_drop + args["encoder_args"].conv_weight_dropout = weight_drop + + args["decoder_args"].embed_dim = target_embed_dim + args["decoder_args"].att_dropout = att_drop + + args["horovod_params"] = { + "horovod_reduce_type": "param", + "horovod_param_sync_step": sync_step, + "horovod_dataset_distribution": "random_seed_offset", + } + + args["batch_size"] = 15_000 * 160 + args["pretrain_opts"]["initial_batch_size"] = 15_000 * 160 + args["accum_grad"] = 1 + + exp_name += f"_weightDrop{weight_drop}_decAttDrop{att_drop}_embedDim{target_embed_dim}_numBlocks{num_blocks}" + if grad_clip: + args["gradient_clip_global_norm"] = grad_clip + exp_name += f"_gradClipNorm{grad_clip}" + + exp_name += f"_paramSync_step{sync_step}_accum1" + + if ctc_scale != 1.0: + args["encoder_args"].ctc_loss_scale = ctc_scale + args["decoder_args"].ce_loss_scale = 1.0 - ctc_scale + exp_name += f"_ctcScale{ctc_scale}" + + run_exp( + exp_name + "_gpu4", + args, + num_epochs=ep, + epoch_wise_filter=None, + bpe_size=BPE_1K, + partition_epoch=4 * 4, + horovod_num_processes=4, + ) # # TODO: mixup # for num_blocks in [12]: From 46ad433ee79ed8ea1bf3f2d662e046999a165291 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Fri, 24 May 2024 15:28:23 +0000 Subject: [PATCH 060/227] update --- .../tedlium2/configs/ted2_att_baseline.py | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py b/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py index ae8d5256a..3c9f21f2c 100644 --- a/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py +++ b/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py @@ -1267,7 +1267,40 @@ def get_base_v1_args(lr, ep, enc_drop=0.1, pretrain_reps=3, use_legacy_stats=Tru partition_epoch=4, ) + if ctc_scale == 0.3: + args_ = copy.deepcopy(args) + args_["with_pretrain"] = False + specaug_steps = {"step0": 10_000, "step1": 15_000, "step2": 20_000} + args_["specaug_str_func_opts"] = { + "version": 2, + **specaug_steps, + "max_time_num": 100, + "max_time_dim": 20, + "min_num_add_factor": 0, + "freq_dim_factor": 5, + } + run_exp( + exp_name + "_woPretrain", + args_, + num_epochs=ep, + epoch_wise_filter=[(1, 2, 400), (3, 4, 800)], + bpe_size=BPE_1K, + partition_epoch=4, + ) + args_["encoder_args"].with_ctc = False + run_exp( + exp_name + "_woPretrain_noCTC", + args_, + num_epochs=ep, + epoch_wise_filter=[(1, 2, 400), (3, 4, 800)], + bpe_size=BPE_1K, + partition_epoch=4, + ) + # TODO: multi-gpu + # base_bpe1000_peakLR0.0016_ep200_globalNorm_epochOCLR_pre3_fixZoneout_encDrop0.15_woDepthConvPre_weightDrop0.1_decAttDrop0.0_embedDim256_numBlocks12_gradClipNorm5.0_paramSync_step100_accum1_ctcScale0.3_gpu4 + # 8.66 7.97 avg + for num_blocks in [12]: for ep in [50 * 4]: for lr in [8e-4, 13e-4, 16e-4]: From df85a5fd09238d38f43ae7f9f3923ec0e3b3e4b7 Mon Sep 17 00:00:00 2001 From: Simon Berger Date: Sun, 26 May 2024 01:45:14 +0200 Subject: [PATCH 061/227] Update users/berger --- users/berger/args/returnn/learning_rates.py | 56 +++++++++++++++++ .../20230602_rescale_baselines/__init__.py | 1 + .../config_01_conformer_ctc.py | 19 +++--- users/berger/pytorch/models/conformer_ctc.py | 63 ++++++++++++------- users/berger/pytorch/train_steps/ctc.py | 6 +- .../recognition/generic_seq2seq_search_v2.py | 24 +++---- users/berger/recipe/returnn/forward.py | 3 + users/berger/systems/functors/rasr_base.py | 1 + 8 files changed, 129 insertions(+), 44 deletions(-) diff --git a/users/berger/args/returnn/learning_rates.py b/users/berger/args/returnn/learning_rates.py index a03c5aaff..9845dce64 100644 --- a/users/berger/args/returnn/learning_rates.py +++ b/users/berger/args/returnn/learning_rates.py @@ -10,6 +10,7 @@ class LearningRateSchedules(Enum): NewbobAbs = auto() OCLR = auto() OCLR_STEP = auto() + OCLR_STEP_TORCH = auto() CONST_DECAY = auto() CONST_DECAY_STEP = auto() @@ -38,6 +39,8 @@ def get_learning_rate_config( config.update(get_oclr_config(**kwargs)) elif schedule == LearningRateSchedules.OCLR_STEP: extra_python.append(get_oclr_function(**kwargs)) + elif schedule == LearningRateSchedules.OCLR_STEP_TORCH: + extra_python.append(get_oclr_function_torch(**kwargs)) elif schedule == LearningRateSchedules.CONST_DECAY: config.update(get_const_decay_config(**kwargs)) elif schedule == LearningRateSchedules.CONST_DECAY_STEP: @@ -153,6 +156,7 @@ def get_oclr_function( **kwargs, ) -> str: initial_lr = initial_lr or peak_lr / 10 + decayed_lr = decayed_lr or initial_lr final_lr = final_lr or initial_lr / 5 cycle_epoch = cycle_epoch or (num_epochs * 9) // 20 # 45% of the training @@ -184,6 +188,58 @@ def get_oclr_function( ) +def get_oclr_function_torch( + num_epochs: int, + n_steps_per_epoch: int, + peak_lr: float = 1e-03, + inc_epochs: Optional[int] = None, + dec_epochs: Optional[int] = None, + initial_lr: Optional[float] = None, + decayed_lr: Optional[float] = None, + final_lr: Optional[float] = None, + **kwargs, +) -> str: + initial_lr = initial_lr or peak_lr / 10 + decayed_lr = decayed_lr or initial_lr + final_lr = final_lr or initial_lr / 5 + inc_epochs = inc_epochs or (num_epochs * 9) // 20 + dec_epochs = dec_epochs or inc_epochs + + return dedent( + f"""def dynamic_learning_rate(*, global_train_step: int, **_): + # Increase linearly from initial_lr to peak_lr over the first inc_epoch epochs + # Decrease linearly from peak_lr to decayed_lr over the next dec_epoch epochs + # Decrease linearly from decayed_lr to final_lr over the remaining epochs + initial_lr = {initial_lr} + peak_lr = {peak_lr} + decayed_lr = {decayed_lr} + final_lr = {final_lr} + inc_epochs = {inc_epochs} + dec_epochs = {dec_epochs} + total_epochs = {num_epochs} + n_steps_per_epoch = {n_steps_per_epoch} + + # -- derived -- # + steps_increase = inc_epochs * n_steps_per_epoch + steps_decay = dec_epochs * n_steps_per_epoch + steps_final = (total_epochs - inc_epochs - dec_epochs) * n_steps_per_epoch + + step_size_increase = (peak_lr - initial_lr) / steps_increase + step_size_decay = (peak_lr - decayed_lr) / steps_decay + step_size_final = (decayed_lr - final_lr) / steps_final + + if global_train_step <= steps_increase: + return initial_lr + step_size_increase * global_train_step + if global_train_step <= steps_increase + steps_decay: + return peak_lr - step_size_decay * (global_train_step - steps_increase) + + return max( + decayed_lr - step_size_final * (global_train_step - steps_increase - steps_decay), + final_lr + )""" + ) + + def get_const_decay_config( num_epochs: int, const_lr: float = 1e-03, diff --git a/users/berger/configs/tedlium2/20230602_rescale_baselines/__init__.py b/users/berger/configs/tedlium2/20230602_rescale_baselines/__init__.py index d8ebae699..a6c351e5a 100644 --- a/users/berger/configs/tedlium2/20230602_rescale_baselines/__init__.py +++ b/users/berger/configs/tedlium2/20230602_rescale_baselines/__init__.py @@ -19,6 +19,7 @@ def worker_wrapper(job, task_name, call): "AdvancedTreeSearchLmImageAndGlobalCacheJob", "FeatureExtractionJob", "GenericSeq2SeqSearchJob", + "GenericSeq2SeqSearchJobV2", "GenericSeq2SeqLmImageAndGlobalCacheJob", "CreateLmImageJob", "BuildGenericSeq2SeqGlobalCacheJob", diff --git a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_01_conformer_ctc.py b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_01_conformer_ctc.py index c9513483b..7420c202f 100644 --- a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_01_conformer_ctc.py +++ b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_01_conformer_ctc.py @@ -1,5 +1,6 @@ import copy import os +from i6_models.config import ModuleFactoryV1 from i6_core.returnn.config import ReturnnConfig from sisyphus import gs, tk @@ -16,6 +17,7 @@ ReturnnSeq2SeqSystem, ) from i6_experiments.users.berger.util import default_tools_v2 +from i6_experiments.users.berger.pytorch.custom_parts.identity import IdentityConfig, IdentityModule # ********** Settings ********** @@ -38,18 +40,20 @@ def returnn_config_generator( extra_config = { "train": train_data_config, "dev": dev_data_config, - "max_seq_length": {"audio_features": 560000}, - "torch_amp": {"dtype": "bfloat16"}, } + if variant == ConfigVariant.TRAIN: + extra_config["max_seq_length"] = {"audio_features": 560000} + extra_config["torch_amp"] = {"dtype": "bfloat16"} if variant == ConfigVariant.RECOG: extra_config["extern_data"] = { - "sources": {"dim": 80, "dtype": "float32"}, + "data": {"dim": 80, "dtype": "float32"}, } extra_config["model_outputs"] = { "log_probs": { "dim": num_outputs, } } + model_config.feature_extraction = ModuleFactoryV1(IdentityModule, IdentityConfig()) return get_returnn_config( num_epochs=num_subepochs, @@ -62,12 +66,13 @@ def returnn_config_generator( grad_noise=kwargs.get("grad_noise", 0.0), grad_clip=0.0, optimizer=Optimizers.AdamW, - schedule=LearningRateSchedules.OCLR, + schedule=LearningRateSchedules.OCLR_STEP_TORCH, max_seqs=60, initial_lr=7e-06, peak_lr=7e-04, decayed_lr=7e-05, final_lr=1e-08, + n_steps_per_epoch=480, batch_size=36000 * 160, use_chunking=False, extra_config=extra_config, @@ -129,9 +134,9 @@ def run_exp(num_subepochs: int = 250) -> SummaryReport: recog_args = exp_args.get_ctc_recog_step_args( num_classes=num_outputs, epochs=[ep for ep in [80, 160, 320, 640, 1280, num_subepochs] if ep <= num_subepochs], - prior_scales=[0.5], - lm_scales=[1.1], - feature_type=FeatureType.SAMPLES, + prior_scales=[0.3, 0.5, 0.7], + lm_scales=[0.7, 0.9, 1.1, 1.3], + feature_type=FeatureType.LOGMEL_16K, search_stats=True, seq2seq_v2=True, ) diff --git a/users/berger/pytorch/models/conformer_ctc.py b/users/berger/pytorch/models/conformer_ctc.py index d85b144ee..7becad467 100644 --- a/users/berger/pytorch/models/conformer_ctc.py +++ b/users/berger/pytorch/models/conformer_ctc.py @@ -13,6 +13,7 @@ RasrCompatibleLogMelFeatureExtractionV1, RasrCompatibleLogMelFeatureExtractionV1Config, ) +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1, VGG4LayerActFrontendV1Config from i6_models.parts.frontend.generic_frontend import ( GenericFrontendV1, GenericFrontendV1Config, @@ -330,36 +331,56 @@ def get_default_config_v3(num_outputs: int) -> ConformerCTCConfig: specaugment = ModuleFactoryV1( module_class=SpecaugmentByLengthModuleV1, cfg=SpecaugmentByLengthConfigV1( - time_min_num_masks=1, + time_min_num_masks=2, time_max_mask_per_n_frames=25, time_mask_max_size=20, - freq_min_num_masks=1, + freq_min_num_masks=2, freq_max_num_masks=16, freq_mask_max_size=5, ), ) + # frontend = ModuleFactoryV1( + # GenericFrontendV1, + # GenericFrontendV1Config( + # in_features=80, + # layer_ordering=[ + # FrontendLayerType.Conv2d, + # FrontendLayerType.Conv2d, + # FrontendLayerType.Pool2d, + # FrontendLayerType.Conv2d, + # FrontendLayerType.Conv2d, + # FrontendLayerType.Pool2d, + # FrontendLayerType.Activation, + # ], + # conv_kernel_sizes=[(3, 3), (3, 3), (3, 3), (3, 3)], + # conv_paddings=None, + # conv_out_dims=[32, 64, 64, 32], + # conv_strides=[(1, 1), (1, 1), (1, 1), (1, 1)], + # pool_kernel_sizes=[(2, 1), (2, 1)], + # pool_strides=None, + # pool_paddings=None, + # activations=[torch.nn.ReLU()], + # out_features=384, + # ), + # ) frontend = ModuleFactoryV1( - GenericFrontendV1, - GenericFrontendV1Config( + VGG4LayerActFrontendV1, + VGG4LayerActFrontendV1Config( in_features=80, - layer_ordering=[ - FrontendLayerType.Conv2d, - FrontendLayerType.Conv2d, - FrontendLayerType.Pool2d, - FrontendLayerType.Conv2d, - FrontendLayerType.Conv2d, - FrontendLayerType.Pool2d, - FrontendLayerType.Activation, - ], - conv_kernel_sizes=[(3, 3), (3, 3), (3, 3), (3, 3)], - conv_paddings=None, - conv_out_dims=[32, 64, 64, 32], - conv_strides=[(1, 1), (1, 1), (1, 1), (1, 1)], - pool_kernel_sizes=[(2, 1), (2, 1)], - pool_strides=None, - pool_paddings=None, - activations=[torch.nn.ReLU()], + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation=torch.nn.ReLU(), out_features=384, ), ) diff --git a/users/berger/pytorch/train_steps/ctc.py b/users/berger/pytorch/train_steps/ctc.py index 932fb0276..4e1419607 100644 --- a/users/berger/pytorch/train_steps/ctc.py +++ b/users/berger/pytorch/train_steps/ctc.py @@ -1,12 +1,10 @@ import torch from returnn.tensor.tensor_dict import TensorDict -from ..helper_functions import map_tensor_to_minus1_plus1_interval def train_step(*, model: torch.nn.Module, extern_data: TensorDict, **_): - audio_features = extern_data["data"].raw_tensor + audio_features = extern_data["data"].raw_tensor.float() audio_features = audio_features.squeeze(-1) - audio_features = map_tensor_to_minus1_plus1_interval(audio_features) assert extern_data["data"].dims[1].dyn_size_ext is not None audio_features_len = extern_data["data"].dims[1].dyn_size_ext.raw_tensor @@ -37,8 +35,8 @@ def train_step(*, model: torch.nn.Module, extern_data: TensorDict, **_): zero_infinity=True, ) - from returnn.tensor import batch_dim import returnn.frontend as rf + from returnn.tensor import batch_dim rf.get_run_ctx().mark_as_loss( name="CTC", loss=loss, custom_inv_norm_factor=rf.reduce_sum(targets_len_rf, axis=batch_dim) diff --git a/users/berger/recipe/recognition/generic_seq2seq_search_v2.py b/users/berger/recipe/recognition/generic_seq2seq_search_v2.py index c469f615d..134b9c0d9 100644 --- a/users/berger/recipe/recognition/generic_seq2seq_search_v2.py +++ b/users/berger/recipe/recognition/generic_seq2seq_search_v2.py @@ -102,17 +102,8 @@ def create_config( if label_tree.lexicon_config is not None: config["flf-lattice-tool.lexicon"]._update(label_tree.lexicon_config) - # Apply config from label scorer and eliminate unnecessary arguments that don't affect the search space (scale, prior) - label_scorer_reduced = LabelScorer( - scorer_type=label_scorer.scorer_type, - scale=1.0, - label_file=label_scorer.label_file, - num_classes=label_scorer.num_classes, - use_prior=False, - extra_args={key: val for key, val in label_scorer.extra_args.items() if key != "first-order"}, - ) - - label_scorer_reduced.apply_config("flf-lattice-tool.network.recognizer.label-scorer", config, post_config) + # Apply config from label scorer + label_scorer.apply_config("flf-lattice-tool.network.recognizer.label-scorer", config, post_config) # search settings # search_config = rasr.RasrConfig() @@ -415,8 +406,17 @@ def create_config( post_config["*"].output_channel.unbuffered = True if global_cache is None: + # Eliminate unnecessary arguments that don't affect the search space (scale, prior) + label_scorer_reduced = LabelScorer( + scorer_type=label_scorer.scorer_type, + scale=1.0, + label_file=label_scorer.label_file, + num_classes=label_scorer.num_classes, + use_prior=False, + extra_args={key: val for key, val in label_scorer.extra_args.items() if key != "first-order"}, + ) global_cache = BuildGenericSeq2SeqGlobalCacheJob( - crp=crp, label_tree=label_tree, label_scorer=label_scorer + crp=crp, label_tree=label_tree, label_scorer=label_scorer_reduced ).out_global_cache post_config.flf_lattice_tool.global_cache.read_only = True diff --git a/users/berger/recipe/returnn/forward.py b/users/berger/recipe/returnn/forward.py index 03493bba0..69f867f16 100644 --- a/users/berger/recipe/returnn/forward.py +++ b/users/berger/recipe/returnn/forward.py @@ -75,6 +75,9 @@ def run(self): self.out_returnn_config_file.get_path(), ] ) + util.shutil.move("prior.txt", self.out_prior_txt_file.get_path()) + util.shutil.move("prior.xml", self.out_prior_xml_file.get_path()) + util.shutil.move("prior.png", self.out_prior_png_file.get_path()) @classmethod def create_returnn_config( diff --git a/users/berger/systems/functors/rasr_base.py b/users/berger/systems/functors/rasr_base.py index b14fac1dd..7915f0717 100644 --- a/users/berger/systems/functors/rasr_base.py +++ b/users/berger/systems/functors/rasr_base.py @@ -266,6 +266,7 @@ def _get_prior_file( returnn_config=prior_config, returnn_root=self.returnn_root, returnn_python_exe=self.returnn_python_exe, + mem_rqmt=8, ) return forward_job.out_prior_xml_file else: From fe595759d1cbe0460ad529510acedf4948517d77 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Sat, 25 May 2024 23:18:11 +0200 Subject: [PATCH 062/227] more --- users/zeyer/experiments/exp2024_04_23_baselines/ctc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 1b07cc885..41be5f7af 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -98,7 +98,8 @@ def py(): ) for alpha in [ - 0.3, + # 0.3, # very bad? + 0.7, ]: train_exp( "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-spm_bpe10k" From fde3754e772f5c7d63528ac524405ff45284a3cd Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Mon, 27 May 2024 09:27:01 +0200 Subject: [PATCH 063/227] more --- users/zeyer/experiments/exp2024_04_23_baselines/aed.py | 6 +++--- users/zeyer/experiments/exp2024_04_23_baselines/ctc.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/aed.py b/users/zeyer/experiments/exp2024_04_23_baselines/aed.py index 941f3086a..115e4f2c9 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/aed.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/aed.py @@ -90,11 +90,11 @@ def py(): # alpha=0.1 seems too aggressive for AED, bad convergence for alpha in [ 0.3, # 5.26 - 0.5, - 0.6, + 0.5, # 5.13 + 0.6, # 5.13 0.7, # 4.98 (!!) 0.8, - 0.9, + 0.9, # 5.18 1.0, # sanity check ]: train_exp( diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 41be5f7af..4403ca6a0 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -58,7 +58,7 @@ def py(): ) for vocab in [ - "spm20k", + "spm20k", # 7.44 "bpe10k", # 8.23 "spm10k", # 8.12 "spm_bpe10k", # 7.97 @@ -81,7 +81,7 @@ def py(): for alpha in [ 0.3, # 7.88 0.5, - 0.7, + 0.7, # 6.99 ]: train_exp( "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-spm10k" From 1d3a6263fc15a6abe8bac395bd662cd5151e5132 Mon Sep 17 00:00:00 2001 From: schmitt Date: Thu, 23 May 2024 15:51:53 +0200 Subject: [PATCH 064/227] update --- .../returnn/network_builder_rf/segmental/recog.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recog.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recog.py index 730e67383..dfdf2a8c3 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recog.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recog.py @@ -223,8 +223,6 @@ def model_recog( emit_log_prob = rf.log(rf.sigmoid(blank_logits)) emit_log_prob = rf.squeeze(emit_log_prob, axis=emit_log_prob.feature_dim) blank_log_prob = rf.log(rf.sigmoid(-blank_logits)) - # update blank decoder state - blank_decoder_state = tree.map_structure(lambda s: rf.gather(s, indices=backrefs), blank_decoder_state) # ------------------- combination ------------------- @@ -259,10 +257,15 @@ def model_recog( # mask for updating label-sync states update_state_mask = rf.convert_to_tensor(target != model.blank_idx) + # ------------------- update blank decoder state ------------------- + + if not model.use_joint_model: + blank_decoder_state = tree.map_structure(lambda s: rf.gather(s, indices=backrefs), blank_decoder_state) + # ------------------- update label decoder state ------------------- if model.use_joint_model: - label_decoder_state = tree.map_structure(lambda s: rf.gather(s, indices=backrefs), label_decoder_state) + label_decoder_state = tree.map_structure(lambda s: rf.gather(s, indices=backrefs), label_decoder_state_updated) else: def _get_masked_state(old, new, mask): old = rf.gather(old, indices=backrefs, axis=old_beam_dim) From 29d467ad10cbe4d3658db0be4b66c1ddf9c86d3c Mon Sep 17 00:00:00 2001 From: schmitt Date: Tue, 28 May 2024 10:36:54 +0200 Subject: [PATCH 065/227] update --- .../notes/commands | 2 +- .../global_vs_segmental_2022_23/train_new.py | 5 +- .../returnn/config_builder_rf/base.py | 70 +++--- .../returnn/network_builder_rf/base.py | 44 +++- .../network_builder_rf/global_/decoder.py | 65 ++++- .../network_builder_rf/global_/model.py | 31 ++- .../network_builder_rf/global_/train.py | 101 ++++++-- .../network_builder_rf/segmental/model.py | 51 ++-- .../segmental/model_new/blank_model/train.py | 15 ++ .../segmental/model_new/label_model/model.py | 91 ++++--- .../segmental/model_new/label_model/train.py | 237 +++++++++++++++--- .../network_builder_rf/segmental/recog.py | 25 +- .../network_builder_rf/segmental/train.py | 228 ++++++++++++++--- .../network_builder_rf/segmental/utils.py | 10 +- .../center_window_att/baseline_v1/__init__.py | 4 +- .../center_window_att/baseline_v1/baseline.py | 9 +- .../center_window_att/baseline_v3/__init__.py | 40 ++- .../center_window_att/baseline_v3/baseline.py | 9 +- .../center_window_att/baseline_v4/__init__.py | 22 +- .../center_window_att/baseline_v4/baseline.py | 13 +- .../center_window_att/config_builder.py | 17 +- .../center_window_att/train.py | 46 +++- .../global_att/baseline_v1/__init__.py | 77 ++++-- .../global_att/baseline_v1/baseline.py | 47 +++- .../pipeline_ls_conf/global_att/train.py | 51 ++-- 25 files changed, 1021 insertions(+), 289 deletions(-) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/notes/commands b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/notes/commands index 1b134a42f..0e7d139c5 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/notes/commands +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/notes/commands @@ -1,4 +1,4 @@ -apptainer shell -B /work/asr4 -B /work/asr3 -B /work/common -B /work/tools -B /u/berger -B /u/zeineldeen -B /u/rossenbach -B /u/beck -B /work/speech/tuske -B /u/zeyer -B /u/schmitt -B /u/atanas.gruev -B /u/zhou +apptainer shell --nv -B /work/asr4 -B /work/asr3 -B /work/common -B /work/tools -B /u/berger -B /u/zeineldeen -B /u/rossenbach -B /u/beck -B /work/speech/tuske -B /u/zeyer -B /u/schmitt -B /u/atanas.gruev -B /u/zhou /work/asr4/berger/apptainer/images/i6_tensorflow-2.8_onnx-1.15.sif /work/asr4/berger/apptainer/images/i6_torch-2.2_onnx-1.16.sif export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.8/dist-packages/scipy/.libs \ No newline at end of file diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/train_new.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/train_new.py index e903aba56..4ca77a54c 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/train_new.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/train_new.py @@ -71,7 +71,10 @@ def run_train(self) -> Tuple[Dict[int, Checkpoint], Path, Path]: returnn_python_exe=config_builder.variant_params["returnn_python_exe"], returnn_root=config_builder.variant_params["returnn_root"], mem_rqmt=self.train_rqmt.get("mem", 24), - time_rqmt=self.train_rqmt.get("time", 30) + time_rqmt=self.train_rqmt.get("time", 30), + cpu_rqmt=self.train_rqmt.get("cpu", 4), + horovod_num_processes=self.train_rqmt.get("horovod_num_processes", None), + distributed_launch_cmd=self.train_rqmt.get("distributed_launch_cmd", "mpirun"), ) train_job.add_alias(self.alias) tk.register_output(train_job.get_one_alias() + "/models", train_job.out_model_dir) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/config_builder_rf/base.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/config_builder_rf/base.py index b94dca43a..20af5a6d8 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/config_builder_rf/base.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/config_builder_rf/base.py @@ -31,6 +31,7 @@ def __init__( variant_params: Dict, model_def: ModelDef, get_model_func: Callable, + use_att_ctx_in_state: bool = True, ): self.variant_params = variant_params self.model_def = model_def @@ -45,18 +46,19 @@ def __init__( self.config_dict = dict( backend="torch", log_batch_size=True, - # truncation=-1, torch_log_memory_usage=True, debug_print_layer_output_template=True, max_seqs=200, - # gradient_clip=0.0, - # gradient_noise=0.0, optimizer={"class": "adamw", "epsilon": 1e-8, "weight_decay": 1e-6}, accum_grad_multiple_step=4, default_input="data", target="targets", ) + self.use_att_ctx_in_state = use_att_ctx_in_state + if not use_att_ctx_in_state: + self.config_dict["use_att_ctx_in_state"] = use_att_ctx_in_state + self.python_prolog = [] def get_train_config(self, opts: Dict): @@ -65,27 +67,12 @@ def get_train_config(self, opts: Dict): python_prolog = copy.deepcopy(self.python_prolog) python_epilog = copy.deepcopy(self.python_epilog) - dataset_opts = opts.get("dataset_opts", {}) + dataset_opts = opts.pop("dataset_opts", {}) config_dict.update(self.get_train_datasets(dataset_opts=dataset_opts)) extern_data_raw = self.get_extern_data_dict(dataset_opts) extern_data_raw = instanciate_delayed(extern_data_raw) - if opts.get("preload_from_files"): - config_dict["preload_from_files"] = opts["preload_from_files"] - elif opts.get("import_model_train_epoch1"): - config_dict.update({ - "import_model_train_epoch1": opts["import_model_train_epoch1"], - "load_ignore_missing_vars": True, - }) - - config_dict.update(dict( - batching=opts.get("batching", "laplace:.1000"), - aux_loss_layers=opts.get("aux_loss_layers", [4, 8]), - accum_grad_multiple_step=opts.get("accum_grad_multiple_step", config_dict["accum_grad_multiple_step"]), - optimizer=opts.get("optimizer", config_dict["optimizer"]), - )) - - if opts.get("dataset_opts", {}).get("use_speed_pert"): + if dataset_opts.pop("use_speed_pert", None): python_prolog += [ "import sys", 'sys.path.append("/work/asr4/zeineldeen/py_envs/py_3.10_tf_2.9/lib/python3.10/site-packages")' @@ -93,10 +80,22 @@ def get_train_config(self, opts: Dict): config_dict["speed_pert"] = speed_pert if opts.get("cleanup_old_models"): - post_config_dict["cleanup_old_models"] = opts["cleanup_old_models"] + post_config_dict["cleanup_old_models"] = opts.pop("cleanup_old_models") - config_dict.update(self.get_lr_settings(lr_opts=opts["lr_opts"], python_epilog=python_epilog)) - config_dict["batch_size"] = opts.get("batch_size", 15_000) * self.batch_size_factor + config_dict.update(self.get_lr_settings(lr_opts=opts.pop("lr_opts"), python_epilog=python_epilog)) + config_dict["batch_size"] = opts.pop("batch_size", 15_000) * self.batch_size_factor + + train_def = opts.pop("train_def") + train_step_func = opts.pop("train_step_func") + + remaining_opt_keys = [ + "aux_loss_layers", "preload_from_files", "accum_grad_multiple_step", "optimizer", "batching", + "torch_distributed", "pos_emb_dropout", "rf_att_dropout_broadcast", "grad_scaler", "gradient_clip_global_norm", + "spec_augment_steps", "torch_amp" + ] + config_dict.update( + {k: opts.pop(k) for k in remaining_opt_keys if k in opts} + ) python_epilog.append( serialization.Collection( @@ -107,8 +106,8 @@ def get_train_config(self, opts: Dict): ), *serialize_model_def(self.model_def), serialization.Import(self.get_model_func, import_as="get_model"), - serialization.Import(opts["train_def"], import_as="_train_def", ignore_import_as_for_hash=True), - serialization.Import(opts["train_step_func"], import_as="train_step"), + serialization.Import(train_def, import_as="_train_def", ignore_import_as_for_hash=True), + serialization.Import(train_step_func, import_as="train_step"), serialization.PythonEnlargeStackWorkaroundNonhashedCode, serialization.PythonCacheManagerFunctionNonhashedCode, serialization.PythonModelineNonhashedCode @@ -241,9 +240,9 @@ def get_lr_settings(self, lr_opts, python_epilog: Optional[List] = None): elif lr_opts["type"] == "dyn_lr_lin_warmup_invsqrt_decay": return dict( dynamic_learning_rate=dynamic_lr.dyn_lr_lin_warmup_invsqrt_decay, - learning_rate_warmup_steps=40_000, - learning_rate_invsqrt_norm=40_000, - learning_rate=2.5e-3, + learning_rate_warmup_steps=lr_opts["learning_rate_warmup_steps"], + learning_rate_invsqrt_norm=lr_opts["learning_rate_invsqrt_norm"], + learning_rate=lr_opts["learning_rate"], ) elif lr_opts["type"] == "const": const_lr = lr_opts["const_lr"] @@ -443,13 +442,20 @@ def batch_size_factor(self): class GlobalAttConfigBuilderRF(LibrispeechConformerConfigBuilderRF): - def __init__(self, **kwargs): + def __init__( + self, + use_weight_feedback: bool = True, + **kwargs + ): super(GlobalAttConfigBuilderRF, self).__init__(**kwargs) self.config_dict.update(dict( max_seq_length_default_target=75, )) + if not use_weight_feedback: + self.config_dict["use_weight_feedback"] = use_weight_feedback + def get_extern_data_dict(self, dataset_opts: Dict): extern_data_dict = super(GlobalAttConfigBuilderRF, self).get_extern_data_dict(dataset_opts) extern_data_dict["targets"]["vocab"] = { @@ -467,10 +473,10 @@ class SegmentalAttConfigBuilderRF(LibrispeechConformerConfigBuilderRF): def __init__( self, center_window_size: int, - label_decoder_version: int, blank_decoder_version: Optional[int] = None, use_joint_model: bool = False, use_weight_feedback: bool = True, + label_decoder_state: str = "nb-lstm", **kwargs ): super(SegmentalAttConfigBuilderRF, self).__init__(**kwargs) @@ -482,14 +488,14 @@ def __init__( if use_joint_model: assert not blank_decoder_version, "Either use joint model or separate label and blank model" - if label_decoder_version != 1: - self.config_dict["label_decoder_version"] = label_decoder_version if blank_decoder_version is not None and blank_decoder_version != 1: self.config_dict["blank_decoder_version"] = blank_decoder_version if use_joint_model: self.config_dict["use_joint_model"] = use_joint_model if not use_weight_feedback: self.config_dict["use_weight_feedback"] = use_weight_feedback + if label_decoder_state != "nb-lstm": + self.config_dict["label_decoder_state"] = label_decoder_state def get_train_config(self, opts: Dict): train_config = super(SegmentalAttConfigBuilderRF, self).get_train_config(opts) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/base.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/base.py index c6f58441c..4151392f2 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/base.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/base.py @@ -23,6 +23,8 @@ def __init__( att_num_heads: Dim = Dim(name="att_num_heads", dimension=1), att_dropout: float = 0.1, l2: float = 0.0001, + use_weight_feedback: bool = True, + use_att_ctx_in_state: bool = True, ): super(BaseLabelDecoder, self).__init__() @@ -38,9 +40,8 @@ def __init__( self.target_embed = rf.Embedding(target_dim, Dim(name="target_embed", dimension=640)) - self.s = rf.ZoneoutLSTM( - self.target_embed.out_dim + att_num_heads * enc_out_dim, - Dim(name="lstm", dimension=1024), + zoneout_lstm_opts = dict( + out_dim=Dim(name="lstm", dimension=1024), zoneout_factor_cell=0.15, zoneout_factor_output=0.05, use_zoneout_output=False, # like RETURNN/TF ZoneoutLSTM old default @@ -49,12 +50,26 @@ def __init__( parts_order="jifo", # NativeLSTM (the code above converts it...) forget_bias=0.0, # the code above already adds it during conversion ) + self.use_att_ctx_in_state = use_att_ctx_in_state + if use_att_ctx_in_state: + self.s = rf.ZoneoutLSTM( + self.target_embed.out_dim + att_num_heads * enc_out_dim, + **zoneout_lstm_opts, + ) + else: + self.s_wo_att = rf.ZoneoutLSTM( + self.target_embed.out_dim, + **zoneout_lstm_opts, + ) - self.weight_feedback = rf.Linear(att_num_heads, enc_key_total_dim, with_bias=False) - self.s_transformed = rf.Linear(self.s.out_dim, enc_key_total_dim, with_bias=False) + self.use_weight_feedback = use_weight_feedback + if use_weight_feedback: + self.weight_feedback = rf.Linear(att_num_heads, enc_key_total_dim, with_bias=False) + + self.s_transformed = rf.Linear(self.get_lstm().out_dim, enc_key_total_dim, with_bias=False) self.energy = rf.Linear(enc_key_total_dim, att_num_heads, with_bias=False) self.readout_in = rf.Linear( - self.s.out_dim + self.target_embed.out_dim + att_num_heads * enc_out_dim, + self.get_lstm().out_dim + self.target_embed.out_dim + att_num_heads * enc_out_dim, Dim(name="readout", dimension=1024), ) self.output_prob = rf.Linear(self.readout_in.out_dim // 2, target_dim) @@ -66,3 +81,20 @@ def __init__( # Instead, it is intended to make a separate label scorer for it. self.language_model = None self.language_model_make_label_scorer = None + + def _update_state( + self, + input_embed: rf.Tensor, + prev_att: rf.Tensor, + prev_s_state: rf.LstmState, + ): + if self.use_att_ctx_in_state: + return self.s(rf.concat_features(input_embed, prev_att), state=prev_s_state, spatial_dim=single_step_dim) + else: + return self.s_wo_att(input_embed, state=prev_s_state, spatial_dim=single_step_dim) + + def get_lstm(self): + if self.use_att_ctx_in_state: + return self.s + else: + return self.s_wo_att diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/decoder.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/decoder.py index d55b163ae..6aed0ba96 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/decoder.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/decoder.py @@ -17,20 +17,23 @@ def __init__(self, eos_idx: int, **kwargs): def decoder_default_initial_state(self, *, batch_dims: Sequence[Dim], enc_spatial_dim: Dim) -> rf.State: """Default initial state""" state = rf.State( - s=self.s.default_initial_state(batch_dims=batch_dims), + s=self.get_lstm().default_initial_state(batch_dims=batch_dims), att=rf.zeros(list(batch_dims) + [self.att_num_heads * self.enc_out_dim]), - accum_att_weights=rf.zeros( - list(batch_dims) + [enc_spatial_dim, self.att_num_heads], feature_dim=self.att_num_heads - ), ) state.att.feature_dim_axis = len(state.att.dims) - 1 + + if self.use_weight_feedback: + state.accum_att_weights = rf.zeros( + list(batch_dims) + [enc_spatial_dim, self.att_num_heads], feature_dim=self.att_num_heads + ) + return state def loop_step_output_templates(self, batch_dims: List[Dim]) -> Dict[str, Tensor]: """loop step out""" return { "s": Tensor( - "s", dims=batch_dims + [self.s.out_dim], dtype=rf.get_default_float_dtype(), feature_dim_axis=-1 + "s", dims=batch_dims + [self.get_lstm().out_dim], dtype=rf.get_default_float_dtype(), feature_dim_axis=-1 ), "att": Tensor( "att", @@ -60,15 +63,22 @@ def loop_step( prev_att = state.att - s, state_.s = self.s(rf.concat_features(input_embed, prev_att), state=state.s, spatial_dim=single_step_dim) + # s, state_.s = self.s(rf.concat_features(input_embed, prev_att), state=state.s, spatial_dim=single_step_dim) + s, state_.s = self._update_state(input_embed, prev_att, state.s) + + if self.use_weight_feedback: + weight_feedback = self.weight_feedback(state.accum_att_weights) + else: + weight_feedback = rf.zeros((self.enc_key_total_dim,)) - weight_feedback = self.weight_feedback(state.accum_att_weights) s_transformed = self.s_transformed(s) energy_in = enc_ctx + weight_feedback + s_transformed energy = self.energy(rf.tanh(energy_in)) att_weights = rf.softmax(energy, axis=enc_spatial_dim) - state_.accum_att_weights = state.accum_att_weights + att_weights * inv_fertility * 0.5 + if self.use_weight_feedback: + state_.accum_att_weights = state.accum_att_weights + att_weights * inv_fertility * 0.5 + att0 = rf.dot(att_weights, enc, reduce=enc_spatial_dim, use_mask=False) att0.feature_dim = self.enc_out_dim att, _ = rf.merge_dims(att0, dims=(self.att_num_heads, self.enc_out_dim)) @@ -83,3 +93,42 @@ def decode_logits(self, *, s: Tensor, input_embed: Tensor, att: Tensor) -> Tenso readout = rf.dropout(readout, drop_prob=0.3, axis=self.dropout_broadcast and readout.feature_dim) logits = self.output_prob(readout) return logits + + +class GlobalAttEfficientDecoder(GlobalAttDecoder): + def __init__(self, **kwargs): + super(GlobalAttEfficientDecoder, self).__init__(**kwargs) + + assert not self.use_att_ctx_in_state and not self.use_weight_feedback, ( + "Cannot have alignment dependency for efficient implementation!" + ) + + def __call__( + self, + *, + enc: rf.Tensor, + enc_ctx: rf.Tensor, + enc_spatial_dim: Dim, + s: rf.Tensor, + ) -> rf.Tensor: + s_transformed = self.s_transformed(s) + + weight_feedback = rf.zeros((self.enc_key_total_dim,)) + + energy_in = enc_ctx + weight_feedback + s_transformed + energy = self.energy(rf.tanh(energy_in)) + att_weights = rf.softmax(energy, axis=enc_spatial_dim) + # we do not need use_mask because the softmax output is already padded with zeros + att0 = rf.dot(att_weights, enc, reduce=enc_spatial_dim, use_mask=False) + att0.feature_dim = self.enc_out_dim + att, _ = rf.merge_dims(att0, dims=(self.att_num_heads, self.enc_out_dim)) + + return att + + def decode_logits(self, *, s: Tensor, input_embed: Tensor, att: Tensor) -> Tensor: + """logits for the decoder""" + readout_in = self.readout_in(rf.concat_features(s, input_embed, att, allow_broadcast=True)) + readout = rf.reduce_out(readout_in, mode="max", num_pieces=2, out_dim=self.output_prob.in_dim) + readout = rf.dropout(readout, drop_prob=0.3, axis=self.dropout_broadcast and readout.feature_dim) + logits = self.output_prob(readout) + return logits diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model.py index 0d4f54109..ed2961cd8 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model.py @@ -5,7 +5,10 @@ import returnn.frontend as rf from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.model import ModelDef -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.global_.decoder import GlobalAttDecoder +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.global_.decoder import ( + GlobalAttDecoder, + GlobalAttEfficientDecoder +) from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.encoder.global_ import GlobalConformerEncoder from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.base import _batch_size_factor, _log_mel_feature_dim from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.supports_label_scorer_torch import RFModelWithMakeLabelScorer @@ -31,6 +34,8 @@ def __init__( dec_att_num_heads: Dim = Dim(name="att_num_heads", dimension=1), enc_dropout: float = 0.1, eos_idx: int, + use_weight_feedback: bool = True, + use_att_ctx_in_state: bool = True, ): super(GlobalAttentionModel, self).__init__() @@ -51,7 +56,12 @@ def __init__( l2=l2, ) - self.label_decoder = GlobalAttDecoder( + if not use_weight_feedback and not use_att_ctx_in_state: + decoder_cls = GlobalAttEfficientDecoder + else: + decoder_cls = GlobalAttDecoder + + self.label_decoder = decoder_cls( enc_out_dim=self.encoder.out_dim, target_dim=target_dim, att_num_heads=dec_att_num_heads, @@ -60,6 +70,8 @@ def __init__( enc_key_total_dim=enc_key_total_dim, l2=l2, eos_idx=eos_idx, + use_weight_feedback=use_weight_feedback, + use_att_ctx_in_state=use_att_ctx_in_state, ) if language_model: @@ -101,6 +113,8 @@ def make_model( num_enc_layers: int = 12, pos_emb_dropout: float = 0.0, language_model: Optional[Dict[str, Any]] = None, + use_weight_feedback: bool = True, + use_att_ctx_in_state: bool = True, **extra, ) -> GlobalAttentionModel: """make""" @@ -140,6 +154,8 @@ def make_model( target_dim=target_dim, blank_idx=target_dim.dimension, language_model=lm, + use_weight_feedback=use_weight_feedback, + use_att_ctx_in_state=use_att_ctx_in_state, **extra, ) @@ -179,8 +195,17 @@ def from_scratch_model_def(*, epoch: int, in_dim: Dim, target_dim: Dim) -> Globa # real input is raw audio, internally it does logmel in_dim = Dim(name="logmel", dimension=_log_mel_feature_dim, kind=Dim.Types.Feature) lm_opts = config.typed_value("external_lm") + use_weight_feedback = config.bool("use_weight_feedback", True) + use_att_ctx_in_state = config.bool("use_att_ctx_in_state", True) + return MakeModel.make_model( - in_dim, target_dim, enc_aux_logits=enc_aux_logits or (), pos_emb_dropout=pos_emb_dropout, language_model=lm_opts + in_dim, + target_dim, + enc_aux_logits=enc_aux_logits or (), + pos_emb_dropout=pos_emb_dropout, + language_model=lm_opts, + use_weight_feedback=use_weight_feedback, + use_att_ctx_in_state=use_att_ctx_in_state, ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/train.py index d81959356..a2a1391d3 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/train.py @@ -1,9 +1,15 @@ +from typing import Dict, List, Tuple + from returnn.tensor import TensorDict from returnn.tensor import Tensor, Dim import returnn.frontend as rf from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.training import TrainDef from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.global_.model import GlobalAttentionModel +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.global_.decoder import ( + GlobalAttDecoder, + GlobalAttEfficientDecoder +) def _returnn_v2_train_step(*, model, extern_data: TensorDict, **_kwargs_unused): @@ -26,6 +32,63 @@ def _returnn_v2_train_step(*, model, extern_data: TensorDict, **_kwargs_unused): ) +def get_s_and_att( + *, + model: GlobalAttDecoder, + enc_args: Dict[str, Tensor], + input_embeddings: Tensor, + enc_spatial_dim: Dim, + targets_spatial_dim: Dim, + batch_dims: List[Dim] +) -> Tuple[Tensor, Tensor]: + def _body(input_embed: Tensor, state: rf.State): + new_state = rf.State() + loop_out_, new_state.decoder = model.loop_step( + **enc_args, + enc_spatial_dim=enc_spatial_dim, + input_embed=input_embed, + state=state.decoder, + ) + return loop_out_, new_state + + loop_out, _, _ = rf.scan( + spatial_dim=targets_spatial_dim, + xs=input_embeddings, + ys=model.loop_step_output_templates(batch_dims=batch_dims), + initial=rf.State( + decoder=model.decoder_default_initial_state(batch_dims=batch_dims, enc_spatial_dim=enc_spatial_dim), + ), + body=_body, + ) + + return loop_out["s"], loop_out["att"] + + +def get_s_and_att_efficient( + *, + model: GlobalAttEfficientDecoder, + enc_args: Dict[str, Tensor], + input_embeddings: Tensor, + enc_spatial_dim: Dim, + targets_spatial_dim: Dim, + batch_dims: List[Dim] +) -> Tuple[Tensor, Tensor]: + s, _ = model.s_wo_att( + input_embeddings, + state=model.s_wo_att.default_initial_state(batch_dims=batch_dims), + spatial_dim=targets_spatial_dim, + ) + + att = model( + enc=enc_args["enc"], + enc_ctx=enc_args["enc_ctx"], + enc_spatial_dim=enc_spatial_dim, + s=s, + ) + + return s, att + + def from_scratch_training( *, model: GlobalAttentionModel, @@ -43,6 +106,8 @@ def from_scratch_training( aed_loss_scale = config.float("aed_loss_scale", 1.0) use_normalized_loss = config.bool("use_normalized_loss", True) + force_inefficient_loop = config.bool("force_inefficient_loop", False) + if data.feature_dim and data.feature_dim.dimension == 1: data = rf.squeeze(data, axis=data.feature_dim) assert not data.feature_dim # raw audio @@ -73,27 +138,27 @@ def from_scratch_training( input_embeddings = model.label_decoder.target_embed(targets) input_embeddings = rf.shift_right(input_embeddings, axis=targets_spatial_dim, pad_value=0.0) - def _body(input_embed: Tensor, state: rf.State): - new_state = rf.State() - loop_out_, new_state.decoder = model.label_decoder.loop_step( - **enc_args, + if type(model.label_decoder) is GlobalAttDecoder or force_inefficient_loop: + s, att = get_s_and_att( + model=model.label_decoder, + enc_args=enc_args, + input_embeddings=input_embeddings, enc_spatial_dim=enc_spatial_dim, - input_embed=input_embed, - state=state.decoder, + targets_spatial_dim=targets_spatial_dim, + batch_dims=batch_dims + ) + else: + assert type(model.label_decoder) is GlobalAttEfficientDecoder + s, att = get_s_and_att_efficient( + model=model.label_decoder, + enc_args=enc_args, + input_embeddings=input_embeddings, + enc_spatial_dim=enc_spatial_dim, + targets_spatial_dim=targets_spatial_dim, + batch_dims=batch_dims ) - return loop_out_, new_state - - loop_out, _, _ = rf.scan( - spatial_dim=targets_spatial_dim, - xs=input_embeddings, - ys=model.label_decoder.loop_step_output_templates(batch_dims=batch_dims), - initial=rf.State( - decoder=model.label_decoder.decoder_default_initial_state(batch_dims=batch_dims, enc_spatial_dim=enc_spatial_dim), - ), - body=_body, - ) - logits = model.label_decoder.decode_logits(input_embed=input_embeddings, **loop_out) + logits = model.label_decoder.decode_logits(input_embed=input_embeddings, s=s, att=att) logits_packed, pack_dim = rf.pack_padded(logits, dims=batch_dims + [targets_spatial_dim], enforce_sorted=False) targets_packed, _ = rf.pack_padded( targets, dims=batch_dims + [targets_spatial_dim], enforce_sorted=False, out_dim=pack_dim diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model.py index e4cd2de4d..55f63067a 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model.py @@ -11,8 +11,7 @@ BlankDecoderV3, ) from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.model import ( - SegmentalAttLabelDecoder, - SegmentalAttLabelDecoderWoCtxInState + SegmentalAttLabelDecoder, SegmentalAttEfficientLabelDecoder ) from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.encoder.global_ import GlobalConformerEncoder from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.supports_label_scorer_torch import RFModelWithMakeLabelScorer @@ -41,10 +40,11 @@ def __init__( encoder_layer_opts: Optional[Dict[str, Any]] = None, dec_att_num_heads: Dim = Dim(name="att_num_heads", dimension=1), enc_dropout: float = 0.1, - label_decoder_version: int = 1, + use_att_ctx_in_state: bool = True, blank_decoder_version: int = 1, use_joint_model: bool = False, use_weight_feedback: bool = True, + label_decoder_state: str = "nb-lstm", ): super(SegmentalAttentionModel, self).__init__() @@ -65,15 +65,17 @@ def __init__( l2=l2, ) - assert label_decoder_version in {1, 2} - assert blank_decoder_version in {1, 3} + assert blank_decoder_version in {1, 3, 4} + assert label_decoder_state in {"nb-lstm", "joint-lstm"} + if not use_joint_model: + assert label_decoder_state == "nb-lstm" - if label_decoder_version == 1: - label_decoder_class = SegmentalAttLabelDecoder + if not use_weight_feedback and not use_att_ctx_in_state: + label_decoder_cls = SegmentalAttEfficientLabelDecoder else: - label_decoder_class = SegmentalAttLabelDecoderWoCtxInState + label_decoder_cls = SegmentalAttLabelDecoder - self.label_decoder = label_decoder_class( + self.label_decoder = label_decoder_cls( enc_out_dim=self.encoder.out_dim, target_dim=target_dim, att_num_heads=dec_att_num_heads, @@ -83,6 +85,7 @@ def __init__( l2=l2, center_window_size=center_window_size, use_weight_feedback=use_weight_feedback, + use_att_ctx_in_state=use_att_ctx_in_state, ) if not use_joint_model: @@ -94,6 +97,7 @@ def __init__( encoder_out_dim=self.encoder.out_dim, ) else: + # the logic for blank_decoder_version == 4 is in the train/recog code self.blank_decoder = BlankDecoderV3( length_model_state_dim=length_model_state_dim, label_state_dim=self.label_decoder.get_lstm().out_dim, @@ -113,6 +117,8 @@ def __init__( self.target_dim = self.label_decoder.target_dim self.align_target_dim = align_target_dim self.use_joint_model = use_joint_model + self.blank_decoder_version = blank_decoder_version + self.label_decoder_state = label_decoder_state class MakeModel: @@ -149,10 +155,14 @@ def make_model( num_enc_layers: int = 12, pos_emb_dropout: float = 0.0, language_model: Optional[Dict[str, Any]] = None, - label_decoder_version: int, + use_att_ctx_in_state: bool, blank_decoder_version: int, use_joint_model: bool, use_weight_feedback: bool, + label_decoder_state: str, + enc_out_dim: int, + enc_key_total_dim: int, + enc_ff_dim: int, **extra, ) -> SegmentalAttentionModel: """make""" @@ -172,8 +182,9 @@ def make_model( return SegmentalAttentionModel( enc_in_dim=in_dim, enc_num_layers=num_enc_layers, - enc_out_dim=Dim(name="enc", dimension=512, kind=Dim.Types.Feature), - enc_ff_dim=Dim(name="enc-ff", dimension=2048, kind=Dim.Types.Feature), + enc_out_dim=Dim(name="enc", dimension=enc_out_dim, kind=Dim.Types.Feature), + enc_ff_dim=Dim(name="enc-ff", dimension=enc_ff_dim, kind=Dim.Types.Feature), + enc_key_total_dim=Dim(name="enc_key_total_dim", dimension=enc_key_total_dim), enc_num_heads=8, encoder_layer_opts=dict( conv_norm_opts=dict(use_mask=True), @@ -195,10 +206,11 @@ def make_model( length_model_state_dim=Dim(name="length_model_state", dimension=128, kind=Dim.Types.Feature), length_model_embed_dim=Dim(name="length_model_embed", dimension=128, kind=Dim.Types.Feature), center_window_size=center_window_size, - label_decoder_version=label_decoder_version, + use_att_ctx_in_state=use_att_ctx_in_state, blank_decoder_version=blank_decoder_version, use_joint_model=use_joint_model, use_weight_feedback=use_weight_feedback, + label_decoder_state=label_decoder_state, **extra, ) @@ -219,11 +231,16 @@ def from_scratch_model_def( if center_window_size is None: raise ValueError("center_window_size is not set!") - label_decoder_version = config.int("label_decoder_version", 1) + use_att_ctx_in_state = config.bool("use_att_ctx_in_state", True) + label_decoder_state = config.typed_value("label_decoder_state", "nb-lstm") blank_decoder_version = config.int("blank_decoder_version", 1) use_joint_model = config.bool("use_joint_model", False) use_weight_feedback = config.bool("use_weight_feedback", True) + enc_out_dim = config.int("enc_out_dim", 512) + enc_key_total_dim = config.int("enc_key_total_dim", 1024) + enc_ff_dim = config.int("enc_ff_dim", 2048) + return MakeModel.make_model( in_dim, align_target_dim, @@ -232,10 +249,14 @@ def from_scratch_model_def( enc_aux_logits=enc_aux_logits or (), pos_emb_dropout=pos_emb_dropout, language_model=lm_opts, - label_decoder_version=label_decoder_version, + use_att_ctx_in_state=use_att_ctx_in_state, blank_decoder_version=blank_decoder_version, use_joint_model=use_joint_model, use_weight_feedback=use_weight_feedback, + label_decoder_state=label_decoder_state, + enc_out_dim=enc_out_dim, + enc_key_total_dim=enc_key_total_dim, + enc_ff_dim=enc_ff_dim, ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/train.py index 1abef2546..101330b4e 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/train.py @@ -126,3 +126,18 @@ def viterbi_training_v3( emit_blank_target_dim=emit_blank_target_dim, pack_dim=pack_dim ) + + +# TODO: implement viterbi_training_v4 +def viterbi_training_v4( + *, + model: BlankDecoderV3, + enc_args: Dict, + enc_spatial_dim: Dim, + label_states_unmasked: rf.Tensor, + label_states_unmasked_spatial_dim: Dim, + emit_ground_truth: rf.Tensor, + emit_blank_target_dim: Dim, + batch_dims: List[Dim], +): + pass diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/model.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/model.py index 03a3a756d..be2c24c36 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/model.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/model.py @@ -8,16 +8,12 @@ class SegmentalAttLabelDecoder(BaseLabelDecoder): - def __init__(self, center_window_size: int, use_weight_feedback: bool, **kwargs): + def __init__(self, center_window_size: int, **kwargs): super(SegmentalAttLabelDecoder, self).__init__(**kwargs) self.center_window_size = center_window_size self.accum_att_weights_dim = Dim(name="accum_att_weights", dimension=center_window_size) - self.use_weight_feedback = use_weight_feedback - if not use_weight_feedback: - delattr(self, "weight_feedback") - def default_initial_state( self, *, @@ -29,9 +25,6 @@ def default_initial_state( state = rf.State( s=self.get_lstm().default_initial_state(batch_dims=batch_dims), att=rf.zeros(list(batch_dims) + [self.att_num_heads * self.enc_out_dim]), - # accum_att_weights=rf.zeros( - # list(batch_dims) + [self.accum_att_weights_dim, self.att_num_heads], feature_dim=self.att_num_heads - # ), segment_starts=rf.zeros(batch_dims, sparse_dim=segment_starts_sparse_dim, dtype="int32"), segment_lens=rf.zeros(batch_dims, sparse_dim=segment_lens_sparse_dim, dtype="int32"), ) @@ -135,17 +128,6 @@ def _get_weight_feedback( return self.weight_feedback(prev_accum_att_weights_sliced) - def _update_state( - self, - input_embed: rf.Tensor, - prev_att: rf.Tensor, - prev_s_state: rf.LstmState, - ): - return self.s(rf.concat_features(input_embed, prev_att), state=prev_s_state, spatial_dim=single_step_dim) - - def get_lstm(self): - return self.s - def loop_step( self, *, @@ -226,29 +208,58 @@ def decode_logits(self, *, s: Tensor, input_embed: Tensor, att: Tensor) -> Tenso return logits -class SegmentalAttLabelDecoderWoCtxInState(SegmentalAttLabelDecoder): +class SegmentalAttEfficientLabelDecoder(SegmentalAttLabelDecoder): def __init__(self, **kwargs): - super(SegmentalAttLabelDecoderWoCtxInState, self).__init__(**kwargs) - - # replace old state with new one - self.s_wo_att = rf.ZoneoutLSTM( - self.target_embed.out_dim, - self.s.out_dim, - zoneout_factor_cell=0.15, - zoneout_factor_output=0.05, - use_zoneout_output=False, - parts_order="jifo", - forget_bias=0.0, + super(SegmentalAttEfficientLabelDecoder, self).__init__(**kwargs) + + assert not self.use_att_ctx_in_state and not self.use_weight_feedback, ( + "Cannot have alignment dependency for efficient implementation!" ) - delattr(self, "s") - def _update_state( + def __call__( self, - input_embed: rf.Tensor, - prev_att: rf.Tensor, - prev_s_state: rf.LstmState, - ): - return self.s_wo_att(rf.concat_features(input_embed), state=prev_s_state, spatial_dim=single_step_dim) + *, + enc: rf.Tensor, + enc_ctx: rf.Tensor, + enc_spatial_dim: Dim, + s: rf.Tensor, + segment_starts: rf.Tensor, + segment_lens: rf.Tensor, + ) -> rf.Tensor: + s_transformed = self.s_transformed(s) + + slice_dim = Dim(name="slice", dimension=segment_lens) + gather_positions = rf.range_over_dim(slice_dim) + gather_positions += segment_starts + + # need to move size tensor to GPU since otherwise there is an error in some merge_dims call inside rf.gather + # because two tensors have different devices + # TODO: fix properly in the gather implementation + enc_spatial_dim.dyn_size_ext = rf.copy_to_device(enc_spatial_dim.dyn_size_ext, gather_positions.device) + + enc_ctx_sliced = rf.gather(enc_ctx, axis=enc_spatial_dim, indices=gather_positions, clip_to_valid=True) + enc_sliced = rf.gather(enc, axis=enc_spatial_dim, indices=gather_positions, clip_to_valid=True) + + # move size tensor back to CPU + enc_spatial_dim.dyn_size_ext = rf.copy_to_device(enc_spatial_dim.dyn_size_ext, "cpu") + + weight_feedback = rf.zeros((self.enc_key_total_dim,)) + + energy_in = enc_ctx_sliced + weight_feedback + s_transformed + + energy = self.energy(rf.tanh(energy_in)) + att_weights = rf.softmax(energy, axis=slice_dim) + # we do not need use_mask because the softmax output is already padded with zeros + att0 = rf.dot(att_weights, enc_sliced, reduce=slice_dim, use_mask=False) + att0.feature_dim = self.enc_out_dim + att, _ = rf.merge_dims(att0, dims=(self.att_num_heads, self.enc_out_dim)) - def get_lstm(self): - return self.s_wo_att + return att + + def decode_logits(self, *, s: Tensor, input_embed: Tensor, att: Tensor) -> Tensor: + """logits for the decoder""" + readout_in = self.readout_in(rf.concat_features(s, input_embed, att, allow_broadcast=True)) + readout = rf.reduce_out(readout_in, mode="max", num_pieces=2, out_dim=self.output_prob.in_dim) + readout = rf.dropout(readout, drop_prob=0.3, axis=self.dropout_broadcast and readout.feature_dim) + logits = self.output_prob(readout) + return logits diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/train.py index 3a547e7bd..94468db9a 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/train.py @@ -1,7 +1,10 @@ from typing import Optional, Dict, Any, Sequence, Tuple, List -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.utils import get_non_blank_mask, get_masked -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.model import SegmentalAttLabelDecoder +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental import utils +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.model import ( + SegmentalAttLabelDecoder, + SegmentalAttEfficientLabelDecoder +) from returnn.tensor import Dim, single_step_dim import returnn.frontend as rf @@ -9,6 +12,30 @@ from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.training import TrainDef +def _calc_ce_loss_and_fer( + logits: rf.Tensor, + targets: rf.Tensor, + batch_dims: List[Dim], + targets_spatial_dim: Dim, + target_dim: Dim, +): + logits_packed, pack_dim = rf.pack_padded(logits, dims=batch_dims + [targets_spatial_dim], enforce_sorted=False) + non_blank_targets_packed, _ = rf.pack_padded( + targets, dims=batch_dims + [targets_spatial_dim], enforce_sorted=False, out_dim=pack_dim + ) + + log_prob = rf.log_softmax(logits_packed, axis=target_dim) + log_prob = rf.label_smoothed_log_prob_gradient(log_prob, 0.1, axis=target_dim) + loss = rf.cross_entropy( + target=non_blank_targets_packed, estimated=log_prob, estimated_type="log-probs", axis=target_dim + ) + loss.mark_as_loss("non_blank_ce", scale=1.0, use_normalized_loss=True) + + best = rf.reduce_argmax(logits_packed, axis=target_dim) + frame_error = best != non_blank_targets_packed + frame_error.mark_as_loss(name="non_blank_fer", as_error=True) + + def viterbi_training( *, model: SegmentalAttLabelDecoder, @@ -19,8 +46,8 @@ def viterbi_training( segment_starts: rf.Tensor, segment_lens: rf.Tensor, batch_dims: List[Dim], - output_tensors: Optional[List[str]] = None -) -> Optional[Dict[str, Tuple[rf.Tensor, Dim]]]: + return_label_model_states: bool = False, +) -> Optional[Tuple[rf.Tensor, Dim]]: non_blank_input_embeddings = model.target_embed(non_blank_targets) non_blank_input_embeddings_shifted = rf.shift_right( non_blank_input_embeddings, axis=non_blank_targets_spatial_dim, pad_value=0.0) @@ -59,24 +86,9 @@ def _label_loop_body(xs, state: rf.State): ) logits = model.decode_logits(input_embed=non_blank_input_embeddings_shifted, **label_loop_out) - logits_packed, pack_dim = rf.pack_padded(logits, dims=batch_dims + [non_blank_targets_spatial_dim], enforce_sorted=False) - non_blank_targets_packed, _ = rf.pack_padded( - non_blank_targets, dims=batch_dims + [non_blank_targets_spatial_dim], enforce_sorted=False, out_dim=pack_dim - ) - - log_prob = rf.log_softmax(logits_packed, axis=model.target_dim) - log_prob = rf.label_smoothed_log_prob_gradient(log_prob, 0.1, axis=model.target_dim) - loss = rf.cross_entropy( - target=non_blank_targets_packed, estimated=log_prob, estimated_type="log-probs", axis=model.target_dim - ) - loss.mark_as_loss("non_blank_ce", scale=1.0, use_normalized_loss=True) - - best = rf.reduce_argmax(logits_packed, axis=model.target_dim) - frame_error = best != non_blank_targets_packed - frame_error.mark_as_loss(name="non_blank_fer", as_error=True) + _calc_ce_loss_and_fer(logits, non_blank_targets, batch_dims, non_blank_targets_spatial_dim, model.target_dim) - if output_tensors is not None: - extended_outputs = {} + if return_label_model_states: # need to run the loop one more time to get the last output (which is not needed for the loss computation) last_embedding = rf.gather( non_blank_input_embeddings, @@ -92,13 +104,180 @@ def _label_loop_body(xs, state: rf.State): segment_lens=final_state.decoder.segment_lens, state=final_state.decoder, ) - for key, val in last_loop_out.items(): - if key not in output_tensors: - continue - extended_outputs[key] = rf.concat( - (label_loop_out[key], non_blank_targets_spatial_dim), - (rf.expand_dim(val, single_step_dim), single_step_dim), - ) - return extended_outputs + return rf.concat( + (label_loop_out["s"], non_blank_targets_spatial_dim), + (rf.expand_dim(last_loop_out["s"], single_step_dim), single_step_dim), + ) + + return None + + +def viterbi_training_efficient( + *, + model: SegmentalAttEfficientLabelDecoder, + enc_args: Dict, + enc_spatial_dim: Dim, + targets: rf.Tensor, + targets_spatial_dim: Dim, + segment_starts: rf.Tensor, + segment_lens: rf.Tensor, + batch_dims: List[Dim], + ce_targets: rf.Tensor, + ce_spatial_dim: Dim, + non_blank_mask: Optional[rf.Tensor] = None, + non_blank_mask_spatial_dim: Optional[Dim] = None, + return_label_model_states: bool = False, +) -> Optional[Tuple[rf.Tensor, Dim]]: + input_embeddings = model.target_embed(targets) + input_embeddings_shifted = rf.shift_right( + input_embeddings, axis=targets_spatial_dim, pad_value=0.0) + + label_lstm_out, final_state = model.s_wo_att( + input_embeddings_shifted, + state=model.s_wo_att.default_initial_state(batch_dims=batch_dims), + spatial_dim=targets_spatial_dim, + ) + + if non_blank_mask is not None: + label_lstm_out = utils.get_unmasked( + input=label_lstm_out, + input_spatial_dim=targets_spatial_dim, + mask=non_blank_mask, + mask_spatial_dim=non_blank_mask_spatial_dim, + ) + input_embeddings_shifted = utils.get_unmasked( + input=input_embeddings_shifted, + input_spatial_dim=targets_spatial_dim, + mask=non_blank_mask, + mask_spatial_dim=non_blank_mask_spatial_dim, + ) + + # need to move size tensor to GPU since otherwise there is an error in some merge_dims call inside rf.gather + # because two tensors have different devices + # TODO: fix properly in the gather implementation + targets_spatial_dim.dyn_size_ext = rf.copy_to_device(targets_spatial_dim.dyn_size_ext, label_lstm_out.device) + if non_blank_mask_spatial_dim is not None: + non_blank_mask_spatial_dim.dyn_size_ext = rf.copy_to_device(non_blank_mask_spatial_dim.dyn_size_ext, label_lstm_out.device) + att = model( + enc=enc_args["enc"], + enc_ctx=enc_args["enc_ctx"], + enc_spatial_dim=enc_spatial_dim, + s=label_lstm_out, + segment_starts=segment_starts, + segment_lens=segment_lens, + ) + targets_spatial_dim.dyn_size_ext = rf.copy_to_device(targets_spatial_dim.dyn_size_ext, "cpu") + if non_blank_mask_spatial_dim is not None: + non_blank_mask_spatial_dim.dyn_size_ext = rf.copy_to_device(non_blank_mask_spatial_dim.dyn_size_ext, "cpu") + + logits = model.decode_logits( + input_embed=input_embeddings_shifted, + att=att, + s=label_lstm_out, + ) + + _calc_ce_loss_and_fer(logits, ce_targets, batch_dims, ce_spatial_dim, model.target_dim) + + if return_label_model_states: + # need to run the lstm one more time to get the last output (which is not needed for the loss computation) + last_embedding = rf.gather( + input_embeddings, + axis=targets_spatial_dim, + indices=rf.copy_to_device( + targets_spatial_dim.get_size_tensor() - 1, input_embeddings.device), + clip_to_valid=True, + ) + last_lstm_out, _ = model.s_wo_att( + last_embedding, + state=final_state, + spatial_dim=single_step_dim, + ) + return rf.concat( + (label_lstm_out, targets_spatial_dim), + (rf.expand_dim(last_lstm_out, single_step_dim), single_step_dim), + ) + + return None + + +def full_sum_training( + *, + model: SegmentalAttEfficientLabelDecoder, + enc_args: Dict, + enc_spatial_dim: Dim, + non_blank_targets: rf.Tensor, + non_blank_targets_spatial_dim: Dim, + segment_starts: rf.Tensor, + segment_lens: rf.Tensor, + batch_dims: List[Dim], +) -> Optional[Dict[str, Tuple[rf.Tensor, Dim]]]: + # print("full_sum_training") + # print("model", model) + + import torch + from torch.profiler import profile, record_function, ProfilerActivity + + with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, profile_memory=True) as prof: + + non_blank_input_embeddings = model.target_embed(non_blank_targets) # [B, S, D] + singleton_dim = Dim(name="singleton", dimension=1) + singleton_zeros = rf.zeros(batch_dims + [singleton_dim, model.target_embed.out_dim]) + non_blank_input_embeddings_shifted, non_blank_targets_spatial_dim_ext = rf.concat( + (singleton_zeros, singleton_dim), + (non_blank_input_embeddings, non_blank_targets_spatial_dim), + allow_broadcast=True + ) # [B, S+1, D] + non_blank_input_embeddings_shifted.feature_dim = non_blank_input_embeddings.feature_dim + + label_lstm_out, _ = model.s_wo_att( + non_blank_input_embeddings_shifted, + state=model.s_wo_att.default_initial_state(batch_dims=batch_dims), + spatial_dim=non_blank_targets_spatial_dim_ext, + ) # [B, S+1, D] + + att = model( + enc=enc_args["enc"], + enc_ctx=enc_args["enc_ctx"], + enc_spatial_dim=enc_spatial_dim, + s=label_lstm_out, + segment_starts=segment_starts, + segment_lens=segment_lens, + ) # [B, S+1, T, D] + + logits = model.decode_logits( + input_embed=non_blank_input_embeddings_shifted, + att=att, + s=label_lstm_out, + ) # [B, S+1, T, D] + + print("logits", logits.raw_tensor.shape) + + logits_packed, pack_dim = rf.pack_padded( + logits, + dims=batch_dims + [enc_spatial_dim, non_blank_targets_spatial_dim_ext], + enforce_sorted=False + ) # [B * T * (S+1), D] + + print("logits_packed", logits_packed.raw_tensor.shape) + + print(prof.key_averages(group_by_input_shape=True).table(sort_by="self_cuda_memory_usage", row_limit=10)) + exit() + + from returnn.extern_private.BergerMonotonicRNNT.monotonic_rnnt.pytorch_binding import monotonic_rnnt_loss + + loss = monotonic_rnnt_loss( + acts=logits_packed.raw_tensor, + labels=non_blank_targets.copy_transpose(batch_dims + [non_blank_targets_spatial_dim]).raw_tensor, + input_lengths=rf.copy_to_device(enc_spatial_dim.dyn_size_ext, logits.device).raw_tensor, + label_lengths=rf.copy_to_device(non_blank_targets_spatial_dim.dyn_size_ext, logits.device).raw_tensor.int(), + blank_label=model.blank_idx, + ) + + # print("loss", loss.shape) + + exit() + + loss = rf.convert_to_tensor(loss, name="full_sum_loss") + loss.mark_as_loss("full_sum_loss", scale=1.0, use_normalized_loss=True) return None diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recog.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recog.py index dfdf2a8c3..cd7886330 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recog.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recog.py @@ -32,31 +32,38 @@ def recombine_seqs( print("seq_log_prob before: ", seq_log_prob.raw_tensor) seq_hash_cpu = rf.copy_to_device(seq_hash.copy_transpose([batch_dim, beam_dim]), device="cpu") + # convert from neg log prob to log prob seq_log_prob = rf.copy_to_device(seq_log_prob.copy_transpose([batch_dim, beam_dim]), device="cpu") - for b in range(batch_dim.get_dim_value()): + for b in range(batch_dim.dyn_size_ext.raw_tensor.item()): + # for each batch dim, we need to find the seqs that have the same hash value seq_sets = {} for h in range(beam_dim.dimension): - seq_hash_value = seq_hash_cpu.raw_tensor[b, h] + # hash value of current hypothesis + seq_hash_value = seq_hash_cpu.raw_tensor[b, h].item() if seq_hash_value not in seq_sets: seq_sets[seq_hash_value] = [] + # insert hypothesis index into the list of hypotheses with the same hash value seq_sets[seq_hash_value].append(h) - + # for each set of hypotheses with the same hash value, we keep the one with the highest log prob for seq_set in seq_sets.values(): if len(seq_set) == 1: continue - best_score = 0 + best_score = float("-inf") best_idx = -1 for idx in seq_set: if seq_log_prob.raw_tensor[b, idx] > best_score: best_score = seq_log_prob.raw_tensor[b, idx] best_idx = idx + # print("batch: ", b, "seq_set: ", seq_set, "best_idx: ", best_idx, "best_score: ", best_score) + # exit() for idx in seq_set: if idx != best_idx: - seq_log_prob.raw_tensor[b, idx] = -float("inf") + seq_log_prob.raw_tensor[b, idx] = float("-inf") else: seq_log_prob.raw_tensor[b, idx] = best_score + seq_log_prob = seq_log_prob print("seq_log_prob after: ", seq_log_prob.raw_tensor) exit() @@ -127,6 +134,8 @@ def model_recog( if model.use_joint_model: target = rf.constant(bos_idx, dims=batch_dims_, sparse_dim=model.target_dim) + if model.label_decoder_state == "nb-lstm": + target_non_blank = target.copy() else: target = rf.constant(bos_idx, dims=batch_dims_, sparse_dim=model.align_target_dim) update_state_mask = rf.convert_to_tensor(target != model.blank_idx) @@ -157,7 +166,7 @@ def model_recog( seq_backrefs = [] while i < max_seq_len.raw_tensor: if i > 0: - if model.use_joint_model: + if model.label_decoder_state == "joint-lstm": input_embed = model.label_decoder.target_embed(target) else: target_non_blank = rf.where(update_state_mask, target, rf.gather(target_non_blank, indices=backrefs)) @@ -238,7 +247,7 @@ def model_recog( if use_recombination: seq_log_prob = recombine_seqs(seq_targets, seq_log_prob, seq_backrefs, seq_hash, beam_dim, batch_dims[0], i) - if i== 3: + if i == 3: exit() seq_log_prob = seq_log_prob + output_log_prob # Batch, InBeam, Vocab @@ -264,7 +273,7 @@ def model_recog( # ------------------- update label decoder state ------------------- - if model.use_joint_model: + if model.label_decoder_state == "joint-lstm": label_decoder_state = tree.map_structure(lambda s: rf.gather(s, indices=backrefs), label_decoder_state_updated) else: def _get_masked_state(old, new, mask): diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/train.py index fe38ba5cc..082d8525d 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/train.py @@ -10,6 +10,12 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.train import ( viterbi_training as label_model_viterbi_training ) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.train import ( + viterbi_training_efficient as label_model_viterbi_training_efficient +) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.train import ( + full_sum_training as label_model_full_sum_training +) from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.blank_model.train import ( viterbi_training as blank_model_viterbi_training ) @@ -20,6 +26,10 @@ BlankDecoderV1, BlankDecoderV3, ) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.model import ( + SegmentalAttLabelDecoder, + SegmentalAttEfficientLabelDecoder +) from returnn.tensor import Dim import returnn.frontend as rf @@ -61,6 +71,7 @@ def viterbi_training( config = get_global_config() # noqa aux_loss_layers = config.typed_value("aux_loss_layers") aux_loss_scales = config.typed_value("aux_loss_scales", ([1.0] * len(aux_loss_layers)) if aux_loss_layers else None) + force_inefficient_loop = config.typed_value("force_inefficient_loop", False) if data.feature_dim and data.feature_dim.dimension == 1: data = rf.squeeze(data, axis=data.feature_dim) @@ -80,25 +91,33 @@ def viterbi_training( ) if model.use_joint_model: - non_blank_targets, non_blank_targets_spatial_dim = None, None + # TODO: use rf.window() instead segment_starts, segment_lens = utils.get_segment_starts_and_lens( - utils.get_non_blank_mask(align_targets, blank_idx=-1), # this way, every frame is interpreted as non-blank - align_targets, - align_targets_spatial_dim, - model, - batch_dims, - align_targets_spatial_dim + non_blank_mask=rf.sequence_mask(align_targets.dims), # this way, every frame is interpreted as non-blank + align_targets=align_targets, + align_targets_spatial_dim=align_targets_spatial_dim, + model=model, + batch_dims=batch_dims, + out_spatial_dim=align_targets_spatial_dim ) # set blank indices in alignment to 0 (= EOS index of imported global att model which is not used otherwise) align_targets.raw_tensor[align_targets.raw_tensor == model.target_dim.dimension] = 0 align_targets.sparse_dim = model.target_dim + + non_blank_mask = utils.get_non_blank_mask(align_targets, model.blank_idx) + non_blank_targets, non_blank_targets_spatial_dim = utils.get_masked( + align_targets, non_blank_mask, align_targets_spatial_dim, batch_dims + ) + non_blank_targets.sparse_dim = model.target_dim else: + non_blank_mask = utils.get_non_blank_mask(align_targets, model.blank_idx) non_blank_targets, non_blank_targets_spatial_dim = utils.get_masked( - align_targets, utils.get_non_blank_mask(align_targets, model.blank_idx), align_targets_spatial_dim, batch_dims + align_targets, non_blank_mask, align_targets_spatial_dim, batch_dims ) non_blank_targets.sparse_dim = model.target_dim + segment_starts, segment_lens = utils.get_segment_starts_and_lens( - utils.get_non_blank_mask(align_targets, model.blank_idx), + non_blank_mask, align_targets, align_targets_spatial_dim, model, @@ -133,31 +152,80 @@ def viterbi_training( ) if model.use_joint_model: + # ------------------- joint loop ------------------- - label_model_viterbi_training( - model=model.label_decoder, - enc_args=enc_args, - enc_spatial_dim=enc_spatial_dim, - non_blank_targets=align_targets, - non_blank_targets_spatial_dim=align_targets_spatial_dim, - segment_starts=segment_starts, - segment_lens=segment_lens, - batch_dims=batch_dims, - ) + + # isinstance() does not work here, since SegmentalAttEfficientJointLabelDecoder inherits from SegmentalAttLabelDecoder + if type(model.label_decoder) is SegmentalAttLabelDecoder: + assert model.label_decoder_state == "joint-lstm", "not implemented yet, simple to extend" + label_model_viterbi_training( + model=model.label_decoder, + enc_args=enc_args, + enc_spatial_dim=enc_spatial_dim, + non_blank_targets=align_targets, + non_blank_targets_spatial_dim=align_targets_spatial_dim, + segment_starts=segment_starts, + segment_lens=segment_lens, + batch_dims=batch_dims, + ) + else: + assert type(model.label_decoder) is SegmentalAttEfficientLabelDecoder + if model.label_decoder_state == "joint-lstm": + targets = align_targets + targets_spatial_dim = align_targets_spatial_dim + non_blank_mask_ = None + non_blank_mask_spatial_dim = None + else: + targets = non_blank_targets + targets_spatial_dim = non_blank_targets_spatial_dim + non_blank_mask_ = non_blank_mask + non_blank_mask_spatial_dim = align_targets_spatial_dim + + label_model_viterbi_training_efficient( + model=model.label_decoder, + enc_args=enc_args, + enc_spatial_dim=enc_spatial_dim, + targets=targets, + targets_spatial_dim=targets_spatial_dim, + segment_starts=segment_starts, + segment_lens=segment_lens, + non_blank_mask=non_blank_mask_, + non_blank_mask_spatial_dim=non_blank_mask_spatial_dim, + ce_targets=align_targets, + ce_spatial_dim=align_targets_spatial_dim, + batch_dims=batch_dims, + ) else: + # ------------------- label loop ------------------- - label_decoder_outputs = label_model_viterbi_training( - model=model.label_decoder, - enc_args=enc_args, - enc_spatial_dim=enc_spatial_dim, - non_blank_targets=non_blank_targets, - non_blank_targets_spatial_dim=non_blank_targets_spatial_dim, - segment_starts=segment_starts, - segment_lens=segment_lens, - batch_dims=batch_dims, - output_tensors=model.blank_decoder.get_label_decoder_deps(), - ) + if type(model.label_decoder) is SegmentalAttLabelDecoder or force_inefficient_loop: + label_decoder_outputs = label_model_viterbi_training( + model=model.label_decoder, + enc_args=enc_args, + enc_spatial_dim=enc_spatial_dim, + non_blank_targets=non_blank_targets, + non_blank_targets_spatial_dim=non_blank_targets_spatial_dim, + segment_starts=segment_starts, + segment_lens=segment_lens, + batch_dims=batch_dims, + return_label_model_states=model.blank_decoder.get_label_decoder_deps() is not None, + ) + else: + assert type(model.label_decoder) is SegmentalAttEfficientLabelDecoder + label_decoder_outputs = label_model_viterbi_training_efficient( + model=model.label_decoder, + enc_args=enc_args, + enc_spatial_dim=enc_spatial_dim, + targets=non_blank_targets, + targets_spatial_dim=non_blank_targets_spatial_dim, + segment_starts=segment_starts, + segment_lens=segment_lens, + batch_dims=batch_dims, + ce_targets=non_blank_targets, + ce_spatial_dim=non_blank_targets_spatial_dim, + return_label_model_states=model.blank_decoder.get_label_decoder_deps() is not None, + ) # ------------------- blank loop ------------------- @@ -174,13 +242,12 @@ def viterbi_training( batch_dims=batch_dims, ) else: - assert isinstance(model.blank_decoder, BlankDecoderV3) - assert "s" in label_decoder_outputs + assert isinstance(model.blank_decoder, BlankDecoderV3) and model.blank_decoder_version in (3,) label_states_unmasked = utils.get_unmasked( - input=label_decoder_outputs["s"][0], - input_spatial_dim=label_decoder_outputs["s"][1], - mask=utils.get_non_blank_mask(align_targets, model.blank_idx), + input=label_decoder_outputs[0], + input_spatial_dim=label_decoder_outputs[1], + mask=non_blank_mask, mask_spatial_dim=align_targets_spatial_dim ) blank_model_viterbi_training_v3( @@ -196,3 +263,94 @@ def viterbi_training( viterbi_training: TrainDef[SegmentalAttentionModel] viterbi_training.learning_rate_control_error_measure = "dev_score_full_sum" + + +def full_sum_training( + *, + model: SegmentalAttentionModel, + data: rf.Tensor, + data_spatial_dim: Dim, + align_targets: rf.Tensor, + align_targets_spatial_dim: Dim +): + assert model.use_joint_model + assert isinstance(model.label_decoder, SegmentalAttEfficientLabelDecoder) + assert model.label_decoder_state == "nb-lstm" + + from returnn.config import get_global_config + + # torch.cuda.memory._record_memory_history(enabled=True) + + config = get_global_config() # noqa + aux_loss_layers = config.typed_value("aux_loss_layers") + aux_loss_scales = config.typed_value("aux_loss_scales", ([1.0] * len(aux_loss_layers)) if aux_loss_layers else None) + + if data.feature_dim and data.feature_dim.dimension == 1: + data = rf.squeeze(data, axis=data.feature_dim) + assert not data.feature_dim # raw audio + + batch_dims = data.remaining_dims(data_spatial_dim) + + # set blank indices in alignment to 0 (= EOS index of imported global att model which is not used otherwise) + align_targets.raw_tensor[align_targets.raw_tensor == model.target_dim.dimension] = 0 + align_targets.sparse_dim = model.target_dim + + # get non-blank targets ([B,S]) + non_blank_mask = utils.get_non_blank_mask(align_targets, model.blank_idx) + non_blank_targets, non_blank_targets_spatial_dim = utils.get_masked( + align_targets, non_blank_mask, align_targets_spatial_dim, batch_dims + ) + + # ------------------- encoder aux loss ------------------- + + collected_outputs = {} + enc_args, enc_spatial_dim = model.encoder.encode( + data, in_spatial_dim=data_spatial_dim, collected_outputs=collected_outputs) + + if aux_loss_layers: + for i, layer_idx in enumerate(aux_loss_layers): + if layer_idx > len(model.encoder.layers): + continue + linear = getattr(model, f"enc_aux_logits_{layer_idx}") + aux_logits = linear(collected_outputs[str(layer_idx - 1)]) + aux_loss = rf.ctc_loss( + logits=aux_logits, + targets=non_blank_targets, + input_spatial_dim=enc_spatial_dim, + targets_spatial_dim=non_blank_targets_spatial_dim, + blank_index=model.blank_idx, + ) + aux_loss.mark_as_loss( + f"ctc_{layer_idx}", + scale=aux_loss_scales[i], + custom_inv_norm_factor=align_targets_spatial_dim.get_size_tensor(), + use_normalized_loss=True, + ) + + # for every frame position, get the corresponding window around it ([B,T,W]) + # TODO: use rf.window() instead + segment_starts, segment_lens = utils.get_segment_starts_and_lens( + utils.get_non_blank_mask(align_targets, blank_idx=-1), # this way, every frame is interpreted as non-blank + align_targets, + align_targets_spatial_dim, + model, + batch_dims, + enc_spatial_dim + ) + + # ------------------- joint loop ------------------- + + label_model_full_sum_training( + model=model.label_decoder, + enc_args=enc_args, + enc_spatial_dim=enc_spatial_dim, + non_blank_targets=non_blank_targets, + non_blank_targets_spatial_dim=non_blank_targets_spatial_dim, + segment_starts=segment_starts, + segment_lens=segment_lens, + batch_dims=batch_dims, + ) + + +full_sum_training: TrainDef[SegmentalAttentionModel] +full_sum_training.learning_rate_control_error_measure = "dev_score_full_sum" diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/utils.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/utils.py index 7b8a7281a..18074b34b 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/utils.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/utils.py @@ -48,14 +48,20 @@ def get_unmasked( ): mask_shifted = rf.shift_right(mask, axis=mask_spatial_dim, pad_value=False) mask_axis = mask.get_axis_from_description(mask_spatial_dim) - cumsum = rf.cast(mask_shifted, "int32").copy_template() + + # changelog + # 24.05.24: changed int32->int64 and added 'clip_to_valid=True' since i got an CUDA idx out of bounds error + # when testing a new feature. weirdly, i did not see this error in the log.run.1 file of existing trainings + # using this function. + cumsum = rf.cast(mask_shifted, "int64").copy_template() cumsum.raw_tensor = torch.cumsum( - mask_shifted.raw_tensor.to(torch.int32), dim=mask_axis, dtype=torch.int32 + mask_shifted.raw_tensor.to(torch.int64), dim=mask_axis, dtype=torch.int64 ) return rf.gather( input, indices=cumsum, axis=input_spatial_dim, + clip_to_valid=True ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v1/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v1/__init__.py index b33815fb9..05080ed11 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v1/__init__.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v1/__init__.py @@ -4,7 +4,6 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.center_window_att import ( train, recog ) -from i6_experiments.users.schmitt.custom_load_params import load_missing_params def run_exps(): @@ -26,7 +25,7 @@ def run_exps(): ) for model_alias, config_builder in baseline.center_window_att_baseline_rf( - win_size_list=(5,), decoder_version=2 + win_size_list=(5,), use_att_ctx_in_state=False ): for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( alias=model_alias, @@ -34,7 +33,6 @@ def run_exps(): n_epochs_list=(10,), const_lr_list=(1e-4, 2e-4, 3e-4), time_rqmt=4, - custom_missing_load_func=load_missing_params ): recog.center_window_returnn_frame_wise_beam_search( alias=train_alias, diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v1/baseline.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v1/baseline.py index 496bae2e0..4034fdd07 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v1/baseline.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v1/baseline.py @@ -6,13 +6,14 @@ def center_window_att_baseline_rf( win_size_list: Tuple[int, ...] = (5, 129), - decoder_version: int = 1, + use_att_ctx_in_state: bool = True, ): for win_size in win_size_list: - alias = f"{base_alias}/baseline_rf/win-size-{win_size}/decoder-version-{decoder_version if decoder_version else 1}" - yield alias, get_center_window_att_config_builder_rf( + alias, config_builder = get_center_window_att_config_builder_rf( win_size=win_size, - label_decoder_version=decoder_version, + use_att_ctx_in_state=use_att_ctx_in_state, blank_decoder_version=1, use_joint_model=False, ) + alias = f"{base_alias}/baseline_rf/{alias}" + yield alias, config_builder diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/__init__.py index 89d9f44d5..21f3ce8fc 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/__init__.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/__init__.py @@ -4,10 +4,28 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.center_window_att import ( train, recog ) -from i6_experiments.users.schmitt.custom_load_params import load_missing_params + +from i6_core.returnn.training import PtCheckpoint +from sisyphus import Path def run_exps(): + # baseline model for checking consistency of train and recog implementations + for model_alias, config_builder in baseline.center_window_att_baseline_rf( + win_size_list=(5,), + ): + for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( + alias=model_alias, + config_builder=config_builder, + n_epochs_list=(10,), + const_lr_list=(1e-4,), + ): + recog.center_window_returnn_frame_wise_beam_search( + alias=train_alias, + config_builder=config_builder, + checkpoint=checkpoint, + ) + for model_alias, config_builder in baseline.center_window_att_baseline_rf( win_size_list=(5, 129), ): @@ -24,7 +42,7 @@ def run_exps(): ) for model_alias, config_builder in baseline.center_window_att_baseline_rf( - win_size_list=(5, 129), decoder_version=2 + win_size_list=(5, 129), use_att_ctx_in_state=False, use_weight_feedback=False, ): for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( alias=model_alias, @@ -39,9 +57,9 @@ def run_exps(): ) for model_alias, config_builder in baseline.center_window_att_baseline_rf( - win_size_list=(5,), decoder_version=2 + win_size_list=(5,) ): - for max_shift, num_iterations in [(1, 1), (2, 1), (1, 2), (2, 2)]: + for max_shift, num_iterations in [(1, 1), (2, 1), (1, 2)]: for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( alias=model_alias, config_builder=config_builder, @@ -54,3 +72,17 @@ def run_exps(): config_builder=config_builder, checkpoint=checkpoint, ) + + for model_alias, config_builder in baseline.center_window_att_baseline_rf( + win_size_list=(5,), + ): + for train_alias, checkpoint in train.train_center_window_att_viterbi_from_scratch( + alias=model_alias, + config_builder=config_builder, + n_epochs_list=(500,), + ): + recog.center_window_returnn_frame_wise_beam_search( + alias=train_alias, + config_builder=config_builder, + checkpoint=checkpoint, + ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/baseline.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/baseline.py index ef8071026..d6fff6356 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/baseline.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/baseline.py @@ -6,15 +6,16 @@ def center_window_att_baseline_rf( win_size_list: Tuple[int, ...] = (5, 129), - decoder_version: int = 1, + use_att_ctx_in_state: bool = True, use_weight_feedback: bool = True, ): for win_size in win_size_list: - alias = f"{base_alias}/baseline_rf/win-size-{win_size}/{'w' if use_weight_feedback else 'wo'}-weight-feedback/decoder-version-{decoder_version if decoder_version else 1}" - yield alias, get_center_window_att_config_builder_rf( + alias, config_builder = get_center_window_att_config_builder_rf( win_size=win_size, - label_decoder_version=decoder_version, + use_att_ctx_in_state=use_att_ctx_in_state, blank_decoder_version=3, use_joint_model=False, use_weight_feedback=use_weight_feedback, ) + alias = f"{base_alias}/baseline_rf/{alias}" + yield alias, config_builder diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/__init__.py index 07a9ffbcc..d9f80c03e 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/__init__.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/__init__.py @@ -4,23 +4,33 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.center_window_att import ( train, recog ) -from i6_experiments.users.schmitt.custom_load_params import load_missing_params def run_exps(): for model_alias, config_builder in baseline.center_window_att_baseline_rf( - win_size_list=(1, 5,), + win_size_list=(5,), label_decoder_state="nb-lstm", use_att_ctx_in_state=False, use_weight_feedback=False, ): for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( alias=model_alias, config_builder=config_builder, - n_epochs_list=(10,), - time_rqmt=4, + n_epochs_list=(100,), + ): + recog.center_window_returnn_frame_wise_beam_search( + alias=train_alias, + config_builder=config_builder, + checkpoint=checkpoint, + ) + + for model_alias, config_builder in baseline.center_window_att_baseline_rf( + win_size_list=(5,), label_decoder_state="joint-lstm", use_att_ctx_in_state=False, use_weight_feedback=False, + ): + for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( + alias=model_alias, + config_builder=config_builder, + n_epochs_list=(100,), ): recog.center_window_returnn_frame_wise_beam_search( alias=train_alias, config_builder=config_builder, checkpoint=checkpoint, - checkpoint_aliases=("last",), - pure_torch=False, ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/baseline.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/baseline.py index 069601d20..8e5566d4f 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/baseline.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/baseline.py @@ -6,13 +6,18 @@ def center_window_att_baseline_rf( win_size_list: Tuple[int, ...] = (5, 129), - decoder_version: int = 1, + use_att_ctx_in_state: bool = True, + label_decoder_state: str = "nb-lstm", + use_weight_feedback: bool = True, ): for win_size in win_size_list: - alias = f"{base_alias}/baseline_rf/win-size-{win_size}/decoder-version-{decoder_version if decoder_version else 1}" - yield alias, get_center_window_att_config_builder_rf( + alias, config_builder = get_center_window_att_config_builder_rf( win_size=win_size, - label_decoder_version=decoder_version, + use_att_ctx_in_state=use_att_ctx_in_state, blank_decoder_version=None, use_joint_model=True, + label_decoder_state=label_decoder_state, + use_weight_feedback=use_weight_feedback, ) + alias = f"{base_alias}/baseline_rf/{alias}" + yield alias, config_builder diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/config_builder.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/config_builder.py index 1f0687758..a4f9b7d68 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/config_builder.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/config_builder.py @@ -12,11 +12,12 @@ def get_center_window_att_config_builder_rf( win_size: int, - label_decoder_version: int, + use_att_ctx_in_state: bool, blank_decoder_version: Optional[int], use_joint_model: bool, use_weight_feedback: bool = True, -) -> SegmentalAttConfigBuilderRF: + label_decoder_state: str = "nb-lstm", +) -> Tuple[str, SegmentalAttConfigBuilderRF]: variant_params = { "dependencies": LibrispeechBPE10025_CTC_ALIGNMENT, "dataset": { @@ -36,10 +37,18 @@ def get_center_window_att_config_builder_rf( model_def=from_scratch_model_def, get_model_func=_returnn_v2_get_model, center_window_size=win_size, - label_decoder_version=label_decoder_version, + use_att_ctx_in_state=use_att_ctx_in_state, blank_decoder_version=blank_decoder_version, use_joint_model=use_joint_model, use_weight_feedback=use_weight_feedback, + label_decoder_state=label_decoder_state ) - return config_builder + alias = ( + f"win-size-{win_size}/" + f"{'w' if use_weight_feedback else 'wo'}-weight-feedback/" + f"{'w' if use_att_ctx_in_state else 'wo'}-att-ctx-in-state/" + f"{label_decoder_state}" + ) + + return alias, config_builder diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/train.py index c21b470f0..468a525d2 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/train.py @@ -6,16 +6,17 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.checkpoints import ( external_checkpoints, default_import_model_name, - get_center_window_baseline_v1_tf_checkpoint ) +from i6_experiments.users.schmitt.custom_load_params import load_missing_params def train_center_window_att_viterbi_from_scratch( alias: str, config_builder: SegmentalAttConfigBuilderRF, n_epochs_list: Tuple[int, ...], - time_rqmt: int = 168, + time_rqmt: int = 80, ): + batch_size = 15_000 for n_epochs in n_epochs_list: alias += "/train_from_scratch/%d-epochs_w-ctc-loss" % (n_epochs,) @@ -24,15 +25,41 @@ def train_center_window_att_viterbi_from_scratch( alias=alias, num_epochs=n_epochs, train_rqmt={ - "time": time_rqmt + "time": time_rqmt, + "horovod_num_processes": 4, + "distributed_launch_cmd": "torchrun" }, train_opts={ "dataset_opts": { "use_speed_pert": False, "epoch_wise_filter": {(1, 5): {"max_mean_len": 1000}} }, - "import_model_train_epoch1": None, - "lr_opts": {"type": "dyn_lr_lin_warmup_invsqrt_decay"}, + # "import_model_train_epoch1": None, + "accum_grad_multiple_step": 4, + "torch_distributed": {}, + "pos_emb_dropout": 0.1, + "rf_att_dropout_broadcast": False, + "batch_size": batch_size, + "batching": "laplace:.1000", + "lr_opts": { + "type": "dyn_lr_piecewise_linear", + "batch_size": batch_size, + "num_epochs": n_epochs, + "learning_rate": 1e-3, + }, + "aux_loss_layers": None, + "specaugment_steps": (5_000, 15_000, 25_000), + "grad_scaler": None, + "gradient_clip_global_norm": 5.0, + "optimizer": { + "class": "adamw", + "weight_decay_modules_blacklist": [ + "rf.Embedding", + "rf.LearnedRelativePositionalEncoding", + ], + "epsilon": 1e-16, + "weight_decay": 1e-6, + }, "train_def": viterbi_training, "train_step_func": _returnn_v2_train_step, } @@ -54,10 +81,15 @@ def train_center_window_att_viterbi_import_global_tf( config_builder: SegmentalAttConfigBuilderRF, n_epochs_list: Tuple[int, ...], const_lr_list: Tuple[float, ...] = (1e-4,), - time_rqmt: int = 30, - custom_missing_load_func: Optional[Callable] = None, + time_rqmt: int = 80, alignment_augmentation_opts: Optional[Dict] = None, ): + if not config_builder.use_att_ctx_in_state: + # only randomly init FF weights, since only the input dim of the lstm layer is different + custom_missing_load_func = load_missing_params + else: + custom_missing_load_func = None + for n_epochs in n_epochs_list: for const_lr in const_lr_list: train_alias = alias + f"/train_from_global_att_tf_checkpoint/standard-training/{n_epochs}-epochs_{const_lr}-const-lr_wo-ctc-loss" diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/__init__.py index 182f46415..31c6c5bab 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/__init__.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/__init__.py @@ -8,33 +8,60 @@ def run_exps(): - for model_alias, config_builder in baseline.global_att_baseline_rf(): - for train_alias, checkpoint in ( - (f"{model_alias}/import-global-tf_no-finetuning", external_checkpoints[default_import_model_name]), - ): - recog.global_att_returnn_label_sync_beam_search( - alias=train_alias, + for use_weight_feedback in (True,): + for model_alias, config_builder in baseline.global_att_baseline_rf(use_weight_feedback=use_weight_feedback): + for train_alias, checkpoint in ( + (f"{model_alias}/import-global-tf_no-finetuning", external_checkpoints[default_import_model_name]), + ): + recog.global_att_returnn_label_sync_beam_search( + alias=train_alias, + config_builder=config_builder, + checkpoint=checkpoint, + checkpoint_aliases=("best-4-avg",), + ) + + for train_alias, checkpoint in train.train_import_global_tf( + alias=model_alias, config_builder=config_builder, - checkpoint=checkpoint, - checkpoint_aliases=("best-4-avg",), - ) - recog.global_att_returnn_label_sync_beam_search( - alias=train_alias, + n_epochs_list=(10, 100), + const_lr_list=(1e-4,), + ): + recog.global_att_returnn_label_sync_beam_search( + alias=train_alias, + config_builder=config_builder, + checkpoint=checkpoint, + ) + + for use_weight_feedback in (True,): + for model_alias, config_builder in baseline.global_att_baseline_rf( + use_weight_feedback=use_weight_feedback, + use_att_ctx_in_state=False, + ): + for train_alias, checkpoint in train.train_import_global_tf( + alias=model_alias, config_builder=config_builder, - checkpoint=checkpoint, - checkpoint_aliases=("best-4-avg",), - pure_torch=True, - ) + n_epochs_list=(100,), + const_lr_list=(1e-4,), + ): + recog.global_att_returnn_label_sync_beam_search( + alias=train_alias, + config_builder=config_builder, + checkpoint=checkpoint, + ) - for train_alias, checkpoint in train.train_import_global_tf( - alias=model_alias, - config_builder=config_builder, - n_epochs_list=(10,), - const_lr_list=(1e-4,), - time_rqmt=4, + for use_weight_feedback in (False,): + for model_alias, config_builder in baseline.global_att_baseline_rf( + use_weight_feedback=use_weight_feedback, + use_att_ctx_in_state=False, ): - recog.global_att_returnn_label_sync_beam_search( - alias=train_alias, + for train_alias, checkpoint in train.train_import_global_tf( + alias=model_alias, config_builder=config_builder, - checkpoint=checkpoint, - ) + n_epochs_list=(300,), + const_lr_list=(1e-4,), + ): + recog.global_att_returnn_label_sync_beam_search( + alias=train_alias, + config_builder=config_builder, + checkpoint=checkpoint, + ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/baseline.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/baseline.py index f136850e4..c9a3a3c95 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/baseline.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/baseline.py @@ -5,24 +5,51 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.global_.model import from_scratch_model_def, _returnn_v2_get_model from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.model_variants.model_variants_ls_conf import models from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.pipelines.pipeline_ls_conf.global_att.baseline_v1.alias import alias as base_alias +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.labels.v2.librispeech.label_singletons import ( + LibrispeechBPE10025_LABELS, +LIBRISPEECH_CORPUS +) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.general.returnn.exes import RETURNN_EXE_NEW, RETURNN_CURRENT_ROOT -def get_global_att_config_builder_rf(use_weight_feedback: bool = True, decoder_version: Optional[int] = None): - model_type = "librispeech_conformer_glob_att" - variant_name = "glob.conformer.mohammad.5.6" - variant_params = copy.deepcopy(models[model_type][variant_name]) - variant_params["network"]["use_weight_feedback"] = use_weight_feedback - variant_params["network"]["decoder_version"] = decoder_version +def get_global_att_config_builder_rf( + use_weight_feedback: bool = True, + use_att_ctx_in_state: bool = True, +): + variant_params = { + "dependencies": LibrispeechBPE10025_LABELS, + "dataset": { + "feature_type": "raw", + "corpus": LIBRISPEECH_CORPUS + }, + "config": { + "train_seq_ordering": "laplace:.1000" + }, + "network": {"length_scale": 1.0}, + "returnn_python_exe": RETURNN_EXE_NEW, + "returnn_root": RETURNN_CURRENT_ROOT + } config_builder = GlobalAttConfigBuilderRF( variant_params=variant_params, model_def=from_scratch_model_def, get_model_func=_returnn_v2_get_model, + use_weight_feedback=use_weight_feedback, + use_att_ctx_in_state=use_att_ctx_in_state, ) - return config_builder + alias = ( + f"{'w' if use_weight_feedback else 'wo'}-weight-feedback/" + f"{'w' if use_att_ctx_in_state else 'wo'}-att-ctx-in-state" + ) + + return alias, config_builder -def global_att_baseline_rf(): - alias = f"{base_alias}/baseline_rf" - yield alias, get_global_att_config_builder_rf(use_weight_feedback=True) +def global_att_baseline_rf(use_weight_feedback: bool = True, use_att_ctx_in_state: bool = True): + alias, config_builder = get_global_att_config_builder_rf( + use_weight_feedback=use_weight_feedback, + use_att_ctx_in_state=use_att_ctx_in_state, + ) + alias = f"{base_alias}/baseline_rf/{alias}" + yield alias, config_builder diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/train.py index d5e500543..6d02cab56 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/train.py @@ -5,6 +5,7 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.train_new import GlobalTrainExperiment from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.checkpoints import external_checkpoints, default_import_model_name from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.global_.train import _returnn_v2_train_step, from_scratch_training +from i6_experiments.users.schmitt.custom_load_params import load_missing_params def train_from_scratch( @@ -56,11 +57,39 @@ def train_import_global_tf( config_builder: GlobalAttConfigBuilderRF, n_epochs_list: Tuple[int, ...], const_lr_list: Tuple[float, ...], - time_rqmt: int = 168, + time_rqmt: int = 80, ): + if not config_builder.use_att_ctx_in_state: + # only randomly init FF weights, since only the input dim of the lstm layer is different + custom_missing_load_func = load_missing_params + else: + custom_missing_load_func = None + for n_epochs, const_lr in itertools.product(n_epochs_list, const_lr_list): train_alias = alias + f"/train_from_global_att_tf_checkpoint/standard-training/{n_epochs}-epochs_{const_lr}-const-lr_wo-ctc-loss" + train_opts = { + "preload_from_files": { + "pretrained_global_att_params": { + "filename": external_checkpoints[default_import_model_name], + "init_for_train": True, + } + }, + "train_def": from_scratch_training, + "train_step_func": _returnn_v2_train_step, + "batching": "random", + "aux_loss_layers": None, + "lr_opts": { + "type": "const_then_linear", + "const_lr": const_lr, + "const_frac": 1 / 3, + "final_lr": 1e-6, + "num_epochs": n_epochs + }, + } + if custom_missing_load_func: + train_opts["preload_from_files"]["pretrained_global_att_params"]["custom_missing_load_func"] = custom_missing_load_func + train_exp = GlobalTrainExperiment( config_builder=config_builder, alias=train_alias, @@ -68,25 +97,7 @@ def train_import_global_tf( train_rqmt={ "time": time_rqmt }, - train_opts={ - "preload_from_files": { - "pretrained_global_att_params": { - "filename": external_checkpoints[default_import_model_name], - "init_for_train": True, - } - }, - "train_def": from_scratch_training, - "train_step_func": _returnn_v2_train_step, - "batching": "random", - "aux_loss_layers": None, - "lr_opts": { - "type": "const_then_linear", - "const_lr": const_lr, - "const_frac": 1 / 3, - "final_lr": 1e-6, - "num_epochs": n_epochs - }, - } + train_opts=train_opts ) checkpoints, model_dir, learning_rates = train_exp.run_train() From fb8b0b1264380cd79526cd167c4d7bc931898b27 Mon Sep 17 00:00:00 2001 From: Nick Rossenbach Date: Tue, 28 May 2024 11:51:35 +0200 Subject: [PATCH 066/227] ConformerV2 setup --- .../ctc_rnnt_standalone_2024/default_tools.py | 4 +- .../ctc_bpe/low_vocab_exps_confv2.py | 229 ++++++++++++++++++ .../ctc/conformer_0524/__init__.py | 0 ...elsV2_VGG4LayerActFrontendV1_auxloss_v1.py | 197 +++++++++++++++ ...2_VGG4LayerActFrontendV1_auxloss_v1_cfg.py | 90 +++++++ 5 files changed, 518 insertions(+), 2 deletions(-) create mode 100644 users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_bpe/low_vocab_exps_confv2.py create mode 100644 users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/ctc/conformer_0524/__init__.py create mode 100644 users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/ctc/conformer_0524/i6modelsV2_VGG4LayerActFrontendV1_auxloss_v1.py create mode 100644 users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/ctc/conformer_0524/i6modelsV2_VGG4LayerActFrontendV1_auxloss_v1_cfg.py diff --git a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/default_tools.py b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/default_tools.py index 69fbddcc4..85883940d 100644 --- a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/default_tools.py +++ b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/default_tools.py @@ -13,13 +13,13 @@ RETURNN_EXE = tk.Path("/usr/bin/python3", hash_overwrite="GENERIC_RETURNN_LAUNCHER") MINI_RETURNN_ROOT = CloneGitRepositoryJob( - "https://github.com/JackTemaki/MiniReturnn", commit="3f47cb87f298254d86d9faf37916067dd2c74674" + "https://github.com/JackTemaki/MiniReturnn", commit="a8b6c2551d72d68b9173654c0254a8944e62b293" ).out_repository.copy() MINI_RETURNN_ROOT.hash_overwrite = "LIBRISPEECH_DEFAULT_RETURNN_ROOT" I6_MODELS_REPO_PATH = CloneGitRepositoryJob( url="https://github.com/rwth-i6/i6_models", - commit="933c6c13f7d6c74e5a59af0257e17c208dae9da3", + commit="918143c1011fe5a19c5fcfb61fe05050a8d58a2b", checkout_folder_name="i6_models", ).out_repository.copy() I6_MODELS_REPO_PATH.hash_overwrite = "LIBRISPEECH_DEFAULT_I6_MODELS" diff --git a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_bpe/low_vocab_exps_confv2.py b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_bpe/low_vocab_exps_confv2.py new file mode 100644 index 000000000..817aef167 --- /dev/null +++ b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_bpe/low_vocab_exps_confv2.py @@ -0,0 +1,229 @@ +from sisyphus import tk + +import copy +from dataclasses import asdict +import numpy as np +from typing import cast + +from i6_core.tools.parameter_tuning import GetOptimalParametersAsVariableJob + +from i6_experiments.common.setups.returnn.datastreams.vocabulary import LabelDatastream + +from ...data.common import DatasetSettings, build_test_dataset +from ...data.bpe import build_bpe_training_datasets, get_text_lexicon +from ...default_tools import RETURNN_EXE, MINI_RETURNN_ROOT +from ...lm import get_4gram_binary_lm +from ...pipeline import training, prepare_asr_model, search, ASRModel +from ...storage import add_ctc_model + + + +def bpe_ls960_1023_low_vocab_test_confv2(): + prefix_name = "experiments/librispeech/ctc_rnnt_standalone_2024/ls960_ctc_bpe_low_vocab" + + train_settings = DatasetSettings( + preemphasis=0.97, # TODO: Check if this is really useful + peak_normalization=True, # TODO: Also check if really useful, older Attention setups did not have that + # training + train_partition_epoch=10, + train_seq_ordering="laplace:.1000", + ) + + arpa_4gram_lm = get_4gram_binary_lm(prefix_name=prefix_name) + + default_returnn = { + "returnn_exe": RETURNN_EXE, + "returnn_root": MINI_RETURNN_ROOT, + } + + from ...pytorch_networks.ctc.decoder.flashlight_ctc_v1 import DecoderConfig + from ...pytorch_networks.ctc.decoder.greedy_bpe_ctc_v3 import DecoderConfig as GreedyDecoderConfig + + + + from ...pytorch_networks.ctc.conformer_0524.i6modelsV2_VGG4LayerActFrontendV1_auxloss_v1_cfg import \ + SpecaugConfig, VGG4LayerActFrontendV1Config_mod, ModelConfig, LogMelFeatureExtractionV1Config + + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + specaug_config = SpecaugConfig( + repeat_per_n_frames=25, + max_dim_time=20, + max_dim_feat=8, # Jingjing style + num_repeat_feat=5, + ) + frontend_config = VGG4LayerActFrontendV1Config_mod( + in_features=80, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation_str="ReLU", + out_features=512, + activation=None, + ) + + + + train_config_24gbgpu_amp = { + "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3}, + "learning_rates": list(np.linspace(7e-6, 5e-4, 240)) + list( + np.linspace(5e-4, 5e-5, 240)) + list(np.linspace(5e-5, 1e-7, 20)), + ############# + "batch_size": 360 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + "accum_grad_multiple_step": 1, + "torch_amp_options": {"dtype": "bfloat16"}, + } + + network_module = "ctc.conformer_0524.i6modelsV2_VGG4LayerActFrontendV1_auxloss_v1" + global_train_args = { + "config": train_config_24gbgpu_amp, + "network_module": network_module, + "debug": True, + } + + def tune_and_evaluate_helper(training_name, dev_dataset_tuples, test_dataset_tuples, asr_model, base_decoder_config, lm_scales, prior_scales): + tune_parameters = [] + tune_values_clean = [] + tune_values_other = [] + for lm_weight in lm_scales: + for prior_scale in prior_scales: + decoder_config = copy.deepcopy(base_decoder_config) + decoder_config.lm_weight = lm_weight + decoder_config.prior_scale = prior_scale + search_name = training_name + "/search_lm%.1f_prior%.1f" % (lm_weight, prior_scale) + search_jobs, wers = search( + search_name, + forward_config={}, + asr_model=asr_model, + decoder_module="ctc.decoder.flashlight_ctc_v1", + decoder_args={"config": asdict(decoder_config)}, + test_dataset_tuples=dev_dataset_tuples, + **default_returnn + ) + tune_parameters.append((lm_weight, prior_scale)) + tune_values_clean.append((wers[search_name + "/dev-clean"])) + tune_values_other.append((wers[search_name + "/dev-other"])) + + for key, tune_values in [("test-clean", tune_values_clean), ("test-other", tune_values_other)]: + pick_optimal_params_job = GetOptimalParametersAsVariableJob(parameters=tune_parameters, values=tune_values, mode="minimize") + pick_optimal_params_job.add_alias(training_name + f"/pick_best_{key}") + decoder_config = copy.deepcopy(base_decoder_config) + decoder_config.lm_weight = pick_optimal_params_job.out_optimal_parameters[0] + decoder_config.prior_scale = pick_optimal_params_job.out_optimal_parameters[1] + search_jobs, wers = search( + training_name, forward_config={}, asr_model=asr_model, decoder_module="ctc.decoder.flashlight_ctc_v1", + decoder_args={"config": asdict(decoder_config)}, test_dataset_tuples={key: test_dataset_tuples[key]}, + **default_returnn + ) + + def greedy_search_helper( + training_name: str, + asr_model: ASRModel, + decoder_config: GreedyDecoderConfig + ): + # remove prior if exists + asr_model = copy.deepcopy(asr_model) + asr_model.prior_file = None + + search_name = training_name + "/search_greedy" + search_jobs, wers = search( + search_name, + forward_config={}, + asr_model=asr_model, + decoder_module="ctc.decoder.greedy_bpe_ctc_v3", + decoder_args={"config": asdict(decoder_config)}, + test_dataset_tuples={**dev_dataset_tuples, **test_dataset_tuples}, + **default_returnn, + ) + + for BPE_SIZE in [128, 512]: + + # build the training datasets object containing train, cv, dev-train and the extern_data dict + train_data_bpe = build_bpe_training_datasets( + prefix=prefix_name, + librispeech_key="train-other-960", + bpe_size=BPE_SIZE, + settings=train_settings, + use_postfix=False, + ) + label_datastream_bpe = cast(LabelDatastream, train_data_bpe.datastreams["labels"]) + vocab_size_without_blank = label_datastream_bpe.vocab_size + + dev_dataset_tuples = {} + for testset in ["dev-clean", "dev-other"]: + dev_dataset_tuples[testset] = build_test_dataset( + dataset_key=testset, + settings=train_settings, + ) + + test_dataset_tuples = {} + for testset in ["test-clean", "test-other"]: + test_dataset_tuples[testset] = build_test_dataset( + dataset_key=testset, + settings=train_settings, + ) + + model_config = ModelConfig( + feature_extraction_config=fe_config, + frontend_config=frontend_config, + specaug_config=specaug_config, + label_target_size=vocab_size_without_blank, + conformer_size=512, + num_layers=12, + num_heads=8, + ff_dim=2048, + att_weights_dropout=0.1, + conv_dropout=0.1, + ff_dropout=0.1, + mhsa_dropout=0.1, + conv_kernel_size=31, + final_dropout=0.1, + specauc_start_epoch=1, + module_list=["ff", "conv", "mhsa", "ff"], + module_scales=[0.5, 1.0, 1.0, 0.5], + aux_ctc_loss_layers=[3, 7, 11], # 4, 8, 12 when counting from 1 + aux_ctc_loss_scales=[0.3, 0.3, 1.0], + ) + model_config_decoding = copy.deepcopy(model_config) + model_config_decoding.aux_ctc_loss_scales = [0.0, 0.0, 1.0] # for decoding use result only of last layer + + default_decoder_config_bpe = DecoderConfig( + lexicon=get_text_lexicon(prefix=prefix_name, librispeech_key="train-other-960", bpe_size=BPE_SIZE), + returnn_vocab=label_datastream_bpe.vocab, + beam_size=1024, + beam_size_token=16, # makes it much faster + arpa_lm=arpa_4gram_lm, + beam_threshold=14, + ) + + train_args = copy.deepcopy(global_train_args) + train_args["net_args"] = {"model_config_dict": asdict(model_config)} + + train_args_decoding = copy.deepcopy(train_args) + train_args_decoding["net_args"] = {"model_config_dict": asdict(model_config_decoding)} + + training_name = prefix_name + "/" + str(BPE_SIZE) + "/" + network_module + ".512dim_convfirst_sub4_24gbgpu_50eps" + train_job = training(training_name, train_data_bpe, train_args, num_epochs=500, **default_returnn) + train_job.rqmt["gpu_mem"] = 24 + asr_model = prepare_asr_model( + training_name, train_job, train_args_decoding, with_prior=True, datasets=train_data_bpe, get_specific_checkpoint=500 + ) + tune_and_evaluate_helper(training_name, dev_dataset_tuples, test_dataset_tuples, asr_model, default_decoder_config_bpe, lm_scales=[1.6, 1.8, 2.0], prior_scales=[0.2, 0.3, 0.4]) diff --git a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/ctc/conformer_0524/__init__.py b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/ctc/conformer_0524/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/ctc/conformer_0524/i6modelsV2_VGG4LayerActFrontendV1_auxloss_v1.py b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/ctc/conformer_0524/i6modelsV2_VGG4LayerActFrontendV1_auxloss_v1.py new file mode 100644 index 000000000..31c7f48ad --- /dev/null +++ b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/ctc/conformer_0524/i6modelsV2_VGG4LayerActFrontendV1_auxloss_v1.py @@ -0,0 +1,197 @@ +""" +Like v2, but with i6_models specaugment (v3) +and now controllable start time for when specaugment is applied (v4) +and with the proper feature extraction from i6-models +""" + +import numpy as np +import torch +from torch import nn + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v2 import ConformerEncoderV2, ConformerEncoderV2Config, ConformerBlockV2Config +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1 + +from returnn.torch.context import get_run_ctx + +from .i6modelsV2_VGG4LayerActFrontendV1_auxloss_v1_cfg import ModelConfig + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV2Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV2Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, kernel_size=self.cfg.conv_kernel_size, dropout=self.cfg.conv_dropout, activation=nn.functional.silu, + norm=LayerNormNC(conformer_size) + ), + modules=self.cfg.module_list, + scales=self.cfg.module_scales, + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=self.cfg.feature_extraction_config) + self.conformer = ConformerEncoderV2(cfg=conformer_config) + self.num_output_linears = 1 if self.cfg.aux_ctc_loss_layers is None else len(self.cfg.aux_ctc_loss_layers) + self.output_linears = nn.ModuleList([ + nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + for _ in range(self.num_output_linears) + ]) + self.output_dropout = nn.Dropout(p=self.cfg.final_dropout) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :return: list of logprobs [B, T, #labels + blank], mask [B, T] + """ + + squeezed_features = torch.squeeze(raw_audio, dim=-1) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + run_ctx = get_run_ctx() + if self.training and run_ctx.epoch >= self.specaug_start_epoch: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, # TODO: make configurable + time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.cfg.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.cfg.specaug_config.max_dim_feat, + freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + + return_layers = self.cfg.aux_ctc_loss_layers or [self.cfg.num_layers - 1] + print(return_layers) + conformer_out_layers, out_mask = self.conformer(conformer_in, mask, return_layers=return_layers) + log_probs_list = [] + for i, (out_layer, scale) in enumerate(zip(conformer_out_layers, self.cfg.aux_ctc_loss_scales)): + if scale == 0.0: + continue + conformer_out = self.output_dropout(out_layer) + logits = self.output_linears[i](conformer_out) + log_probs = torch.log_softmax(logits, dim=2) + log_probs_list.append(log_probs) + + if len(log_probs_list) == 1: + log_probs_list = log_probs_list[0] + + return log_probs_list, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"].to("cpu") # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs_list, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + for logprobs, layer_index, scale in zip(logprobs_list, model.cfg.aux_ctc_loss_layers, model.cfg.aux_ctc_loss_scales): + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True, + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name=f"ctc_loss_layer{layer_index + 1}", loss=ctc_loss, scale=scale, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", 'w') as f: + np.savetxt(f, log_average_probs, delimiter=' ') + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/ctc/conformer_0524/i6modelsV2_VGG4LayerActFrontendV1_auxloss_v1_cfg.py b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/ctc/conformer_0524/i6modelsV2_VGG4LayerActFrontendV1_auxloss_v1_cfg.py new file mode 100644 index 000000000..b2e7bbd93 --- /dev/null +++ b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pytorch_networks/ctc/conformer_0524/i6modelsV2_VGG4LayerActFrontendV1_auxloss_v1_cfg.py @@ -0,0 +1,90 @@ +""" +Config for the base CTC models v4, including specaug start time +""" + +from dataclasses import dataclass + +import torch +from torch import nn +from typing import Callable, List, Optional, Type, Union + +from i6_models.assemblies.conformer.conformer_v2 import ConformerBlockV2Config, ConformerBlockV2 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config +from i6_models.config import ModuleFactoryV1, ModelConfiguration +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1Config + + +@dataclass(kw_only=True) +class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config): + activation_str: str = "" + activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None + + @classmethod + def from_dict(cls, d): + d = d.copy() + activation_str = d.pop("activation_str") + if activation_str == "ReLU": + from torch.nn import ReLU + activation = ReLU() + else: + assert False, "Unsupported activation %s" % d["activation_str"] + d["activation"] = activation + return VGG4LayerActFrontendV1Config(**d) + + +@dataclass +class ConformerEncoderV2Config(ModelConfiguration): + """ + Attributes: + num_layers: Number of conformer layers in the conformer encoder + frontend: A pair of ConformerFrontend and corresponding config + block_cfg: Configuration for ConformerBlockV1 + """ + + num_layers: int + + # nested configurations + frontend: ModuleFactoryV1 + block_cfg: ConformerBlockV2Config + + +@dataclass +class SpecaugConfig(ModelConfiguration): + repeat_per_n_frames: int + max_dim_time: int + num_repeat_feat: int + max_dim_feat: int + + + +@dataclass +class ModelConfig(): + feature_extraction_config: LogMelFeatureExtractionV1Config + frontend_config: VGG4LayerActFrontendV1Config + specaug_config: SpecaugConfig + specauc_start_epoch: int + label_target_size: int + conformer_size: int + num_layers: int + num_heads: int + ff_dim: int + att_weights_dropout: float + conv_dropout: float + ff_dropout: float + mhsa_dropout: float + conv_kernel_size: int + final_dropout: float + module_list: List[str] + module_scales: List[float] + aux_ctc_loss_layers: Optional[List[int]] + aux_ctc_loss_scales: Optional[List[float]] + + @classmethod + def from_dict(cls, d): + d = d.copy() + d["feature_extraction_config"] = LogMelFeatureExtractionV1Config(**d["feature_extraction_config"]) + d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"]) + d["specaug_config"] = SpecaugConfig(**d["specaug_config"]) + return ModelConfig(**d) + + From 5d5852c4620761f8bc835f5451c0b1b0d7ab36cf Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Tue, 28 May 2024 13:26:27 +0200 Subject: [PATCH 067/227] updates --- .../common/setups/rasr/hybrid_system.py | 229 +++++-- users/hilmes/common/setups/rasr/nn_system.py | 2 +- .../hybrid/distil_hubert/corpus_data.py | 40 +- .../distil_hubert/distill_hubert_args.py | 129 +--- .../pytorch_networks/distill_hubert_v1.py | 70 +-- .../pytorch_networks/prior/basic.py | 12 +- .../pytorch_networks/prior/basic.py | 24 + .../pytorch_networks/prior/prior_callback.py | 16 + .../hybrid/torch_baselines/torch_args.py | 350 ++++++++--- .../experiments/tedlium2/standalone/config.py | 5 +- .../tedlium2/standalone/default_tools.py | 6 +- .../experiments/ctc_bpe/baseline.py | 2 +- .../experiments/ctc_phon/baseline.py | 567 ++++++++---------- .../experiments/ctc_phon/tune_eval.py | 135 +++++ .../conformer_v1_uni_aggr_cfg_v1.py | 85 +++ .../conformer_v1_uni_aggr_v1.py | 278 +++++++++ .../conformer_1023/quant/baseline_quant_v1.py | 5 +- .../modules/onnx_precomputed_hybrid_system.py | 11 +- .../modules/pytorch_onnx_hybrid_system.py | 139 ++++- users/hilmes/tools/onnx.py | 100 ++- 20 files changed, 1535 insertions(+), 670 deletions(-) create mode 100644 users/hilmes/experiments/tedlium2/standalone/experiments/ctc_phon/tune_eval.py create mode 100644 users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_1023/conformer_v1_uni_aggr_cfg_v1.py create mode 100644 users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_1023/conformer_v1_uni_aggr_v1.py diff --git a/users/hilmes/common/setups/rasr/hybrid_system.py b/users/hilmes/common/setups/rasr/hybrid_system.py index bfd1211d9..9bf853c9f 100644 --- a/users/hilmes/common/setups/rasr/hybrid_system.py +++ b/users/hilmes/common/setups/rasr/hybrid_system.py @@ -57,21 +57,31 @@ def calc_stat(ls): def hybrid_report_format(report: _Report_Type) -> str: quants = report.pop("quant") - extra_ls = ["iter", "filter", "quant_min_max", "quant_entropy", "quant_percentile", "rtf-intel"] + loss_tables = report.pop("loss_tables") + loss_values = {} + if len(loss_tables) > 0: + for job in loss_tables: + with open(loss_tables[job].out_files["loss_table"]) as f: + for line in f: + loss_values[line.split(" ")[0]] = float(line.split(" ")[1].strip()) + extra_ls = ["iter", "filter", "quant_min_max", "quant_entropy", "quant_percentile", "rtf-intel", "loss_table"] out = [(recog, str(report[recog])) for recog in report if not any(extra in recog for extra in extra_ls)] out = sorted(out, key=lambda x: float(x[1])) - best_ls = [out[0]] + best_ls = [] + if len(out) > 0: + best_ls.append(out[0]) for extra in extra_ls: if extra == "iter": - for quant, count in itertools.product(["min_max", "entropy", "percentile"], ["10", "500", "1000"]): + for quant, count in itertools.product(["min_max", "entropy", "percentile"], ["1", "2", "3", "4", "5", "10", "100", "500", "1000", "10000"]): others = ["seed", "avrg", "filter", "rtf"] out2 = [(recog, str(report[recog])) for recog in report if "iter" in recog and quant in recog and (recog.endswith(count) or recog.endswith(count + "-optlm")) and not any(x in report for x in others)] out2 = sorted(out2, key=lambda x: float(x[1])) if len(out2) > 0: ex_str = calc_stat(out2) + out.append(('', '')) out.append((extra + "_" + quant + "_" + count, ex_str)) - out.extend(out2[:3]) - out.extend(out2[-3:]) + out.extend(out2[:5]) + out.extend(out2[-5:]) best_ls.append(out2[0]) # avg list out2 = [(recog, str(report[recog])) for recog in report if "iter" in recog and quant in recog and ( @@ -80,9 +90,10 @@ def hybrid_report_format(report: _Report_Type) -> str: out2 = sorted(out2, key=lambda x: float(x[1])) if len(out2) > 0: ex_str = calc_stat(out2) + out.append(('', '')) out.append((extra + "_avrg" + "_" + quant + "_" + count, ex_str)) - out.extend(out2[:3]) - out.extend(out2[-3:]) + out.extend(out2[:5]) + out.extend(out2[-5:]) best_ls.append(out2[0]) # different seeds for seed in ["24, 2005, 5"]: @@ -93,20 +104,21 @@ def hybrid_report_format(report: _Report_Type) -> str: if len(out2) > 0: ex_str = calc_stat(out2) out.append((quant + "_seed_" + seed + "_" + extra + "-" + count, ex_str)) - out.extend(out2[:3]) - out.extend(out2[-3:]) + out.extend(out2[:5]) + out.extend(out2[-5:]) best_ls.append(out2[0]) elif extra == "filter": # max and min len filter methods - for quant, count, mode, thresh in itertools.product(["min_max", "entropy", "percentile"], ["10", "500", "1000"], ["max_calib_len_", "min_calib_len_"], ["500", "1000" "1500"]): + for quant, count, mode, thresh in itertools.product(["min_max", "entropy", "percentile"], ["10", "500", "1000"], ["max_calib_len_", "min_calib_len_"], ["50", "100", "250", "400", "500", "1000" "1500"]): out2 = [(recog, str(report[recog])) for recog in report if "filter" in recog and quant in recog and ( - recog.endswith(count) or recog.endswith(count + "-optlm")) and mode+thresh in recog] + recog.endswith(count) or recog.endswith(count + "-optlm")) and mode+thresh+"-" in recog] out2 = sorted(out2, key=lambda x: float(x[1])) if len(out2) > 0: ex_str = calc_stat(out2) + out.append(('', '')) out.append((extra + "_" + mode + thresh + "_" + quant + "_" + count, ex_str)) - out.extend(out2[:3]) - out.extend(out2[-3:]) + out.extend(out2[:5]) + out.extend(out2[-5:]) best_ls.append(out2[0]) # partition filter methods partitions = set() @@ -123,9 +135,10 @@ def hybrid_report_format(report: _Report_Type) -> str: out2 = sorted(out2, key=lambda x: float(x[1])) if len(out2) > 0: ex_str = calc_stat(out2) + out.append(('', '')) out.append((extra + "_" + mode + thresh + "_" + quant + "_" + count, ex_str)) - out.extend(out2[:3]) - out.extend(out2[-3:]) + out.extend(out2[:5]) + out.extend(out2[-5:]) best_ls.append(out2[0]) # budget filter methods budgets = set() @@ -135,21 +148,73 @@ def hybrid_report_format(report: _Report_Type) -> str: for i, x in enumerate(spl): if x == "budget": budgets.add(spl[i+1].split("-")[0]) # add the number after the partition which gives identification - for quant, thresh in itertools.product(["min_max", "entropy", "percentile"], budgets): + for quant, thresh, remaind in itertools.product(["min_max", "entropy", "percentile"], budgets, ["0.01", "0.005"]): mode = "budget_" - out2 = [(recog, str(report[recog])) for recog in report if "filter" in recog and quant in recog and mode+thresh in recog] + out2 = [(recog, str(report[recog])) for recog in report if "filter" in recog and quant in recog and mode+thresh+"_"+remaind in recog] out2 = sorted(out2, key=lambda x: float(x[1])) tmp = [] + sequences = [] + losses = [] + wers = [] for name, value in out2: job_ls = [quants[x] for x in quants if name.split("/")[1].split("-")[0] == x.split("/")[-2] and quant in x and mode+thresh in x] assert len(job_ls) == 1, (job_ls, out2, quants) + with open(job_ls[0].out_seq_info, "rt") as f: + loss_tmp = [] + sequences_tmp = [] + for line in f: + sequences_tmp.append(line.split(" ")[0].strip()[:-1]) + loss_tmp.append(loss_values[line.split(" ")[0].strip()[:-1]]) + losses.append(sum(loss_tmp)) + sequences.append(sequences_tmp) + wers.append(float(value)) tmp.append((name, value, str(job_ls[0].out_num_seqs))) out2 = tmp if len(out2) > 0: ex_str = calc_stat(out2) - out.append((extra + "_" + mode + thresh + "_" + quant, ex_str)) - out.extend(out2[:3]) - out.extend(out2[-3:]) + out.append(('', '')) + out.append((extra + "_" + mode + thresh + "_" + remaind + "_" + quant, ex_str)) + out.append(("Correlation between loss and drawn sequence", str(np.corrcoef(wers, losses)[0, 1]))) + out.extend(out2[:5]) + out.extend(out2[-5:]) + best_ls.append(out2[0]) + # range filter methods + ranges = set() + for recog in report: + if "range_len" in recog: + spl = recog.split("_") + for i, x in enumerate(spl): + if x == "range": + ranges.add(f"{spl[i+2]}_{spl[i+3].split('-')[0]}") # add the number after the partition which gives identification + for quant, ran in itertools.product(["min_max", "entropy", "percentile"], ranges): + mode = "range_len_" + out2 = [(recog, str(report[recog])) for recog in report if "filter" in recog and quant in recog and mode+ran in recog] + out2 = sorted(out2, key=lambda x: float(x[1])) + tmp = [] + sequences = [] + losses = [] + wers = [] + for name, value in out2: + job_ls = [quants[x] for x in quants if name.split("/")[1].split("-")[0] == x.split("/")[-2] and quant in x and mode+ran in x] + assert len(job_ls) == 1, (name, job_ls, out2, quants) + with open(job_ls[0].out_seq_info, "rt") as f: + sequence = [] + loss = [] + for line in f: + sequences.append(line.split(" ")[0].strip()) + wers.append(float(value)) + loss.append(loss_values[line.split(" ")[0].strip()[:-1]]) + losses.append(sum(loss)) + sequences.append(sequence) + tmp.append((name.split("/")[1], value, " ".join(f.readlines()).strip())) + out2 = tmp + if len(out2) > 0: + ex_str = calc_stat(out2) + out.append(('', '')) + out.append((extra + "_" + mode + ran + "_" + quant, ex_str)) + out.append(("Correlation between loss and drawn sequence", str(np.corrcoef(wers, losses)[0, 1]))) + out.extend(out2[:5]) + out.extend(out2[-5:]) best_ls.append(out2[0]) # single and unique tag filter methods for quant, count, mode in itertools.product(["min_max", "entropy", "percentile"], ["10", "20", "30", "50"], ["single_tag", "unique_tags"]): @@ -158,19 +223,75 @@ def hybrid_report_format(report: _Report_Type) -> str: out2 = sorted(out2, key=lambda x: float(x[1])) if len(out2) > 0: ex_str = calc_stat(out2) + out.append(('', '')) out.append((extra + "_" + mode + "_" + quant + "_" + count, ex_str)) - out.extend(out2[:3]) - out.extend(out2[-3:]) + out.extend(out2[:5]) + out.extend(out2[-5:]) + best_ls.append(out2[0]) + elif extra == "loss_table": + # loss table + for quant in ["min_max", "entropy", "percentile", "reverse"]: + out2 = [(recog, str(report[recog])) for recog in report if extra in recog and quant in recog] + out2 = sorted(out2, key=lambda x: float(x[1])) + tmp = [] + sequences = [] + losses = [] + wers = [] + for name, value in out2: + job_ls = [quants[x] for x in quants if + name.split("/")[1].split("-")[0] == x.split("/")[-2] and extra in x and x.endswith(name.split("/")[1].split("-")[1]) and name.split("/")[0].split("-")[-1] in x] + assert len(job_ls) == 1, (name, job_ls, len(out2), quants) + with open(job_ls[0].out_seq_info, "rt") as f: + for line in f: + sequences.append(line.split(" ")[0].strip()) + wers.append(float(value)) + losses.append(loss_values[line.split(" ")[0].strip()[:-1]]) + tmp.append((name.split("/")[0].split("-")[-1]+"/"+name.split("/")[1], value, " ".join(f.readlines()).strip())) + out2 = tmp + if len(out2) > 0: + ex_str = calc_stat(out2) + out.append(('', '')) + out.append((extra + " " + quant, ex_str)) + out.extend(out2) + out.extend(out2) best_ls.append(out2[0]) + quant = "reverse" + out2 = [(recog, str(report[recog])) for recog in report if extra in recog and quant not in recog] + out2 = sorted(out2, key=lambda x: float(x[1])) + tmp = [] + sequences = [] + losses = [] + wers = [] + for name, value in out2: + job_ls = [quants[x] for x in quants if + name.split("/")[1].split("-")[0] == x.split("/")[-2] and extra in x and x.endswith( + name.split("/")[1].split("-")[1]) and name.split("/")[0].split("-")[-1] in x] + assert len(job_ls) == 1, (name, job_ls, len(out2), quants) + with open(job_ls[0].out_seq_info, "rt") as f: + for line in f: + sequences.append(line.split(" ")[0].strip()) + wers.append(float(value)) + losses.append(loss_values[line.split(" ")[0].strip()[:-1]]) + tmp.append((name.split("/")[0].split("-")[-1] + "/" + name.split("/")[1], value, + " ".join(f.readlines()).strip())) + out2 = tmp + if len(out2) > 0: + ex_str = calc_stat(out2) + out.append(('', '')) + out.append((extra + " no " + quant, ex_str)) + out.extend(out2) + out.extend(out2) + best_ls.append(out2[0]) else: # mixed out2 = [(recog, str(report[recog])) for recog in report if extra in recog] out2 = sorted(out2, key=lambda x: float(x[1])) if len(out2) > 0: ex_str = calc_stat(out2) + out.append(('', '')) out.append((extra, ex_str)) - out.extend(out2[:3]) - out.extend(out2[-3:]) + out.extend(out2[:5]) + out.extend(out2[-5:]) best_ls.append(out2[0]) best_ls = sorted(best_ls, key=lambda x: float(x[1])) @@ -714,32 +835,40 @@ def run_nn_step(self, step_name: str, step_args: HybridArgs): train_job=returnn_train_job, ) print(f"NN Recog Iteration {time.time() - start}") - start = time.time() - from i6_core.report import GenerateReportStringJob, MailJob - - results = {} - for c in self.dev_corpora + self.test_corpora: - for job_name in self.jobs[c]: - if "scorer" not in job_name: - continue - if not name == job_name.split("-")[1]: - continue - scorer = self.jobs[c][job_name] - if scorer.out_wer: - results[job_name] = scorer.out_wer - tk.register_report(f"reports/{name.replace('/', '_')}", values=results) - quants = {} - for c in self.dev_corpora + self.test_corpora: - for job_name in self.jobs[c]: - if "quantize_static" in job_name and "budget" in job_name: - quants[job_name] = self.jobs[c][job_name] - results["quant"] = quants - report = GenerateReportStringJob(report_values=results, report_template=hybrid_report_format) - report.add_alias(f"report/report_{name}") - mail = MailJob(report.out_report, send_contents=True, subject=name) - mail.add_alias(f"report/mail_{name}") - tk.register_output("mail/" + name, mail.out_status) - print(f"NN Report Iteration {time.time() - start}") + for recog_name, _ in step_args.recognition_args.items(): + results = {} + from i6_core.report import GenerateReportStringJob, MailJob + for c in self.dev_corpora + self.test_corpora: + for job_name in self.jobs[c]: + if "scorer" not in job_name: + continue + if not name == job_name.split("-")[1]: + continue + if not f"{recog_name}-{c}" in job_name: # e.g. "quant_multile-dev + continue + scorer = self.jobs[c][job_name] + if scorer.out_wer: + results[job_name] = scorer.out_wer + tk.register_report(f"reports/{name.replace('/', '_')}/{recog_name}", values=results) + quants = {} + for c in self.dev_corpora + self.test_corpora: + for job_name in self.jobs[c]: + #if "quantize_static" in job_name and "budget" in job_name and f"{recog_name}" in job_name: + if "quantize_static" in job_name and f"{recog_name}" in job_name: + quants[job_name] = self.jobs[c][job_name] + results["quant"] = quants + loss_tables = {} + for c in self.dev_corpora + self.test_corpora: + for job_name in self.jobs[c]: + if "calculate_loss" in job_name and f"{recog_name}" in job_name: + loss_tables[job_name] = self.jobs[c][job_name] + results["loss_tables"] = loss_tables + report = GenerateReportStringJob(report_values=results, report_template=hybrid_report_format) + report.add_alias(f"report/report_{name}_{recog_name}") + mail = MailJob(report.out_report, send_contents=True, subject=name + " " + recog_name) + mail.add_alias(f"report/mail_{name}_{recog_name}") + tk.register_output("mail/" + name + "_" + recog_name, mail.out_status) + def run_nn_recog_step(self, step_args: NnRecogArgs): for eval_c in self.dev_corpora + self.test_corpora: diff --git a/users/hilmes/common/setups/rasr/nn_system.py b/users/hilmes/common/setups/rasr/nn_system.py index 10dcf88e2..72bc423ec 100644 --- a/users/hilmes/common/setups/rasr/nn_system.py +++ b/users/hilmes/common/setups/rasr/nn_system.py @@ -112,7 +112,7 @@ def returnn_training( ) #if any(sub in name for sub in ["larger", "whisper_medium", "whisper_large"]): #print(name, any(f"keepuntil_{x}_" in name for x in range(9))) - if any(sub in name for sub in ["larger", "medium", "large", "parakeet", "old_spec_baseline", "new_spec_baseline"]) and not any(f"keepuntil_{x}_" in name for x in range(9)): + if any(sub in name for sub in ["larger", "medium", "large", "parakeet", "hubert_fe_test"]) and not any(f"keepuntil_{x}_" in name for x in range(9)): returnn_training_job.rqmt["gpu_mem"] = 24 if any(sub in name for sub in ["whisper_large", "whisper_v2_large", "parakeet_1.1"]): returnn_training_job.rqmt["mem"] = 12 diff --git a/users/hilmes/experiments/tedlium2/asr_2023/hybrid/distil_hubert/corpus_data.py b/users/hilmes/experiments/tedlium2/asr_2023/hybrid/distil_hubert/corpus_data.py index b9e959e2b..999dd32d7 100644 --- a/users/hilmes/experiments/tedlium2/asr_2023/hybrid/distil_hubert/corpus_data.py +++ b/users/hilmes/experiments/tedlium2/asr_2023/hybrid/distil_hubert/corpus_data.py @@ -87,34 +87,34 @@ def build_data_input( return { "class": "MetaDataset", - #"data_map": {"classes": ("hdf_align", "data"), "data_raw": ("ogg", "data"), "data": ("hdf_feat", "data")}, - "data_map": {"classes": ("hdf_align", "data"), "data": ("hdf_feat", "data")}, + "data_map": {"classes": ("hdf_align", "data"), "data_raw": ("ogg", "data")}, + #"data_map": {"classes": ("hdf_align", "data"), "data": ("hdf_feat", "data")}, "datasets": { "hdf_align": { "class": "HDFDataset", "files": [align_hdf], "use_cache_manager": True, }, - # "ogg": { - # "class": "OggZipDataset", - # "audio": {"features": "raw", "peak_normalization": True, "sample_rate": 16000}, - # "partition_epoch": partition_epoch, - # "path": [raw_features], - # "seq_ordering": seq_ordering, - # "use_cache_manager": True, - # "segment_file": segment_list, - # "targets": None, - # }, - "hdf_feat": { - "class": "HDFDataset", - "files": [feat_hdf], - "use_cache_manager": True, - "seq_ordering": seq_ordering, + "ogg": { + "class": "OggZipDataset", + "audio": {"features": "raw", "peak_normalization": True, "sample_rate": 16000}, "partition_epoch": partition_epoch, - "seq_list_filter_file": segment_list, + "path": [raw_features], + "seq_ordering": seq_ordering, + "use_cache_manager": True, + "segment_file": segment_list, + "targets": None, }, + # "hdf_feat": { + # "class": "HDFDataset", + # "files": [feat_hdf], + # "use_cache_manager": True, + # "seq_ordering": seq_ordering, + # "partition_epoch": partition_epoch, + # "seq_list_filter_file": segment_list, + # }, }, - "seq_order_control_dataset": "hdf_feat", + "seq_order_control_dataset": "ogg", } @@ -255,7 +255,7 @@ def get_corpus_data_inputs( allophone_labeling=allophone_labeling, alias_prefix=alias_prefix + "/nn_train_data", partition_epoch=5, - acoustic_mixtures=gmm_system.outputs["train"]["final"].acoustic_mixtures, # TODO: NN Mixtures + #acoustic_mixtures=gmm_system.outputs["train"]["final"].acoustic_mixtures, # TODO: NN Mixtures seq_ordering="laplace:.1000", raw_features=train_features_raw, ) diff --git a/users/hilmes/experiments/tedlium2/asr_2023/hybrid/distil_hubert/distill_hubert_args.py b/users/hilmes/experiments/tedlium2/asr_2023/hybrid/distil_hubert/distill_hubert_args.py index 0abe0c9f0..3283213cb 100644 --- a/users/hilmes/experiments/tedlium2/asr_2023/hybrid/distil_hubert/distill_hubert_args.py +++ b/users/hilmes/experiments/tedlium2/asr_2023/hybrid/distil_hubert/distill_hubert_args.py @@ -131,9 +131,9 @@ def get_pytorch_returnn_configs( base_config = { "extern_data": { - "data": { - "dim": 80, - "shape": (None, 80), + "data_raw": { + "dim": 1, + "shape": (None, 1), "available_for_inference": True, }, # input: 1-dimensional waveforms "classes": { @@ -153,13 +153,7 @@ def get_pytorch_returnn_configs( "behavior_version": 16, "torch_log_memory_usage": True, } - if not recognition and False: - # add teacher features - base_config["extern_data"]["data_raw"] = { - "dim": 1, - "shape": (None, 1), - "available_for_inference": True, - } # input: 1-dimensional waveforms + base_post_config = { "backend": "torch", "debug_print_layer_output_template": True, @@ -188,6 +182,12 @@ def get_pytorch_returnn_configs( chunk_400_200_config = copy.deepcopy(hubert_config) chunk_400_200_config["chunking"] = "400:200" + + chunk_raw_config = copy.deepcopy(hubert_config) + chunk_400 = 400 * 16000 + chunk_200 = 200 * 16000 + chunk_raw_config["chunking"] = f"{chunk_400}:{chunk_200}" + #if not recognition: # del chunk_400_200_config['extern_data']['data_raw'] @@ -236,6 +236,11 @@ def construct_from_net_kwargs( base_config["max_seqs"] = 1 base_config["forward_data"] = "train" base_config["model_outputs"] = {"log_probs": {"dim": num_outputs, "shape": (None, num_outputs)}} + base_config["extern_data"]["data_raw"] = { + "dim": 80, + "shape": (None, 80), + "available_for_inference": True, + } if "min_seq_length" in base_config: del base_config["min_seq_length"] @@ -274,8 +279,8 @@ def construct_from_net_kwargs( return returnn_config return { - **{f"torch_distill_hubert_large_test_{x}": construct_from_net_kwargs( - hubert_config, + **{f"torch_distill_hubert_fe_test": construct_from_net_kwargs( + chunk_raw_config, { "model_type": "distill_hubert_v1", "hubert_dict": { @@ -301,107 +306,13 @@ def construct_from_net_kwargs( "upsample_kernel": 3, "upsample_stride": 3, "upsample_padding": 0, - "upsample_out_padding": 0, + "upsample_out_padding": 1, "dropout": 0.2, + "feat_extr": True }, }, models_commit="3c9173691521778b1e8b4070c172cbe929e4826b", - #max_seqs=2, - #grad_acc=14, - ) for x in [0.00]}, # 6.4 - **{f"torch_old_spec_baseline": construct_from_net_kwargs( - chunk_400_200_config, - { - "model_type": "conformer_baseline", - "conformer_size": 384, - "conv_kernel_size": 7, - "att_heads": 6, - "ff_dim": 1536, - "spec_num_time": 20, - "spec_max_time": 20, - "spec_num_feat": 5, - "spec_max_feat": 16, - "pool_1_stride": (3, 1), - "pool_1_kernel_size": (1, 2), - "pool_1_padding": None, - "pool_2_stride": None, - "pool_2_kernel_size": (1, 2), - "pool_2_padding": None, - "num_layers": 12, - "upsample_kernel": 3, - "upsample_stride": 3, - "upsample_padding": 0, - "upsample_out_padding": 0, - "dropout": 0.2, - "old_spec": True - }, - models_commit="3c9173691521778b1e8b4070c172cbe929e4826b", - # max_seqs=2, - # grad_acc=14, - ) for x in [0.00]}, # 6.2 - **{f"torch_new_spec_baseline": construct_from_net_kwargs( - chunk_400_200_config, - { - "model_type": "conformer_baseline", - "conformer_size": 384, - "conv_kernel_size": 7, - "att_heads": 6, - "ff_dim": 1536, - "spec_num_time": 20, - "spec_max_time": 20, - "spec_num_feat": 5, - "spec_max_feat": 16, - "pool_1_stride": (3, 1), - #"pool_1_kernel_size": (1, 2), - #"pool_1_padding": None, - #"pool_2_stride": None, - #"pool_2_kernel_size": (1, 2), - #"pool_2_padding": None, - "num_layers": 12, - "upsample_kernel": 3, - "upsample_stride": 3, - "upsample_padding": 0, - #"upsample_out_padding": 0, - #"dropout": 0.2, - "old_spec": False, - }, - models_commit="3c9173691521778b1e8b4070c172cbe929e4826b", # max_seqs=2, # grad_acc=14, - ) for x in [0.00]}, # 6.2 - # **{f"torch_distill_hubert_large_test_chunk_{x}": construct_from_net_kwargs( - # chunk_400_200_config, - # { - # "model_type": "distill_hubert_v1", - # "hubert_dict": { - # "model_name": "base-ls960", - # "distill_scale": x - # }, - # "conformer_dict": { - # "hidden_d": 384, - # "conv_kernel_size": 7, - # "att_heads": 6, - # "ff_dim": 1536, - # "spec_num_time": 20, - # "spec_max_time": 20, - # "spec_num_feat": 5, - # "spec_max_feat": 16, - # "pool_1_stride": (3, 1), - # "pool_1_kernel_size": (1, 2), - # "pool_1_padding": None, - # "pool_2_stride": None, - # "pool_2_kernel_size": (1, 2), - # "pool_2_padding": None, - # "num_layers": 12, - # "upsample_kernel": 3, - # "upsample_stride": 3, - # "upsample_padding": 0, - # "upsample_out_padding": 0, - # "dropout": 0.2, - # }, - # }, - # models_commit="3c9173691521778b1e8b4070c172cbe929e4826b", - # max_seqs=2, - # grad_acc=14, - # ) for x in [0.00]}, + ) for x in [0.00]}, } diff --git a/users/hilmes/experiments/tedlium2/asr_2023/hybrid/distil_hubert/pytorch_networks/distill_hubert_v1.py b/users/hilmes/experiments/tedlium2/asr_2023/hybrid/distil_hubert/pytorch_networks/distill_hubert_v1.py index 5c5eba78a..706f805d9 100644 --- a/users/hilmes/experiments/tedlium2/asr_2023/hybrid/distil_hubert/pytorch_networks/distill_hubert_v1.py +++ b/users/hilmes/experiments/tedlium2/asr_2023/hybrid/distil_hubert/pytorch_networks/distill_hubert_v1.py @@ -17,6 +17,7 @@ from i6_models.parts.conformer.norm import LayerNormNC from i6_models.config import ModelConfiguration, ModuleFactoryV1 from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1, VGG4LayerActFrontendV1Config +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config, RasrCompatibleLogMelFeatureExtractionV1Config, RasrCompatibleLogMelFeatureExtractionV1 def _lengths_to_padding_mask(lengths: torch.Tensor, x: torch.Tensor) -> torch.Tensor: """ @@ -55,6 +56,7 @@ class ConformerStudentConfig: upsample_padding: int upsample_out_padding: int dropout: float + feat_extr: bool class Model(nn.Module): @@ -65,7 +67,7 @@ def __init__(self, epoch, step, hubert_dict, conformer_dict, **kwargs): self.conformer_cfg = ConformerStudentConfig(**conformer_dict) self.distill_scale = self.hubert_cfg.distill_scale self.config = HubertConfig.from_pretrained(f"facebook/hubert-{self.hubert_cfg.model_name}", cache_dir='/work/asr4/hilmes/debug/whisper/hubert_for_ctc') - if not run_ctx.stage == "train_step" and not run_ctx.stage == "init": + if not run_ctx.stage == "train_step" and not run_ctx.stage == "init" or True: import logging logging.warning("Hubert not loaded") self.hubert = None @@ -86,8 +88,6 @@ def __init__(self, epoch, step, hubert_dict, conformer_dict, **kwargs): for param in self.hubert.parameters(): param.requires_grad_(False) - self.upsample_conv = torch.nn.ConvTranspose1d( - in_channels=self.config.hidden_size, out_channels=self.config.hidden_size, kernel_size=5, stride=2, padding=1) self.final_linear = nn.Linear(self.conformer_cfg.hidden_d, 9001) if len(kwargs) >= 2: assert False, f"You did not use all kwargs: {kwargs}" @@ -95,6 +95,16 @@ def __init__(self, epoch, step, hubert_dict, conformer_dict, **kwargs): assert "random" in list(kwargs.keys())[0], "This must only be RETURNN random arg" # else len == 0 + if self.conformer_cfg.feat_extr is True: + self.fe_cfg = RasrCompatibleLogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + min_amp=1e-10, + num_filters=80, + ) + self.feature_extraction = RasrCompatibleLogMelFeatureExtractionV1(self.fe_cfg) + conv_cfg = ConformerConvolutionV1Config( channels=self.conformer_cfg.hidden_d, kernel_size=self.conformer_cfg.conv_kernel_size, @@ -148,16 +158,20 @@ def __init__(self, epoch, step, hubert_dict, conformer_dict, **kwargs): ) # self.initial_linear = nn.Linear(80, conformer_size) - def forward(self, - audio_features: torch.Tensor, audio_features_len: torch.Tensor, - raw_audio: torch.Tensor = None, - raw_audio_len: torch.Tensor = None): + def forward(self, raw_audio: torch.Tensor, raw_audio_len: torch.Tensor): run_ctx = rf.get_run_ctx() + if self.conformer_cfg.feat_extr is True and (self.training or run_ctx.stage == "train_step"): + squeezed_features = torch.squeeze(raw_audio, dim=-1) + audio_features, audio_features_len = self.feature_extraction(raw_audio=squeezed_features, length=raw_audio_len) + else: + # for export / forward we take external features + audio_features = raw_audio + audio_features_len = raw_audio_len + # Hubert teacher: if (self.training or run_ctx.stage == "train_step") and raw_audio is not None and raw_audio_len is not None and False: assert False, "Since this is just testing this should not happen, if entering here is correct please just delete this line" - squeezed_features = torch.squeeze(raw_audio, dim=-1) hubert_outputs = self.hubert(input_values=squeezed_features) audio_features_size = self.hubert._get_feat_extract_output_lengths(raw_audio_len).to(dtype=torch.int64) encoder_output = hubert_outputs.last_hidden_state @@ -182,7 +196,8 @@ def forward(self, conformer_out, _ = self.conformer(audio_features_masked, mask) upsampled = self.upsample_conv(conformer_out.transpose(1, 2)).transpose(1, 2) # final upsampled [B, T, F] - upsampled = upsampled[:, 0: audio_features.size()[1], :] + print(upsampled.shape, audio_features.shape, audio_features_len, conformer_out.shape) + upsampled = upsampled[:, 0: audio_features.size()[1]+1, :] student_features = upsampled upsampled_dropped = nn.functional.dropout(student_features, p=self.conformer_cfg.dropout, training=self.training) @@ -191,27 +206,22 @@ def forward(self, log_probs = torch.log_softmax(final_out, dim=2) if teacher_features is not None: - return log_probs, logits_ce_order, teacher_features, student_features + return log_probs, logits_ce_order, teacher_features, student_features, audio_features_len elif self.training or run_ctx.stage == "train_step": - return log_probs, logits_ce_order, None, None + return log_probs, logits_ce_order, None, None, audio_features_len else: - return log_probs, logits_ce_order + return log_probs, logits_ce_order, audio_features_len def train_step(*, model: Model, extern_data, **_kwargs): - audio_features = extern_data["data"].raw_tensor - audio_features_len = extern_data["data"].dims[1].dyn_size_ext.raw_tensor - audio_raw = extern_data["data_raw"].raw_tensor audio_raw_len = extern_data["data_raw"].dims[1].dyn_size_ext.raw_tensor phonemes = extern_data["classes"].raw_tensor phonemes_len = extern_data["classes"].dims[1].dyn_size_ext.raw_tensor - log_probs, logits_ce_order, teacher_features, student_features = model( + log_probs, logits_ce_order, teacher_features, student_features, audio_features_len = model( raw_audio=audio_raw, raw_audio_len=audio_raw_len.to("cuda"), - audio_features=audio_features, - audio_features_len=audio_features_len.to("cuda") ) targets_packed = nn.utils.rnn.pack_padded_sequence( @@ -219,7 +229,7 @@ def train_step(*, model: Model, extern_data, **_kwargs): ) targets_masked, _ = nn.utils.rnn.pad_packed_sequence(targets_packed, batch_first=True, padding_value=-100) targets_masked = targets_masked.long() - + print(f"{logits_ce_order.shape=}, {targets_masked.shape=}") loss_ce = nn.functional.cross_entropy(logits_ce_order, targets_masked) #loss_features = nn.functional.l1_loss(student_features, teacher_features, reduction="mean") @@ -227,25 +237,3 @@ def train_step(*, model: Model, extern_data, **_kwargs): # TODO: KL div loss # TODO: look if Hubert model has a softmax somewhere #rf.get_run_ctx().mark_as_loss(name="L1 Dist", loss=loss_features, scale=model.distill_scale) - - -def export2(*, model: Model, f: str): - model.export_mode = True - dummy_data = torch.randn(1, 45000, 1, device="cpu") - dummy_data_len = torch.IntTensor([45000]) - scripted_model = torch.jit.trace(model.eval(), example_inputs=(dummy_data, dummy_data_len)) - onnx_export( - scripted_model, - (dummy_data, dummy_data_len), - f=f, - verbose=True, - input_names=["data", "data_len"], - output_names=["classes"], - opset_version=17, - dynamic_axes={ - # dict value: manually named axes - "data": {0: "batch", 1: "time"}, - "data_len": {0: "batch"}, - "classes": {0: "batch", 1: "time"}, - }, - ) diff --git a/users/hilmes/experiments/tedlium2/asr_2023/hybrid/distil_hubert/pytorch_networks/prior/basic.py b/users/hilmes/experiments/tedlium2/asr_2023/hybrid/distil_hubert/pytorch_networks/prior/basic.py index 4ad5fdada..7c51bf69e 100644 --- a/users/hilmes/experiments/tedlium2/asr_2023/hybrid/distil_hubert/pytorch_networks/prior/basic.py +++ b/users/hilmes/experiments/tedlium2/asr_2023/hybrid/distil_hubert/pytorch_networks/prior/basic.py @@ -4,13 +4,13 @@ def forward_step(*, model: torch.nn.Module, extern_data: TensorDict, **kwargs): - audio_features = extern_data["data"].raw_tensor - audio_features_len = extern_data["data"].dims[1].dyn_size_ext.raw_tensor + audio_features = extern_data["data_raw"].raw_tensor + audio_features_len = extern_data["data_raw"].dims[1].dyn_size_ext.raw_tensor - log_probs, logits_ce_order = model( - audio_features=audio_features, - audio_features_len=audio_features_len.to(audio_features.device) + log_probs, logits_ce_order, features_len = model( + raw_audio=audio_features, + raw_audio_len=audio_features_len.to(audio_features.device) ) # [B, T, F] - features_len = audio_features_len.to(dtype=torch.int32) + features_len = features_len.to(dtype=torch.int32) rf.get_run_ctx().expected_outputs["log_probs"].dims[1].dyn_size_ext.raw_tensor = features_len rf.get_run_ctx().mark_as_output(log_probs, name="log_probs") diff --git a/users/hilmes/experiments/tedlium2/asr_2023/hybrid/torch_baselines/pytorch_networks/prior/basic.py b/users/hilmes/experiments/tedlium2/asr_2023/hybrid/torch_baselines/pytorch_networks/prior/basic.py index f49fe12b9..0f4cd722d 100644 --- a/users/hilmes/experiments/tedlium2/asr_2023/hybrid/torch_baselines/pytorch_networks/prior/basic.py +++ b/users/hilmes/experiments/tedlium2/asr_2023/hybrid/torch_baselines/pytorch_networks/prior/basic.py @@ -1,6 +1,7 @@ from returnn.tensor.tensor_dict import TensorDict import returnn.frontend as rf import torch +from torch import nn def forward_step(*, model: torch.nn.Module, extern_data: TensorDict, **kwargs): @@ -16,3 +17,26 @@ def forward_step(*, model: torch.nn.Module, extern_data: TensorDict, **kwargs): ) # [B, T, F] rf.get_run_ctx().mark_as_output(log_probs, name="log_probs") + + +def loss_forward_step(*, model: torch.nn.Module, extern_data: TensorDict, **kwargs): + audio_features = extern_data["data"].raw_tensor + audio_features_len = extern_data["data"].dims[1].dyn_size_ext.raw_tensor + + audio_features_len, indices = torch.sort(audio_features_len, descending=True) + audio_features = audio_features[indices, :, :] + + log_probs, logits = model( + audio_features=audio_features, + audio_features_len=audio_features_len.to("cuda"), + ) # [B, T, F] + phonemes = extern_data["classes"].raw_tensor[indices, :].long() + phonemes_len = extern_data["classes"].dims[1].dyn_size_ext.raw_tensor[indices] + targets_packed = nn.utils.rnn.pack_padded_sequence( + phonemes, phonemes_len.to("cpu"), batch_first=True, enforce_sorted=False + ) + targets_masked, _ = nn.utils.rnn.pad_packed_sequence(targets_packed, batch_first=True, padding_value=-100) + + loss = nn.functional.cross_entropy(logits, targets_masked) + loss = torch.unsqueeze(torch.unsqueeze(loss, dim=0), dim=0) + rf.get_run_ctx().mark_as_output(loss, "ce_score") diff --git a/users/hilmes/experiments/tedlium2/asr_2023/hybrid/torch_baselines/pytorch_networks/prior/prior_callback.py b/users/hilmes/experiments/tedlium2/asr_2023/hybrid/torch_baselines/pytorch_networks/prior/prior_callback.py index b6c87c599..f24eb451e 100644 --- a/users/hilmes/experiments/tedlium2/asr_2023/hybrid/torch_baselines/pytorch_networks/prior/prior_callback.py +++ b/users/hilmes/experiments/tedlium2/asr_2023/hybrid/torch_baselines/pytorch_networks/prior/prior_callback.py @@ -58,3 +58,19 @@ def finish(self): plt.ylabel("prior") plt.grid(True) plt.savefig("../output/prior.png") + + +class PrintLossCallback(ForwardCallbackIface): + def init(self, *, model: torch.nn.Module): + self.seq_loss_pairs = [] + + def process_seq(self, *, seq_tag: str, outputs: TensorDict): + loss: torch.Tensor = outputs["ce_score"].raw_tensor + self.seq_loss_pairs.append((seq_tag, loss.cpu().numpy()[0])) + + def finish(self): + # Write txt file + sorted_pairs = sorted(self.seq_loss_pairs, key=lambda x: x[1]) + with open("loss_table", "wt") as f: + for x in sorted_pairs: + f.write(f"{x[0]} {x[1]}\n") diff --git a/users/hilmes/experiments/tedlium2/asr_2023/hybrid/torch_baselines/torch_args.py b/users/hilmes/experiments/tedlium2/asr_2023/hybrid/torch_baselines/torch_args.py index 5ffd16a5c..fcb8a5f70 100644 --- a/users/hilmes/experiments/tedlium2/asr_2023/hybrid/torch_baselines/torch_args.py +++ b/users/hilmes/experiments/tedlium2/asr_2023/hybrid/torch_baselines/torch_args.py @@ -106,12 +106,52 @@ def get_nn_args(num_outputs: int = 9001, num_epochs: int = 250, debug=False, **n # "needs_features_size": True, # "training_whitelist": ["torch_jj_config2", "torch_jj_config2_large"], # }, - # "quant": { + "quant": { + "epochs": evaluation_epochs, + "feature_flow_key": "fb", + "prior_scales": [0.9], + "pronunciation_scales": [0.0], + "lm_scales": [10.0], + "lm_lookahead": True, + "lookahead_options": None, + "create_lattice": True, + "eval_single_best": True, + "eval_best_in_lattice": True, + "search_parameters": { + "beam-pruning": 15.0, + "beam-pruning-limit": 10000, + "word-end-pruning": 0.5, + "word-end-pruning-limit": 15000, + }, + "lattice_to_ctm_kwargs": { + "fill_empty_segments": True, + "best_path_algo": "bellman-ford", + }, + "optimize_am_lm_scale": True, + "rtf": 50, + "mem": 7, + "lmgc_mem": 16, + "cpu": 2, + "parallelize_conversion": True, + "needs_features_size": True, + #"quantize": [10, 15, 25, 100, 250, 500, 750, 1000, 2500, 5000], + "quantize": [5, 10, 25, 100, 500, 1000, 5000, 10000, 50000, 100000, 100001, 100002, 100003], + "quant_modes": [CalibrationMethod.MinMax, CalibrationMethod.Percentile], + "quant_ops": [["Conv", "MatMul"]], + "quant_sym_modes": [False], + "quant_avg_modes": [False], + "quant_percentiles": [99.999], + "quant_num_bin_ls": [2048], + "training_whitelist": [ + "torch_jj_config2", + ], + }, + # "quant-base": { # "epochs": evaluation_epochs, # "feature_flow_key": "fb", # "prior_scales": [0.7, 0.9], # "pronunciation_scales": [0.0], - # "lm_scales": [10.0, 7.5], + # "lm_scales": [10.0], # "lm_lookahead": True, # "lookahead_options": None, # "create_lattice": True, @@ -129,26 +169,26 @@ def get_nn_args(num_outputs: int = 9001, num_epochs: int = 250, debug=False, **n # }, # "optimize_am_lm_scale": True, # "rtf": 50, - # "mem": 7, + # "mem": 6, # "lmgc_mem": 16, # "cpu": 2, # "parallelize_conversion": True, # "needs_features_size": True, - # #"quantize": [10, 15, 25, 100, 250, 500, 750, 1000, 2500, 5000], - # "quantize": [5, 11, 10, 25, 100, 500, 1000, 5000, 10000, 50000, 100000], + # "quant_ops": [["Conv", "MatMul"]], + # "quantize": [5, 10, 25, 100, 500, 1000, 5000, 10000, 50000],#, 100000], # "quant_modes": [CalibrationMethod.MinMax, CalibrationMethod.Percentile], # "quant_sym_modes": [False], # "quant_avg_modes": [False], - # "quant_percentiles": [90.0, 95.0, 99.0, 99.999, 99.9], - # "quant_num_bin_ls": [512, 1024, 2048, 4096], + # "quant_percentiles": [99.999], + # "quant_num_bin_ls": [2048], # "training_whitelist": [ # "torch_jj_config2", # ], # }, - # "quant-base": { + # "quant-ops": { # "epochs": evaluation_epochs, # "feature_flow_key": "fb", - # "prior_scales": [0.7, 0.9], + # "prior_scales": [0.7], # "pronunciation_scales": [0.0], # "lm_scales": [10.0], # "lm_lookahead": True, @@ -173,21 +213,20 @@ def get_nn_args(num_outputs: int = 9001, num_epochs: int = 250, debug=False, **n # "cpu": 2, # "parallelize_conversion": True, # "needs_features_size": True, - # "quant_ops": [["Conv", "MatMul"]], - # "quantize": [5, 10, 25, 100, 500, 1000, 5000, 10000, 50000],#, 100000], - # "quant_modes": [CalibrationMethod.MinMax, CalibrationMethod.Percentile], + # # "quantize": [10, 15, 25, 100, 250, 500, 750, 1000, 2500, 5000], + # "quantize": [10, 25, 100, 500], + # "quant_modes": [CalibrationMethod.MinMax], # "quant_sym_modes": [False], # "quant_avg_modes": [False], - # "quant_percentiles": [99.999], - # "quant_num_bin_ls": [2048], + # "quant_ops": [None, ["Conv"], ["MatMul"], ["Conv", "MatMul"], ["Conv", "MatMul", "Mul", "Add"]], # "training_whitelist": [ # "torch_jj_config2", # ], # }, - # "quant-ops": { + # "quant-rtf-intel-speed": { # "epochs": evaluation_epochs, # "feature_flow_key": "fb", - # "prior_scales": [0.7], + # "prior_scales": [0.9], # "pronunciation_scales": [0.0], # "lm_scales": [10.0], # "lm_lookahead": True, @@ -196,8 +235,8 @@ def get_nn_args(num_outputs: int = 9001, num_epochs: int = 250, debug=False, **n # "eval_single_best": True, # "eval_best_in_lattice": True, # "search_parameters": { - # "beam-pruning": 15.0, - # "beam-pruning-limit": 10000, + # "beam-pruning": 12.0, + # "beam-pruning-limit": 2000, # "word-end-pruning": 0.5, # "word-end-pruning-limit": 15000, # }, @@ -209,20 +248,20 @@ def get_nn_args(num_outputs: int = 9001, num_epochs: int = 250, debug=False, **n # "rtf": 50, # "mem": 6, # "lmgc_mem": 16, - # "cpu": 2, + # "cpu": 1, # "parallelize_conversion": True, # "needs_features_size": True, # # "quantize": [10, 15, 25, 100, 250, 500, 750, 1000, 2500, 5000], - # "quantize": [10, 25, 100, 500], + # "quantize": [10], # "quant_modes": [CalibrationMethod.MinMax], # "quant_sym_modes": [False], # "quant_avg_modes": [False], - # "quant_ops": [None, ["Conv"], ["MatMul"], ["Conv", "MatMul"], ["Conv", "MatMul", "Mul", "Add"]], + # "quant_ops": [None, ["Conv"], ["MatMul"], ["Conv", "MatMul"], ["Conv", "MatMul", "Mul", "Add"], ["ConvTranspose"], ["Conv", "MatMul", "ConvTranspose"]], # "training_whitelist": [ # "torch_jj_config2", # ], # }, - # "quant-rtf-intel-speed": { + # "quant-multiple": { # "epochs": evaluation_epochs, # "feature_flow_key": "fb", # "prior_scales": [0.9], @@ -234,8 +273,8 @@ def get_nn_args(num_outputs: int = 9001, num_epochs: int = 250, debug=False, **n # "eval_single_best": True, # "eval_best_in_lattice": True, # "search_parameters": { - # "beam-pruning": 12.0, - # "beam-pruning-limit": 2000, + # "beam-pruning": 15.0, + # "beam-pruning-limit": 10000, # "word-end-pruning": 0.5, # "word-end-pruning-limit": 15000, # }, @@ -247,23 +286,24 @@ def get_nn_args(num_outputs: int = 9001, num_epochs: int = 250, debug=False, **n # "rtf": 50, # "mem": 6, # "lmgc_mem": 16, - # "cpu": 1, + # "cpu": 2, # "parallelize_conversion": True, # "needs_features_size": True, - # # "quantize": [10, 15, 25, 100, 250, 500, 750, 1000, 2500, 5000], - # "quantize": [10], - # "quant_modes": [CalibrationMethod.MinMax], + # "quantize": [1, 2, 3, 5, 10, 100, 500, 1000, 10000], + # "quant_modes": [CalibrationMethod.MinMax, CalibrationMethod.Percentile], + # #"quant_modes": [CalibrationMethod.MinMax], + # "random_seed_draws": 100, # "quant_sym_modes": [False], # "quant_avg_modes": [False], - # "quant_ops": [None, ["Conv"], ["MatMul"], ["Conv", "MatMul"], ["Conv", "MatMul", "Mul", "Add"], ["ConvTranspose"], ["Conv", "MatMul", "ConvTranspose"]], + # "quant_percentiles": [99.999], + # "quant_num_bin_ls": [2048], # "training_whitelist": [ - # "torch_jj_config2", # ], # }, - "quant-multiple": { + "quant-multiple-convmatmul": { "epochs": evaluation_epochs, "feature_flow_key": "fb", - "prior_scales": [0.7], + "prior_scales": [0.9], "pronunciation_scales": [0.0], "lm_scales": [10.0], "lm_lookahead": True, @@ -288,22 +328,20 @@ def get_nn_args(num_outputs: int = 9001, num_epochs: int = 250, debug=False, **n "cpu": 2, "parallelize_conversion": True, "needs_features_size": True, - "quantize": [10, 500, 1000, 10000], - "quant_modes": [CalibrationMethod.MinMax, CalibrationMethod.Percentile, CalibrationMethod.Entropy], - #"quant_modes": [CalibrationMethod.MinMax], + "quantize": [1, 2, 10, 100, 500, 1000, 10000], + "quant_modes": [CalibrationMethod.MinMax, CalibrationMethod.Percentile], + # "quant_modes": [CalibrationMethod.MinMax], "random_seed_draws": 100, "quant_sym_modes": [False], - "quant_avg_modes": [False, True], + "quant_avg_modes": [False], "quant_percentiles": [99.999], "quant_num_bin_ls": [2048], + "quant_ops": [["Conv", "MatMul"]], "training_whitelist": [ "torch_jj_config2", - "torch_jj_seed_24", - "torch_jj_seed_5", - "torch_jj_seed_2005", ], }, - "quant-filter": { + "quant-loss": { "epochs": evaluation_epochs, "feature_flow_key": "fb", "prior_scales": [0.9], @@ -331,18 +369,148 @@ def get_nn_args(num_outputs: int = 9001, num_epochs: int = 250, debug=False, **n "cpu": 2, "parallelize_conversion": True, "needs_features_size": True, - "quantize": [1000], - "quant_filter_opts": [{"max_seq_len": 500}, {"min_seq_len": 1500}], + "quantize": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 50, 100, 200, 300, 400, 500, 1000, 2000, 3000, 4000, 5000, 10000], "quant_modes": [CalibrationMethod.MinMax, CalibrationMethod.Percentile], - "random_seed_draws": 50, + # "quant_modes": [CalibrationMethod.MinMax], + "random_seed_draws": 1, "quant_sym_modes": [False], "quant_avg_modes": [False], "quant_percentiles": [99.999], "quant_num_bin_ls": [2048], + "quant_ops": [["Conv", "MatMul"]], + "loss_table_args": [{}, {"reverse"}], "training_whitelist": [ "torch_jj_config2", ], }, + # "quant-filter": { + # "epochs": evaluation_epochs, + # "feature_flow_key": "fb", + # "prior_scales": [0.9], + # "pronunciation_scales": [0.0], + # "lm_scales": [10.0], + # "lm_lookahead": True, + # "lookahead_options": None, + # "create_lattice": True, + # "eval_single_best": True, + # "eval_best_in_lattice": True, + # "search_parameters": { + # "beam-pruning": 15.0, + # "beam-pruning-limit": 10000, + # "word-end-pruning": 0.5, + # "word-end-pruning-limit": 15000, + # }, + # "lattice_to_ctm_kwargs": { + # "fill_empty_segments": True, + # "best_path_algo": "bellman-ford", + # }, + # "optimize_am_lm_scale": True, + # "rtf": 50, + # "mem": 6, + # "lmgc_mem": 16, + # "cpu": 2, + # "parallelize_conversion": True, + # "needs_features_size": True, + # "quantize": [10], + # "quant_filter_opts": [{"max_seq_len": 500}, {"min_seq_len": 1500}], + # "quant_modes": [CalibrationMethod.MinMax, CalibrationMethod.Percentile], + # "random_seed_draws": 50, + # "quant_sym_modes": [False], + # "quant_avg_modes": [False], + # "quant_percentiles": [99.999], + # "quant_num_bin_ls": [2048], + # "training_whitelist": [ + # "torch_jj_config2", + # ], + # }, + # "quant-filter-small": { + # "epochs": evaluation_epochs, + # "feature_flow_key": "fb", + # "prior_scales": [0.9], + # "pronunciation_scales": [0.0], + # "lm_scales": [10.0], + # "lm_lookahead": True, + # "lookahead_options": None, + # "create_lattice": True, + # "eval_single_best": True, + # "eval_best_in_lattice": True, + # "search_parameters": { + # "beam-pruning": 15.0, + # "beam-pruning-limit": 10000, + # "word-end-pruning": 0.5, + # "word-end-pruning-limit": 15000, + # }, + # "lattice_to_ctm_kwargs": { + # "fill_empty_segments": True, + # "best_path_algo": "bellman-ford", + # }, + # "optimize_am_lm_scale": True, + # "rtf": 50, + # "mem": 6, + # "lmgc_mem": 16, + # "cpu": 2, + # "parallelize_conversion": True, + # "needs_features_size": True, + # "quantize": [10], + # "quant_filter_opts": [{"min_seq_len": 100}, {"min_seq_len": 50}, {"min_seq_len": 250}, {"min_seq_len": 400}], + # "quant_modes": [CalibrationMethod.MinMax], + # "random_seed_draws": 100, + # "quant_sym_modes": [False], + # "quant_avg_modes": [False], + # "quant_percentiles": [99.999], + # "quant_num_bin_ls": [2048], + # "training_whitelist": [ + # "torch_jj_config2", + # ], + # }, + # "quant-filter-hun": { + # "epochs": evaluation_epochs, + # "feature_flow_key": "fb", + # "prior_scales": [0.9], + # "pronunciation_scales": [0.0], + # "lm_scales": [10.0], + # "lm_lookahead": True, + # "lookahead_options": None, + # "create_lattice": True, + # "eval_single_best": True, + # "eval_best_in_lattice": True, + # "search_parameters": { + # "beam-pruning": 15.0, + # "beam-pruning-limit": 10000, + # "word-end-pruning": 0.5, + # "word-end-pruning-limit": 15000, + # }, + # "lattice_to_ctm_kwargs": { + # "fill_empty_segments": True, + # "best_path_algo": "bellman-ford", + # }, + # "optimize_am_lm_scale": True, + # "rtf": 50, + # "mem": 6, + # "lmgc_mem": 16, + # "cpu": 2, + # "parallelize_conversion": True, + # "needs_features_size": True, + # "quantize": [1], + # "quant_filter_opts": [ + # {"range_len": (95, 105)}, + # {"range_len": (195, 205)}, + # {"range_len": (495, 505)}, + # {"range_len": (595, 605)}, + # {"range_len": (1000, 1100)}, + # {"range_len": (1900, 2000)}, + # {"range_len": (2000, 3000)}, + # ], + # "quant_modes": [CalibrationMethod.MinMax], + # "random_seed_draws": 100, + # "quant_sym_modes": [False], + # "quant_avg_modes": [False], + # "quant_percentiles": [99.999], + # "quant_num_bin_ls": [2048], + # "training_whitelist": [ + # "torch_jj_config2", + # ], + # }, # "quant-filter-partition": { # "epochs": evaluation_epochs, # "feature_flow_key": "fb", @@ -393,51 +561,57 @@ def get_nn_args(num_outputs: int = 9001, num_epochs: int = 250, debug=False, **n # "torch_jj_config2", # ], # }, - "quant-filter-budget": { - "epochs": evaluation_epochs, - "feature_flow_key": "fb", - "prior_scales": [0.9], - "pronunciation_scales": [0.0], - "lm_scales": [10.0], - "lm_lookahead": True, - "lookahead_options": None, - "create_lattice": True, - "eval_single_best": True, - "eval_best_in_lattice": True, - "search_parameters": { - "beam-pruning": 15.0, - "beam-pruning-limit": 10000, - "word-end-pruning": 0.5, - "word-end-pruning-limit": 15000, - }, - "lattice_to_ctm_kwargs": { - "fill_empty_segments": True, - "best_path_algo": "bellman-ford", - }, - "optimize_am_lm_scale": True, - "rtf": 50, - "mem": 6, - "lmgc_mem": 16, - "cpu": 2, - "parallelize_conversion": True, - "needs_features_size": True, - "quantize": [None], - # "quant_filter_opts": [None, {"min_seq_len": 1000}, {"max_seq_len": 1000}], - "quant_filter_opts": [ - {"budget": (30000, 0.01)}, {"budget": (10000, 0.01)}, {"budget": (30000, 0.005)}, {"budget": (80000, 0.01)}, - ], - # "quant_modes": [CalibrationMethod.MinMax, CalibrationMethod.Percentile], - "quant_modes": [CalibrationMethod.MinMax, CalibrationMethod.Percentile], - "random_seed_draws": 100, - "quant_sym_modes": [False], - "quant_avg_modes": [False], - "quant_percentiles": [99.999], - "quant_num_bin_ls": [2048], - "quant_ops": [["Conv", "MatMul"]], - "training_whitelist": [ - "torch_jj_config2", - ], - }, + # "quant-filter-budget": { + # "epochs": evaluation_epochs, + # "feature_flow_key": "fb", + # "prior_scales": [0.9], + # "pronunciation_scales": [0.0], + # "lm_scales": [10.0], + # "lm_lookahead": True, + # "lookahead_options": None, + # "create_lattice": True, + # "eval_single_best": True, + # "eval_best_in_lattice": True, + # "search_parameters": { + # "beam-pruning": 15.0, + # "beam-pruning-limit": 10000, + # "word-end-pruning": 0.5, + # "word-end-pruning-limit": 15000, + # }, + # "lattice_to_ctm_kwargs": { + # "fill_empty_segments": True, + # "best_path_algo": "bellman-ford", + # }, + # "optimize_am_lm_scale": True, + # "rtf": 50, + # "mem": 6, + # "lmgc_mem": 16, + # "cpu": 2, + # "parallelize_conversion": True, + # "needs_features_size": True, + # "quantize": [None], + # # "quant_filter_opts": [None, {"min_seq_len": 1000}, {"max_seq_len": 1000}], + # "quant_filter_opts": [ + # {"budget": (30000, 0.01)}, + # {"budget": (10000, 0.01)}, + # {"budget": (30000, 0.005)}, + # {"budget": (80000, 0.01)}, + # {"budget": (2500, 0.01)}, + # {"budget": (4000, 0.01)}, + # {"budget": (1000, 0.01)}, + # ], + # # "quant_modes": [CalibrationMethod.MinMax, CalibrationMethod.Percentile], + # "quant_modes": [CalibrationMethod.MinMax, CalibrationMethod.Percentile], + # "random_seed_draws": 100, + # "quant_sym_modes": [False], + # "quant_avg_modes": [False], + # "quant_percentiles": [99.999], + # "quant_num_bin_ls": [2048], + # "quant_ops": [["Conv", "MatMul"]], + # "training_whitelist": [ + # "torch_jj_config2", + # ], + # }, # "quant-filter-seqtags": { # "epochs": evaluation_epochs, # "feature_flow_key": "fb", diff --git a/users/hilmes/experiments/tedlium2/standalone/config.py b/users/hilmes/experiments/tedlium2/standalone/config.py index 7f8c5d738..5cb3bd1ee 100644 --- a/users/hilmes/experiments/tedlium2/standalone/config.py +++ b/users/hilmes/experiments/tedlium2/standalone/config.py @@ -2,7 +2,7 @@ Universal helpers to create configuration objects (i6_core ReturnnConfig) for RETURNN training/forwarding """ import copy -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, List from i6_core.returnn.config import ReturnnConfig, CodeWrapper @@ -183,6 +183,7 @@ def get_static_quant_config( config: Dict[str, Any], num_samples: int, dataset_seed: int, + dataset_filter_args: Optional[Dict[str, Any]], debug: bool = False, ): """ @@ -206,6 +207,8 @@ def get_static_quant_config( base_config['forward']['seq_ordering'] = 'random' base_config['forward']['datasets']['zip_dataset']['fixed_random_subset'] = num_samples base_config['forward']['datasets']['zip_dataset']['fixed_random_subset_seed'] = dataset_seed + if dataset_filter_args is not None: + base_config['forward']['datasets']['zip_dataset']['random_subset_filter_args'] = dataset_filter_args config = {**base_config, **copy.deepcopy(config)} post_config["backend"] = "torch" assert net_args.keys().isdisjoint(quant_args.keys()) diff --git a/users/hilmes/experiments/tedlium2/standalone/default_tools.py b/users/hilmes/experiments/tedlium2/standalone/default_tools.py index 2cd71d3d6..8c1b48070 100644 --- a/users/hilmes/experiments/tedlium2/standalone/default_tools.py +++ b/users/hilmes/experiments/tedlium2/standalone/default_tools.py @@ -19,7 +19,7 @@ I6_MODELS_REPO_PATH = CloneGitRepositoryJob( url="https://github.com/rwth-i6/i6_models", - commit="933c6c13f7d6c74e5a59af0257e17c208dae9da3", + commit="3c9173691521778b1e8b4070c172cbe929e4826b", checkout_folder_name="i6_models", ).out_repository.copy() I6_MODELS_REPO_PATH.hash_overwrite = "TEDLIUM_STANDALONE_DEFAULT_I6_MODELS" @@ -35,3 +35,7 @@ commit_hash="5015a45e28a958f800ef1c50e7880c0c9ef414cf", ).copy() SUBWORD_NMT_REPO.hash_overwrite = "I6_SUBWORD_NMT_V2" + +QUANT_RETURNN = CloneGitRepositoryJob( + "https://github.com/JackTemaki/MiniReturnn", commit="f31614f2a071aa75588eff6f2231b54751fb962c" +).out_repository.copy() diff --git a/users/hilmes/experiments/tedlium2/standalone/experiments/ctc_bpe/baseline.py b/users/hilmes/experiments/tedlium2/standalone/experiments/ctc_bpe/baseline.py index d37553098..8ca22093c 100644 --- a/users/hilmes/experiments/tedlium2/standalone/experiments/ctc_bpe/baseline.py +++ b/users/hilmes/experiments/tedlium2/standalone/experiments/ctc_bpe/baseline.py @@ -281,7 +281,7 @@ def tune_and_evaluate_helper( activation=None, ) - for bpe in [0, 32, 64, 100, 128, 256, 512, 1024]: + for bpe in [32, 64, 100, 128, 256, 512, 1024]: prefix_name_bpe = f"experiments/tedlium2/ctc_rnnt_standalone_2024/bpe_ctc_bpe_{bpe}" train_data_bpe = build_bpe_training_datasets( prefix=prefix_name_bpe, diff --git a/users/hilmes/experiments/tedlium2/standalone/experiments/ctc_phon/baseline.py b/users/hilmes/experiments/tedlium2/standalone/experiments/ctc_phon/baseline.py index 48e054f81..77e80e7b9 100644 --- a/users/hilmes/experiments/tedlium2/standalone/experiments/ctc_phon/baseline.py +++ b/users/hilmes/experiments/tedlium2/standalone/experiments/ctc_phon/baseline.py @@ -1,36 +1,19 @@ -import copy -from dataclasses import asdict, dataclass +from dataclasses import asdict import numpy as np -from typing import cast, List, Dict, Any, Optional - -from i6_core.tools.parameter_tuning import GetOptimalParametersAsVariableJob -from i6_core.tools.git import CloneGitRepositoryJob +from typing import cast from i6_experiments.common.setups.returnn.datastreams.vocabulary import LabelDatastream - -from ...data.common import DatasetSettings, build_test_dataset, TrainingDatasets +from .tune_eval import QuantArgs +from ...data.common import DatasetSettings, build_test_dataset from ...data.phon import build_eow_phon_training_datasets, get_text_lexicon from ...default_tools import RETURNN_EXE, MINI_RETURNN_ROOT from ...lm import get_4gram_binary_lm -from ...pipeline import training, prepare_asr_model, search, ASRModel, quantize_static +from ...pipeline import training, prepare_asr_model from ...report import generate_report -from ...config import get_static_quant_config - -@dataclass -class QuantArgs: - sample_ls: List[int] - quant_config_dict: Dict[str, Any] - decoder: str - num_iterations: int - datasets: TrainingDatasets - network_module: str - -QUANT_RETURNN = CloneGitRepositoryJob( - "https://github.com/JackTemaki/MiniReturnn", commit="f31614f2a071aa75588eff6f2231b54751fb962c" -).out_repository.copy() +from .tune_eval import tune_and_evaluate_helper -def eow_phon_ted_1023_base(): +def eow_phon_ted_1023_base(quant=False): prefix_name = "experiments/tedlium2/ctc_rnnt_standalone_2024/ctc_eow_phon" train_settings = DatasetSettings( @@ -72,115 +55,6 @@ def eow_phon_ted_1023_base(): from ...pytorch_networks.ctc.decoder.flashlight_ctc_v1 import DecoderConfig - def tune_and_evaluate_helper( - training_name: str, - asr_model: ASRModel, - base_decoder_config: DecoderConfig, - lm_scales: List[float], - prior_scales: List[float], - quant_str: Optional[str] = None, - eval_test: bool = False, - quant_args: Optional[QuantArgs] = None, - ): - """ - Example helper to execute tuning over lm_scales and prior scales. - With the best values runs test-clean and test-other. - - This is just a reference helper and can (should) be freely changed, copied, modified etc... - - :param training_name: for alias and output names - :param asr_model: ASR model to use - :param base_decoder_config: any decoder config dataclass - :param lm_scales: lm scales for tuning - :param prior_scales: prior scales for tuning, same length as lm scales - """ - tune_parameters = [] - tune_values = [] - results = {} - for lm_weight in lm_scales: - for prior_scale in prior_scales: - decoder_config = copy.deepcopy(base_decoder_config) - decoder_config.lm_weight = lm_weight - decoder_config.prior_scale = prior_scale - search_name = training_name + "/search_lm%.1f_prior%.1f" % (lm_weight, prior_scale) - search_jobs, wers = search( - search_name, - forward_config={}, - asr_model=asr_model, - decoder_module="ctc.decoder.flashlight_ctc_v1", - decoder_args={"config": asdict(decoder_config)}, - test_dataset_tuples=dev_dataset_tuples, - **default_returnn, - ) - tune_parameters.append((lm_weight, prior_scale)) - tune_values.append((wers[search_name + "/dev"])) - results.update(wers) - if quant_args is not None: - assert quant_str is not None, "You want your quant to have a name" - for num_samples in quant_args.sample_ls: - for seed in range(quant_args.num_iterations): - it_name = training_name + quant_str + f"/quantize_static/samples_{num_samples}/seed_{seed}" - quant_config = get_static_quant_config( - training_datasets=quant_args.datasets, - network_module=quant_args.network_module, - net_args=asr_model.net_args, - quant_args=quant_args.quant_config_dict, - config={}, - num_samples=num_samples, - dataset_seed=seed, - debug=False, - ) - quant_chkpt = quantize_static( - prefix_name=it_name, - returnn_config=quant_config, - checkpoint=asr_model.checkpoint, - returnn_exe=RETURNN_EXE, - returnn_root=QUANT_RETURNN, - ) - quant_model = ASRModel( - checkpoint=quant_chkpt, - net_args=asr_model.net_args | quant_args.quant_config_dict, - network_module=quant_args.network_module, - prior_file=asr_model.prior_file, - prefix_name=it_name - ) - for lm_weight in lm_scales: - for prior_scale in prior_scales: - decoder_config = copy.deepcopy(base_decoder_config) - decoder_config.lm_weight = lm_weight - decoder_config.prior_scale = prior_scale - search_name = it_name + "/search_lm%.1f_prior%.1f" % (lm_weight, prior_scale) - search_jobs, wers = search( - search_name, - forward_config={}, - asr_model=quant_model, - decoder_module=quant_args.decoder, - decoder_args={"config": asdict(decoder_config)}, - test_dataset_tuples=dev_dataset_tuples, - **default_returnn, - ) - results.update(wers) - pick_optimal_params_job = GetOptimalParametersAsVariableJob( - parameters=tune_parameters, values=tune_values, mode="minimize" - ) - pick_optimal_params_job.add_alias(training_name + f"/pick_best_dev") - if eval_test: - for key, tune_values in [("test", tune_values)]: - decoder_config = copy.deepcopy(base_decoder_config) - decoder_config.lm_weight = pick_optimal_params_job.out_optimal_parameters[0] - decoder_config.prior_scale = pick_optimal_params_job.out_optimal_parameters[1] - search_jobs, wers = search( - training_name, - forward_config={}, - asr_model=asr_model, - decoder_module="ctc.decoder.flashlight_ctc_v1", - decoder_args={"config": asdict(decoder_config)}, - test_dataset_tuples={key: test_dataset_tuples[key]}, - **default_returnn, - ) - results.update(wers) - return results, pick_optimal_params_job - default_decoder_config = DecoderConfig( lexicon=get_text_lexicon(), returnn_vocab=label_datastream.vocab, @@ -277,7 +151,9 @@ def tune_and_evaluate_helper( training_name, train_job, train_args, with_prior=True, datasets=train_data, get_specific_checkpoint=250 ) res, _ = tune_and_evaluate_helper( - training_name, asr_model, default_decoder_config, lm_scales=[1.4, 1.6, 1.8, 2.0, 2.2, 2.4], prior_scales=[0.0, 0.3, 0.5, 0.7, 1.0] + training_name, asr_model, default_decoder_config, + lm_scales=[1.4, 1.6, 1.8, 2.0, 2.2, 2.4], prior_scales=[0.0, 0.3, 0.5, 0.7, 1.0], + dev_dataset_tuples=dev_dataset_tuples ) results.update(res) generate_report(results=results, exp_name=training_name) @@ -309,7 +185,7 @@ def tune_and_evaluate_helper( prior_scales = [0.7, 0.9] res, _ = tune_and_evaluate_helper( training_name, asr_model, default_decoder_config, lm_scales=lm_scales, - prior_scales=prior_scales + prior_scales=prior_scales, dev_dataset_tuples=dev_dataset_tuples ) results.update(res) asr_model_best4 = prepare_asr_model( @@ -317,228 +193,260 @@ def tune_and_evaluate_helper( get_best_averaged_checkpoint=(4, "dev_loss_ctc") ) res, _ = tune_and_evaluate_helper(training_name + "/best4", asr_model_best4, default_decoder_config, - lm_scales=lm_scales, prior_scales=prior_scales) + lm_scales=lm_scales, prior_scales=prior_scales, dev_dataset_tuples=dev_dataset_tuples) results.update(res) asr_model_best = prepare_asr_model( training_name + "/best", train_job, train_args, with_prior=True, datasets=train_data, get_best_averaged_checkpoint=(1, "dev_loss_ctc") ) res, _ = tune_and_evaluate_helper(training_name + "/best", asr_model_best, default_decoder_config, - lm_scales=lm_scales, prior_scales=prior_scales) + lm_scales=lm_scales, prior_scales=prior_scales, dev_dataset_tuples=dev_dataset_tuples) results.update(res) generate_report(results=results, exp_name=training_name) # TODO current best with 7.083 del results - from ...pytorch_networks.ctc.conformer_1023.quant.baseline_quant_v1_cfg import QuantModelConfigV1 - num_iterations = 100 - # what if we give more information to the activation instead? - for activation_bit in [8, 7, 6, 5, 4, 3, 2, 1]: - for weight_bit in [8, 7, 6, 5, 4, 3, 2, 1]: - results = {} - model_config_quant_v1 = QuantModelConfigV1( - weight_quant_dtype="qint8", - weight_quant_method="per_tensor", - activation_quant_dtype="qint8", - activation_quant_method="per_tensor", - dot_quant_dtype="qint8", - dot_quant_method="per_tensor", - Av_quant_dtype="qint8", - Av_quant_method="per_tensor", - moving_average=0.01, - weight_bit_prec=weight_bit, - activation_bit_prec=activation_bit, - linear_quant_output=False, - ) - quant_args = QuantArgs( - sample_ls=[10] if weight_bit < 8 or activation_bit < 8 else [10, 100, 1000, 10000], - quant_config_dict={"quant_config_dict": asdict(model_config_quant_v1)}, - decoder="ctc.decoder.flashlight_quant_stat_phoneme_ctc", - num_iterations=num_iterations, - datasets=train_data, - network_module="ctc.conformer_1023.quant.baseline_quant_v1", - ) - quant_str = f"_weight_{weight_bit}_act_{activation_bit}" - asr_model = prepare_asr_model( - training_name+quant_str, - train_job, - train_args, - with_prior=True, - datasets=train_data, - get_specific_checkpoint=250, - ) - res, _ = tune_and_evaluate_helper( # only take best for now, since otherwise too many searches - training_name, asr_model, default_decoder_config, lm_scales=[2.8], - prior_scales=[0.7], quant_args=quant_args, quant_str=quant_str, - ) - results.update(res) - generate_report(results=results, exp_name=training_name + quant_str) - del results - - num_iterations = 100 - for activation_bit in [8]: - for weight_bit in [8, 7, 6, 5, 4, 3, 2, 1]: - results = {} - model_config_quant_v1 = QuantModelConfigV1( - weight_quant_dtype="qint8", - weight_quant_method="per_tensor", - activation_quant_dtype="qint8", - activation_quant_method="per_tensor", - dot_quant_dtype="qint8", - dot_quant_method="per_tensor", - Av_quant_dtype="qint8", - Av_quant_method="per_tensor", - moving_average=0.01, - weight_bit_prec=weight_bit, - activation_bit_prec=activation_bit, - linear_quant_output=True, - ) - quant_args = QuantArgs( - sample_ls=[10] if weight_bit < 8 or activation_bit < 8 else [10, 100, 1000, 10000], - quant_config_dict={"quant_config_dict": asdict(model_config_quant_v1)}, - decoder="ctc.decoder.flashlight_quant_stat_phoneme_ctc", - num_iterations=num_iterations, - datasets=train_data, - network_module="ctc.conformer_1023.quant.baseline_quant_v1", - ) - quant_str = f"_weight_{weight_bit}_act_{activation_bit}_qlin" - asr_model = prepare_asr_model( - training_name+quant_str, train_job, train_args, with_prior=True, datasets=train_data, get_specific_checkpoint=250 - ) - res, _ = tune_and_evaluate_helper( # only take best for now, since otherwise too many searches - training_name, asr_model, default_decoder_config, lm_scales=[2.8], - prior_scales=[0.7], quant_args=quant_args, quant_str=quant_str - ) - results.update(res) - generate_report(results=results, exp_name=training_name+quant_str) - del results + if quant is True: + from ...pytorch_networks.ctc.conformer_1023.quant.baseline_quant_v1_cfg import QuantModelConfigV1 + num_iterations = 100 + # what if we give more information to the activation instead? + for activation_bit in [8, 7, 6, 5, 4, 3, 2, 1]: + for weight_bit in [8, 7, 6, 5, 4, 3, 2, 1]: + results = {} + model_config_quant_v1 = QuantModelConfigV1( + weight_quant_dtype="qint8", + weight_quant_method="per_tensor", + activation_quant_dtype="qint8", + activation_quant_method="per_tensor", + dot_quant_dtype="qint8", + dot_quant_method="per_tensor", + Av_quant_dtype="qint8", + Av_quant_method="per_tensor", + moving_average=0.01, + weight_bit_prec=weight_bit, + activation_bit_prec=activation_bit, + linear_quant_output=False, + ) + quant_args = QuantArgs( + sample_ls=[10] if weight_bit < 8 or activation_bit < 8 else [10, 100, 1000, 10000], + quant_config_dict={"quant_config_dict": asdict(model_config_quant_v1)}, + decoder="ctc.decoder.flashlight_quant_stat_phoneme_ctc", + num_iterations=num_iterations, + datasets=train_data, + network_module="ctc.conformer_1023.quant.baseline_quant_v1", + ) + quant_str = f"_weight_{weight_bit}_act_{activation_bit}" + asr_model = prepare_asr_model( + training_name+quant_str, + train_job, + train_args, + with_prior=True, + datasets=train_data, + get_specific_checkpoint=250, + ) + res, _ = tune_and_evaluate_helper( # only take best for now, since otherwise too many searches + training_name, asr_model, default_decoder_config, lm_scales=[2.8], + prior_scales=[0.7], quant_args=quant_args, quant_str=quant_str, + dev_dataset_tuples=dev_dataset_tuples, + ) + results.update(res) + generate_report(results=results, exp_name=training_name + quant_str) + del results + num_iterations = 250 + for filter in [ + ({"unique_tags": 0.0}, "unique"), + ({"single_tag": 0.0}, "single"), + ({"max_dur": 1.0}, "max_dur_1"), + ({"min_dur": 15.0}, "min_dur_15") + ]: + for activation_bit in [8]: + for weight_bit in [8]: + results = {} + model_config_quant_v1 = QuantModelConfigV1( + weight_quant_dtype="qint8", + weight_quant_method="per_tensor", + activation_quant_dtype="qint8", + activation_quant_method="per_tensor", + dot_quant_dtype="qint8", + dot_quant_method="per_tensor", + Av_quant_dtype="qint8", + Av_quant_method="per_tensor", + moving_average=0.01, + weight_bit_prec=weight_bit, + activation_bit_prec=activation_bit, + linear_quant_output=False, + ) + quant_args = QuantArgs( + sample_ls=[1], #§, 10, 25, 5], + quant_config_dict={"quant_config_dict": asdict(model_config_quant_v1)}, + decoder="ctc.decoder.flashlight_quant_stat_phoneme_ctc", + num_iterations=num_iterations, + datasets=train_data, + network_module="ctc.conformer_1023.quant.baseline_quant_v1", + filter_args=filter[0], + ) + quant_str = f"_weight_{weight_bit}_act_{activation_bit}_{filter[1]}" + asr_model = prepare_asr_model( + training_name+quant_str, + train_job, + train_args, + with_prior=True, + datasets=train_data, + get_specific_checkpoint=250, + ) + res, _ = tune_and_evaluate_helper( # only take best for now, since otherwise too many searches + training_name, asr_model, default_decoder_config, lm_scales=[2.8], + prior_scales=[0.7], quant_args=quant_args, quant_str=quant_str, dev_dataset_tuples=dev_dataset_tuples + ) + results.update(res) + generate_report(results=results, exp_name=training_name + quant_str) + del results - for activation_bit in [8]: - for weight_bit in [8]: - results = {} - model_config_quant_v1 = QuantModelConfigV1( - weight_quant_dtype="qint8", - weight_quant_method="per_tensor", - activation_quant_dtype="qint8", - activation_quant_method="per_tensor", - dot_quant_dtype="qint8", - dot_quant_method="per_tensor", - Av_quant_dtype="qint8", - Av_quant_method="per_tensor", - moving_average=None, - weight_bit_prec=weight_bit, - activation_bit_prec=activation_bit, - linear_quant_output=False, - ) - quant_args = QuantArgs( - sample_ls=[10] if weight_bit < 8 or activation_bit < 8 else [10, 100, 1000, 10000], - quant_config_dict={"quant_config_dict": asdict(model_config_quant_v1)}, - decoder="ctc.decoder.flashlight_quant_stat_phoneme_ctc", - num_iterations=num_iterations, - datasets=train_data, - network_module="ctc.conformer_1023.quant.baseline_quant_v1", - ) - quant_str = f"_weight_{weight_bit}_act_{activation_bit}_no_avg" - asr_model = prepare_asr_model( - training_name+quant_str, - train_job, - train_args, - with_prior=True, - datasets=train_data, - get_specific_checkpoint=250, - ) - res, _ = tune_and_evaluate_helper( # only take best for now, since otherwise too many searches - training_name, asr_model, default_decoder_config, lm_scales=[2.8], - prior_scales=[0.7], quant_args=quant_args, quant_str=quant_str, - ) - results.update(res) - generate_report(results=results, exp_name=training_name + quant_str) - del results + num_iterations = 100 + for activation_bit in [8]: + for weight_bit in [8, 7, 6, 5, 4, 3, 2, 1]: + results = {} + model_config_quant_v1 = QuantModelConfigV1( + weight_quant_dtype="qint8", + weight_quant_method="per_tensor", + activation_quant_dtype="qint8", + activation_quant_method="per_tensor", + dot_quant_dtype="qint8", + dot_quant_method="per_tensor", + Av_quant_dtype="qint8", + Av_quant_method="per_tensor", + moving_average=0.01, + weight_bit_prec=weight_bit, + activation_bit_prec=activation_bit, + linear_quant_output=True, + ) + quant_args = QuantArgs( + sample_ls=[10] if weight_bit < 8 or activation_bit < 8 else [10, 100, 1000, 10000], + quant_config_dict={"quant_config_dict": asdict(model_config_quant_v1)}, + decoder="ctc.decoder.flashlight_quant_stat_phoneme_ctc", + num_iterations=num_iterations, + datasets=train_data, + network_module="ctc.conformer_1023.quant.baseline_quant_v1", + ) + quant_str = f"_weight_{weight_bit}_act_{activation_bit}_qlin" + asr_model = prepare_asr_model( + training_name+quant_str, train_job, train_args, with_prior=True, datasets=train_data, get_specific_checkpoint=250 + ) + res, _ = tune_and_evaluate_helper( # only take best for now, since otherwise too many searches + training_name, asr_model, default_decoder_config, lm_scales=[2.8], + prior_scales=[0.7], quant_args=quant_args, quant_str=quant_str, dev_dataset_tuples=dev_dataset_tuples + ) + results.update(res) + generate_report(results=results, exp_name=training_name+quant_str) + del results - model_config_drop_03 = ModelConfig( - feature_extraction_config=fe_config, - frontend_config=frontend_config, - specaug_config=specaug_config, - label_target_size=vocab_size_without_blank, - conformer_size=384, - num_layers=12, - num_heads=4, - ff_dim=1536, - att_weights_dropout=0.3, - conv_dropout=0.3, - ff_dropout=0.3, - mhsa_dropout=0.3, - conv_kernel_size=31, - final_dropout=0.3, - specauc_start_epoch=1, - ) + for activation_bit in [8]: + for weight_bit in [8]: + results = {} + model_config_quant_v1 = QuantModelConfigV1( + weight_quant_dtype="qint8", + weight_quant_method="per_tensor", + activation_quant_dtype="qint8", + activation_quant_method="per_tensor", + dot_quant_dtype="qint8", + dot_quant_method="per_tensor", + Av_quant_dtype="qint8", + Av_quant_method="per_tensor", + moving_average=None, + weight_bit_prec=weight_bit, + activation_bit_prec=activation_bit, + linear_quant_output=False, + ) + quant_args = QuantArgs( + sample_ls=[10] if weight_bit < 8 or activation_bit < 8 else [10, 100, 1000, 10000], + quant_config_dict={"quant_config_dict": asdict(model_config_quant_v1)}, + decoder="ctc.decoder.flashlight_quant_stat_phoneme_ctc", + num_iterations=num_iterations, + datasets=train_data, + network_module="ctc.conformer_1023.quant.baseline_quant_v1", + ) + quant_str = f"_weight_{weight_bit}_act_{activation_bit}_no_avg" + asr_model = prepare_asr_model( + training_name+quant_str, + train_job, + train_args, + with_prior=True, + datasets=train_data, + get_specific_checkpoint=250, + ) + res, _ = tune_and_evaluate_helper( # only take best for now, since otherwise too many searches + training_name, asr_model, default_decoder_config, lm_scales=[2.8], + prior_scales=[0.7], quant_args=quant_args, quant_str=quant_str, dev_dataset_tuples=dev_dataset_tuples, + ) + results.update(res) + generate_report(results=results, exp_name=training_name + quant_str) + del results - train_config_24gbgpu_amp = { + # E-Branchformer + branchformer_module = "ctc.conformer_1023.i6models_ebranchformer_v1" + train_config = { "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3}, - "learning_rates": list(np.linspace(7e-6, 5e-4, 210)) - + list(np.linspace(5e-4, 5e-5, 210)) - + list(np.linspace(5e-5, 1e-7, 30)), + "learning_rates": list(np.linspace(7e-6, 5e-4, 110)) + + list(np.linspace(5e-4, 5e-5, 110)) + + list(np.linspace(5e-5, 1e-7, 30)), ############# - "batch_size": 360 * 16000, + "batch_size": 180 * 16000, "max_seq_length": {"audio_features": 35 * 16000}, "accum_grad_multiple_step": 1, - "torch_amp_options": {"dtype": "bfloat16"}, } train_args = { - "config": train_config_24gbgpu_amp, - "network_module": network_module, - "net_args": {"model_config_dict": asdict(model_config_drop_03)}, + "config": train_config, + "network_module": branchformer_module, + "net_args": {"model_config_dict": asdict(model_config)}, "debug": False, - "use_speed_perturbation": True } results = {} - training_name = prefix_name + "/" + network_module + "_384dim_sub4_24gbgpu_50eps_amp_longer" - train_job = training(training_name, train_data, train_args, num_epochs=450, **default_returnn) - train_job.rqmt["gpu_mem"] = 24 + training_name = prefix_name + "/" + branchformer_module + "_384dim_sub4_50eps" + train_job = training(training_name, train_data, train_args, num_epochs=250, **default_returnn) asr_model = prepare_asr_model( - training_name, train_job, train_args, with_prior=True, datasets=train_data, get_specific_checkpoint=450 + training_name, train_job, train_args, with_prior=True, datasets=train_data, get_specific_checkpoint=250 ) + lm_scales = [2.0, 2.2, 2.4, 2.6, 2.8] + prior_scales = [0.7, 0.9] res, _ = tune_and_evaluate_helper( - training_name, asr_model, default_decoder_config, lm_scales=[2.0, 2.2, 2.4, 2.6, 2.8], prior_scales=[0.5, 0.7] + training_name, asr_model, default_decoder_config, lm_scales=lm_scales, + prior_scales=prior_scales, dev_dataset_tuples=dev_dataset_tuples ) results.update(res) asr_model_best4 = prepare_asr_model( - training_name + "/best4", train_job, train_args, with_prior=True, datasets=train_data, get_best_averaged_checkpoint=(4, "dev_loss_ctc") + training_name + "/best4", train_job, train_args, with_prior=True, datasets=train_data, + get_best_averaged_checkpoint=(4, "dev_loss_ctc") ) - res, _ = tune_and_evaluate_helper(training_name + "/best4", asr_model_best4, default_decoder_config, lm_scales=[2.3, 2.5, 2.7], prior_scales=[0.5, 0.7]) + res, _ = tune_and_evaluate_helper(training_name + "/best4", asr_model_best4, default_decoder_config, + lm_scales=lm_scales, prior_scales=prior_scales, dev_dataset_tuples=dev_dataset_tuples) results.update(res) asr_model_best = prepare_asr_model( training_name + "/best", train_job, train_args, with_prior=True, datasets=train_data, get_best_averaged_checkpoint=(1, "dev_loss_ctc") ) res, _ = tune_and_evaluate_helper(training_name + "/best", asr_model_best, default_decoder_config, - lm_scales=[2.3, 2.5, 2.7], prior_scales=[0.5, 0.7]) + lm_scales=lm_scales, prior_scales=prior_scales, dev_dataset_tuples=dev_dataset_tuples) results.update(res) - generate_report(results=results, exp_name=training_name) + generate_report(results=results, exp_name=training_name) # TODO current best with 6.99 del results - - network_module_conv_first = "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6_conv_first" - train_args = { - "config": train_config_24gbgpu_amp, - "network_module": network_module_conv_first, - "net_args": {"model_config_dict": asdict(model_config)}, - "debug": False, - } - results = {} - training_name = prefix_name + "/" + network_module_conv_first + "_384dim_sub4_24gbgpu_50eps_conv_first_amp" - train_job = training(training_name, train_data, train_args, num_epochs=250, **default_returnn) - train_job.rqmt["gpu_mem"] = 24 - asr_model = prepare_asr_model( - training_name, train_job, train_args, with_prior=True, datasets=train_data, get_specific_checkpoint=250 - ) - res, _ = tune_and_evaluate_helper( - training_name, asr_model, default_decoder_config, lm_scales=[1.8, 2.0, 2.2, 2.4, 2.6, 2.8], - prior_scales=[0.5, 0.7] + unimod_module = "ctc.conformer_1023.conformer_v1_uni_aggr_v1" + from ...pytorch_networks.ctc.conformer_1023.conformer_v1_uni_aggr_cfg_v1 import ModelConfig as UniAggrConfig + uni_aggr_model_config = UniAggrConfig( + feature_extraction_config=fe_config, + frontend_config=frontend_config, + specaug_config=specaug_config, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=1536, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + specauc_start_epoch=1, + aggr_layer=9, ) - results.update(res) - generate_report(results=results, exp_name=training_name) - del results - - # E-Branchformer - branchformer_module = "ctc.conformer_1023.i6models_ebranchformer_v1" train_config = { "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3}, "learning_rates": list(np.linspace(7e-6, 5e-4, 110)) @@ -549,23 +457,24 @@ def tune_and_evaluate_helper( "max_seq_length": {"audio_features": 35 * 16000}, "accum_grad_multiple_step": 1, } + # Unimodal Aggregation train_args = { "config": train_config, - "network_module": branchformer_module, - "net_args": {"model_config_dict": asdict(model_config)}, + "network_module": unimod_module, + "net_args": {"model_config_dict": asdict(uni_aggr_model_config)}, "debug": False, } results = {} - training_name = prefix_name + "/" + branchformer_module + "_384dim_sub4_50eps" + training_name = prefix_name + "/" + unimod_module + "_384dim_sub4_50eps" train_job = training(training_name, train_data, train_args, num_epochs=250, **default_returnn) asr_model = prepare_asr_model( - training_name, train_job, train_args, with_prior=True, datasets=train_data, get_specific_checkpoint=250 + training_name, train_job, train_args, with_prior=True, datasets=train_data, get_specific_checkpoint=111 ) lm_scales = [2.0, 2.2, 2.4, 2.6, 2.8] prior_scales = [0.7, 0.9] res, _ = tune_and_evaluate_helper( training_name, asr_model, default_decoder_config, lm_scales=lm_scales, - prior_scales=prior_scales + prior_scales=prior_scales, dev_dataset_tuples=dev_dataset_tuples ) results.update(res) asr_model_best4 = prepare_asr_model( @@ -573,14 +482,14 @@ def tune_and_evaluate_helper( get_best_averaged_checkpoint=(4, "dev_loss_ctc") ) res, _ = tune_and_evaluate_helper(training_name + "/best4", asr_model_best4, default_decoder_config, - lm_scales=lm_scales, prior_scales=prior_scales) + lm_scales=lm_scales, prior_scales=prior_scales, dev_dataset_tuples=dev_dataset_tuples) results.update(res) asr_model_best = prepare_asr_model( training_name + "/best", train_job, train_args, with_prior=True, datasets=train_data, get_best_averaged_checkpoint=(1, "dev_loss_ctc") ) res, _ = tune_and_evaluate_helper(training_name + "/best", asr_model_best, default_decoder_config, - lm_scales=lm_scales, prior_scales=prior_scales) + lm_scales=lm_scales, prior_scales=prior_scales, dev_dataset_tuples=dev_dataset_tuples) results.update(res) - generate_report(results=results, exp_name=training_name) # TODO current best with 7.083 - del results \ No newline at end of file + generate_report(results=results, exp_name=training_name) + del results diff --git a/users/hilmes/experiments/tedlium2/standalone/experiments/ctc_phon/tune_eval.py b/users/hilmes/experiments/tedlium2/standalone/experiments/ctc_phon/tune_eval.py new file mode 100644 index 000000000..520ca392c --- /dev/null +++ b/users/hilmes/experiments/tedlium2/standalone/experiments/ctc_phon/tune_eval.py @@ -0,0 +1,135 @@ +from ...default_tools import RETURNN_EXE, QUANT_RETURNN, MINI_RETURNN_ROOT +from ...pipeline import search, ASRModel, quantize_static +from ...pytorch_networks.ctc.decoder.flashlight_ctc_v1 import DecoderConfig +from typing import List, Optional, Dict, Any +from ...data.common import TrainingDatasets +from dataclasses import dataclass, asdict +from ...config import get_static_quant_config +import copy +from i6_core.tools.parameter_tuning import GetOptimalParametersAsVariableJob + +@dataclass +class QuantArgs: + sample_ls: List[int] + quant_config_dict: Dict[str, Any] + decoder: str + num_iterations: int + datasets: TrainingDatasets + network_module: str + filter_args: Optional[Dict[str, Any]] = None + +default_returnn = { + "returnn_exe": RETURNN_EXE, + "returnn_root": MINI_RETURNN_ROOT, +} + +def tune_and_evaluate_helper( + training_name: str, + asr_model: ASRModel, + base_decoder_config: DecoderConfig, + lm_scales: List[float], + prior_scales: List[float], + dev_dataset_tuples: Dict[str, Any], + quant_str: Optional[str] = None, + test_dataset_tuples: Optional[Dict[str, Any]] = None, + quant_args: Optional[QuantArgs] = None, +): + """ + Example helper to execute tuning over lm_scales and prior scales. + With the best values runs test-clean and test-other. + + This is just a reference helper and can (should) be freely changed, copied, modified etc... + + :param training_name: for alias and output names + :param asr_model: ASR model to use + :param base_decoder_config: any decoder config dataclass + :param lm_scales: lm scales for tuning + :param prior_scales: prior scales for tuning, same length as lm scales + """ + tune_parameters = [] + tune_values = [] + results = {} + for lm_weight in lm_scales: + for prior_scale in prior_scales: + decoder_config = copy.deepcopy(base_decoder_config) + decoder_config.lm_weight = lm_weight + decoder_config.prior_scale = prior_scale + search_name = training_name + "/search_lm%.1f_prior%.1f" % (lm_weight, prior_scale) + search_jobs, wers = search( + search_name, + forward_config={}, + asr_model=asr_model, + decoder_module="ctc.decoder.flashlight_ctc_v1", + decoder_args={"config": asdict(decoder_config)}, + test_dataset_tuples=dev_dataset_tuples, + **default_returnn, + ) + tune_parameters.append((lm_weight, prior_scale)) + tune_values.append((wers[search_name + "/dev"])) + results.update(wers) + if quant_args is not None: + assert quant_str is not None, "You want your quant to have a name" + for num_samples in quant_args.sample_ls: + for seed in range(quant_args.num_iterations): + it_name = training_name + quant_str + f"/quantize_static/samples_{num_samples}/seed_{seed}" + quant_config = get_static_quant_config( + training_datasets=quant_args.datasets, + network_module=quant_args.network_module, + net_args=asr_model.net_args, + quant_args=quant_args.quant_config_dict, + config={}, + num_samples=num_samples, + dataset_seed=seed, + debug=False, + dataset_filter_args=quant_args.filter_args + ) + quant_chkpt = quantize_static( + prefix_name=it_name, + returnn_config=quant_config, + checkpoint=asr_model.checkpoint, + returnn_exe=RETURNN_EXE, + returnn_root=QUANT_RETURNN, + ) + quant_model = ASRModel( + checkpoint=quant_chkpt, + net_args=asr_model.net_args | quant_args.quant_config_dict, + network_module=quant_args.network_module, + prior_file=asr_model.prior_file, + prefix_name=it_name + ) + for lm_weight in lm_scales: + for prior_scale in prior_scales: + decoder_config = copy.deepcopy(base_decoder_config) + decoder_config.lm_weight = lm_weight + decoder_config.prior_scale = prior_scale + search_name = it_name + "/search_lm%.1f_prior%.1f" % (lm_weight, prior_scale) + search_jobs, wers = search( + search_name, + forward_config={}, + asr_model=quant_model, + decoder_module=quant_args.decoder, + decoder_args={"config": asdict(decoder_config)}, + test_dataset_tuples=dev_dataset_tuples, + **default_returnn, + ) + results.update(wers) + pick_optimal_params_job = GetOptimalParametersAsVariableJob( + parameters=tune_parameters, values=tune_values, mode="minimize" + ) + pick_optimal_params_job.add_alias(training_name + f"/pick_best_dev") + if test_dataset_tuples is not None: + for key, tune_values in [("test", tune_values)]: + decoder_config = copy.deepcopy(base_decoder_config) + decoder_config.lm_weight = pick_optimal_params_job.out_optimal_parameters[0] + decoder_config.prior_scale = pick_optimal_params_job.out_optimal_parameters[1] + search_jobs, wers = search( + training_name, + forward_config={}, + asr_model=asr_model, + decoder_module="ctc.decoder.flashlight_ctc_v1", + decoder_args={"config": asdict(decoder_config)}, + test_dataset_tuples={key: test_dataset_tuples[key]}, + **default_returnn, + ) + results.update(wers) + return results, pick_optimal_params_job diff --git a/users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_1023/conformer_v1_uni_aggr_cfg_v1.py b/users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_1023/conformer_v1_uni_aggr_cfg_v1.py new file mode 100644 index 000000000..56071ed65 --- /dev/null +++ b/users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_1023/conformer_v1_uni_aggr_cfg_v1.py @@ -0,0 +1,85 @@ +""" +Config for the base CTC models v4, including specaug start time +""" + +from dataclasses import dataclass + +import torch +from torch import nn +from typing import Callable, Optional, Type, Union + +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config +from i6_models.config import ModuleFactoryV1, ModelConfiguration +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1Config + + +@dataclass(kw_only=True) +class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config): + activation_str: str = "" + activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None + + @classmethod + def from_dict(cls, d): + d = d.copy() + activation_str = d.pop("activation_str") + if activation_str == "ReLU": + from torch.nn import ReLU + + activation = ReLU() + else: + assert False, "Unsupported activation %s" % d["activation_str"] + d["activation"] = activation + return VGG4LayerActFrontendV1Config(**d) + + +@dataclass +class ConformerEncoderV1Config(ModelConfiguration): + """ + Attributes: + num_layers: Number of conformer layers in the conformer encoder + frontend: A pair of ConformerFrontend and corresponding config + block_cfg: Configuration for ConformerBlockV1 + """ + + num_layers: int + + # nested configurations + frontend: ModuleFactoryV1 + block_cfg: ConformerBlockV1Config + + +@dataclass +class SpecaugConfig(ModelConfiguration): + repeat_per_n_frames: int + max_dim_time: int + num_repeat_feat: int + max_dim_feat: int + + +@dataclass +class ModelConfig: + feature_extraction_config: LogMelFeatureExtractionV1Config + frontend_config: VGG4LayerActFrontendV1Config + specaug_config: SpecaugConfig + specauc_start_epoch: int + label_target_size: int + conformer_size: int + num_layers: int + num_heads: int + ff_dim: int + att_weights_dropout: float + conv_dropout: float + ff_dropout: float + mhsa_dropout: float + conv_kernel_size: int + final_dropout: float + aggr_layer: int + + @classmethod + def from_dict(cls, d): + d = d.copy() + d["feature_extraction_config"] = LogMelFeatureExtractionV1Config(**d["feature_extraction_config"]) + d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"]) + d["specaug_config"] = SpecaugConfig(**d["specaug_config"]) + return ModelConfig(**d) diff --git a/users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_1023/conformer_v1_uni_aggr_v1.py b/users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_1023/conformer_v1_uni_aggr_v1.py new file mode 100644 index 000000000..a73f38b71 --- /dev/null +++ b/users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_1023/conformer_v1_uni_aggr_v1.py @@ -0,0 +1,278 @@ +""" +Like v2, but with i6_models specaugment (v3) +and now controllable start time for when specaugment is applied (v4) +and with the proper feature extraction from i6-models +""" + +import numpy as np +import torch +from typing import Tuple, Union, Callable +import torch.functional as F +from torch import nn +from dataclasses import dataclass + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1 +from i6_models.config import ModuleFactoryV1, ModelConfiguration +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1 + +from returnn.torch.context import get_run_ctx + +from .conformer_v1_uni_aggr_cfg_v1 import ModelConfig + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +@dataclass +class ConformerAggrEncoderV1Config(ModelConfiguration): + """ + Attributes: + num_layers: Number of conformer layers in the conformer encoder + frontend: A pair of ConformerFrontend and corresponding config + block_cfg: Configuration for ConformerBlockV1 + """ + + num_layers: int + aggr_layer: int + + # nested configurations + frontend: ModuleFactoryV1 + block_cfg: ConformerBlockV1Config + +class ConformerAggrEncoderV1(nn.Module): + """ + Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication. + The model consists of a frontend and a stack of N conformer blocks. + C.f. https://arxiv.org/pdf/2005.08100.pdf + """ + + def __init__(self, cfg: ConformerAggrEncoderV1Config): + """ + :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks + """ + super().__init__() + + self.frontend = cfg.frontend() + self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)]) + self.aggr_layer = cfg.aggr_layer + self.aggr_lin = nn.Linear(cfg.block_cfg.ff_cfg.input_dim, 1) + + def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + :param data_tensor: input tensor of shape [B, T', F] + :param sequence_mask: mask tensor where 1 defines positions within the sequence and 0 outside, shape: [B, T'] + :return: (output, out_seq_mask) + where output is torch.Tensor of shape [B, T, F'], + out_seq_mask is a torch.Tensor of shape [B, T] + + F: input feature dim, F': internal and output feature dim + T': data time dim, T: down-sampled time dim (internal time dim) + """ + x, sequence_mask = self.frontend(data_tensor, sequence_mask) # [B, T, F'] + for i, module in enumerate(self.module_list): + if i == self.aggr_layer: + lengths = torch.sum(sequence_mask, dim=1) + # from https://github.com/Audio-WestlakeU/UMA-ASR/blob/main/espnet2/asr/uma.py + batch, length, _ = x.size() + weights = self.aggr_lin(x) # [B, T, 1] + weights = torch.sigmoid(weights) + scalar_before = weights[:, :-1, :].detach() # (#batch, L-1, 1) + scalar_after = weights[:, 1:, :].detach() # (#batch, L-1, 1) + scalar_before = torch.nn.functional.pad(scalar_before, (0, 0, 1, 0)) # (#batch, L, 1) + scalar_after = torch.nn.functional.pad(scalar_after, (0, 0, 0, 1)) # (#batch, L, 1) + mask = (weights.lt(scalar_before)) & (weights.lt(scalar_after)) # bool tensor (#batch, L, 1) + mask = mask.reshape(weights.shape[0], -1) # bool tensor (#batch, L) + mask[:, 0] = True + batch_index = mask.nonzero()[:, 0] # (k,1); [0,0,0,...,1,1,...,2,2,...,#batch-1,...] + valley_index_start = mask.nonzero()[:, 1] # (k,1); [0,3,7,...,0,2,...,0,4,...,0,...] + mask[:, 0] = False + mask[:, -1] = True + valley_index_end = mask.nonzero()[:, 1] + 2 + # (k,1); [5,9,...,4,...,6,...] + valley_index_end = torch.where(valley_index_end > (length) * torch.ones_like(valley_index_end), + (length) * torch.ones_like(valley_index_end), valley_index_end) + _, counts = torch.unique(batch_index, + return_counts=True) # (#batch, 1); the number of valleys in each sample + max_counts = (torch.max(counts)).item() + utri_mat1 = torch.tril(torch.ones(max_counts + 1, max_counts), -1).to(x.device) + batch_index_mask = utri_mat1[counts] + batch_index_mask = batch_index_mask.reshape(-1, 1) + batch_index_mask = batch_index_mask.nonzero()[:, 0] + valleys = torch.zeros(batch * max_counts, 2).type_as(valley_index_start) + valleys[batch_index_mask] = torch.cat((valley_index_start.unsqueeze(1), valley_index_end.unsqueeze(1)), + 1) + utri_mat = torch.tril(torch.ones(length + 1, length), -1).to(x.device) + output_mask = (utri_mat[valleys[:, 1]] - utri_mat[valleys[:, 0]]).reshape(batch, max_counts, length) + output_mask = output_mask.detach() + alpha_h = torch.mul(weights, x) + x = torch.bmm(output_mask, alpha_h) / torch.bmm(output_mask, weights).clamp_(1e-6) + new_lengths = (lengths / lengths[0] * x.shape[1]).type_as(lengths) + sequence_mask = mask_tensor(x, new_lengths) + + x = module(x, sequence_mask) # [B, T, F'] + + return x, sequence_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerAggrEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + aggr_layer=self.cfg.aggr_layer, + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=self.cfg.feature_extraction_config) + self.conformer = ConformerAggrEncoderV1(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio, dim=-1) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + run_ctx = get_run_ctx() + if self.training and run_ctx.epoch >= self.specaug_start_epoch: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, # TODO: make configurable + time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.cfg.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.cfg.specaug_config.max_dim_feat, + freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"].to("cpu") # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True, + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_1023/quant/baseline_quant_v1.py b/users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_1023/quant/baseline_quant_v1.py index f5086205f..c723c51e3 100644 --- a/users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_1023/quant/baseline_quant_v1.py +++ b/users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_1023/quant/baseline_quant_v1.py @@ -490,17 +490,18 @@ def static_quant_init_hook(run_ctx, **kwargs): run_ctx.apply_quant = False run_ctx.tag_file = open("seq_tags.txt", "wt") - def static_quant_step(*, model: Model, data, run_ctx, **kwargs): raw_audio = data["raw_audio"] # [B, T', F] raw_audio_len = data["raw_audio:size1"] # [B] assert not model.training + assert model.eval() + model.eval() _, audio_features_len = model( raw_audio=raw_audio, raw_audio_len=raw_audio_len, ) - for tag, feat_len, raw_len in zip(data["seq_tag"], audio_features_len, raw_audio_len): + for tag, feat_len, raw_len in zip(data["seq_tag"], audio_features_len, raw_audio_len): run_ctx.tag_file.write(tag + f" len: {feat_len} raw_len: {raw_len}\n") def static_quant_finish_hook(run_ctx, **kwargs): diff --git a/users/hilmes/modules/onnx_precomputed_hybrid_system.py b/users/hilmes/modules/onnx_precomputed_hybrid_system.py index 0ecdfbb95..5edba5167 100644 --- a/users/hilmes/modules/onnx_precomputed_hybrid_system.py +++ b/users/hilmes/modules/onnx_precomputed_hybrid_system.py @@ -42,6 +42,7 @@ def calcluate_nn_prior(self, returnn_config, epoch, epoch_num, name, checkpoint, del prior_config.config["chunking"] if "torch_amp" in prior_config.config.keys(): del prior_config.config['torch_amp'] + prior_config.config["extern_data"]["data_raw"] = {"dim": 1, "shape": (None, 1), "available_for_inference": True} from i6_core.tools.git import CloneGitRepositoryJob # if "align" not in prior_config.config["train"]["datasets"]: returnn_root = CloneGitRepositoryJob( @@ -163,9 +164,9 @@ def nn_recognition( onnx_job.add_alias(f"export_onnx/{name}/epoch_{epoch_str}") onnx_job.set_keep_value(5) onnx_model = onnx_job.out_onnx_model - io_map = {"features": "data", "output": "log_probs"} + io_map = {"features": "data_raw", "output": "log_probs"} if needs_features_size: - io_map["features-size"] = "data:size1" + io_map["features-size"] = "data_raw:size1" if nn_prior or acoustic_mixture_path is None: prior_file, prior_config = self.calcluate_nn_prior( @@ -178,9 +179,13 @@ def nn_recognition( ) # This can't be acoustic_mixture_path because python hands in the object itself, not a copy thus # one would override the old mixture_path (if there is any) for all other exps + if "data_raw" in returnn_config.config['extern_data']: + features = returnn_config.config['extern_data']['data_raw']['dim'] + else: + features = returnn_config.config['extern_data']['data']['dim'] tmp_acoustic_mixture_path = CreateDummyMixturesJob( num_mixtures=returnn_config.config['extern_data']['classes']['dim'], - num_features=returnn_config.config['extern_data']['data']['dim']).out_mixtures + num_features=features).out_mixtures lmgc_scorer = rasr.GMMFeatureScorer(acoustic_mixture_path) scorer = rasr.PrecomputedHybridFeatureScorer( prior_mixtures=tmp_acoustic_mixture_path, diff --git a/users/hilmes/modules/pytorch_onnx_hybrid_system.py b/users/hilmes/modules/pytorch_onnx_hybrid_system.py index 20e211255..2bfee277e 100644 --- a/users/hilmes/modules/pytorch_onnx_hybrid_system.py +++ b/users/hilmes/modules/pytorch_onnx_hybrid_system.py @@ -29,7 +29,7 @@ from i6_experiments.users.hilmes.experiments.tedlium2.asr_2023.hybrid.torch_baselines.pytorch_networks.prior.forward import \ ReturnnForwardComputePriorJob from i6_experiments.users.hilmes.tools.onnx import ExportPyTorchModelToOnnxJob - +from i6_core.tools.git import CloneGitRepositoryJob def get_quant_str( quant_mode: CalibrationMethod, @@ -88,6 +88,8 @@ def get_quant_str( mode_str_tmp += f"_single_tag" if "unique_tags" in filter_opts: mode_str_tmp += f"_unique_tags" + if "range_len" in filter_opts: + mode_str_tmp += f"_range_len_{filter_opts['range_len'][0]}_{filter_opts['range_len'][1]}" return mode_str_tmp @@ -144,7 +146,6 @@ def calcluate_nn_prior(self, returnn_config, epoch, epoch_num, name, checkpoint, prior_config.config["load_epoch"] = epoch_num if "chunking" in prior_config.config.keys(): del prior_config.config["chunking"] - from i6_core.tools.git import CloneGitRepositoryJob if any(x in name for x in ["distill_whisper"]): returnn_root = CloneGitRepositoryJob( "https://github.com/rwth-i6/returnn", @@ -227,7 +228,6 @@ def nn_recognition( epoch_num = best_checkpoint_job.out_epoch elif epoch == "avrg": assert train_job is not None, "train_job needed to average checkpoints" - from i6_core.tools.git import CloneGitRepositoryJob chkpts = [] for x in [0, 1, 2, 3]: best_job = GetBestPtCheckpointJob(train_job.out_model_dir, train_job.out_learning_rates, @@ -269,7 +269,6 @@ def nn_recognition( num_mixtures=returnn_config.config['extern_data']['classes']['dim'], num_features=returnn_config.config['extern_data']['data']['dim']).out_mixtures lmgc_scorer = rasr.GMMFeatureScorer(acoustic_mixture_path) - onnx_job = ExportPyTorchModelToOnnxJob( pytorch_checkpoint=checkpoint, returnn_config=returnn_config, @@ -299,6 +298,7 @@ def nn_recognition( final_skip_ls: Optional[Tuple[List[int], List[int]]] = tmp_kwargs.pop("final_skip_ls", None) smooth_ls = tmp_kwargs.pop("smooth_ls", []) quant_filter_opts = tmp_kwargs.pop("quant_filter_opts", [None]) + loss_table_args = tmp_kwargs.pop("loss_table_args", None) for data_num in data_num_ls: for quant_mode, average, sym, activation_type, weight_type, quant_format, quant_ops, percentile, num_bins, filter_opts in itertools.product(quant_modes, avg_modes, sym_modes, activation_type_ls, weight_type_ls, quant_format_ls, quant_ops_ls, percentile_ls, num_bin_ls, quant_filter_opts): if not quant_mode == CalibrationMethod.MinMax and "speed" in name: @@ -351,7 +351,7 @@ def nn_recognition( quant_job.add_alias("quantize_static/" + name + "/" + mode_str + "/epoch" + epoch_str + data_str) quant_job.set_keep_value(5) quant_model = quant_job.out_model - if data_num is None: + if data_num is None or "range_len_" in mode_str: self.jobs[recognition_corpus_key][f"quantize_static/" + name + "/" + mode_str + "/epoch" + epoch_str] = quant_job scorer = OnnxFeatureScorer( mixtures=acoustic_mixture_path, @@ -484,9 +484,138 @@ def nn_recognition( lmgc_scorer=lmgc_scorer, **tmp_kwargs, ) + # calculate based on loss + if epoch_str == "250": + from i6_core.returnn.forward import ReturnnForwardJobV2 + from i6_core.returnn.config import ReturnnConfig + loss_config: ReturnnConfig = copy.deepcopy(prior_config) + serializer_objects = copy.deepcopy(loss_config.python_epilog[0].serializer_objects)[:-3] + from i6_experiments.common.setups.serialization import Import, ExplicitHash, ExternalImport + package = "i6_experiments.users.hilmes.experiments.tedlium2.asr_2023.hybrid.torch_baselines" + prior_computation = Import(package + ".pytorch_networks.prior.basic.loss_forward_step", + import_as="forward_step") + serializer_objects.append(prior_computation) + prior_computation = Import( + package + ".pytorch_networks.prior.prior_callback.PrintLossCallback", + import_as="forward_callback" + ) + serializer_objects.append(prior_computation) + models_commit = "75e03f37ac74d3d0c7358d29bb9b71dcec1bf120" + i6_models_repo = CloneGitRepositoryJob( + url="https://github.com/rwth-i6/i6_models", + commit=models_commit, + checkout_folder_name="i6_models", + branch="bene_conf_enc" if models_commit == "75e03f37ac74d3d0c7358d29bb9b71dcec1bf120" else None, + ).out_repository + if models_commit == "75e03f37ac74d3d0c7358d29bb9b71dcec1bf120": + i6_models_repo.hash_overwrite = "TEDLIUM2_DEFAULT_I6_MODELS" + i6_models = ExternalImport(import_path=i6_models_repo) + serializer_objects.insert(0, i6_models) + loss_config.python_epilog[0].serializer_objects = serializer_objects + loss_config.config["max_seqs"] = 1 + loss_config.config["forward_data"] = "train" + loss_config.config["model_outputs"] = {"ce_score": {"dim": 1, "shape": (1,)}} + loss_config.config["train"]["datasets"]["align"]["partition_epoch"] = 1 + #if "min_seq_length" in loss_config.config: + # del loss_config.config["min_seq_length"] + calculate_loss_job = ReturnnForwardJobV2( + model_checkpoint=checkpoint, + returnn_config=loss_config, + log_verbosity=5, + mem_rqmt=16, + time_rqmt=2 if not "whisper" in name else 4, + device="gpu", + cpu_rqmt=2, + returnn_python_exe=self.returnn_python_exe, + returnn_root=self.returnn_root, + output_files=["loss_table"] + ) + calculate_loss_job.add_alias("calulate_loss/" + name + "/epoch_" + epoch_str) + tk.register_output("calculate_loss/" + name + "/epoch_" + epoch_str, + calculate_loss_job.out_files["loss_table"]) + self.jobs[recognition_corpus_key][ + f"calculate_loss/" + name + "/epoch_" + epoch_str] = calculate_loss_job + if loss_table_args is not None: + for args in loss_table_args: + mode_str = get_quant_str( + quant_mode=quant_mode, + average=average, + sym=sym, + activation_type=activation_type, + weight_type=weight_type, + quant_format=quant_format, + quant_ops=quant_ops, + percentile=percentile, + num_bins=num_bins, + random_seed=0, + filter_opts=filter_opts, + ) + string = "" + for arg in args: + string += f"_{arg}" + mode_str += f"_loss_table{string}" + quant_job = ModelQuantizeStaticJob( + model=onnx_model, + dataset=prior_config.config["train"]["datasets"]["feat"], + num_seqs=data_num, + num_parallel_seqs=num_parallel_seqs, + calibrate_method=quant_mode, + moving_average=average, + symmetric=sym, + weight_type=weight_type, + activation_type=activation_type, + quant_format=quant_format, + ops_to_quant=quant_ops, + num_bins=num_bins, + percentile=percentile, + random_seed=random_seed, + filter_opts=filter_opts, + loss_table=(calculate_loss_job.out_files["loss_table"], args) + ) + if data_num is None: + data_str = "" + else: + data_str = "-" + str(data_num) + quant_job.add_alias("quantize_static/" + name + "/" + mode_str + "/epoch" + epoch_str + data_str) + quant_job.set_keep_value(5) + quant_model = quant_job.out_model + self.jobs[recognition_corpus_key][f"quantize_static/" + name + "/" + mode_str + "/epoch" + epoch_str + data_str] = quant_job + scorer = OnnxFeatureScorer( + mixtures=acoustic_mixture_path, + model=quant_model, + priori_scale=prior, + io_map=io_map, + inter_op_threads=tmp_kwargs.get("cpu", 1), + intra_op_threads=tmp_kwargs.get("cpu", 1), + prior_file=prior_file + ) + + self.feature_scorers[recognition_corpus_key][f"pre-nn-{name}-{prior:02.2f}-{mode_str}{data_str}"] = scorer + self.feature_flows[recognition_corpus_key][f"{feature_flow_key}-onnx-{epoch_str}-{mode_str}{data_str}"] = feature_flow + + recog_name = f"e{epoch_str}-prior{prior:02.2f}-ps{pron:02.2f}-lm{lm:02.2f}-{mode_str}{data_str}" + recog_func( + name=f"{name}-{recognition_corpus_key}-{recog_name}", + prefix=f"nn_recog/{name}/", + corpus=recognition_corpus_key, + flow=feature_flow, + feature_scorer=scorer, + pronunciation_scale=pron, + lm_scale=lm, + search_parameters=search_parameters, + lattice_to_ctm_kwargs=lattice_to_ctm_kwargs, + parallelize_conversion=parallelize_conversion, + rtf=rtf, + mem=mem, + lmgc_alias=f"lmgc/{name}/{recognition_corpus_key}-{recog_name}", + lmgc_scorer=lmgc_scorer, + **tmp_kwargs, + ) + if "quant" in name and not "rtf" in name: continue + scorer = OnnxFeatureScorer( mixtures=acoustic_mixture_path, model=onnx_model, diff --git a/users/hilmes/tools/onnx.py b/users/hilmes/tools/onnx.py index acabfcfc0..2916cba00 100644 --- a/users/hilmes/tools/onnx.py +++ b/users/hilmes/tools/onnx.py @@ -90,6 +90,7 @@ class ModelQuantizeStaticJob(Job): "num_bins": None, "random_seed": 0, "filter_opts": None, + "loss_table": None } def __init__(self, @@ -111,6 +112,7 @@ def __init__(self, num_bins: Optional[int] = None, random_seed: int = 0, filter_opts: Optional[Dict[str, Any]] = None, + loss_table: Optional[Tuple[tk.Path, Any]] = None, # Path to loss table + args ): """ :param model: @@ -128,6 +130,7 @@ def __init__(self, self.quant_format = quant_format self.weight_type = weight_type self.filter_opts = filter_opts + self.loss_table = loss_table self.out_model = self.output_path("model.onnx") if num_seqs is None: @@ -150,12 +153,16 @@ def __init__(self, time = 4 else: time = 1 + # cpu slow bandaid + if time >= 1: + time += 5 if not calibrate_method == CalibrationMethod.MinMax: time *= 2 if self.filter_opts is not None and "single_tag" in self.filter_opts: time += 1 - self.rqmt = {"cpu": 8 if num_seqs is not None and num_seqs > 100 else 4, "mem": 16.0 if calibrate_method == CalibrationMethod.MinMax else 64, "time": time} + + self.rqmt = {"cpu": 8 if num_seqs is not None and num_seqs > 100 else 4, "mem": 16.0 if calibrate_method == CalibrationMethod.MinMax else 100, "time": time} self.calibration_method = calibrate_method self.percentile = percentile self.num_bins = num_bins @@ -197,6 +204,15 @@ def run(self): seed = self.random_seed import random random.seed(seed) + loss_table = None + if self.loss_table is not None: + loss_table = [] + with open(self.loss_table[0], "rt") as f: + for line in f: + loss_table.append(line.split(" ")) # (Tag, loss) + if "reverse" in self.loss_table[1]: + loss_table.reverse() + class DummyDataReader(CalibrationDataReader): def __init__(self, @@ -204,15 +220,17 @@ def __init__(self, data: Union[Dataset, MetaDataset], max_seqs: int, final_skip: Optional[Tuple[int, int]] = (None, None), filter_opts: Optional[Dict[str, Any]] = None, - open_budget: Optional[Tuple[int, float]] = (None, None), + open_budget: Tuple[Optional[int], Optional[float]] = (None, None), + loss_table: Optional[List[Tuple[str, str]]] = None + ): self.max_seqs = max_seqs self.data = data self.counter: int = 0 sess_option = SessionOptions() - logging.info(f"Data Loading {os.getenv('SLURM_CPUS_PER_TASK')}") - sess_option.intra_op_num_threads = int(os.getenv('SLURM_CPUS_PER_TASK')) + logging.info(f"Data Loading {os.getenv('SLURM_CPUS_PER_TASK', 4)}") + sess_option.intra_op_num_threads = int(os.getenv('SLURM_CPUS_PER_TASK', 4)) session = InferenceSession(model_str, sess_option) self.input_name_1 = session.get_inputs()[0].name inputs = [] @@ -228,6 +246,7 @@ def __init__(self, self.visited_seqs = set() self.open_budget = open_budget[0] self.budget_thresh = None if open_budget[0] is None else open_budget[0] * open_budget[1] + self.loss_table = loss_table def compare_budget(self): if self.budget_thresh is None or self.open_budget is None: @@ -239,7 +258,7 @@ def get_next(self): key = "data" if "data" in self.data.get_data_keys() else "raw_audio" # hack to make it compatible with both setups for now seq_number = None if not self.data.is_less_than_num_seqs(self.counter) or self.counter >= self.max_seqs or self.compare_budget(): - if not self.data.is_less_than_num_seqs(self.counter): + if self.data.is_less_than_num_seqs(self.counter) and self.final_skip_step is None: logging.info(f"Finished after {self.counter} sequences") return None elif self.final_skip_step is not None and self.counter < self.max_seqs + self.final_skip_step * self.final_skip_count: @@ -254,15 +273,46 @@ def get_next(self): else: logging.info("Seen all sequences in dataset") return None + if self.loss_table is not None: + seq_number = self.counter + name = self.loss_table[seq_number][0] + real_seq_number = self.data.get_all_tags().index(name) + while real_seq_number in self.seen_seqs or not self.check_filter(real_seq_number): + seq_number += 1 + name = self.loss_table[seq_number][0] + real_seq_number = self.data.get_all_tags().index(name) + assert self.loss_table[seq_number][0] == self.data.get_tag(real_seq_number), (self.loss_table[seq_number][0], self.data.get_tag(real_seq_number)) + logging.info(f"Position {seq_number} is real {real_seq_number} with Tag {self.data.get_tag(real_seq_number)} matching {self.loss_table[seq_number]}") + self.visited_seqs.add(real_seq_number) + if self.open_budget is not None and self.open_budget == 0: + assert False, "This path should not be reached" + logging.info("Budget Full") + return None + if len(self.visited_seqs) == self.data.num_seqs: + self.visited_seqs = set() + self.open_budget += 1 + seq_number = seq_number + seq_number = real_seq_number if seq_number is None: while not seq_number or seq_number in self.seen_seqs or not self.check_filter(seq_number): seq_number = random.randint(0, self.data.num_seqs - 1) - self.visited_seqs.add(seq_number) - assert len(self.visited_seqs) <= self.data.num_seqs, "Visited all sequences" - if len(self.visited_seqs) == self.data.num_seqs and any(x in self.filter_opts for x in ["single_tag", "unique_tags"]): - logging.warning("All seqs seen, still not all num seqs reached") + self.visited_seqs.add(seq_number) # +2 because seen seqs has not been updated + logging.info(f"{len(self.visited_seqs)} {self.data.num_seqs} {len(self.seen_seqs)}") + logging.info(f"{seq_number}, {seq_number in self.seen_seqs} {not self.check_filter(seq_number)}") + logging.info(f"{len(self.visited_seqs) == self.data.num_seqs} {len(self.seen_seqs)+2 == self.data.num_seqs} {len(self.seen_seqs)+1 == self.data.num_seqs}") + if len(self.visited_seqs) == self.data.num_seqs and (len(self.seen_seqs)+2 == self.data.num_seqs or len(self.seen_seqs)+1 == self.data.num_seqs): return None + if len(self.visited_seqs) == self.data.num_seqs and not (len(self.seen_seqs)+2 == self.data.num_seqs or len(self.seen_seqs)+1 == self.data.num_seqs) and any(x in self.filter_opts for x in ["single_tag", "unique_tags"]): + return None + if self.open_budget is not None and self.open_budget == 0: + logging.info("Budget Full") + return None + if len(self.visited_seqs) == self.data.num_seqs and not (len(self.seen_seqs)+2 == self.data.num_seqs or len(self.seen_seqs)+1 == self.data.num_seqs): + self.visited_seqs = set() + self.open_budget += 1 + #assert len(self.visited_seqs) < self.data.num_seqs, "Visited all sequences" self.seen_seqs.append(seq_number) + logging.info(len(self.seen_seqs)) self.data.load_seqs(seq_number, seq_number+1) data: np.ndarray = self.data.get_data(seq_number, key) seq_len: np.ndarray = self.data.get_seq_length(seq_number)[key] @@ -286,15 +336,26 @@ def check_filter(self, seq_number) -> bool: if name == "max_seq_len": seq_len = self.data.get_seq_length(seq_number)["data" if "data" in self.data.get_data_keys() else "raw_audio"] if seq_len > value: - logging.info( + logging.warning( f"FILTER: Seq {self.data.get_tag(seq_number)} has length {seq_len} longer than {value}") return False elif name == "min_seq_len": seq_len = self.data.get_seq_length(seq_number)[ "data" if "data" in self.data.get_data_keys() else "raw_audio"] if seq_len < value: + logging.warning( + f"FILTER: Seq {self.data.get_tag(seq_number)}has length {seq_len} shorter than {value}") + return False + elif name == "range_len": + seq_len = self.data.get_seq_length(seq_number)[ + "data" if "data" in self.data.get_data_keys() else "raw_audio"] + if seq_len > value[0] and seq_len < value[1]: logging.info( f"FILTER: Seq {self.data.get_tag(seq_number)}has length {seq_len} shorter than {value}") + return True + else: + logging.warning( + f"FILTER: Seq {self.data.get_tag(seq_number)}has length {seq_len} not in range {value}") return False elif name == "partition": seq_len = self.data.get_seq_length(seq_number)[ @@ -306,16 +367,20 @@ def check_filter(self, seq_number) -> bool: value.remove((lower, upper)) logging.info(value) return True - logging.info( + logging.warning( f"FILTER: {self.data.get_tag(seq_number)} of length {seq_len} not matching {value}") return False elif name == "budget": seq_len = self.data.get_seq_length(seq_number)[ "data" if "data" in self.data.get_data_keys() else "raw_audio"] - if seq_len < self.open_budget: + if seq_len <= self.open_budget: logging.info(f"FILTER: Seq with len {seq_len} within budget {self.open_budget}") self.open_budget -= seq_len + self.visited_seqs = set() return True + else: + pass + #logging.warning(f"FILTER: Seq with len {seq_len} NOT in budget {self.open_budget}") return False elif name == "unique_tags": seq_tag = self.data.get_tag(seq_number) @@ -334,16 +399,24 @@ def check_filter(self, seq_number) -> bool: else: raise NotImplementedError if self.open_budget is not None: + assert False seq_len = self.data.get_seq_length(seq_number)[ "data" if "data" in self.data.get_data_keys() else "raw_audio"] if seq_len < self.open_budget: logging.info(f"FILTER: Seq with len {seq_len} within budget {self.open_budget}") self.open_budget -= seq_len return True + else: + logging.warning(f"FILTER: Seq with len {seq_len} NOT in budget {self.open_budget}") return False return True self.dataset = self.convert_to_str(self.dataset) + if loss_table is not None: + with open("segments", "wt") as f: + for tag, loss in loss_table[:self.num_seqs]: + f.write(f"{tag}\n") + #self.dataset["seq_list_filter_file"] = "segments" dataset: Dataset = init_dataset(self.dataset) dataset.init_seq_order(1) @@ -353,7 +426,8 @@ def check_filter(self, seq_number) -> bool: max_seqs=self.num_seqs, final_skip=self.final_skip, filter_opts=self.filter_opts, - open_budget=self.budget + open_budget=self.budget, + loss_table=loss_table, ) quant_options = { "CalibMaxIntermediateOutputs": self.num_parallel_seqs, From 739d46ddd464bb032164c7b3522d7e44f3670424 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Tue, 28 May 2024 09:11:29 +0200 Subject: [PATCH 068/227] cleanup --- .../exp2024_04_23_baselines/ctc.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 4403ca6a0..d295ba6e4 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -97,23 +97,8 @@ def py(): train_vocab_opts={"other_opts": {"enable_sampling": True, "alpha": alpha}}, ) - for alpha in [ - # 0.3, # very bad? - 0.7, - ]: - train_exp( - "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-spm_bpe10k" - f"-spmSample{str(alpha).replace('.', '')}", - config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, - config_updates={ - **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), - "optimizer.weight_decay": 1e-2, - "__train_audio_preprocess": speed_pert_librosa_config, - "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], - }, - vocab="spm_bpe10k", - train_vocab_opts={"other_opts": {"enable_sampling": True, "alpha": alpha}}, - ) + # v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2 + # with spm_bpe10k and enable_sampling, alpha in {0.3, 0.7} was both very bad (90% WER). # noinspection PyShadowingNames From 0f6e5948a894ac1dd89f57057b48964cfe8cbcf7 Mon Sep 17 00:00:00 2001 From: "luca.gaudino" Date: Tue, 28 May 2024 16:51:00 +0200 Subject: [PATCH 069/227] updates and fix mel norm + zoneout --- .../conformer_import_moh_att_2023_06_30.py | 178 +++++++++-------- .../conformer_import_moh_att_train.py | 4 +- .../model_recogs/model_recog.py | 1 + .../tedlium2/_import_model.py | 24 ++- .../conformer_import_moh_att_2023_10_19.py | 58 +++--- .../conformer_import_moh_att_train.py | 80 +++++--- .../librispeech_960/conformer_ctc_train.py | 33 ++- .../tedlium2/conformer_ctc_train.py | 188 ++++++++++++++---- .../rf/conformer_ctc/model_conformer_ctc.py | 3 +- users/gaudino/recog.py | 2 + 10 files changed, 384 insertions(+), 187 deletions(-) diff --git a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py index 8aea3533c..611d6c876 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py @@ -84,6 +84,9 @@ def sis_run_with_prefix(prefix_name: str = None): task = get_librispeech_task_bpe10k_raw(with_eos_postfix=True) + bsf = 10 + prefix_name = prefix_name + f"/bsf{bsf}" + ### Experiments without LM and with LSTM LM new_chkpt_path = tk.Path( @@ -100,11 +103,11 @@ def sis_run_with_prefix(prefix_name: str = None): # att only for beam_size in [12, 18]: - recog_name = f"/bsf10/att_beam{beam_size}" + recog_name = f"/att_beam{beam_size}" name = prefix_name + recog_name search_args = { "beam_size": beam_size, - "bsf": 10, + "bsf": bsf, } res, _ = recog_model( @@ -122,14 +125,15 @@ def sis_run_with_prefix(prefix_name: str = None): ) # att + lstm lm TODO: debug difference - for scales, beam_size in product([(1.0, 0.3), (1.0, 0.33), (1.0, 0.27)], [12, 32]): + for scales, beam_size in product([(1.0, 0.3), (1.0, 0.33), (1.0, 0.27)], []): att_scale, lm_scale = scales - recog_name = f"/bsf10/opls_att{att_scale}_lstm_lm{lm_scale}_beam{beam_size}" + recog_name = f"/opls_att{att_scale}_lstm_lm{lm_scale}_beam{beam_size}" name = prefix_name + recog_name search_args = { "beam_size": beam_size, "add_lstm_lm": True, "lm_scale": lm_scale, + "bsf": bsf, } res, _ = recog_model( task, @@ -150,7 +154,7 @@ def sis_run_with_prefix(prefix_name: str = None): for prior_scale, beam_size in product([0.0], []): name = ( prefix_name - + f"/bsf10/ctc_prefix_fix" + + f"/ctc_prefix_fix" + (f"_prior{prior_scale}" if prior_scale != 0.0 else "") + f"_beam{beam_size}" ) @@ -160,7 +164,7 @@ def sis_run_with_prefix(prefix_name: str = None): "use_ctc": True, "ctc_scale": 1.0, "ctc_state_fix": True, - "bsf": 10, + "bsf": bsf, "prior_corr": prior_scale != 0.0, "ctc_prior_file": "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-02-22--conformer-swb/work/i6_core/returnn/extract_prior/ReturnnComputePriorJobV2.ZeflcEHlQTjn/output/prior.txt", "prior_scale": prior_scale, @@ -179,38 +183,33 @@ def sis_run_with_prefix(prefix_name: str = None): res.output, ) - # att + espnet ctc prefix scorer + lstm lm - for scales, prior_scale, lm_scale, beam_size in product( - [(0.8, 0.2), (0.85, 0.15)], - [0.0], - [0.4, 0.45, 0.5, 0.55, 0.6, 0.65], - [12, 32], - ): + # att + espnet ctc prefix + # beam 32: {"dev-clean": 2.14, "dev-other": 5.21, "test-clean": 2.43, "test-other": 5.57} + for scales, prior_scale, beam_size in product([(0.7, 0.3)], [0.1], []): att_scale, ctc_scale = scales - recog_name = ( - f"/bsf10/opls_att{att_scale}_ctc{ctc_scale}_fix" - + (f"_prior{prior_scale}" if prior_scale > 0.0 else "") - + f"_lstm_lm{lm_scale}_beam{beam_size}" + + name = ( + prefix_name + + f"/opls_att{att_scale}_ctc{ctc_scale}_fix" + + (f"_prior{prior_scale}" if prior_scale != 0.0 else "") + + f"_beam{beam_size}" ) - name = prefix_name + recog_name search_args = { "beam_size": beam_size, - "add_lstm_lm": True, - "lm_scale": lm_scale, "att_scale": att_scale, - "ctc_scale": ctc_scale, "use_ctc": True, - "bsf": 10, - "prior_corr": prior_scale > 0.0, + "ctc_scale": ctc_scale, + "ctc_state_fix": True, + "bsf": bsf, + "prior_corr": prior_scale != 0.0, "ctc_prior_file": "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-02-22--conformer-swb/work/i6_core/returnn/extract_prior/ReturnnComputePriorJobV2.ZeflcEHlQTjn/output/prior.txt", "prior_scale": prior_scale, - "ctc_state_fix": True, } res, _ = recog_model( task, model_with_checkpoint, model_recog, - dev_sets=["dev-other"], + dev_sets=None, model_args=model_args, search_args=search_args, prefix_name=name, @@ -220,33 +219,68 @@ def sis_run_with_prefix(prefix_name: str = None): res.output, ) - # att + espnet ctc prefix - # beam 32: {"dev-clean": 2.14, "dev-other": 5.21, "test-clean": 2.43, "test-other": 5.57} - for scales, prior_scale, beam_size in product([(0.7, 0.3)], [0.1], []): - att_scale, ctc_scale = scales - + # ctc only decoding + # prior 0.0: {"dev-clean": 2.85, "dev-other": 6.68, "test-clean": 3.09, "test-other": 7.0} + for prior_scale in []: + search_args = { + "bsf": bsf, + "prior_corr": prior_scale > 0.0, + "ctc_prior_file": "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-02-22--conformer-swb/work/i6_core/returnn/extract_prior/ReturnnComputePriorJobV2.ZeflcEHlQTjn/output/prior.txt", + "prior_scale": prior_scale, + } name = ( prefix_name - + f"/bsf10/opls_att{att_scale}_ctc{ctc_scale}_fix" - + (f"_prior{prior_scale}" if prior_scale != 0.0 else "") - + f"_beam{beam_size}" + + f"/ctc_greedy" + + (f"_prior{prior_scale}" if prior_scale > 0.0 else "") + ) + res, _ = recog_model( + task, + model_with_checkpoint, + model_recog_ctc, + dev_sets=None, + model_args=model_args, + search_args=search_args, + prefix_name=name, ) + tk.register_output( + name + f"/recog_results", + res.output, + ) + + # ------------------ with LSTM LM ------------------------ + + # att + espnet ctc prefix scorer + lstm lm + for scales, prior_scale, lm_scale, beam_size in product( + [(0.8, 0.2), (0.85, 0.15)], + [0.0], + [0.4, 0.45, 0.5, 0.55, 0.6, 0.65], + [], + ): + att_scale, ctc_scale = scales + recog_name = ( + f"/opls_att{att_scale}_ctc{ctc_scale}_fix" + + (f"_prior{prior_scale}" if prior_scale > 0.0 else "") + + f"_lstm_lm{lm_scale}_beam{beam_size}" + ) + name = prefix_name + recog_name search_args = { "beam_size": beam_size, + "add_lstm_lm": True, + "lm_scale": lm_scale, "att_scale": att_scale, - "use_ctc": True, "ctc_scale": ctc_scale, - "ctc_state_fix": True, - "bsf": 10, - "prior_corr": prior_scale != 0.0, + "use_ctc": True, + "bsf": bsf, + "prior_corr": prior_scale > 0.0, "ctc_prior_file": "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-02-22--conformer-swb/work/i6_core/returnn/extract_prior/ReturnnComputePriorJobV2.ZeflcEHlQTjn/output/prior.txt", "prior_scale": prior_scale, + "ctc_state_fix": True, } res, _ = recog_model( task, model_with_checkpoint, model_recog, - dev_sets=None, + dev_sets=["dev-other"], model_args=model_args, search_args=search_args, prefix_name=name, @@ -256,6 +290,8 @@ def sis_run_with_prefix(prefix_name: str = None): res.output, ) + # ------------ Search Errors ------------ + # check for search errors for scales in [(0.7, 0.3)]: for beam_size in []: @@ -309,40 +345,12 @@ def sis_run_with_prefix(prefix_name: str = None): res, ) - # ctc only decoding - # prior 0.0: {"dev-clean": 2.85, "dev-other": 6.68, "test-clean": 3.09, "test-other": 7.0} - for prior_scale in []: - search_args = { - "bsf": 10, - "prior_corr": prior_scale > 0.0, - "ctc_prior_file": "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-02-22--conformer-swb/work/i6_core/returnn/extract_prior/ReturnnComputePriorJobV2.ZeflcEHlQTjn/output/prior.txt", - "prior_scale": prior_scale, - } - name = ( - prefix_name - + f"/bsf10/ctc_greedy" - + (f"_prior{prior_scale}" if prior_scale > 0.0 else "") - ) - res, _ = recog_model( - task, - model_with_checkpoint, - model_recog_ctc, - dev_sets=None, - model_args=model_args, - search_args=search_args, - prefix_name=name, - ) - tk.register_output( - name + f"/recog_results", - res.output, - ) - # opts att + ctc TODO: fix bugs for scales, blank_scale, beam_size in product([(0.65, 0.35)], [2.0], []): att_scale, ctc_scale = scales name = ( prefix_name - + f"/bsf20/opts_att{att_scale}_ctc{ctc_scale}" + + f"/opts_att{att_scale}_ctc{ctc_scale}" + (f"_blank{blank_scale}" if blank_scale != 0.0 else "") + f"_beam{beam_size}" ) @@ -351,7 +359,7 @@ def sis_run_with_prefix(prefix_name: str = None): "att_scale": att_scale, "ctc_scale": ctc_scale, "blank_scale": blank_scale, - "bsf": 20, + "bsf": bsf, } # @@ -400,7 +408,7 @@ def sis_run_with_prefix(prefix_name: str = None): res, ) - ### Experiments with transformer LM + # ------------------ with Trafo LM ------------------------ model_w_trafo_lm_ckpt_path = tk.Path( _torch_ckpt_filename_w_trafo_lm, hash_overwrite="torch_ckpt_w_trafo_lm" @@ -411,14 +419,20 @@ def sis_run_with_prefix(prefix_name: str = None): ) model_args = { - "add_trafo_lm": True, - "trafo_lm_args": { + "external_language_model": { + "class": "Trafo_LM_Model", "num_layers": 24, "layer_out_dim": 1024, "att_num_heads": 8, "use_pos_enc": True, "ff_activation": "relu", }, + "preload_from_files": { + "01_trafo_lm": { + "prefix": "language_model.", + "filename": "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/librispeech/trafo_lm_only_24_02_06/network.023.pt", + } + }, } # opts ctc + trafo lm TODO: fix bugs @@ -426,7 +440,7 @@ def sis_run_with_prefix(prefix_name: str = None): ctc_scale, lm_scale = scales name = ( prefix_name - + f"/bsf20/opts_ctc{ctc_scale}_trafo_lm{lm_scale}" + + f"/opts_ctc{ctc_scale}_trafo_lm{lm_scale}" + f"_beam{beam_size}" ) search_args = { @@ -437,7 +451,7 @@ def sis_run_with_prefix(prefix_name: str = None): # "remove_trafo_lm_eos": True, # "add_eos_to_end": True, "lm_scale": lm_scale, - "bsf": 20, + "bsf": bsf, } recog_res, recog_out = recog_model( @@ -456,14 +470,14 @@ def sis_run_with_prefix(prefix_name: str = None): # att + trafo lm # beam 32: {"dev-clean": 1.91, "dev-other": 4.14, "test-clean": 2.2, "test-other": 4.6} - for lm_scale, beam_size in product([0.42], [40, 48 ,60 ,64, 70]): - recog_name = f"/bsf10/att_trafo_lm{lm_scale}_beam{beam_size}" + for lm_scale, beam_size in product([0.42], [32, 40]): + recog_name = f"/att_trafo_lm{lm_scale}_beam{beam_size}" name = prefix_name + recog_name search_args = { "beam_size": beam_size, "add_trafo_lm": True, "lm_scale": lm_scale, - "bsf": 10, + "bsf": bsf, } res, _ = recog_model( task, @@ -485,7 +499,7 @@ def sis_run_with_prefix(prefix_name: str = None): [1.0], [0.0], [0.65], [] ): recog_name = ( - f"/bsf10/opls_ctc{ctc_scale}_fix" + f"/opls_ctc{ctc_scale}_fix" + (f"_prior{prior_scale}" if prior_scale > 0.0 else "") + f"_trafo_lm{lm_scale}_beam{beam_size}" ) @@ -497,11 +511,10 @@ def sis_run_with_prefix(prefix_name: str = None): "att_scale": 0.0, "ctc_scale": ctc_scale, "use_ctc": True, - "bsf": 10, + "bsf": bsf, "prior_corr": prior_scale > 0.0, "ctc_prior_file": "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-02-22--conformer-swb/work/i6_core/returnn/extract_prior/ReturnnComputePriorJobV2.ZeflcEHlQTjn/output/prior.txt", "prior_scale": prior_scale, - "ctc_state_fix": True, } res, _ = recog_model( task, @@ -524,7 +537,7 @@ def sis_run_with_prefix(prefix_name: str = None): ): att_scale, ctc_scale = scales recog_name = ( - f"/bsf10/opls_att{att_scale}_ctc{ctc_scale}_fix" + f"/opls_att{att_scale}_ctc{ctc_scale}_fix" + (f"_prior{prior_scale}" if prior_scale > 0.0 else "") + f"_trafo_lm{lm_scale}_beam{beam_size}_cpu" ) @@ -536,11 +549,10 @@ def sis_run_with_prefix(prefix_name: str = None): "att_scale": att_scale, "ctc_scale": ctc_scale, "use_ctc": True, - "bsf": 10, + "bsf": bsf, "prior_corr": prior_scale > 0.0, "ctc_prior_file": "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-02-22--conformer-swb/work/i6_core/returnn/extract_prior/ReturnnComputePriorJobV2.ZeflcEHlQTjn/output/prior.txt", "prior_scale": prior_scale, - "ctc_state_fix": True, } res, _ = recog_model( task, @@ -799,7 +811,7 @@ def __init__( self.mel_normalization = model_args.get("mel_normalization", False) self.no_ctc = model_args.get("no_ctc", False) self.enc_layer_w_ctc = model_args.get("enc_layer_w_ctc", None) - self.s_use_zoneout_output = model_args.get("s_use_zoneout_output", True) + self.s_use_zoneout_output = model_args.get("s_use_zoneout_output", False) self.encoder = ConformerEncoder( in_dim, diff --git a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_train.py b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_train.py index 85c0a7435..79cb8641f 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_train.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_train.py @@ -1302,9 +1302,7 @@ def __init__( Dim(name="lstm", dimension=1024), zoneout_factor_cell=0.15, zoneout_factor_output=0.05, - # TODO: was this a bug? - use_zoneout_output=True, - # use_zoneout_output=False, # like RETURNN/TF ZoneoutLSTM old default + use_zoneout_output=False, # like RETURNN/TF ZoneoutLSTM old default # parts_order="icfo", # like RETURNN/TF ZoneoutLSTM # parts_order="ifco", parts_order="jifo", # NativeLSTM (the code above converts it...) diff --git a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/model_recogs/model_recog.py b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/model_recogs/model_recog.py index 2d19622f4..5c16781ef 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/model_recogs/model_recog.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/model_recogs/model_recog.py @@ -174,6 +174,7 @@ def model_recog( ) if model.search_args.get("ilm_scale", 0.0) > 0: + # breakpoint() ilm_out = model.ilm(input_embed, state=ilm_state, spatial_dim=single_step_dim) ilm_state = ilm_out["state"] ilm_log_prob = rf.log_softmax(ilm_out["output"], axis=model.target_dim) diff --git a/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/_import_model.py b/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/_import_model.py index 5971097fa..c2a7de778 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/_import_model.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/_import_model.py @@ -18,6 +18,8 @@ MakeModel, ) +from i6_experiments.users.gaudino.models.asr.rf.conformer_ctc.model_conformer_ctc import MakeModel as MakeModelCTC + from i6_experiments.users.gaudino.models.asr.rf.nn_lm.lm_import_2023_11_09 import ( MakeModel as MakeModelLM, ) @@ -88,9 +90,14 @@ def convert_checkpoint( print() + ctc_only = model_args.get("ctc_only", False) + print("Creating model...") rf.select_backend_torch() - model = MakeModel(80, 1_057, model_args=model_args)() + if ctc_only: + model = MakeModelCTC(80, 1_057)() + else: + model = MakeModel(80, 1_057, model_args=model_args)() print("Created model:", model) print("Model parameters:") for name, param in model.named_parameters(): @@ -103,7 +110,8 @@ def convert_checkpoint( print("Create ParamMapping...") param_mapping = {} _add_params_conformer(param_mapping, prefix="") - _add_params_att_decoder(param_mapping) + if not ctc_only: + _add_params_att_decoder(param_mapping) _add_params_trafo_lm(param_mapping) # if model_args.get("encoder_ctc", False): # _add_params_conformer(param_mapping, prefix="sep_enc_ctc_") @@ -161,7 +169,6 @@ def convert_checkpoint( os.symlink(os.path.basename(meta_filename), symlink_filename_2) # assert os.path.exists(self.out_checkpoint.get_path()) - def convert_lm(ckpt_path_lm, out_dir, model_target_dim, model_args): from tensorflow.python.training.py_checkpoint_reader import CheckpointReader from returnn.torch.frontend.bridge import rf_module_to_pt_module @@ -326,8 +333,10 @@ def _add_params_conformer(param_mapping: Dict[str, str], prefix: str): param_mapping.update( { prefix + "encoder.input_projection.weight": "source_linear/W", - prefix + "ctc.weight": "ctc/W", - prefix + "ctc.bias": "ctc/b", + # prefix + "ctc.weight": "ctc/W", + # prefix + "ctc.bias": "ctc/b", + prefix + "enc_aux_logits_12.weight": "ctc/W", + prefix + "enc_aux_logits_12.bias": "ctc/b", } ) # conformer @@ -598,7 +607,7 @@ def map_param_func_mini_att_ilm( def import_models(): # for model_name, sep_enc in product(list(models.keys())[-1:], [True, False]): - model_list = ["model_baseline"] + model_list = ["model_ctc_only"] # model_list = ["model_ctc0.9_att0.1", "model_ctc0.8_att0.2", "model_ctc0.7_att0.3", "model_ctc0.6_att0.4", "model_ctc0.5_att0.5", "model_ctc0.4_att0.6"] for model_name, sep_enc, add_trafo_lm in product(model_list, [False], [False]): model_args = { @@ -606,6 +615,7 @@ def import_models(): "add_trafo_lm": add_trafo_lm, "encoder_ctc": sep_enc, "no_ctc": models[model_name].get("no_ctc", False), + "ctc_only": models[model_name].get("ctc_only", False), } print( @@ -615,7 +625,7 @@ def import_models(): + " ..." ) out_dir = "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/tedlium2/without_lm/" - out_dir_postfix = model_name + ("__ctc_only" if sep_enc else "") + ("__trafo_lm" if add_trafo_lm else "") + "_24_05_22" + out_dir_postfix = model_name + ("__ctc_only" if sep_enc else "") + ("__trafo_lm" if add_trafo_lm else "") + "_rf_compatible" ckpt_path = models[model_name]["ckpt"].ckpt_path diff --git a/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_2023_10_19.py b/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_2023_10_19.py index f1ca364cf..e653183c9 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_2023_10_19.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_2023_10_19.py @@ -92,6 +92,7 @@ def sis_run_with_prefix(prefix_name: str = None): model_args = { "target_embed_dim": 256, "mel_normalization": True, + "s_use_zoneout_output": True, "no_ctc": models[model_name].get("no_ctc", False), "enc_layer_w_ctc": models[model_name].get("enc_layer_w_ctc", None), } @@ -108,6 +109,7 @@ def sis_run_with_prefix(prefix_name: str = None): bsf = 10 prefix_name_single_seq = prefix_name + f"/single_seq" + prefix_name_bsf32 = prefix_name + f"/bsf32" prefix_name = prefix_name + f"/bsf{bsf}" + "_fix_zoneout_output" ### Single model experiments @@ -288,8 +290,8 @@ def sis_run_with_prefix(prefix_name: str = None): opls_model_names = { # -------- tuning done ---------- "model_baseline":{ - "scales": [(0.7, 0.3, 0.7, 0.4), (0.7, 0.3, 0.7, 0.5)], - "scales_w_fix": [], + "scales": [(0.7, 0.3, 0.7, 0.4), (0.7, 0.3, 0.7, 0.5), (0.8, 0.2, 0.75, 0.4), (0.8, 0.2, 0.75, 0.5)], + "scales_w_fix": [(0.8, 0.2, 0.75, 0.4)], }, # "model_ctc0.43_att1.0": { # "scales": [(0.8,0.2, 0.6), (0.8, 0.2, 0.7), (0.8, 0.2, 0.9)], @@ -359,9 +361,8 @@ def sis_run_with_prefix(prefix_name: str = None): for model_name in ["model_baseline"]: # for model_name in opls_model_names: # for scales, beam_size in product(opls_model_names[model_name]["scales"], [12]): - for scales, beam_size in product([(0.6, 0.4), (0.65, 0.35), (0.7, 0.3), (0.75, 0.25), (0.8, 0.2)], [12]): + for scales, prior_scale, beam_size in product([(0.8, 0.2)], [0.75], []): #12 att_scale, ctc_scale, = scales - prior_scale = 0.0 search_args = { "beam_size": beam_size, @@ -388,7 +389,7 @@ def sis_run_with_prefix(prefix_name: str = None): task, models_with_pt_ckpt[model_name]["ckpt"], model_recog, - dev_sets=["dev"], # set to None for all + dev_sets=["dev", "test"], # set to None for all model_args=models_with_pt_ckpt[model_name]["model_args"], search_args=search_args, prefix_name=name, @@ -408,7 +409,7 @@ def sis_run_with_prefix(prefix_name: str = None): # ctc beam search espnet for model_name in ctc_beam_search_model_names: for scales, beam_size in product( - ctc_beam_search_model_names[model_name]["scales"], [32] # 32 + ctc_beam_search_model_names[model_name]["scales"], [] # 32 ): att_scale, ctc_scale, prior_scale = scales @@ -461,18 +462,13 @@ def sis_run_with_prefix(prefix_name: str = None): "class": "Trafo_LM_Model", }, "mel_normalization": True, + "s_use_zoneout_output": True, "no_ctc": models[model_name].get("no_ctc", False), "enc_layer_w_ctc": models[model_name].get("enc_layer_w_ctc", None), } models_with_pt_ckpt[model_name]["model_args"] = copy.deepcopy(model_args) - # att + trafo lm + ilm correction - for model_name, lm_scale, ilm_scale, beam_size in product( - # ["model_baseline", "model_ctc0.5_att0.5"], [0.36] ,[0.28], [12] - ["model_baseline"], [0.3, 0.34, 0.36, 0.4] ,[0.28], [12] - ): - ilm_model_args = copy.deepcopy(models_with_pt_ckpt[model_name]["model_args"]) - ilm_model_args["preload_from_files"] = { + preload_from_files_ilm = { "01_mini_att_ilm": { "prefix": "ilm.", "filename": "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/tedlium2/mini_att_ilm_24_04_21/average.pt", @@ -483,8 +479,16 @@ def sis_run_with_prefix(prefix_name: str = None): } } + # att + trafo lm + ilm correction + for model_name, lm_scale, ilm_scale, beam_size in product( + # ["model_baseline", "model_ctc0.5_att0.5"], [0.36] ,[0.28], [12] + ["model_baseline"], [0.36], [0.28], [] #12 + ): + ilm_model_args = copy.deepcopy(models_with_pt_ckpt[model_name]["model_args"]) + ilm_model_args["preload_from_files"] = preload_from_files_ilm + name = ( - prefix_name + prefix_name_bsf32 + "/" + model_name + f"/att_trafolm{lm_scale}_ilm{ilm_scale}" @@ -495,7 +499,7 @@ def sis_run_with_prefix(prefix_name: str = None): "att_scale": 1.0, "ilm_scale": ilm_scale, "lm_scale": lm_scale, - "bsf": bsf, + "bsf": 32, "use_first_lm": True, "use_zoneout_output": True, } @@ -514,9 +518,14 @@ def sis_run_with_prefix(prefix_name: str = None): ) # opls att + ctc + trafo lm + ilm - for model_name, beam_size, ilm_scale in product(["model_baseline"], [12], [0.1, 0.2, 0.25, 0.3, 0.4]): - for scales in opls_model_names[model_name]["scales"]: - att_scale, ctc_scale, prior_scale, lm_scale = scales + # 5.78 with att 0.7, ctc 0.3, prior 0.7, trafo 0.6, ilm 0.45 + for model_name, beam_size, lm_scale in product(["model_baseline"], [12], [0.6, 0.62, 0.64, 0.66, 0.68, 0.7]): + for scales in [(0.7, 0.3, 0.7, 0.45)]: + att_scale, ctc_scale, prior_scale, ilm_scale = scales + + ilm_model_args = copy.deepcopy(models_with_pt_ckpt[model_name]["model_args"]) + ilm_model_args["preload_from_files"] = preload_from_files_ilm + name = ( prefix_name + "/" @@ -531,6 +540,7 @@ def sis_run_with_prefix(prefix_name: str = None): "ctc_scale": ctc_scale, "use_ctc": True, "add_trafo_lm": True, + "ilm_scale": ilm_scale, "lm_scale": lm_scale, "bsf": bsf, "prior_corr": True if prior_scale > 0 else False, @@ -542,8 +552,8 @@ def sis_run_with_prefix(prefix_name: str = None): task, models_with_pt_ckpt[model_name]["ckpt"], model_recog, - dev_sets=["dev", "test"], - model_args=models_with_pt_ckpt[model_name]["model_args"], + dev_sets=["dev"], + model_args=ilm_model_args, search_args=search_args, prefix_name=name, ) @@ -567,6 +577,7 @@ def sis_run_with_prefix(prefix_name: str = None): } }, "mel_normalization": True, + "s_use_zoneout_output": True, "no_ctc": models[model_name].get("no_ctc", False), "enc_layer_w_ctc": models[model_name].get("enc_layer_w_ctc", None), } @@ -574,7 +585,7 @@ def sis_run_with_prefix(prefix_name: str = None): # att + trafo lm for model_name, lm_scale, beam_size in product( - ["model_baseline"], [0.13, 0.15, 0.18, 0.2], [12] + ["model_baseline"], [0.18], [] #12 ): lm_model_args = copy.deepcopy(models_with_pt_ckpt[model_name]["model_args"]) name = ( @@ -608,7 +619,8 @@ def sis_run_with_prefix(prefix_name: str = None): ) # att + ctc + trafo lm opls - for model_name, beam_size in product(opls_model_names.keys(), []): + for model_name, beam_size in product(["model_baseline"], []): # 12 + # for model_name, beam_size in product(opls_model_names.keys(), []): for scales in opls_model_names[model_name]["scales"]: att_scale, ctc_scale, prior_scale, lm_scale = scales name = ( @@ -636,7 +648,7 @@ def sis_run_with_prefix(prefix_name: str = None): task, models_with_pt_ckpt[model_name]["ckpt"], model_recog, - dev_sets=["dev", "test"], + dev_sets=["dev"], model_args=models_with_pt_ckpt[model_name]["model_args"], search_args=search_args, prefix_name=name, diff --git a/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_train.py b/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_train.py index fe0451e21..f405cd911 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_train.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_train.py @@ -105,20 +105,20 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): # train_exp("base-11gb", config_11gb, gpu_mem=11) # train_exp("base-11gb-v1", my_config_11gb, num_epochs=400, gpu_mem=11) - train_exp( # dev 8.77 test 8.26 - "base-11gb-v3-lrlin1e_5_600k_aux4_8", - my_config_11gb, - config_updates={ - "learning_rate": 1.0, - "dynamic_learning_rate": dyn_lr_piecewise_linear, - # total steps after 2000 epochs: 982.312 - "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], - "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], - "aux_loss_layers": [4,8], - }, - num_epochs=400, - gpu_mem=11, - ) + # train_exp( # dev 8.77 test 8.26 + # "base-11gb-v3-lrlin1e_5_600k_aux4_8", + # my_config_11gb, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 2000 epochs: 982.312 + # "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], + # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + # "aux_loss_layers": [4,8], + # }, + # num_epochs=400, + # gpu_mem=11, + # ) # train_exp( # aux 12: does not converge # "base-11gb-v3-lrlin1e_5_261k", @@ -133,8 +133,22 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): # gpu_mem=11, # ) - train_exp( # dev 7.89 test 7.3 - "base-11gb-v3-lrlin1e_5_261k_aux4_8", + # train_exp( # dev 7.89 test 7.3 + # "base-11gb-v3-lrlin1e_5_261k_aux4_8", + # my_config_11gb, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + # "aux_loss_layers": [4, 8], + # }, + # num_epochs=400, + # gpu_mem=11, + # ) + + train_exp( # + "base-11gb-v3-lrlin1e_5_261k_aux4_8_zoneout_fix", my_config_11gb, config_updates={ "learning_rate": 1.0, @@ -142,12 +156,13 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], "aux_loss_layers": [4, 8], + "s_use_zoneout_output": True, }, num_epochs=400, gpu_mem=11, ) - train_exp( # + train_exp( # dev 7.59 test 7.13 "base-11gb-v3-lrlin1e_5_261k_aux4_8_12", my_config_11gb, config_updates={ @@ -175,21 +190,21 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): # gpu_mem=11, # ) - model = train_exp( # with aux 4 8: dev 9.92 test 8.96 - wrong steps!!! - "base-24gb-v6-lrlin1e_5_261k", - config_24gb_v6, - config_updates={ - "learning_rate": 1.0, - "dynamic_learning_rate": dyn_lr_piecewise_linear, - "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], - "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], - }, - num_epochs=400, - ) + # model = train_exp( # with aux 4 8: dev 9.92 test 8.96 - wrong steps!!! + # "base-24gb-v6-lrlin1e_5_261k", + # config_24gb_v6, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], + # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + # }, + # num_epochs=400, + # ) model = train_exp( # - "base-24gb-v6-lrlin1e_5_85k", + "base-24gb-v6-lrlin1e_5_85k_zoneout_fix", config_24gb_v6, config_updates={ "learning_rate": 1.0, @@ -197,6 +212,7 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): # total steps after 400 epochs: 189.995 "learning_rate_piecewise_steps": [85_500, 171_000, 190_000], "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + "s_use_zoneout_output": True, }, num_epochs=400, ) @@ -1365,7 +1381,6 @@ def __init__( enc_att_dropout: float = 0.1, l2: float = 0.0001, language_model: Optional[RFModelWithMakeLabelScorer] = None, - mel_normalization: bool = True, ): super(Model, self).__init__() @@ -1373,7 +1388,7 @@ def __init__( config = get_global_config(return_empty_if_none=True) - self.mel_normalization = mel_normalization + self.mel_normalization = config.typed_value("mel_normalization", True) self.in_dim = in_dim self.encoder = ConformerEncoder( @@ -1432,7 +1447,8 @@ def __init__( Dim(name="lstm", dimension=1024), zoneout_factor_cell=0.15, zoneout_factor_output=0.05, - use_zoneout_output=False, # like RETURNN/TF ZoneoutLSTM old default + use_zoneout_output=config.typed_value("s_use_zoneout_output", False), # TODO: run exps with this fixed + # use_zoneout_output=False, # like RETURNN/TF ZoneoutLSTM old default # parts_order="icfo", # like RETURNN/TF ZoneoutLSTM # parts_order="ifco", parts_order="jifo", # NativeLSTM (the code above converts it...) diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py index 25d1a4bb4..4b08a9e96 100644 --- a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py @@ -119,7 +119,7 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): # }, # ) - train_exp( # + train_exp( # dev-other 9.01 "base-24gb-lrlin1e_5_600k_ctc_only", config_24gb_v6, config_updates={ @@ -132,7 +132,7 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): }, ) - train_exp( # + train_exp( # dev-other 6.93 "base-24gb-lrlin1e_5_600k_ctc_only_aux4_8", config_24gb_v6, config_updates={ @@ -144,6 +144,35 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): }, ) + + # without mel normalization + train_exp( # dev-other + "base-24gb-lrlin1e_5_600k_ctc_only_no_mel_norm", + config_24gb_v6, + config_updates={ + "learning_rate": 1.0, + "dynamic_learning_rate": dyn_lr_piecewise_linear, + # total steps after 2000 epochs: 982.312 + "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], + "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + "aux_loss_layers":[], + "mel_normalization_ted2": False, + }, + ) + + train_exp( # dev-other + "base-24gb-lrlin1e_5_600k_ctc_only_aux4_8_no_mel_norm", + config_24gb_v6, + config_updates={ + "learning_rate": 1.0, + "dynamic_learning_rate": dyn_lr_piecewise_linear, + # total steps after 2000 epochs: 982.312 + "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], + "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + "mel_normalization_ted2": False, + }, + ) + _sis_prefix: Optional[str] = None diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_ctc_train.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_ctc_train.py index 900926eae..74628b703 100644 --- a/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_ctc_train.py +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_ctc_train.py @@ -11,6 +11,7 @@ import hashlib import contextlib import functools +from sisyphus import tk from returnn.tensor import Tensor, Dim, single_step_dim import returnn.frontend as rf @@ -30,10 +31,11 @@ if TYPE_CHECKING: from i6_experiments.users.gaudino.model_interfaces import ModelDef, RecogDef, TrainDef - from i6_experiments.users.gaudino.model_with_checkpoints import ( - ModelWithCheckpoints, - ModelWithCheckpoint, - ) + +from i6_experiments.users.gaudino.model_with_checkpoints import ( + ModelWithCheckpoints, + ModelWithCheckpoint, +) from i6_experiments.users.gaudino.models.asr.rf.conformer_ctc.model_conformer_ctc import from_scratch_model_def, from_scratch_training from i6_experiments.users.gaudino.models.asr.rf.conformer_ctc.model_recog_ctc_greedy import model_recog @@ -45,6 +47,9 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): """run the exp""" + + from i6_core.returnn.training import PtCheckpoint + _sis_setup_global_prefix(prefix_name) # Moh: dev-clean 2.27, dev-other 5.39, test-clean 2.41, test-other 5.51 @@ -167,54 +172,167 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): # gpu_mem=11, # ) - train_exp( - "from-scratch-24gb_lrmaxs85k_lrmin1e-5_lrmax1e-3", - ctc_train_24gb_config, + # train_exp( # does not converge + # "from-scratch-24gb_lrmaxs85k_lrmin1e-5_lrmax1e-3", + # ctc_train_24gb_config, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 400 epochs: + # "learning_rate_piecewise_steps": [85_500, 171_000, 190_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + # "aux_loss_layers": [], + # }, + # num_epochs=400, + # ) + # + # train_exp( # does not converge + # "from-scratch-24gb_lrmaxs85k_lrmin8e-5_lrmax8e-4", + # ctc_train_24gb_config, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 400 epochs: + # "learning_rate_piecewise_steps": [85_500, 171_000, 190_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [8e-5, 8e-4, 8e-5, 1e-6], + # "aux_loss_layers": [], + # }, + # num_epochs=400, + # ) + + # init from tf ctc only model + _torch_ckpt_dir_path = "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/tedlium2/without_lm/" + + model_args = { + "mel_normalization": True, + } + new_ckpt_path = tk.Path( + _torch_ckpt_dir_path + "model_ctc_only_rf_compatible" + "/average.pt", + hash_overwrite= "model_ctc_only_rf_compatible" + "_torch_ckpt", + ) + new_ckpt = PtCheckpoint(new_ckpt_path) + + + # recog ctc only model + _recog( + "model_recogs/model_ctc_only_rf_compatible/ctc_greedy/recog_results", + ModelWithCheckpoint( + definition=from_scratch_model_def, checkpoint=new_ckpt + ), + model_recog, + ) + + # train_exp( # does not improve, different wer from the beginning + # "init_from_tf_lin132k_lrmax8e-4_25eps", + # ctc_train_config, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 2000 epochs: 982.312 + # # total steps after 400 epochs: + # "learning_rate_piecewise_steps": [66_000, 132_000, 145_000], # 45% 45 % 10% + # # "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [8e-5, 8e-4, 8e-5, 1e-6], + # "aux_loss_layers": [], + # "preload_from_files": { + # "encoder": { + # "filename": _torch_ckpt_dir_path + "model_ctc_only_rf_compatible" + "/average.pt", + # "ignore_missing": True, + # "init_for_train": True, + # }, + # }, + # }, + # num_epochs=100, + # gpu_mem=11, + # ) + + train_exp( # + "from-scratch-11gb_lrmaxs522k_lrmin8e-5_lrmax8e-4_aux4_8_adjSpec", + ctc_train_config, config_updates={ "learning_rate": 1.0, "dynamic_learning_rate": dyn_lr_piecewise_linear, - # total steps after 400 epochs: - "learning_rate_piecewise_steps": [85_500, 171_000, 190_000], # 45% 45 % 10% - "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], - "aux_loss_layers": [], - }, + # total steps after 2000 epochs: 982.312 + "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + "learning_rate_piecewise_values": [8e-5, 8e-4, 8e-5, 1e-6], + "aux_loss_layers": [4, 8], + "specaugment_steps": (5_900, 18_000, 36_000), + }, num_epochs=400, + gpu_mem=11, ) - train_exp( - "from-scratch-24gb_lrmaxs85k_lrmin8e-5_lrmax8e-4", - ctc_train_24gb_config, + train_exp( # + "from-scratch-11gb_lrmaxs522k_lrmin1e-5_lrmax1e-3_aux4_8_adjSpec", + ctc_train_config, config_updates={ "learning_rate": 1.0, "dynamic_learning_rate": dyn_lr_piecewise_linear, - # total steps after 400 epochs: - "learning_rate_piecewise_steps": [85_500, 171_000, 190_000], # 45% 45 % 10% - "learning_rate_piecewise_values": [8e-5, 8e-4, 8e-5, 1e-6], - "aux_loss_layers": [], - }, + # total steps after 2000 epochs: 982.312 + "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + "aux_loss_layers": [4, 8], + "specaugment_steps": (5_900, 18_000, 36_000), + }, num_epochs=400, + gpu_mem=11, ) - # init for tf ctc only model - train_exp( # does not converge - "from-scratch-11gb_lrmaxs522k_lrmin8e-5_lrmax8e-4", + # TODO: try with pretrain + + train_exp( # + "from-scratch-11gb_pre3_lrmaxs522k_lrmin1e-5_lrmax1e-3_aux4_8_adjSpec", ctc_train_config, config_updates={ "learning_rate": 1.0, "dynamic_learning_rate": dyn_lr_piecewise_linear, # total steps after 2000 epochs: 982.312 - # total steps after 400 epochs: - "learning_rate_piecewise_steps": [66_000, 132_000, 145_000], # 45% 45 % 10% - # "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], # 45% 45 % 10% - "learning_rate_piecewise_values": [8e-5, 8e-4, 8e-5, 1e-6], - "aux_loss_layers": [], - "preload_from_files": { - "filename": "", - "ignore_missing": True, - "init_for_train": True, + "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + "aux_loss_layers": [4, 8], + "specaugment_steps": (5_900, 18_000, 36_000), + "pretrain_opts": { # pretrain + "steps": [ + (8 * 500, {"num_layers": 2}), + (4 * 500, {"num_layers": 4}), + (4 * 500, {"num_layers": 8}), + ] }, - }, - num_epochs=100, + }, + num_epochs=400, + gpu_mem=11, + ) + + # TODO: try with epoch base lr schedule + ep = 400 + lr = 8e-4 + cyc_ep = int(0.45 * ep) + + train_exp( # + "from-scratch-11gb_ep_based_lr_aux4_8", + ctc_train_config, + config_updates={ + "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # total steps after 2000 epochs: 982.312 + # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + "aux_loss_layers": [4, 8], + # "specaugment_steps": (5_900, 18_000, 36_000), + # "pretrain_opts": { # pretrain + # "steps": [ + # (8 * 500, {"num_layers": 2}), + # (4 * 500, {"num_layers": 4}), + # (4 * 500, {"num_layers": 8}), + # ] + # }, + "learning_rates": ( + list(numpy.linspace(lr / 10, lr, cyc_ep)) + + list(numpy.linspace(lr, lr / 10, cyc_ep)) + + list(numpy.linspace(lr / 10, 1e-6, ep - 2 * cyc_ep)) + ) + }, + num_epochs=400, gpu_mem=11, ) @@ -242,7 +360,7 @@ def _recog( dev_sets: Optional[Collection[str]] = None, ): from sisyphus import tk - from i6_experiments.users.zeyer.recog import recog_model + from i6_experiments.users.gaudino.recog_2 import recog_model if recog_def is None: recog_def = model_recog diff --git a/users/gaudino/models/asr/rf/conformer_ctc/model_conformer_ctc.py b/users/gaudino/models/asr/rf/conformer_ctc/model_conformer_ctc.py index 463d65487..21b5fac95 100644 --- a/users/gaudino/models/asr/rf/conformer_ctc/model_conformer_ctc.py +++ b/users/gaudino/models/asr/rf/conformer_ctc/model_conformer_ctc.py @@ -141,7 +141,6 @@ def __init__( enc_att_dropout: float = 0.1, l2: float = 0.0001, language_model: Optional[RFModelWithMakeLabelScorer] = None, - mel_normalization: bool = True, joiner_dim: int = 640, ): super(Model, self).__init__() @@ -150,7 +149,7 @@ def __init__( config = get_global_config(return_empty_if_none=True) - self.mel_normalization = mel_normalization + self.mel_normalization = config.typed_value("mel_normalization_ted2", True) self.in_dim = in_dim self.encoder = ConformerEncoder( diff --git a/users/gaudino/recog.py b/users/gaudino/recog.py index ecd02a31e..9dd10c3a4 100644 --- a/users/gaudino/recog.py +++ b/users/gaudino/recog.py @@ -4,6 +4,7 @@ from __future__ import annotations +import copy import os from typing import TYPE_CHECKING, Optional, Union, Any, Dict, Sequence, Collection, Iterator, Callable @@ -314,6 +315,7 @@ def search_config_v2( extern_data_raw = instanciate_delayed(extern_data_raw) if model_args.get("preload_from_files", None): + model_args = copy.deepcopy(model_args) preload_from_files = model_args.pop("preload_from_files") returnn_recog_config_dict["preload_from_files"] = preload_from_files From 1c4de76e104f602108512fd4ab58b56af8a79f7e Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 28 May 2024 12:01:14 -0400 Subject: [PATCH 070/227] update conf v2 --- users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py b/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py index fa8e11f26..10240b0a4 100644 --- a/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py +++ b/users/zeineldeen/models/asr/encoder/conformer_encoder_v2.py @@ -87,6 +87,7 @@ def __init__( mhsa_weight_dropout=None, conv_weight_dropout=None, frontend_conv_weight_dropout=None, + ctc_weight_dropout=None, memory_variant_opts: Optional[ConformerMemoryVariantOpts] = None, ): """ @@ -232,6 +233,7 @@ def __init__( self.conv_weight_drop = conv_weight_dropout self.mhsa_weight_drop = mhsa_weight_dropout self.frontend_conv_weight_drop = frontend_conv_weight_dropout + self.ctc_weight_drop = ctc_weight_dropout self.memory_variant_opts = memory_variant_opts if self.memory_variant_opts: @@ -1083,7 +1085,7 @@ def _create_conformer_block(self, i, source): mhsa_input = conv_module1 mhsa = self._create_mhsa_module(prefix_name, mhsa_input, i) mhsa = self.network.add_combine_layer( - "{}_res".format(prefix_name), kind="add", source=[mhsa, mhsa_input], n_out=self.enc_value_dim + "{}_self_att_res".format(prefix_name), kind="add", source=[mhsa, mhsa_input], n_out=self.enc_value_dim ) conv_module = self._create_convolution_module(prefix_name, mhsa, i, half_step=self.sandwich_conv) @@ -1254,7 +1256,7 @@ def _create_all_network_parts(self): loss="ctc", dropout=self.ctc_dropout, loss_opts=default_ctc_loss_opts, - param_dropout=self.ff_weight_drop, + param_dropout=self.ctc_weight_drop, param_dropout_min_ndim=2, param_variational_noise=self.ff_weight_noise, ) From ef244fffa290301fb0cf046a5fc8b987a84ffadb Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 28 May 2024 12:01:45 -0400 Subject: [PATCH 071/227] more --- users/zeineldeen/data_aug/speed_perturbation_generic.py | 8 +++----- .../librispeech_960/attention_asr_config.py | 1 + 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/users/zeineldeen/data_aug/speed_perturbation_generic.py b/users/zeineldeen/data_aug/speed_perturbation_generic.py index 7b5260942..7670427ae 100644 --- a/users/zeineldeen/data_aug/speed_perturbation_generic.py +++ b/users/zeineldeen/data_aug/speed_perturbation_generic.py @@ -8,13 +8,11 @@ def speed_pert(audio, sample_rate, random_state, min_factor={min_factor}, max_fa """ speed_pert_v2 = """ -def speed_pert(audio, sample_rate={sample_rate}, min_factor={min_factor}, max_factor={max_factor}, step={step}): +def speed_pert(audio, sample_rate={sample_rate}, min_factor={min_factor}, max_factor={max_factor}, step={step}, random_state=numpy.random.RandomState(1)): import librosa - import numpy - - random_state = np.random.RandomState(1) + new_sample_rate = int(sample_rate * (1 + random_state.randint(min_factor, max_factor) * step)) if new_sample_rate != sample_rate: - audio = librosa.core.resample(audio, sample_rate, new_sample_rate, res_type="kaiser_fast") + audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=new_sample_rate, res_type="kaiser_fast", axis=0) return audio """ diff --git a/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/attention_asr_config.py b/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/attention_asr_config.py index 0630cbd43..d16556709 100644 --- a/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/attention_asr_config.py +++ b/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/attention_asr_config.py @@ -686,6 +686,7 @@ def create_config( assert "sample_rate" in speed_pert_version speed_pert_generic_str = data_aug.speed_pert_generic_v2 assert isinstance(speed_pert_generic_str, str) + python_prolog += ["import numpy\n\n"] python_prolog += [speed_pert_generic_str.format(**speed_pert_version)] else: raise ValueError("Invalid speed_pert_version") From 755dc073347f652e1cd78236d1458175e2cf59c3 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Wed, 29 May 2024 17:41:21 +0000 Subject: [PATCH 072/227] update --- .../librispeech_960/additional_config.py | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/additional_config.py b/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/additional_config.py index c3fc2ecdb..1dbe0a6a7 100644 --- a/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/additional_config.py +++ b/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/additional_config.py @@ -4,19 +4,20 @@ def get_lm_opts(): transf_lm_net = TransformerLM( - source='prev:output', num_layers=24, vocab_size=2051, use_as_ext_lm=True, prefix_name='lm_') + source="prev:output", num_layers=24, vocab_size=2051, use_as_ext_lm=True, prefix_name="lm_" + ) transf_lm_net.create_network() transf_lm_opts = { - 'lm_subnet': transf_lm_net.network.get_net(), - 'lm_output_prob_name': 'lm_output', - 'is_recurrent': True, - 'preload_from_files': { - 'lm_model': { - 'filename': '/work/asr4/zeineldeen/setups-data/librispeech/2021-02-21--lm-bpe/dependencies/lm_models/transf/epoch.017', - 'prefix': 'lm_' + "lm_subnet": transf_lm_net.network.get_net(), + "lm_output_prob_name": "lm_output", + "is_recurrent": True, + "preload_from_files": { + "lm_model": { + "filename": "/work/asr4/zeineldeen/setups-data/librispeech/2021-02-21--lm-bpe/dependencies/lm_models/transf/epoch.017", + "prefix": "lm_", } }, - 'name': 'trafo', + "name": "trafo", } return transf_lm_opts @@ -25,7 +26,7 @@ def get_lm_opts(): fairseq_mhsa_init = "variance_scaling_initializer(mode='fan_avg', distribution='uniform', scale=0.5)" # limit = sqrt(6 * 0.5 / (fan_in + fan_out)) = sqrt(3 / (fan_in + fan_out)) -def apply_fairseq_init_to_conformer(conformer_args: [ConformerEncoderArgs,ConformerDecoderArgs]): +def apply_fairseq_init_to_conformer(conformer_args: [ConformerEncoderArgs, ConformerDecoderArgs]): # fairseq init conformer_args.ff_init = fairseq_ff_init conformer_args.mhsa_init = fairseq_mhsa_init @@ -40,10 +41,10 @@ def apply_fairseq_init_to_transformer_decoder(transformer_dec_args: TransformerD transformer_dec_args.mhsa_out_init = fairseq_ff_init -def reset_params_init(args: [ConformerEncoderArgs,TransformerDecoderArgs]): +def reset_params_init(args: [ConformerEncoderArgs, TransformerDecoderArgs]): # reset parameters init args.ff_init = None args.mhsa_init = None args.mhsa_out_init = None if isinstance(args, ConformerEncoderArgs): - args.conv_module_init = None \ No newline at end of file + args.conv_module_init = None From dfe142f2a38fcc45a9cddf4dab0a9351f5f886aa Mon Sep 17 00:00:00 2001 From: Simon Berger Date: Thu, 30 May 2024 10:54:51 +0200 Subject: [PATCH 073/227] Update users/berger --- .../20230210_baselines/__init__.py | 2 + .../config_02b_transducer_rasr_features.py | 18 +++++--- ..._02f_transducer_rasr_features_am_scales.py | 10 +---- ...nfig_03a_transducer_fullsum_raw_samples.py | 1 + ...ig_03b_transducer_fullsum_rasr_features.py | 42 ++++++++--------- users/berger/network/helpers/rnnt_loss.py | 45 ++++++++++++++++++- .../network/models/context_1_transducer.py | 2 + .../context_1_transducer_raw_samples.py | 2 + 8 files changed, 86 insertions(+), 36 deletions(-) diff --git a/users/berger/configs/librispeech/20230210_baselines/__init__.py b/users/berger/configs/librispeech/20230210_baselines/__init__.py index 1bfefd2ef..bcb63868b 100644 --- a/users/berger/configs/librispeech/20230210_baselines/__init__.py +++ b/users/berger/configs/librispeech/20230210_baselines/__init__.py @@ -15,6 +15,7 @@ from .config_02c_transducer_rasr_features_wei_lex import py as py_02c from .config_02e_transducer_rasr_features_tinaconf import py as py_02e from .config_02e_transducer_rasr_features_tinaconf_rtf import py as py_02e_rtf +from .config_02f_transducer_rasr_features_am_scales import py as py_02f from .config_03a_transducer_fullsum_raw_samples import py as py_03a from .config_03b_transducer_fullsum_rasr_features import py as py_03b from .config_03c_transducer_fullsum_rasr_features_wei_lex import py as py_03c @@ -37,6 +38,7 @@ def main() -> SummaryReport: sub_reports.append(copy.deepcopy(py_02c()[0])) sub_reports.append(copy.deepcopy(py_02e())) sub_reports.append(copy.deepcopy(py_02e_rtf())) + sub_reports.append(copy.deepcopy(py_02f())) sub_reports.append(copy.deepcopy(py_03a())) sub_reports.append(copy.deepcopy(py_03b())) sub_reports.append(copy.deepcopy(py_03c())) diff --git a/users/berger/configs/librispeech/20230210_baselines/config_02b_transducer_rasr_features.py b/users/berger/configs/librispeech/20230210_baselines/config_02b_transducer_rasr_features.py index 3953a13fa..9970ae508 100644 --- a/users/berger/configs/librispeech/20230210_baselines/config_02b_transducer_rasr_features.py +++ b/users/berger/configs/librispeech/20230210_baselines/config_02b_transducer_rasr_features.py @@ -66,10 +66,7 @@ def generate_returnn_config( } if train: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer( num_outputs=num_classes, specaug_args=specaug_args, conformer_args={ @@ -402,8 +399,15 @@ def run_exp( { "epochs": [382], "lm_scales": [0.8], + "mem": 8, } ) + system.run_recog_step_for_corpora( + exp_names=[f"Conformer_Transducer_Viterbi_specaug-v2_{name_suffix}"], + corpora=["dev-clean_4gram", "dev-other_4gram", "test-clean_4gram", "test-other_4gram"], + recog_exp_names=["recog_ilm-0.3"], + **recog_args, + ) recog_args["search_parameters"].update( { "label-pruning": 11.0, @@ -473,7 +477,11 @@ def run_exp( **recog_args, ) - train_job = system.get_train_job(f"Conformer_Transducer_Viterbi_lr-0.0008_{name_suffix}") + if "blstm" in name_suffix: + train_job = system.get_train_job(f"Conformer_Transducer_Viterbi_specaug-v2_{name_suffix}") + else: + train_job = system.get_train_job(f"Conformer_Transducer_Viterbi_lr-0.0008_{name_suffix}") + model = train_job.out_checkpoints[400] assert isinstance(model, Checkpoint) diff --git a/users/berger/configs/librispeech/20230210_baselines/config_02f_transducer_rasr_features_am_scales.py b/users/berger/configs/librispeech/20230210_baselines/config_02f_transducer_rasr_features_am_scales.py index 320f83114..4f66ded44 100644 --- a/users/berger/configs/librispeech/20230210_baselines/config_02f_transducer_rasr_features_am_scales.py +++ b/users/berger/configs/librispeech/20230210_baselines/config_02f_transducer_rasr_features_am_scales.py @@ -65,10 +65,7 @@ def generate_returnn_config( } if train: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer( num_outputs=num_classes, specaug_args=specaug_args, conformer_args={ @@ -102,10 +99,7 @@ def generate_returnn_config( specaug_v2=specaug_v2, ) else: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer_recog( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_recog( num_outputs=num_classes, conformer_args={ "num_blocks": 12, diff --git a/users/berger/configs/librispeech/20230210_baselines/config_03a_transducer_fullsum_raw_samples.py b/users/berger/configs/librispeech/20230210_baselines/config_03a_transducer_fullsum_raw_samples.py index 2ee08d156..8a89118c7 100644 --- a/users/berger/configs/librispeech/20230210_baselines/config_03a_transducer_fullsum_raw_samples.py +++ b/users/berger/configs/librispeech/20230210_baselines/config_03a_transducer_fullsum_raw_samples.py @@ -83,6 +83,7 @@ def generate_returnn_config( "activation": "tanh", }, }, + fullsum_v2=True, ) else: (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_recog( diff --git a/users/berger/configs/librispeech/20230210_baselines/config_03b_transducer_fullsum_rasr_features.py b/users/berger/configs/librispeech/20230210_baselines/config_03b_transducer_fullsum_rasr_features.py index 496d6fd5b..f0cf7ca0d 100644 --- a/users/berger/configs/librispeech/20230210_baselines/config_03b_transducer_fullsum_rasr_features.py +++ b/users/berger/configs/librispeech/20230210_baselines/config_03b_transducer_fullsum_rasr_features.py @@ -49,10 +49,7 @@ def generate_returnn_config( **kwargs, ) -> ReturnnConfig: if train: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer_fullsum( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_fullsum( num_outputs=num_classes, specaug_args={ "max_time_num": 1, @@ -84,12 +81,10 @@ def generate_returnn_config( "activation": "tanh", }, }, + fullsum_v2=True, ) else: - ( - network_dict, - extra_python, - ) = transducer_model.make_context_1_conformer_transducer_recog( + (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_recog( num_outputs=num_classes, conformer_args={ "num_blocks": 12, @@ -279,27 +274,32 @@ def run_exp(alignments: Dict[str, AlignmentData], viterbi_model_checkpoint: Chec recog_args["search_parameters"].update( { - # "separate-lookahead-lm": True, + "separate-lookahead-lm": True, "label-full-sum": False, "label-pruning": 16.2, } ) - recog_args["lookahead_options"].update({"lm_lookahead_scale": 0.45}) recog_args["use_gpu"] = True recog_args["rtf"] = 100 recog_args["mem"] = 24 - system.run_recog_step_for_corpora( - recog_descriptor="fs", - recog_exp_names={"Conformer_Transducer_Fullsum_lr-0.0001_bs-9000": ["recog_ilm-0.2"]}, - corpora=[ - # "dev-clean_kazuki_transformer", - "dev-other_kazuki_transformer", - # "test-clean_kazuki_transformer", - # "test-other_kazuki_transformer", - ], - **recog_args, - ) + # recog_args["lm_scales"] = [0.8, 0.9] + # for lm_lookahead_scale in [0.3, 0.4, 0.45, 0.5, 0.6]: + recog_args["lm_scales"] = [0.9] + for lm_lookahead_scale in [0.3, 0.4, 0.45, 0.5, 0.6]: + recog_args["lookahead_options"].update({"lm_lookahead_scale": lm_lookahead_scale}) + + system.run_recog_step_for_corpora( + recog_descriptor=f"fs_lookahead-{lm_lookahead_scale}", + recog_exp_names={"Conformer_Transducer_Fullsum_lr-0.0001_bs-9000": ["recog_ilm-0.2", "recog_ilm-0.3"]}, + corpora=[ + "dev-clean_kazuki_transformer", + "dev-other_kazuki_transformer", + "test-clean_kazuki_transformer", + "test-other_kazuki_transformer", + ], + **recog_args, + ) assert system.summary_report return system.summary_report diff --git a/users/berger/network/helpers/rnnt_loss.py b/users/berger/network/helpers/rnnt_loss.py index 0edb8f719..613041026 100644 --- a/users/berger/network/helpers/rnnt_loss.py +++ b/users/berger/network/helpers/rnnt_loss.py @@ -1,3 +1,7 @@ +from sisyphus.delayed_ops import DelayedFormat +from i6_core.tools.git import CloneGitRepositoryJob + + def rnnt_loss(sources, blank_label=0): from returnn.extern.WarpRna import rna_loss @@ -61,6 +65,24 @@ def rnnt_loss_compressed(sources, blank_label=0): return loss +def rnnt_loss_compressed_v2(sources, blank_label: int = 0): + from tensorflow_binding.returnn_tf_op import monotonic_rnnt_loss + + logits = sources(0, as_data=True, auto_convert=False) + targets = sources(1, as_data=True, auto_convert=False) + encoder = sources(2, as_data=True, auto_convert=False) + + loss = monotonic_rnnt_loss( + logits.placeholder, + targets.get_placeholder_as_batch_major(), + encoder.get_sequence_lengths(), + targets.get_sequence_lengths(), + blank_label=blank_label, + ) + loss.set_shape((None,)) + return loss + + def add_rnnt_loss_compressed( network: dict, encoder: str, @@ -68,6 +90,7 @@ def add_rnnt_loss_compressed( targets: str, num_classes: int, blank_index: int = 0, + loss_v2: bool = False, ): network["output"] = { "class": "linear", @@ -86,7 +109,25 @@ def add_rnnt_loss_compressed( "from": ["output", targets, encoder], "loss": "as_is", "out_type": {"batch_dim_axis": 0, "time_dim_axis": None, "shape": ()}, - "eval": f'self.network.get_config().typed_value("rnnt_loss_compressed")(source, {blank_index})', } - return [rnnt_loss_compressed] + if loss_v2: + network["rnnt_loss"][ + "eval" + ] = f'self.network.get_config().typed_value("rnnt_loss_compressed_v2")(source, {blank_index})' + else: + network["rnnt_loss"][ + "eval" + ] = f'self.network.get_config().typed_value("rnnt_loss_compressed")(source, {blank_index})' + + if loss_v2: + repo = CloneGitRepositoryJob( + "https://github.com/SimBe195/monotonic-rnnt.git", checkout_folder_name="monotonic-rnnt" + ).out_repository + return [ + "import sys", + DelayedFormat('sys.path.append("{}")', repo), + rnnt_loss_compressed_v2, + ] + else: + return [rnnt_loss_compressed] diff --git a/users/berger/network/models/context_1_transducer.py b/users/berger/network/models/context_1_transducer.py index 38cd61d42..603b0be38 100644 --- a/users/berger/network/models/context_1_transducer.py +++ b/users/berger/network/models/context_1_transducer.py @@ -119,6 +119,7 @@ def make_context_1_conformer_transducer_fullsum( conformer_args: Dict = {}, decoder_args: Dict = {}, specaug_v2: bool = False, + fullsum_v2: bool = False, ) -> Tuple[Dict, List]: network = {} python_code = [] @@ -172,6 +173,7 @@ def make_context_1_conformer_transducer_fullsum( targets=f"base:base:{context_labels}", num_classes=num_outputs, blank_index=blank_index, + loss_v2=fullsum_v2, ) else: python_code += add_rnnt_loss( diff --git a/users/berger/network/models/context_1_transducer_raw_samples.py b/users/berger/network/models/context_1_transducer_raw_samples.py index 289cc1881..56ccea00a 100644 --- a/users/berger/network/models/context_1_transducer_raw_samples.py +++ b/users/berger/network/models/context_1_transducer_raw_samples.py @@ -112,6 +112,7 @@ def make_context_1_conformer_transducer_fullsum( vgg_args: Dict = {}, conformer_args: Dict = {}, decoder_args: Dict = {}, + fullsum_v2: bool = False, ) -> Tuple[Dict, List]: network = {} python_code = [] @@ -159,6 +160,7 @@ def make_context_1_conformer_transducer_fullsum( targets=f"base:base:{context_labels}", num_classes=num_outputs, blank_index=blank_index, + loss_v2=fullsum_v2, ) else: python_code += add_rnnt_loss( From 99647dff0ea1f952f6169ab8881da376f2379a8f Mon Sep 17 00:00:00 2001 From: "luca.gaudino" Date: Fri, 31 May 2024 11:16:26 +0200 Subject: [PATCH 074/227] fixes and update --- .../conformer_import_moh_att_2023_06_30.py | 171 ++++++++++++--- .../model_recogs/model_recog.py | 5 +- .../tedlium2/_import_model.py | 12 +- .../conformer_import_moh_att_2023_10_19.py | 10 +- .../tedlium2/conformer_ctc_train.py | 205 +++++++++++++----- .../tedlium2/conformer_rnnt_train.py | 20 +- .../models/asr/rf/ilm_import_2024_04_17.py | 6 +- 7 files changed, 333 insertions(+), 96 deletions(-) diff --git a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py index 611d6c876..ab9778259 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py @@ -60,6 +60,7 @@ _returnn_tf_ckpt_filename = "i6_core/returnn/training/AverageTFCheckpointsJob.BxqgICRSGkgb/output/model/average.index" _torch_ckpt_filename_w_lstm_lm = "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/full_w_lm_import_2023_10_18/average.pt" _torch_ckpt_filename_w_trafo_lm = "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/librispeech/full_w_trafo_lm_import_2024_02_05/average.pt" +_torch_ckpt_filename_base_model = "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/librispeech/base_model/average.pt" # The model gets raw features (16khz) and does feature extraction internally. _log_mel_feature_dim = 80 @@ -124,31 +125,6 @@ def sis_run_with_prefix(prefix_name: str = None): res.output, ) - # att + lstm lm TODO: debug difference - for scales, beam_size in product([(1.0, 0.3), (1.0, 0.33), (1.0, 0.27)], []): - att_scale, lm_scale = scales - recog_name = f"/opls_att{att_scale}_lstm_lm{lm_scale}_beam{beam_size}" - name = prefix_name + recog_name - search_args = { - "beam_size": beam_size, - "add_lstm_lm": True, - "lm_scale": lm_scale, - "bsf": bsf, - } - res, _ = recog_model( - task, - model_with_checkpoint, - model_recog, - dev_sets=["dev-other"], - model_args=model_args, - search_args=search_args, - prefix_name=name, - ) - tk.register_output( - name + f"/recog_results", - res.output, - ) - # espnet ctc prefix decoder # beam 12/32: {"dev-clean": 2.83, "dev-other": 6.69, "test-clean": 3.07, "test-other": 7.02} for prior_scale, beam_size in product([0.0], []): @@ -249,6 +225,31 @@ def sis_run_with_prefix(prefix_name: str = None): # ------------------ with LSTM LM ------------------------ + # att + lstm lm TODO: debug difference + for scales, beam_size in product([(1.0, 0.3), (1.0, 0.33), (1.0, 0.27)], []): + att_scale, lm_scale = scales + recog_name = f"/opls_att{att_scale}_lstm_lm{lm_scale}_beam{beam_size}" + name = prefix_name + recog_name + search_args = { + "beam_size": beam_size, + "add_lstm_lm": True, + "lm_scale": lm_scale, + "bsf": bsf, + } + res, _ = recog_model( + task, + model_with_checkpoint, + model_recog, + dev_sets=["dev-other"], + model_args=model_args, + search_args=search_args, + prefix_name=name, + ) + tk.register_output( + name + f"/recog_results", + res.output, + ) + # att + espnet ctc prefix scorer + lstm lm for scales, prior_scale, lm_scale, beam_size in product( [(0.8, 0.2), (0.85, 0.15)], @@ -410,12 +411,20 @@ def sis_run_with_prefix(prefix_name: str = None): # ------------------ with Trafo LM ------------------------ - model_w_trafo_lm_ckpt_path = tk.Path( - _torch_ckpt_filename_w_trafo_lm, hash_overwrite="torch_ckpt_w_trafo_lm" + # model_w_trafo_lm_ckpt_path = tk.Path( + # _torch_ckpt_filename_w_trafo_lm, hash_overwrite="torch_ckpt_w_trafo_lm" + # ) + # model_w_trafo_lm_ckpt = PtCheckpoint(model_w_trafo_lm_ckpt_path) + # model_with_checkpoint = ModelWithCheckpoint( + # definition=from_scratch_model_def, checkpoint=model_w_trafo_lm_ckpt + # ) + # + model_ckpt_path = tk.Path( + _torch_ckpt_filename_base_model, hash_overwrite="torch_ckpt_base_model" ) - model_w_trafo_lm_ckpt = PtCheckpoint(model_w_trafo_lm_ckpt_path) + model_ckpt = PtCheckpoint(model_ckpt_path) model_with_checkpoint = ModelWithCheckpoint( - definition=from_scratch_model_def, checkpoint=model_w_trafo_lm_ckpt + definition=from_scratch_model_def, checkpoint=model_ckpt ) model_args = { @@ -435,6 +444,8 @@ def sis_run_with_prefix(prefix_name: str = None): }, } + # ilm ckpt torch: /work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/librispeech/mini_att_ilm_24_05_28/average.pt + # opts ctc + trafo lm TODO: fix bugs for scales, beam_size in product([(1.0, 0.5)], []): ctc_scale, lm_scale = scales @@ -470,7 +481,7 @@ def sis_run_with_prefix(prefix_name: str = None): # att + trafo lm # beam 32: {"dev-clean": 1.91, "dev-other": 4.14, "test-clean": 2.2, "test-other": 4.6} - for lm_scale, beam_size in product([0.42], [32, 40]): + for lm_scale, beam_size in product([0.42], [32]): recog_name = f"/att_trafo_lm{lm_scale}_beam{beam_size}" name = prefix_name + recog_name search_args = { @@ -570,6 +581,106 @@ def sis_run_with_prefix(prefix_name: str = None): res.output, ) + # ------------------ with MiniAtt ILM ------------------------ + + model_args = { + "external_language_model": { + "class": "Trafo_LM_Model", + "num_layers": 24, + "layer_out_dim": 1024, + "att_num_heads": 8, + "use_pos_enc": True, + "ff_activation": "relu", + }, + "internal_language_model": { + "class": "MiniAtt_ILM_Model", + "s_use_zoneout_output": False, + }, + "preload_from_files": { + "01_trafo_lm": { + "prefix": "language_model.", + "filename": "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/librispeech/trafo_lm_only_24_02_06/network.023.pt", + }, + "01_mini_att_ilm": { + "prefix": "ilm.", + "filename": "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/librispeech/mini_att_ilm_24_05_28/average.pt", + }, + }, + } + + # ilm ckpt torch: /work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/librispeech/mini_att_ilm_24_05_28/average.pt + + # att + trafo lm + ilm + # + for lm_scale, ilm_scale, beam_size in product([0.54], [0.4], [32, 64]): + recog_name = f"/att_trafolm{lm_scale}_ilm{ilm_scale}_beam{beam_size}_ffix" + name = prefix_name + recog_name + search_args = { + "beam_size": beam_size, + "add_trafo_lm": True, + "lm_scale": lm_scale, + "ilm_scale": ilm_scale, + "bsf": bsf, + "use_lm_first_label": True, + } + res, _ = recog_model( + task, + model_with_checkpoint, + model_recog, + dev_sets=["dev-other"], + model_args=model_args, + search_args=search_args, + prefix_name=name, + ) + tk.register_output( + name + f"/recog_results", + res.output, + ) + + # opls att + ctc + trafo lm + ilm + for scales, prior_scale, lm_scale, ilm_scale, beam_size in product( + [(0.85, 0.15)], [0.0], [0.5], [0.3, 0.35, 0.4, 0.45], [12, 32] + ): + att_scale, ctc_scale = scales + recog_name = ( + f"/opls_att{att_scale}_ctc{ctc_scale}" + + (f"_prior{prior_scale}" if prior_scale > 0.0 else "") + + f"_trafo_lm{lm_scale}" + + f"_ilm{ilm_scale}" + + f"_beam{beam_size}_ffix" + ) + name = prefix_name + recog_name + search_args = { + "beam_size": beam_size, + "add_trafo_lm": True, + "lm_scale": lm_scale, + "att_scale": att_scale, + "ctc_scale": ctc_scale, + "ilm_scale": ilm_scale, + "use_ctc": True, + "bsf": bsf, + "prior_corr": prior_scale > 0.0, + "ctc_prior_file": "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-02-22--conformer-swb/work/i6_core/returnn/extract_prior/ReturnnComputePriorJobV2.ZeflcEHlQTjn/output/prior.txt", + "prior_scale": prior_scale, + "use_lm_first_label": True, + } + res, _ = recog_model( + task, + model_with_checkpoint, + model_recog, + dev_sets=["dev-other"], + model_args=model_args, + search_args=search_args, + prefix_name=name, + # device="cpu", + # search_mem_rqmt=15, + ) + tk.register_output( + name + f"/recog_results", + res.output, + ) + + py = sis_run_with_prefix # if run directly via `sis m ...` diff --git a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/model_recogs/model_recog.py b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/model_recogs/model_recog.py index 5c16781ef..ed496388c 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/model_recogs/model_recog.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/model_recogs/model_recog.py @@ -166,15 +166,12 @@ def model_recog( lm_state = lm_out["state"] lm_log_prob = rf.log_softmax(lm_out["output"], axis=model.target_dim) - - - if not model.search_args.get("use_lm_first_label", False) and i > 0: + if model.search_args.get("use_lm_first_label", True) or i > 0: label_log_prob = ( label_log_prob + model.search_args["lm_scale"] * lm_log_prob ) if model.search_args.get("ilm_scale", 0.0) > 0: - # breakpoint() ilm_out = model.ilm(input_embed, state=ilm_state, spatial_dim=single_step_dim) ilm_state = ilm_out["state"] ilm_log_prob = rf.log_softmax(ilm_out["output"], axis=model.target_dim) diff --git a/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/_import_model.py b/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/_import_model.py index c2a7de778..2d63bc745 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/_import_model.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/_import_model.py @@ -657,12 +657,13 @@ def import_models(): if __name__ == "__main__": - import_models() + # import_models() # convert_lm( # _ted2_lm_ckpt_filename, # "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/tedlium2/trafo_lm_only_24_02_05", # 1057, # ) + # Ted2 ILM # convert_mini_att_ilm( # ckpt_path_prior="/u/zeineldeen/setups/ubuntu_22_setups/2023-04-17--conformer-att/work/i6_core/returnn/training/AverageTFCheckpointsJob.yB4JK4GDCxWG/output/model/average", # ckpt_path_mini_att="/u/zeineldeen/setups/ubuntu_22_setups/2023-04-17--conformer-att/work/i6_core/returnn/training/GetBestTFCheckpointJob.70hGEsLQ6ynw/output/model/checkpoint", @@ -670,3 +671,12 @@ def import_models(): # model_target_dim=1057, # out_dir="/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/tedlium2/mini_att_ilm_24_04_21", # ) + + # ls960 ILM + convert_mini_att_ilm( + ckpt_path_prior="/u/zeineldeen/setups/ubuntu_22_setups/2023-04-17--conformer-att/work/i6_core/returnn/training/AverageTFCheckpointsJob.BxqgICRSGkgb/output/model/average", + ckpt_path_mini_att="/u/zeineldeen/setups/ubuntu_22_setups/2023-04-17--conformer-att/work/i6_core/returnn/training/GetBestTFCheckpointJob.JLwxrydala1K/output/model/checkpoint", + model_in_dim=640, + model_target_dim=10025, + out_dir="/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/librispeech/mini_att_ilm_24_05_28", + ) diff --git a/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_2023_10_19.py b/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_2023_10_19.py index e653183c9..d89d6fbf1 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_2023_10_19.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_2023_10_19.py @@ -518,10 +518,10 @@ def sis_run_with_prefix(prefix_name: str = None): ) # opls att + ctc + trafo lm + ilm - # 5.78 with att 0.7, ctc 0.3, prior 0.7, trafo 0.6, ilm 0.45 - for model_name, beam_size, lm_scale in product(["model_baseline"], [12], [0.6, 0.62, 0.64, 0.66, 0.68, 0.7]): - for scales in [(0.7, 0.3, 0.7, 0.45)]: - att_scale, ctc_scale, prior_scale, ilm_scale = scales + # 5.74 with att 0.7, ctc 0.3, prior 0.7, trafo 0.6, ilm 0.45 + for model_name, beam_size in product(["model_baseline"], [12, 24]): + for scales in [(0.7, 0.3, 0.7, 0.6, 0.45)]: + att_scale, ctc_scale, prior_scale, lm_scale, ilm_scale = scales ilm_model_args = copy.deepcopy(models_with_pt_ckpt[model_name]["model_args"]) ilm_model_args["preload_from_files"] = preload_from_files_ilm @@ -552,7 +552,7 @@ def sis_run_with_prefix(prefix_name: str = None): task, models_with_pt_ckpt[model_name]["ckpt"], model_recog, - dev_sets=["dev"], + dev_sets=["dev", "test"], model_args=ilm_model_args, search_args=search_args, prefix_name=name, diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_ctc_train.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_ctc_train.py index 74628b703..3fd2c0362 100644 --- a/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_ctc_train.py +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_ctc_train.py @@ -70,6 +70,7 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): "weight_decay": 1e-6, }, # accum_grad_multiple_step=4, + accum_grad_multiple_step=2, # gradient_noise=0.0, learning_rate=2.5e-3, dynamic_learning_rate=dyn_lr_lin_warmup_invsqrt_decay, @@ -77,7 +78,6 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): learning_rate_invsqrt_norm=40_000, max_seq_length_default_target=None, gradient_clip_global_norm=5.0, - accum_grad_multiple_step=2, aux_loss_layers=[4,8], # rnnt_loss=False, ) @@ -246,24 +246,154 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): # gpu_mem=11, # ) - train_exp( # - "from-scratch-11gb_lrmaxs522k_lrmin8e-5_lrmax8e-4_aux4_8_adjSpec", + # train_exp( # does not converge + # "from-scratch-11gb_lrmaxs522k_lrmin8e-5_lrmax8e-4_aux4_8_adjSpec", + # ctc_train_config, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 2000 epochs: 982.312 + # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [8e-5, 8e-4, 8e-5, 1e-6], + # "aux_loss_layers": [4, 8], + # "specaugment_steps": (5_900, 18_000, 36_000), + # }, + # num_epochs=400, + # gpu_mem=11, + # ) + # + # train_exp( # does not converge + # "from-scratch-11gb_lrmaxs522k_lrmin1e-5_lrmax1e-3_aux4_8_adjSpec", + # ctc_train_config, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 2000 epochs: 982.312 + # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + # "aux_loss_layers": [4, 8], + # "specaugment_steps": (5_900, 18_000, 36_000), + # }, + # num_epochs=400, + # gpu_mem=11, + # ) + + # train_exp( # does not converge + # "from-scratch-11gb_lrmaxs522k_lrmin8e-5_lrmax8e-4_aux4_8_adjSpec2", + # ctc_train_config, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 2000 epochs: 982.312 + # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [8e-5, 8e-4, 8e-5, 1e-6], + # "aux_loss_layers": [4, 8], + # "specaugment_steps": (25_000, 50_000, 100_000), + # }, + # config_deletes=["learning_rate_warmup_steps", "learning_rate_invsqrt_norm"], + # num_epochs=400, + # gpu_mem=11, + # ) + # + # train_exp( # does not converge + # "from-scratch-11gb_lrmaxs522k_lrmin1e-5_lrmax1e-3_aux4_8_adjSpec2", + # ctc_train_config, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 2000 epochs: 982.312 + # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + # "aux_loss_layers": [4, 8], + # "specaugment_steps": (25_000, 50_000, 100_000), + # }, + # config_deletes=["learning_rate_warmup_steps", "learning_rate_invsqrt_norm"], + # num_epochs=400, + # gpu_mem=11, + # ) + + # TODO: try with pretrain + + # train_exp( # does not converge + # "from-scratch-11gb_pre3_lrmaxs522k_lrmin1e-5_lrmax1e-3_aux4_8_adjSpec", + # ctc_train_config, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 2000 epochs: 982.312 + # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + # "aux_loss_layers": [4, 8], + # "specaugment_steps": (5_900, 18_000, 36_000), + # "pretrain_opts": { # pretrain + # "steps": [ + # (8 * 500, {"num_layers": 2}), + # (4 * 500, {"num_layers": 4}), + # (4 * 500, {"num_layers": 8}), + # ] + # }, + # }, + # num_epochs=400, + # gpu_mem=11, + # ) + + # TODO: try with epoch base lr schedule + ep = 400 + lr = 8e-4 + cyc_ep = int(0.45 * ep) + + # train_exp( # does not converge + # "from-scratch-11gb_ep_based_lr_aux4_8", + # ctc_train_config, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": None, + # # total steps after 2000 epochs: 982.312 + # # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + # # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + # "aux_loss_layers": [4, 8], + # # "specaugment_steps": (5_900, 18_000, 36_000), + # # "pretrain_opts": { # pretrain + # # "steps": [ + # # (8 * 500, {"num_layers": 2}), + # # (4 * 500, {"num_layers": 4}), + # # (4 * 500, {"num_layers": 8}), + # # ] + # # }, + # "learning_rates": ( + # list(np.linspace(lr / 10, lr, cyc_ep)) + # + list(np.linspace(lr, lr / 10, cyc_ep)) + # + list(np.linspace(lr / 10, 1e-6, ep - 2 * cyc_ep)) + # ) + # }, + # num_epochs=400, + # gpu_mem=11, + # ) + + train_exp( + "from-scratch-11gb_lrmin1e-5_lrmax1e-3_aux4_8_adjSpec6k_noCurrL", ctc_train_config, config_updates={ "learning_rate": 1.0, "dynamic_learning_rate": dyn_lr_piecewise_linear, # total steps after 2000 epochs: 982.312 "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% - "learning_rate_piecewise_values": [8e-5, 8e-4, 8e-5, 1e-6], + "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], "aux_loss_layers": [4, 8], "specaugment_steps": (5_900, 18_000, 36_000), + "epoch_wise_filter": { + # (1, 5): {"max_mean_len": 1000}, # better? + # older settings: + # (1, 5): {"max_mean_len": 200}, + # (6, 10): {"max_mean_len": 500}, + }, }, num_epochs=400, gpu_mem=11, ) - train_exp( # - "from-scratch-11gb_lrmaxs522k_lrmin1e-5_lrmax1e-3_aux4_8_adjSpec", + train_exp( + "from-scratch-11gb_lrmin1e-5_lrmax1e-3_aux4_8_adjSpec6k_adjCurrL", ctc_train_config, config_updates={ "learning_rate": 1.0, @@ -273,15 +403,21 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], "aux_loss_layers": [4, 8], "specaugment_steps": (5_900, 18_000, 36_000), + "epoch_wise_filter": { + # (1, 5): {"max_mean_len": 1000}, # better? + # older settings: + # (1, 5): {"max_mean_len": 200}, + # (6, 10): {"max_mean_len": 500}, + (1,2): {"max_mean_len": 400}, + (2,4): {"max_mean_len": 800}, + }, }, num_epochs=400, gpu_mem=11, ) - # TODO: try with pretrain - - train_exp( # - "from-scratch-11gb_pre3_lrmaxs522k_lrmin1e-5_lrmax1e-3_aux4_8_adjSpec", + train_exp( + "from-scratch-11gb_lrmin1e-5_lrmax1e-3_aux4_8_adjSpec6k_accumGrad4", ctc_train_config, config_updates={ "learning_rate": 1.0, @@ -291,47 +427,9 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], "aux_loss_layers": [4, 8], "specaugment_steps": (5_900, 18_000, 36_000), - "pretrain_opts": { # pretrain - "steps": [ - (8 * 500, {"num_layers": 2}), - (4 * 500, {"num_layers": 4}), - (4 * 500, {"num_layers": 8}), - ] - }, - }, - num_epochs=400, - gpu_mem=11, - ) - - # TODO: try with epoch base lr schedule - ep = 400 - lr = 8e-4 - cyc_ep = int(0.45 * ep) - - train_exp( # - "from-scratch-11gb_ep_based_lr_aux4_8", - ctc_train_config, - config_updates={ - "learning_rate": 1.0, - # "dynamic_learning_rate": dyn_lr_piecewise_linear, - # total steps after 2000 epochs: 982.312 - # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% - # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], - "aux_loss_layers": [4, 8], - # "specaugment_steps": (5_900, 18_000, 36_000), - # "pretrain_opts": { # pretrain - # "steps": [ - # (8 * 500, {"num_layers": 2}), - # (4 * 500, {"num_layers": 4}), - # (4 * 500, {"num_layers": 8}), - # ] - # }, - "learning_rates": ( - list(numpy.linspace(lr / 10, lr, cyc_ep)) - + list(numpy.linspace(lr, lr / 10, cyc_ep)) - + list(numpy.linspace(lr / 10, 1e-6, ep - 2 * cyc_ep)) - ) + "accum_grad_multiple_step": 4, }, + config_deletes=["learning_rate_warmup_steps", "learning_rate_invsqrt_norm"], num_epochs=400, gpu_mem=11, ) @@ -405,7 +503,8 @@ def train_exp( _sis_setup_global_prefix() prefix = _sis_prefix + "/" + name - task = _get_ted2_task() + epoch_wise_filter = config.pop("epoch_wise_filter", None) + task = _get_ted2_task(epoch_wise_filter=epoch_wise_filter) config = config.copy() config = dict_update_deep(config, config_updates, config_deletes) if "__num_epochs" in config: @@ -506,7 +605,7 @@ def _get_ls_task(): return _ls_task -def _get_ted2_task(): +def _get_ted2_task(epoch_wise_filter=None): global _ted2_task if _ted2_task: return _ted2_task @@ -515,7 +614,7 @@ def _get_ted2_task(): get_tedlium2_task_bpe1k_raw, ) - _ted2_task = get_tedlium2_task_bpe1k_raw(with_eos_postfix=True, train_epoch_wise_filter=None) + _ted2_task = get_tedlium2_task_bpe1k_raw(with_eos_postfix=True, train_epoch_wise_filter=epoch_wise_filter) return _ted2_task diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_rnnt_train.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_rnnt_train.py index 9718aed90..beae49311 100644 --- a/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_rnnt_train.py +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_rnnt_train.py @@ -79,7 +79,7 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): # train_exp("base-11gb", config_11gb, gpu_mem=11) # train_exp("base-11gb-v1", my_config_11gb, num_epochs=400, gpu_mem=11) - train_exp( + train_exp( # "from-scratch-11gb", rnnt_train_config, config_updates={ @@ -91,6 +91,24 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], }, + config_deletes=["learning_rate_warmup_steps", "learning_rate_invsqrt_norm"], + num_epochs=400, + gpu_mem=11, + ) + + train_exp( # does not converge + "from-scratch-11gb", + rnnt_train_config, + config_updates={ + "learning_rate": 1.0, + "dynamic_learning_rate": dyn_lr_piecewise_linear, + # total steps after 2000 epochs: 982.312 + # "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], + # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + }, + config_deletes=["learning_rate_warmup_steps", "learning_rate_invsqrt_norm"], num_epochs=400, gpu_mem=24, ) diff --git a/users/gaudino/models/asr/rf/ilm_import_2024_04_17.py b/users/gaudino/models/asr/rf/ilm_import_2024_04_17.py index e7f671c18..adc3c60c2 100644 --- a/users/gaudino/models/asr/rf/ilm_import_2024_04_17.py +++ b/users/gaudino/models/asr/rf/ilm_import_2024_04_17.py @@ -25,6 +25,7 @@ def __init__( mini_att_lstm_dim: int = 50, mini_att_out_dim: int = 512, prior_dim: int = 1024, + s_use_zoneout_output: bool = True, # for ted2, for ls960 set to false # layer_out_dim: int = 768, # default values for ted2 trafo lm # layer_ff_dim: int = 4096, # embed_dim: int = 128, @@ -49,13 +50,14 @@ def __init__( self.mini_att_lstm_dim, self.mini_att_out_dim, with_bias=True ) + self.s_use_zoneout_output = s_use_zoneout_output + self.prior_s = rf.ZoneoutLSTM( in_dim + self.mini_att_out_dim, self.prior_dim, zoneout_factor_cell=0.15, zoneout_factor_output=0.05, - # use_zoneout_output=False, # like RETURNN/TF ZoneoutLSTM old default - use_zoneout_output=True, + use_zoneout_output=self.s_use_zoneout_output, # parts_order="icfo", # like RETURNN/TF ZoneoutLSTM # parts_order="ifco", parts_order="jifo", # NativeLSTM (the code above converts it...) From 6bd01c34aea62eb44113a061afa268284aee9692 Mon Sep 17 00:00:00 2001 From: "luca.gaudino" Date: Fri, 31 May 2024 11:17:59 +0200 Subject: [PATCH 075/227] add CTC gauss weights --- .../tedlium2/attention_asr_config.py | 17 +- .../tedlium2/configs/ted2_att_baseline.py | 92 ++++- .../gaudino/models/asr/decoder/ctc_decoder.py | 12 + .../models/asr/encoder/conformer_encoder.py | 385 ++++++++++++++---- 4 files changed, 423 insertions(+), 83 deletions(-) diff --git a/users/gaudino/experiments/conformer_att_2023/tedlium2/attention_asr_config.py b/users/gaudino/experiments/conformer_att_2023/tedlium2/attention_asr_config.py index 1321f7cc5..5abb616b6 100644 --- a/users/gaudino/experiments/conformer_att_2023/tedlium2/attention_asr_config.py +++ b/users/gaudino/experiments/conformer_att_2023/tedlium2/attention_asr_config.py @@ -412,6 +412,10 @@ class ConformerEncoderArgs(EncoderArgs): ctc_self_align_scale: float = 0.5 ctc_dropout: float = 0.0 enc_layer_w_ctc: Optional[int] = None + ctc_att_weights_gauss: bool = False + ctc_att_weights_gauss_stddev: float = 1.0 + ctc_att_weights_gauss_window: int = 5 + ctc_att_weights_use_enc: bool = True # param init ff_init: Optional[str] = None @@ -615,6 +619,10 @@ class CTCDecoderArgs(DecoderArgs): renorm_after_remove_blank: bool = True recombine: bool = False max_approx: bool = False + train: bool = False + + # not used + coverage_scale: bool = False def create_config( @@ -813,10 +821,11 @@ def create_config( elif isinstance(decoder_args, CTCDecoderArgs): decoder_type = CTCDecoder dec_type = "ctc" - exp_config["extern_data"]["bpe_labels_w_blank"] = copy.deepcopy( - exp_config["extern_data"]["bpe_labels"] - ) - exp_config["extern_data"]["bpe_labels_w_blank"]["dim"] += 1 + if not decoder_args.train: + exp_config["extern_data"]["bpe_labels_w_blank"] = copy.deepcopy( + exp_config["extern_data"]["bpe_labels"] + ) + exp_config["extern_data"]["bpe_labels_w_blank"]["dim"] += 1 else: assert False, "invalid decoder_args type" diff --git a/users/gaudino/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py b/users/gaudino/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py index eecc1e543..60c2d1423 100644 --- a/users/gaudino/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py +++ b/users/gaudino/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py @@ -1104,10 +1104,100 @@ def get_base_v1_args(lr, ep, enc_drop=0.1, pretrain_reps=3, use_legacy_stats=Tru ) ) prior_file_ctc_only = compute_ctc_prior( - only_ctc_name + "default_last", prior_args, last_checkpoint, bpe_size=BPE_1K + only_ctc_name + "/default_last", prior_args, last_checkpoint, bpe_size=BPE_1K ) # best checkpoint path "/u/zeineldeen/setups/ubuntu_22_setups/2023-04-17--conformer-att/work/i6_core/returnn/training/ReturnnTrainingJob.9o6iL7eblZwa/output/models/epoch.400" + # train only CTC no pretrain + no_pre_args = copy.deepcopy(args) + no_pre_args["decoder_args"] = CTCDecoderArgs(train=True) + no_pre_args["with_pretrain"] = False + only_ctc_name = f"base_bpe1000_peakLR{lr}_ep{ep}_globalNorm_epochOCLR_fixZoneout_encDrop{enc_drop}_woDepthConvPre_weightDrop0.1_decAttDrop0.0_embedDim256_numBlocks12_onlyCTC" + + # _, train_data = run_exp( + # only_ctc_name, + # no_pre_args, + # num_epochs=ep, + # epoch_wise_filter=None, + # bpe_size=BPE_1K, + # partition_epoch=4, + # search_args={ "decoder_args": CTCDecoderArgs(), **no_pre_args}, + # ) + + + # train only CTC with guassian "att weights" + + only_ctc_args = copy.deepcopy(args) + only_ctc_args["decoder_args"].ce_loss_scale = 0.0 + only_ctc_args["encoder_args"].ctc_att_weights_gauss = True + only_ctc_args["encoder_args"].ctc_att_weights_gauss_stddev = 1.0 + only_ctc_args["encoder_args"].ctc_att_weights_gauss_window = 5 + + only_ctc_args2 = copy.deepcopy(only_ctc_args) + + for use_enc in [False, True]: + only_ctc_args["encoder_args"].ctc_att_weights_use_enc = use_enc + only_ctc_name = name + "_onlyCTC_gaussWeights" + ("_no_enc" if not use_enc else "") + + if(use_enc): + only_ctc_args["gradient_clip_global_norm"] = 5.0 + only_ctc_name += "_gradClip5.0" + + _, train_data = run_exp( + only_ctc_name, + only_ctc_args, + num_epochs=ep, + epoch_wise_filter=None, + bpe_size=BPE_1K, + partition_epoch=4, + search_args={"ctc_decode": True, "ctc_blank_idx": 1057, **only_ctc_args}, + ) + + only_ctc_args["decoder_args"] = CTCDecoderArgs() + + last_checkpoint = Checkpoint( + tk.Path( + "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-10-15--conformer-no-app/work/i6_core/returnn/training/ReturnnTrainingJob.4dFO6QJQ4h7x/output/models/epoch.400.index" + ) + ) + prior_file_ctc_only = compute_ctc_prior( + name + "_onlyCTC_gaussWeights" + "_gradClip5.0" + "/default_last", only_ctc_args, last_checkpoint, bpe_size=BPE_1K + ) + + only_ctc_args.pop("gradient_clip_global_norm") + only_ctc_args["encoder_args"].ctc_att_weights_use_enc = False + + last_checkpoint = Checkpoint( + tk.Path( + "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-10-15--conformer-no-app/work/i6_core/returnn/training/ReturnnTrainingJob.4v01A22bWufz/output/models/epoch.400.index" + ) + ) + prior_file_ctc_only = compute_ctc_prior( + name + "_onlyCTC_gaussWeights" + "_no_enc" + "/default_last", only_ctc_args, last_checkpoint, bpe_size=BPE_1K + ) + + + + + for use_enc, std, window in product([], [0.5, 1.0, 2.0], [3, 5, 10]): + only_ctc_args = copy.deepcopy(only_ctc_args2) + only_ctc_args["encoder_args"].ctc_att_weights_gauss_stddev = std + only_ctc_args["encoder_args"].ctc_att_weights_gauss_window = window + only_ctc_args["encoder_args"].ctc_att_weights_use_enc = use_enc + only_ctc_args["gradient_clip_global_norm"] = 5.0 + + only_ctc_name = name + f"_onlyCTC_gaussWeights_std{std}_window{window}" + ("_no_enc" if not use_enc else "") + + _, train_data = run_exp( + only_ctc_name, + only_ctc_args, + num_epochs=ep, + epoch_wise_filter=None, + bpe_size=BPE_1K, + partition_epoch=4, + search_args={"ctc_decode": True, "ctc_blank_idx": 1057, **only_ctc_args}, + ) + # train scale CTC scale_ctc_name = name + "_ctcScale0.3" scale_ctc_args = copy.deepcopy(args) diff --git a/users/gaudino/models/asr/decoder/ctc_decoder.py b/users/gaudino/models/asr/decoder/ctc_decoder.py index ef64002e7..58caa11c3 100644 --- a/users/gaudino/models/asr/decoder/ctc_decoder.py +++ b/users/gaudino/models/asr/decoder/ctc_decoder.py @@ -303,6 +303,11 @@ def __init__( renorm_after_remove_blank=True, recombine=False, max_approx=False, + train = False, + + # not used + coverage_scale=False, + ): """ :param base_model: base/encoder model instance @@ -430,6 +435,8 @@ def __init__( self.dec_output = None self.output_prob = None + self.train = train + def get_python_prolog(self): """Called in attention_asr_config to add ctc decoder specific python code to the config.""" python_prolog = [] @@ -1570,6 +1577,11 @@ def add_blank_collapse(self): self.ctc_source = "blank_collapse_apply" def create_network(self): + + if self.train: + self.decision_layer_name = "dummy" + return + self.decision_layer_name = "out_best_wo_blank" # modify ctc source diff --git a/users/gaudino/models/asr/encoder/conformer_encoder.py b/users/gaudino/models/asr/encoder/conformer_encoder.py index 4cc13678f..c9617a093 100644 --- a/users/gaudino/models/asr/encoder/conformer_encoder.py +++ b/users/gaudino/models/asr/encoder/conformer_encoder.py @@ -16,6 +16,8 @@ from i6_core.returnn.config import CodeWrapper +import numpy as np + class ConformerEncoder: """ @@ -56,9 +58,11 @@ def __init__( ctc_opts=None, ctc_self_align_delay: Optional[int] = None, ctc_self_align_scale: float = 0.5, - enc_layer_w_ctc: Optional[int] = None, - + ctc_att_weights_gauss=False, + ctc_att_weights_gauss_stddev=1.0, + ctc_att_weights_gauss_window=5, + ctc_att_weights_use_enc=True, subsample=None, start_conv_init=None, conv_module_init=None, @@ -89,7 +93,6 @@ def __init__( mhsa_weight_dropout=None, conv_weight_dropout=None, memory_variant_opts: Optional[ConformerMemoryVariantOpts] = None, - conv_use_time_mask=False, ): """ @@ -168,7 +171,9 @@ def __init__( bn_momentum = batch_norm_opts.pop("momentum", 0.1) bn_eps = batch_norm_opts.pop("epsilon", 1e-3) - bn_update_sample_only_in_train = batch_norm_opts.pop("update_sample_only_in_training", True) + bn_update_sample_only_in_train = batch_norm_opts.pop( + "update_sample_only_in_training", True + ) bn_delay_sample_update = batch_norm_opts.pop("delay_sample_update", True) self.batch_norm_opts = { "momentum": bn_momentum, @@ -190,6 +195,10 @@ def __init__( self.ctc_self_align_scale = ctc_self_align_scale self.enc_layer_w_ctc = enc_layer_w_ctc + self.ctc_att_weights_gauss = ctc_att_weights_gauss + self.ctc_att_weights_gauss_stddev = ctc_att_weights_gauss_stddev + self.ctc_att_weights_gauss_window = ctc_att_weights_gauss_window + self.ctc_att_weights_use_enc = ctc_att_weights_use_enc self.start_conv_init = start_conv_init self.conv_module_init = conv_module_init @@ -223,7 +232,9 @@ def __init__( self.use_sqrd_relu = use_sqrd_relu self.use_causal_layers = use_causal_layers - self.use_causal_conv = use_causal_conv if use_causal_conv is not None else self.use_causal_layers + self.use_causal_conv = ( + use_causal_conv if use_causal_conv is not None else self.use_causal_layers + ) self.conv_alternative_name = conv_alternative_name self.fix_merge_dims = fix_merge_dims @@ -245,13 +256,19 @@ def __init__( if self.memory_variant_opts: self.concat_window_dim = SpatialDim("concat-window") # W*N self.enc_att_num_heads_dim = SpatialDim("enc-att-num-heads", att_num_heads) - self.enc_per_head_dim = FeatureDim("enc-dim-per-head", self.enc_key_per_head_dim) + self.enc_per_head_dim = FeatureDim( + "enc-dim-per-head", self.enc_key_per_head_dim + ) if self.memory_variant_opts.conv_cache_size: self.conv_cache_concat_dim = SpatialDim("conv-cache-concat") if self.memory_variant_opts.use_emformer_mem: - self.emformer_mem_bank_dim = SpatialDim("emformer-mem-bank") # M, the same as C but different tag + self.emformer_mem_bank_dim = SpatialDim( + "emformer-mem-bank" + ) # M, the same as C but different tag self.emformer_ext_query_dim = SpatialDim("emformer-ext-query") # W+1 - self.concat_window_with_mem_dim = SpatialDim("concat-window-with-mem") # W*N+M + self.concat_window_with_mem_dim = SpatialDim( + "concat-window-with-mem" + ) # W*N+M self.conv_use_time_mask = conv_use_time_mask @@ -282,7 +299,9 @@ def _create_ff_module(self, prefix_name, i, source, layer_index): ) if self.use_sqrd_relu: - swish_act = self.network.add_activation_layer("{}_relu".format(prefix_name), ff1, activation="relu") + swish_act = self.network.add_activation_layer( + "{}_relu".format(prefix_name), ff1, activation="relu" + ) swish_act = self.network.add_eval_layer( "{}_square_relu".format(prefix_name), swish_act, eval="source(0) ** 2" ) @@ -291,7 +310,9 @@ def _create_ff_module(self, prefix_name, i, source, layer_index): "{}_swish".format(prefix_name), ff1, activation=self.activation ) - drop1 = self.network.add_dropout_layer("{}_drop1".format(prefix_name), swish_act, dropout=self.dropout) + drop1 = self.network.add_dropout_layer( + "{}_drop1".format(prefix_name), swish_act, dropout=self.dropout + ) ff2 = self.network.add_linear_layer( "{}_ff2".format(prefix_name), @@ -303,19 +324,28 @@ def _create_ff_module(self, prefix_name, i, source, layer_index): param_dropout=self.ff_weight_drop, ) - drop2 = self.network.add_dropout_layer("{}_drop2".format(prefix_name), ff2, dropout=self.dropout) + drop2 = self.network.add_dropout_layer( + "{}_drop2".format(prefix_name), ff2, dropout=self.dropout + ) - half_step_ff = self.network.add_eval_layer("{}_half_step".format(prefix_name), drop2, eval="0.5 * source(0)") + half_step_ff = self.network.add_eval_layer( + "{}_half_step".format(prefix_name), drop2, eval="0.5 * source(0)" + ) res_inputs = [half_step_ff, source] ff_module_res = self.network.add_combine_layer( - "{}_res".format(prefix_name), kind="add", source=res_inputs, n_out=self.enc_key_dim + "{}_res".format(prefix_name), + kind="add", + source=res_inputs, + n_out=self.enc_key_dim, ) return ff_module_res - def _get_mem_chunks(self, prefix_name: str, input_layer: str, mem_size: int) -> List[Tuple[str, Union[str, Dim]]]: + def _get_mem_chunks( + self, prefix_name: str, input_layer: str, mem_size: int + ) -> List[Tuple[str, Union[str, Dim]]]: """ :param name: layer prefix name :param input_layer: name of input layer to shift of shape [B*C, W, D] @@ -340,14 +370,16 @@ def _get_mem_chunks(self, prefix_name: str, input_layer: str, mem_size: int) -> ) # [B, C, W, D] # Merge batch and chunk dim again. chunk_shifted = self.network.add_generic_layer( - f"{prefix_name}_chunk_shifted_" + (f"_{mem_idx}" if mem_idx > 0 else ""), + f"{prefix_name}_chunk_shifted_" + + (f"_{mem_idx}" if mem_idx > 0 else ""), cls="merge_dims", source=chunk_shifted, axes=("B", self.memory_variant_opts.chunked_time_dim), ) # [B*C, W, D] # Make sure the time_dim_axis (T) is set to the correct dim (W). chunk_shifted = self.network.add_generic_layer( - f"{prefix_name}_chunk_shifted__" + (f"_{mem_idx}" if mem_idx > 0 else ""), + f"{prefix_name}_chunk_shifted__" + + (f"_{mem_idx}" if mem_idx > 0 else ""), cls="reinterpret_data", source=chunk_shifted, set_axes={"T": "spatial"}, @@ -356,12 +388,15 @@ def _get_mem_chunks(self, prefix_name: str, input_layer: str, mem_size: int) -> if self.memory_variant_opts.mem_slice_start is not None: assert self.memory_variant_opts.mem_slice_size is not None chunk_shifted = self.network.add_generic_layer( - f"{prefix_name}_chunk_shifted__" + (f"_{mem_idx}" if mem_idx > 0 else "") + "_sliced", + f"{prefix_name}_chunk_shifted__" + + (f"_{mem_idx}" if mem_idx > 0 else "") + + "_sliced", cls="slice", source=chunk_shifted, axis="T", slice_start=self.memory_variant_opts.mem_slice_start, - slice_end=self.memory_variant_opts.mem_slice_start + self.memory_variant_opts.mem_slice_size, + slice_end=self.memory_variant_opts.mem_slice_start + + self.memory_variant_opts.mem_slice_size, ) mem_chunks.append((chunk_shifted, "T")) @@ -371,7 +406,12 @@ def _get_mem_chunks(self, prefix_name: str, input_layer: str, mem_size: int) -> return mem_chunks def _self_att_v2( - self, prefix_name: str, *, input_layer: str, concat_prev_chunks_inputs: str, layer_index: int + self, + prefix_name: str, + *, + input_layer: str, + concat_prev_chunks_inputs: str, + layer_index: int, ) -> str: """ Self-Attention implementation via RETURNN layers instead of using RETURNN SelfAttentionLayer @@ -382,12 +422,16 @@ def _self_att_v2( """ if self.memory_variant_opts.use_cached_prev_kv: - assert concat_prev_chunks_inputs is None, "Should use cached keys and values instead." + assert ( + concat_prev_chunks_inputs is None + ), "Should use cached keys and values instead." K = self.network.add_generic_layer( f"{prefix_name}_ln_K", cls="linear", - source=input_layer if self.memory_variant_opts.use_cached_prev_kv else concat_prev_chunks_inputs, + source=input_layer + if self.memory_variant_opts.use_cached_prev_kv + else concat_prev_chunks_inputs, n_out=self.enc_key_dim, forward_weights_init=self.mhsa_init, with_bias=False, @@ -397,7 +441,9 @@ def _self_att_v2( V = self.network.add_generic_layer( f"{prefix_name}_ln_V", cls="linear", - source=input_layer if self.memory_variant_opts.use_cached_prev_kv else concat_prev_chunks_inputs, + source=input_layer + if self.memory_variant_opts.use_cached_prev_kv + else concat_prev_chunks_inputs, n_out=self.enc_value_dim, forward_weights_init=self.mhsa_init, with_bias=False, @@ -415,7 +461,9 @@ def _self_att_v2( # C is approx 15-20. # Then we can concat it to K and V. # Note on prefix_name: The outer _create_mhsa_module adds the additional "_self_att" prefix. - mem_bank = self._block_prefix_name(layer_index - 1) + "_self_att_emformer_mem" # [B*C, D] + mem_bank = ( + self._block_prefix_name(layer_index - 1) + "_self_att_emformer_mem" + ) # [B*C, D] # Same projection which is usually applied to get back to the residual stream. mem_bank = self.network.add_generic_layer( @@ -424,7 +472,8 @@ def _self_att_v2( source=mem_bank, n_out=self.enc_key_dim, with_bias=False, - reuse_params=self._block_prefix_name(layer_index - 1) + "_self_att_linear", + reuse_params=self._block_prefix_name(layer_index - 1) + + "_self_att_linear", param_dropout=self.mhsa_weight_drop, ) # [B*C, D] mem_bank = self.network.add_dropout_layer( @@ -433,7 +482,10 @@ def _self_att_v2( if self.memory_variant_opts.apply_tanh_on_emformer_mem: mem_bank = self.network.add_generic_layer( - f"{prefix_name}_emformer_mem_tanh", cls="activation", source=mem_bank, activation="tanh" + f"{prefix_name}_emformer_mem_tanh", + cls="activation", + source=mem_bank, + activation="tanh", ) else: mem_bank = self.network.add_generic_layer( @@ -453,7 +505,12 @@ def _self_att_v2( f"{prefix_name}_emformer_mem_set_new_dim", cls="reinterpret_data", source=mem_bank, - set_dim_tags=[(self.memory_variant_opts.chunked_time_dim, self.emformer_mem_bank_dim)], + set_dim_tags=[ + ( + self.memory_variant_opts.chunked_time_dim, + self.emformer_mem_bank_dim, + ) + ], ) # [B, M, D] mem_bank_K = self.network.add_generic_layer( @@ -503,11 +560,15 @@ def _self_att_v2( else: mem_bank_K, mem_bank_V = None, None - kv_dim = self.concat_window_with_mem_dim if mem_bank_K else self.concat_window_dim # W*N [+M] + kv_dim = ( + self.concat_window_with_mem_dim if mem_bank_K else self.concat_window_dim + ) # W*N [+M] if self.memory_variant_opts.use_cached_prev_kv or mem_bank_K: # concat previous cached keys and values - concat_keys = self._get_mem_chunks(f"{prefix_name}_ln_K_", K, self.memory_variant_opts.mem_size) + concat_keys = self._get_mem_chunks( + f"{prefix_name}_ln_K_", K, self.memory_variant_opts.mem_size + ) concat_keys.append((K, "T")) if mem_bank_K: concat_keys.append((mem_bank_K, self.emformer_mem_bank_dim)) @@ -528,7 +589,9 @@ def _self_att_v2( ) # [B*C, W*N, H, D/H] if self.memory_variant_opts.use_cached_prev_kv or mem_bank_V: - concat_values = self._get_mem_chunks(f"{prefix_name}_ln_V_", V, self.memory_variant_opts.mem_size) + concat_values = self._get_mem_chunks( + f"{prefix_name}_ln_V_", V, self.memory_variant_opts.mem_size + ) concat_values.append((V, "T")) if mem_bank_V: concat_values.append((mem_bank_V, self.emformer_mem_bank_dim)) @@ -608,7 +671,10 @@ def _self_att_v2( dtype="float32", ) # [1] Q_energy_factor = self.network.add_generic_layer( - f"{prefix_name}_Q_energy_factor", cls="eval", source=dim_per_head_const, eval="source(0) ** -0.5" + f"{prefix_name}_Q_energy_factor", + cls="eval", + source=dim_per_head_const, + eval="source(0) ** -0.5", ) # [1] Q_H = self.network.add_generic_layer( f"{prefix_name}_ln_Q_H", @@ -642,14 +708,18 @@ def _self_att_v2( clipping=self.rel_pos_clipping, query_spatial_dim=query_dim, # W+1 or W key_value_spatial_dim=kv_dim, # W*N [+M] - query_offset=self.memory_variant_opts.chunk_size * self.memory_variant_opts.mem_size, + query_offset=self.memory_variant_opts.chunk_size + * self.memory_variant_opts.mem_size, ) # [queries (W [+1]), kvs (W*N [+M]), D/H] if self.memory_variant_opts.use_emformer_mem: # -> have summary, i.e. [W+1] mask_query = "emformer_mask_query_dim" if mask_query not in self.network: range_in_query_dim = self.network.add_generic_layer( - "emformer_range_query_dim", cls="range_in_axis", source=Q, axis=self.emformer_ext_query_dim + "emformer_range_query_dim", + cls="range_in_axis", + source=Q, + axis=self.emformer_ext_query_dim, ) # [W+1] mask_query = self.network.add_eval_layer( mask_query, @@ -669,9 +739,14 @@ def _self_att_v2( range_in_kv_dim = self.network.add_generic_layer( "emformer_range_kv_dim", cls="range_in_axis", source=K, axis=kv_dim ) # [W*N + M] - kv_dim_len = self.network.add_generic_layer("kv_dim_len", cls="length", source=K, axis=kv_dim) + kv_dim_len = self.network.add_generic_layer( + "kv_dim_len", cls="length", source=K, axis=kv_dim + ) mem_len = self.network.add_generic_layer( - "mem_len", cls="length", source=mem_bank_K, axis=self.emformer_mem_bank_dim + "mem_len", + cls="length", + source=mem_bank_K, + axis=self.emformer_mem_bank_dim, ) mask_kv = self.network.add_eval_layer( mask_kv, @@ -751,7 +826,10 @@ def _self_att_v2( # TODO: is this safe? find a better way set_axes={ "T": f"dim:" - + str(self.memory_variant_opts.chunk_size + (1 if self.memory_variant_opts.use_emformer_mem else 0)) + + str( + self.memory_variant_opts.chunk_size + + (1 if self.memory_variant_opts.use_emformer_mem else 0) + ) }, ) # [B*C, W [+1], D] @@ -805,7 +883,9 @@ def _create_mhsa_module(self, prefix_name, source, layer_index): if self.memory_variant_opts.use_cached_prev_kv is False: # shifted inputs + current chunk ln_concat_chunks = self._get_mem_chunks( - prefix_name=f"{prefix_name}_ln", input_layer=ln, mem_size=self.memory_variant_opts.mem_size + prefix_name=f"{prefix_name}_ln", + input_layer=ln, + mem_size=self.memory_variant_opts.mem_size, ) ln_concat_chunks += [(ln, "T")] ln_ = self.network.add_generic_layer( @@ -818,7 +898,9 @@ def _create_mhsa_module(self, prefix_name, source, layer_index): ln_ = None if self.memory_variant_opts.self_att_version == 0: - assert self.memory_variant_opts.use_cached_prev_kv is False, "Not implemented." + assert ( + self.memory_variant_opts.use_cached_prev_kv is False + ), "Not implemented." # this implementation is not efficient. ln_rel_pos_enc = self.network.add_relative_pos_encoding_layer( f"{prefix_name}_ln_rel_pos_enc", @@ -839,7 +921,9 @@ def _create_mhsa_module(self, prefix_name, source, layer_index): key_shift=ln_rel_pos_enc if ln_rel_pos_enc is not None else None, l2=self.self_att_l2, attention_left_only=self.use_causal_layers, - param_variational_noise=self.weight_noise if "mhsa" in self.weight_noise_layers else None, + param_variational_noise=self.weight_noise + if "mhsa" in self.weight_noise_layers + else None, param_dropout=self.mhsa_weight_drop, ) # [B*C, W*N, D] mhsa_splits = self.network.add_generic_layer( @@ -865,7 +949,10 @@ def _create_mhsa_module(self, prefix_name, source, layer_index): # - ln_ contains the cached keys and values only # - project only current chunk mhsa = self._self_att_v2( - prefix_name, input_layer=ln, concat_prev_chunks_inputs=ln_, layer_index=layer_index + prefix_name, + input_layer=ln, + concat_prev_chunks_inputs=ln_, + layer_index=layer_index, ) else: mhsa = self.network.add_self_att_layer( @@ -879,7 +966,9 @@ def _create_mhsa_module(self, prefix_name, source, layer_index): key_shift=ln_rel_pos_enc if ln_rel_pos_enc is not None else None, l2=self.self_att_l2, attention_left_only=self.use_causal_layers, - param_variational_noise=self.weight_noise if "mhsa" in self.weight_noise_layers else None, + param_variational_noise=self.weight_noise + if "mhsa" in self.weight_noise_layers + else None, param_dropout=self.mhsa_weight_drop, ) @@ -893,16 +982,23 @@ def _create_mhsa_module(self, prefix_name, source, layer_index): param_dropout=self.mhsa_weight_drop, ) - drop = self.network.add_dropout_layer("{}_dropout".format(prefix_name), mhsa_linear, dropout=self.dropout) + drop = self.network.add_dropout_layer( + "{}_dropout".format(prefix_name), mhsa_linear, dropout=self.dropout + ) res_inputs = [drop, source] mhsa_res = self.network.add_combine_layer( - "{}_res".format(prefix_name), kind="add", source=res_inputs, n_out=self.enc_value_dim + "{}_res".format(prefix_name), + kind="add", + source=res_inputs, + n_out=self.enc_value_dim, ) return mhsa_res - def _create_convolution_module(self, prefix_name, source, layer_index, half_step=False): + def _create_convolution_module( + self, prefix_name, source, layer_index, half_step=False + ): """ Add Convolution Module: LN + point-wise-conv + GLU + depth-wise-conv + BN + Swish + point-wise-conv + Dropout @@ -928,9 +1024,14 @@ def _create_convolution_module(self, prefix_name, source, layer_index, half_step param_dropout=self.conv_weight_drop, ) - glu_act = self.network.add_gating_layer("{}_glu".format(prefix_name), pointwise_conv1) + glu_act = self.network.add_gating_layer( + "{}_glu".format(prefix_name), pointwise_conv1 + ) - if self.memory_variant_opts is not None and self.memory_variant_opts.conv_cache_size: + if ( + self.memory_variant_opts is not None + and self.memory_variant_opts.conv_cache_size + ): mem_chunks = self._get_mem_chunks( prefix_name=f"{prefix_name}_glu_act", input_layer=glu_act, @@ -957,7 +1058,6 @@ def _create_convolution_module(self, prefix_name, source, layer_index, half_step padding=(self.conv_kernel_size - 1, 0), ) - depthwise_conv = self.network.add_conv_layer( prefix_name + "_" + (self.conv_alternative_name or "depthwise_conv2"), depthwise_conv_input_padded, @@ -990,26 +1090,36 @@ def _create_convolution_module(self, prefix_name, source, layer_index, half_step **conv_extra_kwargs, ) - if self.memory_variant_opts is not None and self.memory_variant_opts.conv_cache_size: + if ( + self.memory_variant_opts is not None + and self.memory_variant_opts.conv_cache_size + ): # we apply convolution over the concatenated chunks but we only need the output of the current # chunk, thus, we need to slice from [B*C, W*N, D] to [B*C, W, D] - assert self.memory_variant_opts.mem_slice_size, "mem_slice_size must be set." + assert ( + self.memory_variant_opts.mem_slice_size + ), "mem_slice_size must be set." depthwise_conv = self.network.add_generic_layer( f"{prefix_name}_depthwise_conv_slice", cls="slice", source=depthwise_conv, axis="T", - slice_start=self.memory_variant_opts.mem_slice_size * self.memory_variant_opts.conv_cache_size, + slice_start=self.memory_variant_opts.mem_slice_size + * self.memory_variant_opts.conv_cache_size, ) if self.use_ln: - bn = self.network.add_layer_norm_layer("{}_layer_norm".format(prefix_name), depthwise_conv) + bn = self.network.add_layer_norm_layer( + "{}_layer_norm".format(prefix_name), depthwise_conv + ) else: bn = self.network.add_batch_norm_layer( "{}_bn".format(prefix_name), depthwise_conv, opts=self.batch_norm_opts ) - swish_act = self.network.add_activation_layer("{}_swish".format(prefix_name), bn, activation="swish") + swish_act = self.network.add_activation_layer( + "{}_swish".format(prefix_name), bn, activation="swish" + ) pointwise_conv2 = self.network.add_linear_layer( "{}_pointwise_conv2".format(prefix_name), @@ -1022,22 +1132,32 @@ def _create_convolution_module(self, prefix_name, source, layer_index, half_step param_dropout=self.conv_weight_drop, ) - drop = self.network.add_dropout_layer("{}_drop".format(prefix_name), pointwise_conv2, dropout=self.dropout) + drop = self.network.add_dropout_layer( + "{}_drop".format(prefix_name), pointwise_conv2, dropout=self.dropout + ) if half_step: - drop = self.network.add_eval_layer("{}_half_step".format(prefix_name), drop, eval="0.5 * source(0)") + drop = self.network.add_eval_layer( + "{}_half_step".format(prefix_name), drop, eval="0.5 * source(0)" + ) res_inputs = [drop, source] res = self.network.add_combine_layer( - "{}_res".format(prefix_name), kind="add", source=res_inputs, n_out=self.enc_key_dim + "{}_res".format(prefix_name), + kind="add", + source=res_inputs, + n_out=self.enc_key_dim, ) return res def _block_prefix_name(self, layer_index: int) -> str: assert layer_index >= 1 if self.add_to_prefix_name: - prefix_name = "conformer_block_%s_%02i" % (self.add_to_prefix_name, layer_index) + prefix_name = "conformer_block_%s_%02i" % ( + self.add_to_prefix_name, + layer_index, + ) else: prefix_name = "conformer_block_%02i" % layer_index return prefix_name @@ -1074,7 +1194,9 @@ def _create_conformer_block(self, i, source): mhsa_input = conv_module1 mhsa = self._create_mhsa_module(prefix_name, mhsa_input, i) - conv_module = self._create_convolution_module(prefix_name, mhsa, i, half_step=self.sandwich_conv) + conv_module = self._create_convolution_module( + prefix_name, mhsa, i, half_step=self.sandwich_conv + ) ff_module2_input = conv_module ff_module2 = self._create_ff_module(prefix_name, 2, ff_module2_input, i) @@ -1085,7 +1207,9 @@ def _create_conformer_block(self, i, source): assert 0 <= i - 1 < len(self.subsample) subsample_factor = self.subsample_list[i - 1] if subsample_factor > 1: - res = self.network.add_pool_layer(res + "_pool{}".format(i), res, pool_size=(subsample_factor,)) + res = self.network.add_pool_layer( + res + "_pool{}".format(i), res, pool_size=(subsample_factor,) + ) res = self.network.add_copy_layer(prefix_name, res) return res @@ -1150,7 +1274,9 @@ def _create_all_network_parts(self): activation=self.input_layer_conv_act, init=self.start_conv_init, merge_out=False, - param_variational_noise=self.weight_noise if "frontend_conv" in self.weight_noise_layers else None, + param_variational_noise=self.weight_noise + if "frontend_conv" in self.weight_noise_layers + else None, ) subsampled_input = self.network.add_conv_block( @@ -1164,7 +1290,9 @@ def _create_all_network_parts(self): split_input=False, prefix_name="subsample_", merge_out_fixed=self.fix_merge_dims, - param_variational_noise=self.weight_noise if "frontend_conv" in self.weight_noise_layers else None, + param_variational_noise=self.weight_noise + if "frontend_conv" in self.weight_noise_layers + else None, ) elif self.input_layer == "conv-6": extra_conv_opts = {} @@ -1178,7 +1306,9 @@ def _create_all_network_parts(self): activation=self.input_layer_conv_act, init=self.start_conv_init, merge_out=False, - param_variational_noise=self.weight_noise if "frontend_conv" in self.weight_noise_layers else None, + param_variational_noise=self.weight_noise + if "frontend_conv" in self.weight_noise_layers + else None, extra_conv_opts=extra_conv_opts, ) @@ -1193,7 +1323,9 @@ def _create_all_network_parts(self): split_input=False, prefix_name="subsample_", merge_out_fixed=self.fix_merge_dims, - param_variational_noise=self.weight_noise if "frontend_conv" in self.weight_noise_layers else None, + param_variational_noise=self.weight_noise + if "frontend_conv" in self.weight_noise_layers + else None, extra_conv_opts=extra_conv_opts, ) @@ -1209,29 +1341,95 @@ def _create_all_network_parts(self): ) if self.add_abs_pos_enc_to_input: - source_linear = self.network.add_pos_encoding_layer("input_abs_pos_enc", source_linear, add_to_input=True) + source_linear = self.network.add_pos_encoding_layer( + "input_abs_pos_enc", source_linear, add_to_input=True + ) if self.dropout_in: - source_linear = self.network.add_dropout_layer("source_dropout", source_linear, dropout=self.dropout_in) + source_linear = self.network.add_dropout_layer( + "source_dropout", source_linear, dropout=self.dropout_in + ) conformer_block_src = source_linear for i in range(1, self.num_blocks + 1): conformer_block_src = self._create_conformer_block(i, conformer_block_src) - encoder = self.network.add_copy_layer(self.output_layer_name, conformer_block_src) + encoder = self.network.add_copy_layer( + self.output_layer_name, conformer_block_src + ) if self.with_ctc: default_ctc_loss_opts = {"beam_width": 1} if self.native_ctc: default_ctc_loss_opts["use_native"] = True else: - self.ctc_opts.update({"ignore_longer_outputs_than_inputs": True}) # always enable + self.ctc_opts.update( + {"ignore_longer_outputs_than_inputs": True} + ) # always enable if self.ctc_opts: default_ctc_loss_opts["ctc_opts"] = self.ctc_opts - if self.enc_layer_w_ctc is not None and self.enc_layer_w_ctc <= self.num_blocks: + if ( + self.enc_layer_w_ctc is not None + and self.enc_layer_w_ctc <= self.num_blocks + ): encoder = f"conformer_block_{self.enc_layer_w_ctc:02d}" + if self.ctc_att_weights_gauss: + + window_size = self.ctc_att_weights_gauss_window + gaussian_weights = [np.exp(-0.5 * ((i - window_size // 2) / self.ctc_att_weights_gauss_stddev) ** 2) for i in range(window_size)] + # normalize + gaussian_weights = list(np.array(gaussian_weights) / np.sum(gaussian_weights)) + + # gaussian_weights_code = "np.array(%r, dtype=np.float32)" % gaussian_weights + # gaussian_weights_value = CodeWrapper(gaussian_weights_code) + + gaussian_weights_value = CodeWrapper( + f"eval(\"exec('import numpy') or numpy.array({gaussian_weights}, dtype=numpy.float32)\")" + ) + + self.network.update( + { + # encoder [B,T,D] + "enc_win": {"class": "window", "from": encoder, "window_size": window_size}, # [B,T,W,D] + "gaussian_weights_1": { + "class": "constant", + "value": gaussian_weights_value, + "dtype": "float32", + }, + "enc_weighted_with_gaussian": { + "class": "combine", + "kind": "mul", + "from": ["enc_win", "gaussian_weights_1"], + }, + "enc_attention": { + "class": "eval", + "eval": "tf.reduce_sum(source(0, auto_convert=False), axis=-2)", + "from": "enc_weighted_with_gaussian", + "out_type": {"shape": (None, self.enc_key_dim), "dtype": "float32"}, + }, + # "enc_attention": { + # "class": "reduce", + # "mode": "sum", + # "from": "enc_weighted_with_gaussian", + # "axes": ["enc_win:window"], + # }, + "enc_with_attention": { + "class": "concat", + "from": [ + (encoder, "f"), + ("enc_attention", "f"), + ], + }, + } + ) + + if self.ctc_att_weights_use_enc: + encoder = "enc_with_attention" + else: + encoder = "enc_attention" + self.network.add_softmax_layer( "ctc", encoder, @@ -1243,13 +1441,21 @@ def _create_all_network_parts(self): ) if self.ctc_loss_scale or self.ctc_self_align_delay: self.network["ctc"]["loss_scale"] = (self.ctc_loss_scale or 1.0) * ( - (1.0 - self.ctc_self_align_scale) if self.ctc_self_align_delay else 1.0 + (1.0 - self.ctc_self_align_scale) + if self.ctc_self_align_delay + else 1.0 ) if self.ctc_self_align_delay: # http://arxiv.org/abs/2105.05005 - assert self.ctc_self_align_delay > 0 # not implemented otherwise, but also not sure if meaningful - self.network["ctc_log_prob"] = {"class": "activation", "from": "ctc", "activation": "safe_log"} + assert ( + self.ctc_self_align_delay > 0 + ) # not implemented otherwise, but also not sure if meaningful + self.network["ctc_log_prob"] = { + "class": "activation", + "from": "ctc", + "activation": "safe_log", + } # Cut off first N frames. self.network[f"ctc_log_prob_slice{self.ctc_self_align_delay}"] = { "class": "slice", @@ -1258,7 +1464,9 @@ def _create_all_network_parts(self): "slice_start": self.ctc_self_align_delay, } # Forced alignment using that. - self.network[f"ctc_forced_alignment_slice{self.ctc_self_align_delay}"] = { + self.network[ + f"ctc_forced_alignment_slice{self.ctc_self_align_delay}" + ] = { "class": "forced_align", "align_target": f"data:{self.target}", "topology": "ctc", @@ -1272,7 +1480,9 @@ def _create_all_network_parts(self): "axis": "sparse_dim", "sparse": True, } - self.network[f"ctc_forced_alignment_shift{self.ctc_self_align_delay}"] = { + self.network[ + f"ctc_forced_alignment_shift{self.ctc_self_align_delay}" + ] = { "class": "postfix_in_time", "from": f"ctc_forced_alignment_slice{self.ctc_self_align_delay}", "postfix": "_blank_idx", @@ -1283,7 +1493,8 @@ def _create_all_network_parts(self): "class": "copy", "from": "ctc", "loss": "ce", - "loss_scale": (self.ctc_loss_scale or 1.0) * self.ctc_self_align_scale, + "loss_scale": (self.ctc_loss_scale or 1.0) + * self.ctc_self_align_scale, "target": f"layer:ctc_forced_alignment_shift{self.ctc_self_align_delay}", } @@ -1292,13 +1503,19 @@ def _create_all_network_parts(self): def _create_conformer_blocks(self, input): if self.proj_input: conformer_block_src = self.network.add_linear_layer( - "encoder_proj", input, n_out=self.enc_key_dim, activation=None, with_bias=False + "encoder_proj", + input, + n_out=self.enc_key_dim, + activation=None, + with_bias=False, ) else: conformer_block_src = input for i in range(1, self.num_blocks + 1): conformer_block_src = self._create_conformer_block(i, conformer_block_src) - encoder = self.network.add_copy_layer(self.output_layer_name, conformer_block_src) + encoder = self.network.add_copy_layer( + self.output_layer_name, conformer_block_src + ) return encoder def create_network(self): @@ -1344,7 +1561,9 @@ def _energy_mask_emformer_mem( chunk_size_dim # unused # noqa - energy_data: Tensor = source(0, as_data=True) # [B*C, H, W*N+M, W+1], M=C, dims not necessarily that order + energy_data: Tensor = source( + 0, as_data=True + ) # [B*C, H, W*N+M, W+1], M=C, dims not necessarily that order assert len(energy_data.batch.virtual_dims) == 2 batch_virtual_dim0, batch_virtual_dim1 = energy_data.batch.virtual_dims @@ -1361,8 +1580,16 @@ def _energy_mask_emformer_mem( continue energy_dims.append(d) energy_shape.append(d.get_dim_value()) - energy: tf.Tensor = tf.reshape(energy_data.raw_tensor, energy_shape) # [B, C, H, W*N [+M], W+1] - assert set(energy_dims) == {batch_dim, chunked_time_dim, att_num_heads_dim, query_dim, kv_dim} + energy: tf.Tensor = tf.reshape( + energy_data.raw_tensor, energy_shape + ) # [B, C, H, W*N [+M], W+1] + assert set(energy_dims) == { + batch_dim, + chunked_time_dim, + att_num_heads_dim, + query_dim, + kv_dim, + } assert len(energy_dims) == len(energy_shape) == energy.shape.rank == 5 def _bc_shape(d_: Dim): @@ -1391,7 +1618,9 @@ def _bc_shape(d_: Dim): mask = mask0 & mask1 # [..C.., ..W+1.., ..W*N+M..] energy = tf.where(mask, energy, neg_inf) - energy = tf.reshape(energy, [d.get_dim_value() for d in energy_data.dims]) # [B*C, H, W*N+M, W+1] + energy = tf.reshape( + energy, [d.get_dim_value() for d in energy_data.dims] + ) # [B*C, H, W*N+M, W+1] if numpy.isinf(neg_inf): self.allow_inf_in_output = True return energy From c4ebf74cdb711e183378635c7c10d21e61c1fb72 Mon Sep 17 00:00:00 2001 From: "luca.gaudino" Date: Fri, 31 May 2024 12:11:20 +0200 Subject: [PATCH 076/227] convert ls960 LSTM LM to rf --- .../tedlium2/_import_model.py | 143 +++++++++++++++++- 1 file changed, 136 insertions(+), 7 deletions(-) diff --git a/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/_import_model.py b/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/_import_model.py index 2d63bc745..8bace5db2 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/_import_model.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/_import_model.py @@ -24,6 +24,10 @@ MakeModel as MakeModelLM, ) +from i6_experiments.users.gaudino.models.asr.rf.nn_lm.lm_import_2023_09_03 import ( + MakeModel as MakeModelLSTMLM, +) + from i6_experiments.users.gaudino.models.asr.rf.ilm_import_2024_04_17 import ( MakeModel as MakeModelILM, ) @@ -32,7 +36,7 @@ from itertools import product - +_lstm_lm_path = "/work/asr3/irie/experiments/lm/librispeech/2018-03-05--lmbpe-zeyer/data-train/re_i128_m2048_m2048_m2048_m2048.sgd_b32_lr0_cl2.newbobabs.d0.0.1350/bk-net-model/network.035" _returnn_tf_ckpt_filename = "/work/asr4/zeineldeen/setups-data/ubuntu_22_setups/2023-04-17--conformer-att/work/i6_core/returnn/training/AverageTFCheckpointsJob.yB4JK4GDCxWG/output/model/average" _ted2_lm_ckpt_filename = "/work/asr4/michel/setups-data/language_modelling/tedlium/neurallm/trafo_kazuki19/net-model/network.020" @@ -210,6 +214,47 @@ def convert_lm(ckpt_path_lm, out_dir, model_target_dim, model_args): {"model": pt_model.state_dict(), "epoch": epoch, "step": step}, filename ) +def convert_lstm_lm(ckpt_path_lm, out_dir, model_target_dim): + from tensorflow.python.training.py_checkpoint_reader import CheckpointReader + from returnn.torch.frontend.bridge import rf_module_to_pt_module + + print("Loading checkpoint...") + reader_lm = CheckpointReader(ckpt_path_lm) + + print("Creating model...") + rf.select_backend_torch() + model = MakeModelLSTMLM(model_target_dim, model_target_dim)() + + print("Create ParamMapping...") + param_mapping = {} + _add_params_lstm_lm(param_mapping) + + print("Mapping parameters...") + for name, param in model.named_parameters(): + assert isinstance(name, str) + assert isinstance(param, rf.Parameter) + value = map_param_func_lstm(reader_lm, name, param, param_mapping) + + assert isinstance(value, numpy.ndarray) + # noinspection PyProtectedMember + param._raw_backend.set_parameter_initial_value(param, value) + + epoch = 1 + step = 0 + + print("Converting rf module to pt module...") + ckpt_name = os.path.basename(ckpt_path_lm) + pt_model = rf_module_to_pt_module(model) + + save_model = True + if save_model: + os.makedirs(out_dir, exist_ok=True) + filename = out_dir + "/" + ckpt_name + ".pt" + print(f"Saving PyTorch model checkpoint: {filename}") + torch.save( + {"model": pt_model.state_dict(), "epoch": epoch, "step": step}, filename + ) + def convert_mini_att_ilm(ckpt_path_mini_att, ckpt_path_prior, model_in_dim, model_target_dim, out_dir): from tensorflow.python.training.py_checkpoint_reader import CheckpointReader from returnn.torch.frontend.bridge import rf_module_to_pt_module @@ -298,6 +343,26 @@ def _add_params_trafo_lm(param_mapping: Dict[str, str]): } ) +def _add_params_lstm_lm(param_mapping: Dict[str, str]): + # add params of lstm lm + for layer_idx in range(4): + param_mapping.update( + { + f"lstm_{layer_idx}.ff_weight": f"lstm{layer_idx}/rec/W", + f"lstm_{layer_idx}.rec_weight": f"lstm{layer_idx}/rec/W_re", + f"lstm_{layer_idx}.bias": f"lstm{layer_idx}/rec/b", + } + ) + + param_mapping.update( + { + "input.weight": "input/W", + "input_bias": "input/b", + "output.weight": "output/W", + "output.bias": "output/b", + } + ) + def _add_params_mini_att_ilm(param_mapping: Dict[str, str]): # rf -> tf param_mapping.update( @@ -532,6 +597,62 @@ def map_param_func_trafo_lm( raise NotImplementedError(f"cannot map {name!r} {var}") +def map_param_func_lstm(reader, name: str, var: rf.Parameter, param_mapping: Dict[str, str]) -> numpy.ndarray: + """map params, TF to RF""" + from tensorflow.python.training.py_checkpoint_reader import CheckpointReader + from i6_experiments.users.gaudino.convert import ( + convert_params, + ) + from i6_experiments.users.zeyer.returnn.convert.params import ( + tf_to_rf_np as convert_params_tf_to_rf_np, + ) + + assert isinstance(reader, CheckpointReader) + assert isinstance(var, rf.Parameter) + + tf_var_name = name.replace(".", "/") + if reader.has_tensor(tf_var_name): + return reader.get_tensor(tf_var_name) + + if name in param_mapping: + var_name = param_mapping[name] + assert reader.has_tensor(var_name) + value = reader.get_tensor(var_name) + assert isinstance(value, numpy.ndarray) + + if name.endswith(".ff_weight"): + print("Old ff:", value[0][0], value[0][2048], value[0][4096], value[0][6144]) + value = convert_params.convert_tf_lstm_to_torch_lstm_ff(value) + print("Convert ff:", value[0][0], value[2048][0], value[4096][0], value[6144][0]) + + if name.endswith(".rec_weight"): + print("Old rec:", value[0][0], value[0][2048], value[0][4096], value[0][6144]) + value = convert_params.convert_tf_lstm_to_torch_lstm_rec(value) + print("Convert rec:", value[0][0], value[2048][0], value[4096][0], value[6144][0]) + + + if "lstm" in name and name.endswith(".bias"): + print("Old bias:", value[0], value[2048], value[4096], value[6144]) + value = convert_params.convert_tf_lstm_to_torch_lstm_bias( + value + ) + print("Convert bias:", value[0], value[2048], value[4096], value[6144]) + + + if (name == "output.weight"): + # value = convert_params_np.convert_tf_lstm_to_native_lstm_ff(value) + value = value.transpose() + + assert ( + value.shape == var.batch_shape + ), f"new param {name} {var.batch_shape} vs ckpt param {var_name} {value.shape}" + assert ( + value.dtype.name == var.dtype + ), f"new param {name} {var.dtype} vs ckpt param {var_name} {value.dtype}" + return value + + raise NotImplementedError(f"cannot map {name!r} {var}") + def map_param_func_mini_att_ilm( reader, name: str, var: rf.Parameter, param_mapping: Dict[str, str] ) -> numpy.ndarray: @@ -673,10 +794,18 @@ def import_models(): # ) # ls960 ILM - convert_mini_att_ilm( - ckpt_path_prior="/u/zeineldeen/setups/ubuntu_22_setups/2023-04-17--conformer-att/work/i6_core/returnn/training/AverageTFCheckpointsJob.BxqgICRSGkgb/output/model/average", - ckpt_path_mini_att="/u/zeineldeen/setups/ubuntu_22_setups/2023-04-17--conformer-att/work/i6_core/returnn/training/GetBestTFCheckpointJob.JLwxrydala1K/output/model/checkpoint", - model_in_dim=640, - model_target_dim=10025, - out_dir="/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/librispeech/mini_att_ilm_24_05_28", + # convert_mini_att_ilm( + # ckpt_path_prior="/u/zeineldeen/setups/ubuntu_22_setups/2023-04-17--conformer-att/work/i6_core/returnn/training/AverageTFCheckpointsJob.BxqgICRSGkgb/output/model/average", + # ckpt_path_mini_att="/u/zeineldeen/setups/ubuntu_22_setups/2023-04-17--conformer-att/work/i6_core/returnn/training/GetBestTFCheckpointJob.JLwxrydala1K/output/model/checkpoint", + # model_in_dim=640, + # model_target_dim=10025, + # out_dir="/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/librispeech/mini_att_ilm_24_05_28", + # ) + + # ls960 LSTM LM + convert_lstm_lm( + _lstm_lm_path, + "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/librispeech/lstm_lm_only_24_05_31", + 10025, ) + From 3f38be97cf4682c38d482f819f29344de4be1107 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Fri, 31 May 2024 11:03:58 -0400 Subject: [PATCH 077/227] fix --- users/zeineldeen/data_aug/speed_perturbation_generic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/users/zeineldeen/data_aug/speed_perturbation_generic.py b/users/zeineldeen/data_aug/speed_perturbation_generic.py index 7670427ae..97dcb3c50 100644 --- a/users/zeineldeen/data_aug/speed_perturbation_generic.py +++ b/users/zeineldeen/data_aug/speed_perturbation_generic.py @@ -8,6 +8,7 @@ def speed_pert(audio, sample_rate, random_state, min_factor={min_factor}, max_fa """ speed_pert_v2 = """ +import numpy def speed_pert(audio, sample_rate={sample_rate}, min_factor={min_factor}, max_factor={max_factor}, step={step}, random_state=numpy.random.RandomState(1)): import librosa From 2e333edb12a1cb2bdcb80bead30097dabba27a06 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Fri, 31 May 2024 11:04:30 -0400 Subject: [PATCH 078/227] fix --- .../conformer_att_2022/librispeech_960/attention_asr_config.py | 1 - 1 file changed, 1 deletion(-) diff --git a/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/attention_asr_config.py b/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/attention_asr_config.py index d16556709..0630cbd43 100644 --- a/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/attention_asr_config.py +++ b/users/zeineldeen/experiments/conformer_att_2022/librispeech_960/attention_asr_config.py @@ -686,7 +686,6 @@ def create_config( assert "sample_rate" in speed_pert_version speed_pert_generic_str = data_aug.speed_pert_generic_v2 assert isinstance(speed_pert_generic_str, str) - python_prolog += ["import numpy\n\n"] python_prolog += [speed_pert_generic_str.format(**speed_pert_version)] else: raise ValueError("Invalid speed_pert_version") From 455fc74d6c7fa317965446ea1a8573dcecbaccca Mon Sep 17 00:00:00 2001 From: mmz33 Date: Fri, 31 May 2024 11:17:11 -0400 Subject: [PATCH 079/227] fix --- users/zeineldeen/data_aug/speed_perturbation_generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/users/zeineldeen/data_aug/speed_perturbation_generic.py b/users/zeineldeen/data_aug/speed_perturbation_generic.py index 97dcb3c50..452df93ee 100644 --- a/users/zeineldeen/data_aug/speed_perturbation_generic.py +++ b/users/zeineldeen/data_aug/speed_perturbation_generic.py @@ -11,7 +11,7 @@ def speed_pert(audio, sample_rate, random_state, min_factor={min_factor}, max_fa import numpy def speed_pert(audio, sample_rate={sample_rate}, min_factor={min_factor}, max_factor={max_factor}, step={step}, random_state=numpy.random.RandomState(1)): import librosa - + new_sample_rate = int(sample_rate * (1 + random_state.randint(min_factor, max_factor) * step)) if new_sample_rate != sample_rate: audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=new_sample_rate, res_type="kaiser_fast", axis=0) From e8460cfa9a72585802cfcbec10c0282edda217aa Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Fri, 31 May 2024 10:40:56 +0200 Subject: [PATCH 080/227] more --- users/zeyer/experiments/exp2024_04_23_baselines/aed.py | 6 +++--- users/zeyer/experiments/exp2024_04_23_baselines/ctc.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/aed.py b/users/zeyer/experiments/exp2024_04_23_baselines/aed.py index 115e4f2c9..59572b25a 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/aed.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/aed.py @@ -71,7 +71,7 @@ def py(): for vocab in [ "bpe10k", # 5.32 "spm10k", # 5.16 - "spm_bpe10k", + "spm_bpe10k", # 5.21 ]: train_exp( # 5.16 f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-{vocab}", @@ -93,9 +93,9 @@ def py(): 0.5, # 5.13 0.6, # 5.13 0.7, # 4.98 (!!) - 0.8, + 0.8, # 5.14 0.9, # 5.18 - 1.0, # sanity check + 1.0, # 5.35. sanity check, should be like baseline (5.16), could be attributed to randomness? ]: train_exp( f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-spm10k" diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index d295ba6e4..c2b4f3faa 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -47,7 +47,7 @@ def py(): }, ) - train_exp( + train_exp( # 8.79 f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_4-lrlin1e_5_295k-speedpertV2-bpe10k", config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, config_updates={ @@ -63,7 +63,7 @@ def py(): "spm10k", # 8.12 "spm_bpe10k", # 7.97 "spm4k", # 9.86 - "spm1k", + "spm1k", # 12.72 "spm_bpe1k", # 11.76 ]: train_exp( # 8.23 @@ -80,7 +80,7 @@ def py(): for alpha in [ 0.3, # 7.88 - 0.5, + 0.5, # 7.13 0.7, # 6.99 ]: train_exp( From ba8163bf89a9c94c1cd8bd1af3777c12ac6d4f26 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Sat, 1 Jun 2024 12:14:11 +0200 Subject: [PATCH 081/227] more --- .../exp2024_04_23_baselines/ctc.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index c2b4f3faa..50f4a4148 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -30,7 +30,41 @@ def py(): + """Sisyphus entry point""" + """ + Luca: + + CTC, greedy decoding ohne lm. + Habe eigentlich nicht wirklich was gemacht. Ist genau dein setup ohne die attention. + Model definition: i6_experiments/users/gaudino/models/asr/rf/conformer_ctc/model_conformer_ctc.py + Decoding: i6_experiments/users/gaudino/models/asr/rf/conformer_ctc/model_recog_ctc_greedy.py + Sis config: i6_experiments/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py + Experiment name: base-24gb-lrlin1e_5_600k_ctc_only_aux4_8 + Sis work dir: /u/luca.gaudino/setups/2023-08-10--rf-librispeech/alias/librispeech_960_exp2024_05_13_rf/conformer_ctc_train/base-24gb-lrlin1e_5_600k_ctc_only_aux4_8/train + Bekomme: {"dev-clean": 3.08, "dev-other": 6.93, "test-clean": 3.24, "test-other": 7.18} + + No diffs: + - Same oggzip files + - Same BPE vocab + Diffs: + - Luca has "seq_postfix": [0]? + - Luca uses single 24GB GPU, bfloat16 AMP + - Luca uses larger batch 2_400_000 -> 6_400_000, grad accum 1 -> 2 + - Luca uses wd 1e-06 + - Luca uses older behavior_version 21 -> 16. + """ + train_exp( + f"v6-bhv20-11gb-f32-bs15k-accgrad5-mgpu4-pavg100-wd1e_5-lrlin1e_5_295k-bpe10k", + config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), + "accum_grad_multiple_step": 5, + "optimizer.weight_decay": 1e-5, + }, + ) + + train_exp( # 9.24 f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_4-lrlin1e_5_295k-bpe10k", config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, config_updates={ From c2c44ab3214d5cfa01d694e30feeef5cf801ae1f Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Sat, 1 Jun 2024 12:26:02 +0200 Subject: [PATCH 082/227] use_eos_postfix --- .../exp2024_04_23_baselines/ctc.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 50f4a4148..b75c73519 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -134,6 +134,20 @@ def py(): # v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2 # with spm_bpe10k and enable_sampling, alpha in {0.3, 0.7} was both very bad (90% WER). + train_exp( + "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-spm10k-eos-spmSample07", + config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), + "optimizer.weight_decay": 1e-2, + "__train_audio_preprocess": speed_pert_librosa_config, + "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], + "use_eos_postfix": True, + }, + vocab="spm10k", + train_vocab_opts={"other_opts": {"enable_sampling": True, "alpha": 0.7}}, + ) + # noinspection PyShadowingNames def train_exp( @@ -296,6 +310,11 @@ def ctc_training(*, model: Model, data: rf.Tensor, data_spatial_dim: Dim, target data = rf.squeeze(data, axis=data.feature_dim) assert not data.feature_dim # raw audio + if config.bool("use_eos_postfix", False): + targets, (targets_spatial_dim,) = rf.pad( + targets, axes=[targets_spatial_dim], padding=[(0, 1)], value=model.eos_idx + ) + collected_outputs = {} logits, enc_spatial_dim = model(data, in_spatial_dim=data_spatial_dim, collected_outputs=collected_outputs) if aux_loss_layers: From 28e052c60c8802ebbafe26729d0dd7b8ff67c262 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Sun, 2 Jun 2024 01:57:21 +0200 Subject: [PATCH 083/227] fix CTC with EOS recog scoring --- .../experiments/exp2024_04_23_baselines/ctc.py | 16 +++++++++++++++- users/zeyer/recog.py | 10 +++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index b75c73519..d187ba7bf 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -23,6 +23,7 @@ if TYPE_CHECKING: from i6_experiments.users.zeyer.model_with_checkpoints import ModelWithCheckpoints, ModelWithCheckpoint from i6_experiments.users.zeyer.datasets.task import Task + from i6_experiments.users.zeyer.datasets.score_results import RecogOutput # The model gets raw features (16khz) and does feature extraction internally. @@ -211,11 +212,24 @@ def train_exp( distributed_launch_cmd="torchrun" if num_processes else None, time_rqmt=time_rqmt, ) - recog_training_exp(prefix, task, model_with_checkpoint, recog_def=model_recog) + + recog_post_proc_funcs = [] + if config.get("use_eos_postfix", False): + recog_post_proc_funcs.append(_remove_eos_label) + recog_training_exp( + prefix, task, model_with_checkpoint, recog_def=model_recog, recog_post_proc_funcs=recog_post_proc_funcs + ) return model_with_checkpoint +def _remove_eos_label(res: RecogOutput) -> RecogOutput: + from i6_experiments.users.zeyer.datasets.score_results import RecogOutput + from i6_core.returnn.search import SearchRemoveLabelJob + + return RecogOutput(SearchRemoveLabelJob(res.output, remove_label="", output_gzip=True).out_search_results) + + _sis_prefix: Optional[str] = None diff --git a/users/zeyer/recog.py b/users/zeyer/recog.py index 6f4ee8d81..ae83407ee 100644 --- a/users/zeyer/recog.py +++ b/users/zeyer/recog.py @@ -39,6 +39,7 @@ def recog_training_exp( *, search_config: Dict[str, Any] = None, search_post_config: Optional[Dict[str, Any]] = None, + recog_post_proc_funcs: Sequence[Callable[[RecogOutput], RecogOutput]] = (), search_mem_rqmt: Union[int, float] = 6, exclude_epochs: Collection[int] = (), model_avg: bool = False, @@ -51,6 +52,7 @@ def recog_training_exp( recog_def, search_config=search_config, search_post_config=search_post_config, + recog_post_proc_funcs=recog_post_proc_funcs, search_mem_rqmt=search_mem_rqmt, ) summarize_job = GetBestRecogTrainExp( @@ -79,6 +81,7 @@ def __init__( *, search_config: Optional[Dict[str, Any]] = None, search_post_config: Optional[Dict[str, Any]] = None, + recog_post_proc_funcs: Sequence[Callable[[RecogOutput], RecogOutput]] = (), search_mem_rqmt: Union[int, float] = 6, ): # Note: When something is added here, remember to handle it in _sis_hash. @@ -88,6 +91,7 @@ def __init__( self.recog_def = recog_def self.search_config = search_config self.search_post_config = search_post_config + self.recog_post_proc_funcs = recog_post_proc_funcs self.search_mem_rqmt = search_mem_rqmt def __call__(self, epoch_or_ckpt: Union[int, PtCheckpoint]) -> ScoreResultCollection: @@ -103,6 +107,7 @@ def __call__(self, epoch_or_ckpt: Union[int, PtCheckpoint]) -> ScoreResultCollec self.recog_def, config=self.search_config, search_post_config=self.search_post_config, + recog_post_proc_funcs=self.recog_post_proc_funcs, search_mem_rqmt=self.search_mem_rqmt, ) if isinstance(epoch_or_ckpt, int): @@ -119,6 +124,8 @@ def _sis_hash(self) -> bytes: del d["search_mem_rqmt"] if not self.search_config: del d["search_config"] # compat + if not self.recog_post_proc_funcs: + del d["recog_post_proc_funcs"] # compat # Not the whole task object is relevant but only some minimal parts. task = d.pop("task") assert isinstance(task, Task) @@ -135,6 +142,7 @@ def recog_model( *, config: Optional[Dict[str, Any]] = None, search_post_config: Optional[Dict[str, Any]] = None, + recog_post_proc_funcs: Sequence[Callable[[RecogOutput], RecogOutput]] = (), search_mem_rqmt: Union[int, float] = 6, search_rqmt: Optional[Dict[str, Any]] = None, dev_sets: Optional[Collection[str]] = None, @@ -157,7 +165,7 @@ def recog_model( search_mem_rqmt=search_mem_rqmt, search_rqmt=search_rqmt, search_alias_name=f"{name}/search/{dataset_name}" if name else None, - recog_post_proc_funcs=task.recog_post_proc_funcs, + recog_post_proc_funcs=list(recog_post_proc_funcs) + list(task.recog_post_proc_funcs), ) score_out = task.score_recog_output_func(dataset, recog_out) outputs[dataset_name] = score_out From b1077e9f967cfab36712cfd7e2f1a93fa253cce8 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Sun, 2 Jun 2024 02:31:59 +0200 Subject: [PATCH 084/227] ctc eos fix more --- users/zeyer/experiments/exp2024_04_23_baselines/ctc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index d187ba7bf..265ed9c17 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -215,7 +215,7 @@ def train_exp( recog_post_proc_funcs = [] if config.get("use_eos_postfix", False): - recog_post_proc_funcs.append(_remove_eos_label) + recog_post_proc_funcs.append(_remove_eos_label_v2) recog_training_exp( prefix, task, model_with_checkpoint, recog_def=model_recog, recog_post_proc_funcs=recog_post_proc_funcs ) @@ -223,11 +223,11 @@ def train_exp( return model_with_checkpoint -def _remove_eos_label(res: RecogOutput) -> RecogOutput: +def _remove_eos_label_v2(res: RecogOutput) -> RecogOutput: from i6_experiments.users.zeyer.datasets.score_results import RecogOutput from i6_core.returnn.search import SearchRemoveLabelJob - return RecogOutput(SearchRemoveLabelJob(res.output, remove_label="
", output_gzip=True).out_search_results) + return RecogOutput(SearchRemoveLabelJob(res.output, remove_label="", output_gzip=True).out_search_results) _sis_prefix: Optional[str] = None From b2349a2238d5f891340c698093c71130b645868a Mon Sep 17 00:00:00 2001 From: "luca.gaudino" Date: Mon, 3 Jun 2024 12:11:43 +0200 Subject: [PATCH 085/227] update --- .../librispeech_960/configs/ctc_att_search.py | 52 +++++++++++++++---- .../configs/ctc_att_search_w_recombine.py | 14 +++-- 2 files changed, 48 insertions(+), 18 deletions(-) diff --git a/users/gaudino/experiments/conformer_att_2023/librispeech_960/configs/ctc_att_search.py b/users/gaudino/experiments/conformer_att_2023/librispeech_960/configs/ctc_att_search.py index ac85a50ee..6279e386b 100644 --- a/users/gaudino/experiments/conformer_att_2023/librispeech_960/configs/ctc_att_search.py +++ b/users/gaudino/experiments/conformer_att_2023/librispeech_960/configs/ctc_att_search.py @@ -366,6 +366,8 @@ def run_lm_fusion( test_dataset_tuples = get_test_dataset_tuples(bpe_size=bpe_size) + use_sclite = kwargs.get("use_sclite", False) + run_single_search( exp_name=name, train_data=train_data, @@ -380,6 +382,8 @@ def run_lm_fusion( att_scale=kwargs.get("att_scale", 1.0), ctc_scale=kwargs.get("ctc_scale", 1.0), ctc_prior_scale=kwargs.get("ctc_prior_scale", None), + use_sclite=use_sclite, + ) def run_decoding( @@ -883,7 +887,8 @@ def train_mini_self_att( ) prior_args = copy.deepcopy(retrain_args) - prior_args["decoder_args"] = CTCDecoderArgs(hash_override_version=1) + # prior_args["decoder_args"] = CTCDecoderArgs(hash_override_version=1) + prior_args["decoder_args"] = CTCDecoderArgs() prior_file = compute_ctc_prior( name + f"_retrain1_const20_linDecay580_{1e-4}", prior_args, @@ -969,7 +974,7 @@ def train_mini_self_att( ) # two pass rescoring att,ctc - for beam_size in [12]: + for beam_size in []: #12 for ctc_scale in [0.01]: att_scale = 1.0 run_decoding( @@ -991,7 +996,7 @@ def train_mini_self_att( ) # optsnr att + ctc - for beam_size, scales, prior_scale in product([32], [(0.65, 0.35)], [0.0, 0.3]): + for beam_size, scales, prior_scale in product([], [(0.65, 0.35)], [0.0, 0.3]): # 32 search_args = copy.deepcopy(oclr_args) search_args["beam_size"] = beam_size search_args["ctc_log_prior_file"] = new_prior_file @@ -1062,7 +1067,7 @@ def train_mini_self_att( # att + lstm lm - for beam_size in [24]: + for beam_size in []: # 24 for lm_scale in [0.33]: att_scale = 1.0 run_lm_fusion( @@ -1086,7 +1091,7 @@ def train_mini_self_att( # ctc + lstm lm for beam_size, scales, prior_scale in product( - [55], [(1.0, 0.5)], [0.0, 0.3] + [], [(1.0, 0.5)], [0.0, 0.3] # 55 ): search_args = copy.deepcopy(oclr_args) search_args["beam_size"] = beam_size @@ -1125,7 +1130,7 @@ def train_mini_self_att( ) # two pass rescoring att + lstm lm, ctc - for beam_size in [12]: + for beam_size in []: # 12 for ctc_scale in [0.003]: for lm_scale in [0.4]: # for lm_scale in [0.28, 0.3, 0.32, 0.35, 0.38, 0.4, 0.42]: @@ -1155,7 +1160,7 @@ def train_mini_self_att( ) # optsnr att + ctc w prior + lstm lm - for beam_size, scales in product([48], [(0.7, 0.3, 0.6, 0.3)]): # 48 + for beam_size, scales in product([], [(0.7, 0.3, 0.6, 0.3)]): # 48 search_args = copy.deepcopy(oclr_args) search_args["ctc_log_prior_file"] = new_prior_file # ] = "/u/luca.gaudino/debug/ctc/prior.txt" @@ -1196,7 +1201,7 @@ def train_mini_self_att( # --------------------------- Trafo LM --------------------------- # # att + trafo lm - for beam_size in [12, 32]: + for beam_size in []: # 12, 32 lm_scale = 0.42 run_lm_fusion( args=oclr_args, @@ -1219,7 +1224,7 @@ def train_mini_self_att( # ctc + trafo lm for beam_size, scales, prior_scale in product( - [32], [(1, 0.55)], [0.4] + [], [(1, 0.55)], [0.4] # 32 ): search_args = copy.deepcopy(oclr_args) search_args["beam_size"] = beam_size @@ -1262,7 +1267,7 @@ def train_mini_self_att( # optsnr att + ctc + trafo lm for beam_size, scales, lm_scale, prior_scale in product( - [12, 32], + [], # 12 , 32 [ (0.8, 0.2) ], @@ -1316,3 +1321,30 @@ def train_mini_self_att( use_sclite=True, time_rqmt=time_rqmt, ) + + # --------- With ILM -------------------------------- + + # att + trafo lm + for beam_size, ilm_scale in product([32], [0.4]): + lm_scale = 0.54 + run_lm_fusion( + args=oclr_args, + lm_type="trafo", + exp_name=(f"bsf{bsf}/" if bsf > 0 else "") + f"att_trafolm{lm_scale}_ilm{ilm_scale}_beam{beam_size}", + train_data=train_data, + train_job=train_j, + feature_net=log10_net_10ms, + epoch=train_job_avg_ckpt[ + f"base_conf_12l_lstm_1l_conv6_OCLR_sqrdReLU_cyc915_ep2035_peak0.0009_retrain1_const20_linDecay580_{1e-4}" + ], + ckpt_name="avg", + lm_scales=[lm_scale], + beam_size=beam_size, + bpe_size=BPE_10K, + test_set_names=["dev-clean", "dev-other", "test-clean", "test-other"], + use_sclite=True, + bsf=bsf, + prior_type="mini_lstm", + prior_scales=[ilm_scale], + mini_lstm_ckpt="/u/zeineldeen/setups/ubuntu_22_setups/2023-04-17--conformer-att/work/i6_core/returnn/training/GetBestTFCheckpointJob.JLwxrydala1K/output/model/checkpoint", + ) diff --git a/users/gaudino/experiments/conformer_att_2023/librispeech_960/configs/ctc_att_search_w_recombine.py b/users/gaudino/experiments/conformer_att_2023/librispeech_960/configs/ctc_att_search_w_recombine.py index 3ed600a57..4e997847f 100644 --- a/users/gaudino/experiments/conformer_att_2023/librispeech_960/configs/ctc_att_search_w_recombine.py +++ b/users/gaudino/experiments/conformer_att_2023/librispeech_960/configs/ctc_att_search_w_recombine.py @@ -600,13 +600,12 @@ def run_exp( # --------------------------- With Lstm LM --------------------------- # # optsr max ctc w prior + lstm lm - for lm_scale, beam_size in product([0.3, 0.4, 0.5, 0.6], [12]): + for lm_scale, prior_scale, beam_size in product([0.6], [0.3], [32]): search_args = copy.deepcopy(oclr_args) search_args["beam_size"] = beam_size search_args["ctc_log_prior_file"] = new_prior_file ctc_scale = 1.0 label_scale = 1.0 - prior_scale = 0.0 ext_lm_opts = lstm_lm_opts_map[BPE_10K] @@ -651,7 +650,7 @@ def run_exp( search_args=search_args, feature_extraction_net=log10_net_10ms, bpe_size=BPE_10K, - test_sets=["dev-other"], + test_sets=["dev-clean", "dev-other", "test-clean", "test-other"], # test_sets=["dev-other"], remove_label={ "", @@ -664,13 +663,12 @@ def run_exp( # --------------------------- With Trafo LM --------------------------- # # optsr max ctc w prior + trafo lm - for lm_scale, beam_size in product([0.5, 0.6, 0.65, 0.7], [12]): + for lm_scale, prior_scale, beam_size in product([0.65], [0.3, 0.35], [32]): search_args = copy.deepcopy(oclr_args) search_args["beam_size"] = beam_size search_args["ctc_log_prior_file"] = new_prior_file ctc_scale = 1.0 label_scale = 1.0 - prior_scale = 0.0 ext_lm_opts = trafo_lm_opts_map[BPE_10K] @@ -715,7 +713,7 @@ def run_exp( search_args=search_args, feature_extraction_net=log10_net_10ms, bpe_size=BPE_10K, - test_sets=["dev-other"], + test_sets=["dev-clean", "dev-other", "test-clean", "test-other"], # test_sets=["dev-other"], remove_label={ "", @@ -726,11 +724,11 @@ def run_exp( ) # optsr max att + ctc w prior + trafo lm - for scales, lm_scale, beam_size in product([(0.65, 0.35, 0.0)], [0.45, 0.5, 0.55], [12]): + for scales, lm_scale, prior_scale, beam_size in product([(0.6, 0.4)], [0.6, 0.64, 0.68, 0.7], [0.15, 0.3, 0.45], [32]): search_args = copy.deepcopy(oclr_args) search_args["beam_size"] = beam_size search_args["ctc_log_prior_file"] = new_prior_file - att_scale, ctc_scale, prior_scale = scales + att_scale, ctc_scale = scales label_scale = 1.0 ext_lm_opts = trafo_lm_opts_map[BPE_10K] From f0f8b91aaad595e82368051f1631b88a3bcc3395 Mon Sep 17 00:00:00 2001 From: Lukas Rilling Date: Mon, 3 Jun 2024 13:53:41 +0200 Subject: [PATCH 086/227] Update Glow-TTS-ASR --- .../evaluation/forward_comparison.ipynb | 308 +++- users/rilling/evaluation/swer_eval.ipynb | 1479 +++++++++++------ users/rilling/evaluation/wer_eval.ipynb | 11 +- .../librispeech_glow_asr/experiments.py | 2 +- .../glowTTS/experiments.py | 2 +- .../glowTTS/gt_extraction.py | 2 +- .../librispeech_joint_training/experiments.py | 98 ++ .../glowTTS_ASR_conformer_two_forward_pass.py | 1 - .../shared/eval_invertibility.py | 17 +- .../training_comparison.ipynb | 78 +- .../exp_tts/experiments.py | 2 +- 11 files changed, 1349 insertions(+), 651 deletions(-) diff --git a/users/rilling/evaluation/forward_comparison.ipynb b/users/rilling/evaluation/forward_comparison.ipynb index c22bf1a9e..1a9e6570f 100644 --- a/users/rilling/evaluation/forward_comparison.ipynb +++ b/users/rilling/evaluation/forward_comparison.ipynb @@ -3,13 +3,15 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "metadata": {} + }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_503671/2049839218.py:8: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display\n", + "/var/tmp/ipykernel_4059881/2049839218.py:8: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display\n", " from IPython.core.display import display\n" ] } @@ -33,22 +35,24 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": {}, + "execution_count": 3, + "metadata": { + "metadata": {} + }, "outputs": [ { "data": { "text/plain": [ - "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/': '/glowTTS_x_vector_v2/enc768/400ep/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/': '/glowTTS_x_vector_v2/enc768/200ep/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/100ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/': '/glowTTS_x_vector_v2/enc768/100ep/',\n", + "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/100ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/': '/glowTTS_x_vector_v2/enc768/100ep/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/': '/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/': '/glowTTS/enc768/100ep/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/': '/glowTTS_x_vector_v2/enc768/400ep/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/': '/glowTTS_x_vector_v2/enc768/200ep/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/': '/glowTTS/enc768/200ep/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/': '/glowTTS/enc768/100ep/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/grad_clip_10/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/': '/glowTTS/enc768/400ep/grad_clip_10/'}" ] }, - "execution_count": 8, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -86,16 +90,18 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": {}, + "execution_count": 141, + "metadata": { + "metadata": {} + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_121-127105-0010_121-127105-0010.ogg\n", - "/glowTTS_x_vector_v2/enc768/400ep/\n", - "AutoMOS: 3.3250440482638384\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/100ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_1188-133604-0023_1188-133604-0023.ogg\n", + "/glowTTS_x_vector_v2/enc768/100ep/\n", + "AutoMOS: 3.181718836146358\n", "\n" ] }, @@ -104,7 +110,7 @@ "text/html": [ "\n", " \n", " " @@ -120,9 +126,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_121-127105-0010_121-127105-0010.ogg\n", - "/glowTTS_x_vector_v2/enc768/200ep/\n", - "AutoMOS: 3.2574123507855752\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_1188-133604-0023_1188-133604-0023.ogg\n", + "/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/\n", + "AutoMOS: 3.212764627153152\n", "\n" ] }, @@ -131,7 +137,7 @@ "text/html": [ "\n", " \n", " " @@ -147,9 +153,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/100ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_121-127105-0010_121-127105-0010.ogg\n", - "/glowTTS_x_vector_v2/enc768/100ep/\n", - "AutoMOS: 3.181718836146358\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_1188-133604-0023_1188-133604-0023.ogg\n", + "/glowTTS_x_vector_v2/enc768/400ep/\n", + "AutoMOS: 3.300797877637179\n", "\n" ] }, @@ -158,7 +164,7 @@ "text/html": [ "\n", " \n", " " @@ -174,9 +180,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_121-127105-0010_121-127105-0010.ogg\n", - "/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/\n", - "AutoMOS: 3.212764627153152\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_1188-133604-0023_1188-133604-0023.ogg\n", + "/glowTTS_x_vector_v2/enc768/200ep/\n", + "AutoMOS: 3.2574123507855752\n", "\n" ] }, @@ -185,7 +191,7 @@ "text/html": [ "\n", " \n", " " @@ -201,9 +207,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_121-127105-0010_121-127105-0010.ogg\n", - "/glowTTS/enc768/100ep/\n", - "AutoMOS: 3.302589650658008\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_1188-133604-0023_1188-133604-0023.ogg\n", + "/glowTTS/enc768/200ep/\n", + "AutoMOS: 3.3596264232244146\n", "\n" ] }, @@ -212,7 +218,7 @@ "text/html": [ "\n", " \n", " " @@ -228,9 +234,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_121-127105-0010_121-127105-0010.ogg\n", - "/glowTTS/enc768/200ep/\n", - "AutoMOS: 3.3596264232244146\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_1188-133604-0023_1188-133604-0023.ogg\n", + "/glowTTS/enc768/100ep/\n", + "AutoMOS: 3.302589650658008\n", "\n" ] }, @@ -239,7 +245,7 @@ "text/html": [ "\n", " \n", " " @@ -255,9 +261,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/grad_clip_10/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_121-127105-0010_121-127105-0010.ogg\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/grad_clip_10/dec_drop_0.05/tts_eval_gl/test-clean/forward/output/audio_files/test-clean_1188-133604-0023_1188-133604-0023.ogg\n", "/glowTTS/enc768/400ep/grad_clip_10/\n", - "AutoMOS: 3.2932603021495437\n", + "AutoMOS: 3.351169210956677\n", "\n" ] }, @@ -266,7 +272,7 @@ "text/html": [ "\n", " \n", " " @@ -280,7 +286,7 @@ } ], "source": [ - "sequence_index = 10\n", + "sequence_index = 102\n", "sequence_name = sequence_names[sequence_index]\n", "\n", "for folder, name in files.items():\n", @@ -296,10 +302,228 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 150, + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "from i6_experiments.users.rilling.experiments.librispeech.librispeech_glowtts.pytorch_networks.feature_extraction import DbMelFeatureExtraction, DbMelFeatureExtractionConfig\n", + "\n", + "fe_config = DbMelFeatureExtractionConfig.from_dict({\n", + " \"sample_rate\": 16000,\n", + " \"win_size\": 0.05,\n", + " \"hop_size\": 0.0125,\n", + " \"f_min\": 60,\n", + " \"f_max\": 7600,\n", + " \"min_amp\": 1e-10,\n", + " \"num_filters\": 80,\n", + " \"center\": True,\n", + " \"norm\": (-83.20164937973021, 34.14062855722116),\n", + " })\n", + "\n", + "fe = DbMelFeatureExtraction(fe_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "import torchaudio" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(tensor([[-0.0029, -0.0012, -0.0003, ..., 0.0098, -0.0023, -0.0016]]), 16000)" + ] + }, + "execution_count": 152, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "for folder, name in files.items():\n", + " path = folder + sequence_name\n", + " break\n", + "test = torchaudio.load(path)\n", + "test" + ] + }, + { + "cell_type": "code", + "execution_count": 234, + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor(1, dtype=torch.int32)" + ] + }, + "execution_count": 234, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spec, len = fe(test[0], test[0].shape[0])\n", + "len" + ] + }, + { + "cell_type": "code", + "execution_count": 247, + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([1376, 80])" + ] + }, + "execution_count": 247, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spec[0].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 298, + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "from torchaudio.functional import mask_along_axis\n", + "import copy\n", + "\n", + "def apply_spec_aug(input, num_repeat_time, max_dim_time, num_repeat_feat, max_dim_feat):\n", + " \"\"\"\n", + " :param Tensor input: the input audio features (B,T,F)\n", + " :param int num_repeat_time: number of repetitions to apply time mask\n", + " :param int max_dim_time: number of columns to be masked on time dimension will be uniformly sampled from [0, mask_param]\n", + " :param int num_repeat_feat: number of repetitions to apply feature mask\n", + " :param int max_dim_feat: number of columns to be masked on feature dimension will be uniformly sampled from [0, mask_param]\n", + " \"\"\"\n", + " for _ in range(num_repeat_time):\n", + " input = mask_along_axis(input, mask_param=max_dim_time, mask_value=0.0, axis=1)\n", + "\n", + " out1 = copy.deepcopy(input)\n", + " for _ in range(num_repeat_feat):\n", + " input = mask_along_axis(input, mask_param=max_dim_feat, mask_value=0.0, axis=2)\n", + " return input, out1" + ] + }, + { + "cell_type": "code", + "execution_count": 299, + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "def plot_spec(ax, s):\n", + " shape = s.shape\n", + " x = np.arange(0, shape[0]+1)\n", + " y = np.arange(0, shape[1]+1)\n", + "\n", + " s2 = s.transpose(0,1)\n", + " ax.pcolormesh(s2)" + ] + }, + { + "cell_type": "code", + "execution_count": 316, + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "augment_spec, midstep = apply_spec_aug(spec, 20, 20, 5, 8)" + ] + }, + { + "cell_type": "code", + "execution_count": 317, + "metadata": { + "metadata": {} + }, "outputs": [], - "source": [] + "source": [ + "import matplotlib.pyplot as plt\n", + "import librosa\n", + "\n", + "%matplotlib widget\n" + ] + }, + { + "cell_type": "code", + "execution_count": 320, + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/tmp/ipykernel_4059881/4065018720.py:1: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). Consider using `matplotlib.pyplot.close()`.\n", + " fig, ax = plt.subplots(1, 3)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "397f69e5a0c74ecc9ee95ca648376b45", + "version_major": 2, + "version_minor": 0 + }, + "image/png": "", + "text/html": [ + "\n", + "
\n", + "
\n", + " Figure\n", + "
\n", + " \n", + "
\n", + " " + ], + "text/plain": [ + "Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(1, 3)\n", + "plot_spec(ax[0], spec[0, :400])\n", + "plot_spec(ax[1], midstep[0, :400])\n", + "plot_spec(ax[2], augment_spec[0, :400])" + ] } ], "metadata": { diff --git a/users/rilling/evaluation/swer_eval.ipynb b/users/rilling/evaluation/swer_eval.ipynb index 1e207a807..fe7eaf1e2 100644 --- a/users/rilling/evaluation/swer_eval.ipynb +++ b/users/rilling/evaluation/swer_eval.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -20,108 +20,154 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_100ep_pe1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1_radam1e-9/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_x_vector/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + "['/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_4.5/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_3.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_2.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_3.5/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_2.5/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_4.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector_specaug/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuned/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_3.5_ps_0.5/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_3.0_ps_0.3/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_3.5_ps_0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_4.5_ps_0.5/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_4.0_ps_0.3/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_2.0_ps_0.3/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_3.0_ps_0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_2.5_ps_0.5/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_3.5_ps_0.3/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_2.0_ps_0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_4.5_ps_0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_3.0_ps_0.5/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_4.5_ps_0.3/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_4.0_ps_0.5/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_2.0_ps_0.5/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_2.5_ps_0.3/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_4.0_ps_0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_2.5_ps_0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/gin512/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/100ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05_epsilon_1e-8/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_3.5_ps_0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_3.0_ps_0.5/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_3.5_ps_0.3/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_4.0_ps_0.5/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_4.5_ps_0.3/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_2.5_ps_0.3/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_2.0_ps_0.5/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_3.0_ps_0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_3.0_ps_0.3/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_3.5_ps_0.5/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_2.0_ps_0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_4.0_ps_0.3/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_4.5_ps_0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_4.5_ps_0.5/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_4.0_ps_0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_2.5_ps_0.5/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_2.0_ps_0.3/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_2.5_ps_0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuned/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1_radam1e-9/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_x_vector/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_100ep_pe1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_multiscale/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_conformer_coupling/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss/ed_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_batch_norm/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector/enc768/100ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/100ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05_epsilon_1e-8/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.0/grad_clip_10/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/100ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/grad_clip_10/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/gin512/grad_clip_10/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/epsilon_1e-8/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/grad_clip_10/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05_epsilon_1e-8/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/400ep/grad_clip_10/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.00/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss/ed_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss_grad_clip_10/ed_scale_1.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/grad_clip_10/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/epsilon_1e-8/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05_epsilon_1e-8/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/gin512/grad_clip_10/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/grad_clip_10/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/gin512/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/100ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss_grad_clip_10/ed_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_simple_encoder/12cb/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss_grad_clip_10/ed_scale_1.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_simple_encoder/20cb/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_conformer_coupling/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_multiscale/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_batch_norm/enc768/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector/enc768/100ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/basic_init/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/mean_only/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_1.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.01/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.01/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_1.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/glowTTS_simple_encoder/12cb/200ep/dec_drop_0.05/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_1.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_1.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/basic_init/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_1.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.01/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.01/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/basic_init/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/tts_pretrained/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/basic_init/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc256/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc768/with_sigma/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc768/with_sigma/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc768/mean_only/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc768/mean_only/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/basic_init/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/mean_only/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_1.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_simple_encoder/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc192/200ep/long_cooldown/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc192/200ep/long_cooldown/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_simple_encoder/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_ddi_actnorm/enc192/100ep/not_silence_preprocessed/LR_scheduled/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc768/mean_only/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc768/mean_only/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc768/with_sigma/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc768/with_sigma/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc256/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder_no_blstm/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc768/100ep/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc192/100ep/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder_no_blstm/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer']" + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_ddi_actnorm/enc192/100ep/not_silence_preprocessed/LR_scheduled/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer']" ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -142,7 +188,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -155,10 +201,10 @@ { "data": { "text/plain": [ - "'100'" + "'250'" ] }, - "execution_count": 3, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -180,16 +226,16 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(93, 93, 93, 93, 93, 93, 93, 93)" + "(139, 139, 139, 139, 139, 139, 139, 139)" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -228,7 +274,7 @@ " else:\n", " nisqa_confidence.append(None)\n", "\n", - " folders = [\"librispeech_glow_asr\", \"joint_training/default\", \"joint_training/conformer_coupling\", \"joint_training/given_alignments\", \"tts_architecture\"]\n", + " folders = [\"ASR_only\", \"joint_training/default\", \"joint_training/conformer_coupling\", \"joint_training/given_alignments\", \"TTS_only/v1\", \"TTS_only/v2\"]\n", " found = False\n", " for folder in folders:\n", " if folder in f:\n", @@ -306,7 +352,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -357,23 +403,23 @@ " \n", " \n", " \n", - " joint_training/conformer_coupling\n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_100ep_pe1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 12.4\n", - " 2.936271\n", - " NaN\n", - " 100\n", + " joint_training/default\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 13.3\n", + " 3.123151\n", + " 0.019940\n", + " 250\n", " 0.05\n", " False\n", " 768\n", - " [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05]\n", + " [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]\n", " {'class': 'adam', 'epsilon': 1e-08}\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 20.9\n", - " 2.449569\n", - " NaN\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 9.5\n", + " 2.746789\n", + " 0.021347\n", " 250\n", " 0.05\n", " False\n", @@ -382,10 +428,10 @@ " {'class': 'adam', 'epsilon': 1e-08}\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 5.2\n", - " 2.136262\n", - " NaN\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 11.2\n", + " 3.176109\n", + " 0.023388\n", " 250\n", " 0.05\n", " False\n", @@ -394,27 +440,27 @@ " {'class': 'adam', 'epsilon': 1e-08}\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1_radam1e-9/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 14.7\n", - " 2.588368\n", - " NaN\n", - " 100\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_4.5/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 16.0\n", + " 3.109557\n", + " 0.021113\n", + " 250\n", " 0.05\n", " False\n", " 768\n", - " [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05]\n", - " {'class': 'radam', 'epsilon': 1e-09}\n", + " [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]\n", + " {'class': 'adam', 'epsilon': 1e-08}\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 14.4\n", - " 2.718031\n", - " NaN\n", - " 100\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_3.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 16.0\n", + " 3.109557\n", + " 0.021113\n", + " 250\n", " 0.05\n", " False\n", " 768\n", - " [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05]\n", + " [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]\n", " {'class': 'adam', 'epsilon': 1e-08}\n", " \n", " \n", @@ -432,6 +478,30 @@ " \n", " \n", " tts_architecture\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 25.2\n", + " 3.463099\n", + " NaN\n", + " 200\n", + " 0.00\n", + " -\n", + " -\n", + " [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-05]\n", + " {'class': 'adam', 'epsilon': 1e-09}\n", + " \n", + " \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder_no_blstm/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 105.6\n", + " 3.165388\n", + " NaN\n", + " 100\n", + " 0.00\n", + " -\n", + " -\n", + " [0: 0.0001, 49: 0.0005, 50: 0.0001, 100: 1e-06]\n", + " {'class': 'radam', 'epsilon': 1e-09}\n", + " \n", + " \n", " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc768/100ep/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", " 20.9\n", " 3.312732\n", @@ -456,177 +526,153 @@ " {'class': 'adam', 'epsilon': 1e-09}\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/not_silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 25.2\n", - " 3.463099\n", - " NaN\n", - " 200\n", - " 0.00\n", - " -\n", - " -\n", - " [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-05]\n", - " {'class': 'adam', 'epsilon': 1e-09}\n", - " \n", - " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 25.9\n", - " 3.391651\n", - " NaN\n", - " 200\n", - " 0.00\n", - " -\n", - " -\n", - " [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-05]\n", - " {'class': 'adam', 'epsilon': 1e-09}\n", - " \n", - " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder_no_blstm/silence_preprocessed/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 105.6\n", - " 3.165388\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_ddi_actnorm/enc192/100ep/not_silence_preprocessed/LR_scheduled/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 100.0\n", + " 3.115826\n", " NaN\n", " 100\n", " 0.00\n", - " -\n", - " -\n", - " [0: 0.0001, 49: 0.0005, 50: 0.0001, 100: 1e-06]\n", - " {'class': 'radam', 'epsilon': 1e-09}\n", + " False\n", + " 192\n", + " [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05]\n", + " {'class': 'adam', 'epsilon': 1e-09}\n", " \n", " \n", "\n", - "

93 rows × 9 columns

\n", + "

139 rows × 9 columns

\n", "" ], "text/plain": [ - " sWER \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 12.4 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 20.9 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 5.2 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 14.7 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 14.4 \n", - "... ... \n", - "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 20.9 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 95.4 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 25.2 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 25.9 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 105.6 \n", + " sWER \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 13.3 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 9.5 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 11.2 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 16.0 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 16.0 \n", + "... ... \n", + "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 25.2 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 105.6 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 20.9 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 95.4 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 100.0 \n", "\n", - " autoMOS \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.936271 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.449569 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.136262 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.588368 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.718031 \n", - "... ... \n", - "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.312732 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.633865 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.463099 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.391651 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.165388 \n", + " autoMOS \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.123151 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.746789 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.176109 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.109557 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.109557 \n", + "... ... \n", + "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.463099 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.165388 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.312732 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.633865 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.115826 \n", "\n", - " autoMOS confidence \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", - "... ... \n", - "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", + " autoMOS confidence \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.019940 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.021347 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.023388 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.021113 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.021113 \n", + "... ... \n", + "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", "\n", - " num_epochs \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 100 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 100 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 100 \n", - "... ... \n", - "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 100 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 100 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 200 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 200 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 100 \n", + " num_epochs \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", + "... ... \n", + "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 200 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 100 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 100 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 100 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 100 \n", "\n", - " decoder dropout \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", - "... ... \n", - "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.00 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.00 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.00 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.00 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.00 \n", + " decoder dropout \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", + "... ... \n", + "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.00 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.00 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.00 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.00 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.00 \n", "\n", - " mean only \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - "... ... \n", - "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... - \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... - \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... - \n", + " mean only \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + "... ... \n", + "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... - \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... - \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", "\n", - " encoder channels \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", - "... ... \n", - "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 192 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... - \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... - \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... - \n", + " encoder channels \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", + "... ... \n", + "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... - \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... - \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 192 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 192 \n", "\n", - " LR \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] \n", - "... ... \n", - "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-05] \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-05] \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 0.0001, 49: 0.0005, 50: 0.0001, 100: 1e-06] \n", + " LR \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", + "... ... \n", + "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-05] \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 0.0001, 49: 0.0005, 50: 0.0001, 100: 1e-06] \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] \n", "\n", - " Optimizer \n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'radam', 'epsilon': 1e-09} \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", - "... ... \n", - "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-09} \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-09} \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-09} \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-09} \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'radam', 'epsilon': 1e-09} \n", + " Optimizer \n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", + "... ... \n", + "tts_architecture /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-09} \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'radam', 'epsilon': 1e-09} \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-09} \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-09} \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-09} \n", "\n", - "[93 rows x 9 columns]" + "[139 rows x 9 columns]" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -643,7 +689,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -708,236 +754,236 @@ " \n", " \n", " \n", - " joint_training/conformer_coupling\n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_100ep_pe1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 12.4\n", - " 2.936271\n", - " NaN\n", - " 100\n", + " joint_training/default\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 13.3\n", + " 3.123151\n", + " 0.019940\n", + " 250\n", " 0.05\n", " False\n", " 768\n", - " [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05]\n", + " [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]\n", " {'class': 'adam', 'epsilon': 1e-08}\n", - " -0.826781\n", - " -0.846025\n", - " None\n", - " 0.382039\n", - " 0.353906\n", - " None\n", - " False\n", + " -0.753013\n", + " -0.694853\n", + " -0.771774\n", + " 0.417359\n", + " 0.565613\n", + " 0.399888\n", + " True\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 20.9\n", - " 2.449569\n", - " NaN\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 9.5\n", + " 2.746789\n", + " 0.021347\n", " 250\n", " 0.05\n", " False\n", " 768\n", " [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]\n", " {'class': 'adam', 'epsilon': 1e-08}\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", + " -0.610471\n", + " -0.555647\n", + " -0.629746\n", + " 0.416831\n", + " 0.583815\n", + " 0.403638\n", + " True\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 5.2\n", - " 2.136262\n", - " NaN\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 11.2\n", + " 3.176109\n", + " 0.023388\n", " 250\n", " 0.05\n", " False\n", " 768\n", " [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]\n", " {'class': 'adam', 'epsilon': 1e-08}\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", + " -0.758909\n", + " -0.694360\n", + " -0.778751\n", + " 0.400894\n", + " 0.555330\n", + " 0.379635\n", + " True\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1_radam1e-9/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 14.7\n", - " 2.588368\n", - " NaN\n", - " 100\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_4.5/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 16.0\n", + " 3.109557\n", + " 0.021113\n", + " 250\n", " 0.05\n", " False\n", " 768\n", - " [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05]\n", - " {'class': 'radam', 'epsilon': 1e-09}\n", - " -0.82573\n", - " -0.844433\n", - " None\n", - " 0.384814\n", - " 0.355806\n", - " None\n", - " False\n", + " [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]\n", + " {'class': 'adam', 'epsilon': 1e-08}\n", + " -0.676933\n", + " -0.621956\n", + " -0.692515\n", + " 0.328548\n", + " 0.477463\n", + " 0.302857\n", + " True\n", " \n", " \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", - " 14.4\n", - " 2.718031\n", - " NaN\n", - " 100\n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_3.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer\n", + " 16.0\n", + " 3.109557\n", + " 0.021113\n", + " 250\n", " 0.05\n", " False\n", " 768\n", - " [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05]\n", + " [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]\n", " {'class': 'adam', 'epsilon': 1e-08}\n", - " -0.825686\n", - " -0.844577\n", - " None\n", - " 0.389056\n", - " 0.360854\n", - " None\n", - " False\n", + " -0.676933\n", + " -0.621956\n", + " -0.692515\n", + " 0.328548\n", + " 0.477463\n", + " 0.302857\n", + " True\n", " \n", " \n", "\n", "" ], "text/plain": [ - " sWER \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 12.4 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 20.9 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 5.2 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 14.7 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 14.4 \n", + " sWER \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 13.3 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 9.5 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 11.2 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 16.0 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 16.0 \n", "\n", - " autoMOS \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.936271 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.449569 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.136262 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.588368 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.718031 \n", + " autoMOS \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.123151 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.746789 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.176109 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.109557 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.109557 \n", "\n", - " autoMOS confidence \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... NaN \n", + " autoMOS confidence \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.019940 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.021347 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.023388 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.021113 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.021113 \n", "\n", - " num_epochs \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 100 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 100 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 100 \n", + " num_epochs \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", "\n", - " decoder dropout \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", + " decoder dropout \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", "\n", - " mean only \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " mean only \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", "\n", - " encoder channels \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", + " encoder channels \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", "\n", - " LR \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] \n", + " LR \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", "\n", - " Optimizer \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'radam', 'epsilon': 1e-09} \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", + " Optimizer \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", "\n", - " MLE \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.826781 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.82573 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.825686 \n", + " MLE \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.753013 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.610471 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.758909 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.676933 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.676933 \n", "\n", - " dev MLE \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.846025 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.844433 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.844577 \n", + " dev MLE \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.694853 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.555647 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.694360 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.621956 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.621956 \n", "\n", - " devtrain MLE \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... None \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... None \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... None \n", + " devtrain MLE \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.771774 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.629746 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.778751 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.692515 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.692515 \n", "\n", - " DP loss \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.382039 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.384814 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.389056 \n", + " DP loss \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.417359 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.416831 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.400894 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.328548 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.328548 \n", "\n", - " DP dev loss \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.353906 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.355806 \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.360854 \n", + " DP dev loss \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.565613 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.583815 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.555330 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.477463 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.477463 \n", "\n", - " DP devtrain loss \\\n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... None \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... None \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... None \n", + " DP devtrain loss \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.399888 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.403638 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.379635 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.302857 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.302857 \n", "\n", - " Joint \n", - "Group Experiment \n", - "joint_training/conformer_coupling /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", - " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False " + " Joint \n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True " ] }, - "execution_count": 13, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -999,6 +1045,311 @@ "df.head(5)" ] }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sWERautoMOSautoMOS confidencenum_epochsdecoder dropoutmean onlyencoder channelsLROptimizerMLEdev MLEdevtrain MLEDP lossDP dev lossDP devtrain lossJoint
GroupExperiment
joint_training/default/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer13.33.1231510.0199402500.05False768[0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]{'class': 'adam', 'epsilon': 1e-08}-0.753013-0.694853-0.7717740.4173590.5656130.399888True
/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer9.52.7467890.0213472500.05False768[0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]{'class': 'adam', 'epsilon': 1e-08}-0.610471-0.555647-0.6297460.4168310.5838150.403638True
/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer11.23.1761090.0233882500.05False768[0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]{'class': 'adam', 'epsilon': 1e-08}-0.758909-0.694360-0.7787510.4008940.5553300.379635True
/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_4.5/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer16.03.1095570.0211132500.05False768[0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]{'class': 'adam', 'epsilon': 1e-08}-0.676933-0.621956-0.6925150.3285480.4774630.302857True
/u/lukas.rilling/experiments/glow_tts_asr_v2/output/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_3.0/tts_eval_gl/test-clean/swer/ls960eow_phon_ctc_50eps_fastsearch/sclite/wer16.03.1095570.0211132500.05False768[0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08]{'class': 'adam', 'epsilon': 1e-08}-0.676933-0.621956-0.6925150.3285480.4774630.302857True
\n", + "
" + ], + "text/plain": [ + " sWER \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 13.3 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 9.5 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 11.2 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 16.0 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 16.0 \n", + "\n", + " autoMOS \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.123151 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 2.746789 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.176109 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.109557 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 3.109557 \n", + "\n", + " autoMOS confidence \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.019940 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.021347 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.023388 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.021113 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.021113 \n", + "\n", + " num_epochs \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 250 \n", + "\n", + " decoder dropout \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.05 \n", + "\n", + " mean only \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... False \n", + "\n", + " encoder channels \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 768 \n", + "\n", + " LR \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] \n", + "\n", + " Optimizer \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... {'class': 'adam', 'epsilon': 1e-08} \n", + "\n", + " MLE \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.753013 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.610471 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.758909 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.676933 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.676933 \n", + "\n", + " dev MLE \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.694853 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.555647 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.694360 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.621956 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.621956 \n", + "\n", + " devtrain MLE \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.771774 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.629746 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.778751 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.692515 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... -0.692515 \n", + "\n", + " DP loss \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.417359 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.416831 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.400894 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.328548 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.328548 \n", + "\n", + " DP dev loss \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.565613 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.583815 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.555330 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.477463 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.477463 \n", + "\n", + " DP devtrain loss \\\n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.399888 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.403638 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.379635 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.302857 \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... 0.302857 \n", + "\n", + " Joint \n", + "Group Experiment \n", + "joint_training/default /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True \n", + " /u/lukas.rilling/experiments/glow_tts_asr_v2/ou... True " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df[\"Joint\"]].head(5)" + ] + }, { "cell_type": "code", "execution_count": 14, @@ -1020,62 +1371,60 @@ "name": "stdout", "output_type": "stream", "text": [ - "| | Group | Experiment | sWER | autoMOS | autoMOS confidence | num_epochs | decoder dropout | mean only | encoder channels | LR | Optimizer | MLE | dev MLE | devtrain MLE | DP loss | DP dev loss | DP devtrain loss | Joint |\n", - "|---:|:----------------------------------|:------------------------------------------------------------------------------------------------------------------------|-------:|----------:|---------------------:|-------------:|------------------:|:------------|:-------------------|:-------------------------------------------------|:-------------------------------------|----------:|----------:|---------------:|----------:|--------------:|-------------------:|:--------|\n", - "| 0 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_100ep_pe1/ | 12.4 | 2.94 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-08} | -0.826781 | -0.846025 | | 0.382039 | 0.353906 | | False |\n", - "| 1 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_two_forward_pass/ | 20.9 | 2.45 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | 0 | 0 | 0 | 0 | 0 | 0 | False |\n", - "| 2 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/ | 5.2 | 2.14 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | 0 | 0 | 0 | 0 | 0 | 0 | False |\n", - "| 3 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1_radam1e-9/ | 14.7 | 2.59 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.82573 | -0.844433 | | 0.384814 | 0.355806 | | False |\n", - "| 4 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1/ | 14.4 | 2.72 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-08} | -0.825686 | -0.844577 | | 0.389056 | 0.360854 | | False |\n", - "| 5 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS/ | 97.7 | 1.61 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.824271 | -0.830326 | | 0.964433 | 0.725438 | | False |\n", - "| 7 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_x_vector/ | 99.5 | 1.93 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.809305 | -0.755239 | -0.813211 | 1.00198 | 1.07779 | 1.04241 | False |\n", - "| 8 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_x_vector/ | 5.2 | 2.14 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | 0 | 0 | 0 | 0 | 0 | 0 | False |\n", - "| 23 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/gin512/dec_drop_0.05/ | 12.9 | 3.33 | 0.02 | 400 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 400: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.822881 | -0.834023 | -0.844881 | 0.376549 | 0.455059 | 0.340289 | False |\n", - "| 24 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/dec_drop_0.05/ | 12.9 | 3.33 | 0.02 | 400 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 400: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | 0 | 0 | 0 | 0 | 0 | 0 | False |\n", - "| 25 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.0/ | 96.2 | 2.29 | nan | 200 | 0 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.813627 | -0.797961 | -0.803228 | 0.975631 | 0.866819 | 0.882062 | False |\n", - "| 26 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/ | 14.6 | 3.26 | 0.02 | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.809709 | -0.82178 | -0.830623 | 0.390625 | 0.4432 | 0.357604 | False |\n", - "| 27 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/100ep/dec_drop_0.05/ | 16.3 | 3.18 | 0.02 | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.795013 | -0.809868 | -0.816514 | 0.408781 | 0.440684 | 0.381301 | False |\n", - "| 28 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.0/ | 16.7 | 3.32 | nan | 200 | 0 | False | 768 | [0: 1e-06, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.846667 | -0.76151 | -0.845555 | 0.409992 | 0.468342 | 0.378645 | False |\n", - "| 29 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.05/ | 15.6 | 3.21 | nan | 200 | 0.05 | False | 768 | [0: 1e-06, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.80793 | -0.820998 | -0.829611 | 0.394024 | 0.448099 | 0.360531 | False |\n", - "| 30 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.05/ | 13.3 | 3.3 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.811426 | -0.824218 | -0.832125 | 0.375146 | 0.412309 | 0.342878 | False |\n", - "| 31 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.0/ | 95.4 | 2.41 | nan | 200 | 0 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.841694 | -0.819818 | -0.83016 | 0.965626 | 1.15174 | 1.18063 | False |\n", - "| 32 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05_epsilon_1e-8/ | 96.7 | 2.27 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-08} | -0.808277 | -0.817505 | -0.821914 | 0.982278 | 0.814173 | 0.826286 | False |\n", - "| 33 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05/ | 98.8 | 1.88 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.809477 | -0.815064 | -0.819234 | 0.989373 | 0.615026 | 0.622701 | False |\n", - "| 34 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.0/grad_clip_10/ | 12.8 | 3.34 | nan | 200 | 0 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.852834 | -0.773976 | -0.851532 | 0.392373 | 0.431975 | 0.362269 | False |\n", - "| 35 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/100ep/dec_drop_0.05/ | 98.1 | 1.52 | nan | 100 | 0.05 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.797353 | -0.804897 | -0.808365 | 0.989832 | 0.777391 | 0.784409 | False |\n", - "| 36 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/grad_clip_10/dec_drop_0.05/ | 14.4 | 3.29 | 0.02 | 400 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 400: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | 0 | 0 | 0 | 0 | 0 | 0 | False |\n", - "| 37 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/gin512/grad_clip_10/dec_drop_0.05/ | 14.4 | 3.29 | 0.02 | 400 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 400: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.824197 | -0.835761 | -0.846551 | 0.372315 | 0.440626 | 0.33371 | False |\n", - "| 38 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/epsilon_1e-8/ | 95.9 | 3.06 | nan | 200 | 0 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-08} | -0.82712 | -0.750288 | -0.813842 | 0.979225 | 0.454561 | 0.462357 | False |\n", - "| 39 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/grad_clip_10/ | 14.5 | 3.31 | nan | 200 | 0 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.850649 | -0.838267 | -0.849902 | 0.396277 | 0.445801 | 0.364157 | False |\n", - "| 40 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05_epsilon_1e-8/ | 15.7 | 3.35 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-08} | -0.811807 | -0.826657 | -0.834532 | 0.386067 | 0.438248 | 0.351594 | False |\n", - "| 41 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05/ | 15.1 | 3.36 | 0.02 | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.812429 | -0.827038 | -0.834987 | 0.387377 | 0.441218 | 0.352695 | False |\n", - "| 42 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.00/ | 16.3 | 3.27 | nan | 100 | 0 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.831177 | -0.821651 | -0.830556 | 0.409946 | 0.437995 | 0.384226 | False |\n", - "| 43 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.05/ | 17.2 | 3.3 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.798251 | -0.814911 | -0.821276 | 0.397457 | 0.432689 | 0.367326 | False |\n", - "| 44 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss/ed_scale_0.1/ | 95.8 | 3.09 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.782615 | -0.779794 | -0.782747 | 1.08489 | 2.0684 | 2.07562 | False |\n", - "| 45 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss_grad_clip_10/ed_scale_1.0/ | 98.8 | 2.27 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.743438 | -0.756143 | -0.757724 | 1.13249 | 1.54302 | 1.5535 | False |\n", - "| 46 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss_grad_clip_10/ed_scale_0.1/ | 95.7 | 3.21 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.798058 | -0.786642 | -0.791474 | 1.07707 | 1.81449 | 1.83075 | False |\n", - "| 47 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_simple_encoder/12cb/200ep/dec_drop_0.05/ | 70.3 | 3.47 | nan | 200 | 0.05 | False | - | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.80368 | -0.821293 | -0.825658 | 0.428016 | 0.498157 | 0.402439 | False |\n", - "| 48 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_simple_encoder/20cb/200ep/dec_drop_0.05/ | 57.3 | 3.46 | nan | 200 | 0.05 | False | - | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.834191 | -0.847676 | -0.856235 | 0.402502 | 0.472223 | 0.376431 | False |\n", - "| 49 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_conformer_coupling/enc768/200ep/dec_drop_0.05/ | 9.6 | 1.79 | 0.03 | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.842848 | -0.852489 | -0.860671 | 0.370438 | 0.41724 | 0.344025 | False |\n", - "| 50 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_multiscale/enc768/200ep/dec_drop_0.05/ | 100 | 1.25 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.785697 | -0.800581 | -0.807986 | 0.533892 | 0.569833 | 0.522571 | False |\n", - "| 51 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_batch_norm/enc768/200ep/dec_drop_0.05/ | 15.1 | 3.4 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.80533 | -0.817184 | -0.826384 | 0.389733 | 0.44642 | 0.357169 | False |\n", - "| 52 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector/enc768/100ep/dec_drop_0.05/ | 97.9 | 1.51 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.781911 | -0.787168 | -0.790923 | 0.991812 | 1.38362 | 1.38305 | False |\n", - "| 77 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc256/not_silence_preprocessed/ | 97.9 | 1.69 | nan | 200 | 0 | False | 256 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.851128 | -0.54391 | | 0.962189 | 0.501158 | | False |\n", - "| 78 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc768/with_sigma/not_silence_preprocessed/ | 15.7 | 3.35 | nan | 200 | 0 | False | 768 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.851343 | -0.833649 | | 0.401905 | 0.450686 | | False |\n", - "| 79 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc768/with_sigma/silence_preprocessed/ | 15.3 | 3.4 | nan | 200 | 0 | False | 768 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.736796 | -0.463881 | | 0.371603 | 0.409141 | | False |\n", - "| 80 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc768/mean_only/silence_preprocessed/ | 16.9 | 3.52 | nan | 200 | 0 | True | 768 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.735579 | -0.714157 | | 0.366752 | 0.410529 | | False |\n", - "| 81 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc768/mean_only/not_silence_preprocessed/ | 15.5 | 3.47 | nan | 200 | 0 | True | 768 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.85035 | -0.763903 | | 0.395024 | 0.448581 | | False |\n", - "| 82 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/not_silence_preprocessed/ | 14.3 | 3.31 | nan | 100 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.8365 | -0.770533 | | 0.406652 | 0.423657 | | False |\n", - "| 83 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/silence_preprocessed/ | 13.5 | 3.3 | nan | 100 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.819508 | -0.80574 | | 0.373986 | 0.385518 | | False |\n", - "| 84 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc192/200ep/long_cooldown/not_silence_preprocessed/ | 14.7 | 3.29 | nan | 200 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-07] | {'class': 'adam', 'epsilon': 1e-08} | -0.839926 | -0.694879 | | 0.411271 | 0.432932 | | False |\n", - "| 85 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc192/200ep/long_cooldown/silence_preprocessed/ | 13.5 | 3.28 | nan | 200 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-07] | {'class': 'adam', 'epsilon': 1e-08} | -0.82215 | -0.807288 | | 0.377117 | 0.390598 | | False |\n", - "| 86 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_simple_encoder/silence_preprocessed/ | 56.4 | 3.33 | nan | 100 | 0 | - | - | [0: 0.0001, 49: 0.0005, 50: 0.0005, 100: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.842638 | -0.822322 | | 0.384891 | 0.428803 | | False |\n", - "| 87 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_ddi_actnorm/enc192/100ep/not_silence_preprocessed/LR_scheduled/ | 100 | 3.12 | nan | 100 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.837399 | -0.801737 | | 0.42113 | 0.441682 | | False |\n", - "| 88 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc768/100ep/not_silence_preprocessed/ | 20.9 | 3.31 | nan | 100 | 0 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.810979 | -0.801251 | | 0.459904 | 0.467786 | | False |\n", - "| 89 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc192/100ep/not_silence_preprocessed/ | 95.4 | 2.63 | nan | 100 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.805523 | -0.788462 | | 0.972006 | 1.27149 | | False |\n", - "| 90 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/not_silence_preprocessed/ | 25.2 | 3.46 | nan | 200 | 0 | - | - | [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.844504 | -0.70768 | | 0.411171 | 0.455673 | | False |\n", - "| 91 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/silence_preprocessed/ | 25.9 | 3.39 | nan | 200 | 0 | - | - | [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.829622 | -0.813688 | | 0.386848 | 0.423963 | | False |\n", - "| 92 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder_no_blstm/silence_preprocessed/ | 105.6 | 3.17 | nan | 100 | 0 | - | - | [0: 0.0001, 49: 0.0005, 50: 0.0001, 100: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.799977 | -0.788486 | | 0.8847 | 0.873624 | | False |\n" + "| | Group | Experiment | sWER | autoMOS | autoMOS confidence | num_epochs | decoder dropout | mean only | encoder channels | LR | Optimizer | MLE | dev MLE | devtrain MLE | DP loss | DP dev loss | DP devtrain loss | Joint |\n", + "|----:|:----------------------------------|:------------------------------------------------------------------------------------------------------------------------|-------:|----------:|---------------------:|-------------:|------------------:|:------------|:-------------------|:-------------------------------------------------|:-------------------------------------|------:|----------:|---------------:|----------:|--------------:|-------------------:|:--------|\n", + "| 62 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1/ | 14.4 | 2.72 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-08} | -0.83 | -0.84 | nan | 0.39 | 0.36 | nan | False |\n", + "| 63 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS/ | 97.7 | 1.61 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.82 | -0.83 | nan | 0.96 | 0.73 | nan | False |\n", + "| 64 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_100ep_pe1_radam1e-9/ | 14.7 | 2.59 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.83 | -0.84 | nan | 0.38 | 0.36 | nan | False |\n", + "| 65 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_x_vector/ | 99.5 | 1.93 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.81 | -0.76 | -0.81 | 1 | 1.08 | 1.04 | False |\n", + "| 66 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_100ep_pe1/ | 12.4 | 2.94 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-08} | -0.83 | -0.85 | nan | 0.38 | 0.35 | nan | False |\n", + "| 68 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_multiscale/enc768/200ep/dec_drop_0.05/ | 100 | 1.25 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.79 | -0.8 | -0.81 | 0.53 | 0.57 | 0.52 | False |\n", + "| 69 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_conformer_coupling/enc768/200ep/dec_drop_0.05/ | 9.6 | 1.79 | 0.03 | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.84 | -0.85 | -0.86 | 0.37 | 0.42 | 0.34 | False |\n", + "| 70 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss/ed_scale_0.1/ | 95.8 | 3.09 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.78 | -0.78 | -0.78 | 1.08 | 2.07 | 2.08 | False |\n", + "| 71 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_batch_norm/enc768/200ep/dec_drop_0.05/ | 15.1 | 3.4 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.82 | -0.83 | 0.39 | 0.45 | 0.36 | False |\n", + "| 72 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector/enc768/100ep/dec_drop_0.05/ | 97.9 | 1.51 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.78 | -0.79 | -0.79 | 0.99 | 1.38 | 1.38 | False |\n", + "| 73 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/100ep/dec_drop_0.05/ | 98.1 | 1.52 | nan | 100 | 0.05 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.8 | -0.81 | 0.99 | 0.78 | 0.78 | False |\n", + "| 74 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05/ | 98.8 | 1.88 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.82 | -0.82 | 0.99 | 0.62 | 0.62 | False |\n", + "| 75 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.05_epsilon_1e-8/ | 96.7 | 2.27 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-08} | -0.81 | -0.82 | -0.82 | 0.98 | 0.81 | 0.83 | False |\n", + "| 76 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/200ep/dec_drop_0.0/grad_clip_10/ | 12.8 | 3.34 | nan | 200 | 0 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.85 | -0.77 | -0.85 | 0.39 | 0.43 | 0.36 | False |\n", + "| 77 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc192/400ep/grad_clip_10/dec_drop_0.05/ | 11.7 | 3.47 | nan | 400 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 400: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.82 | -0.84 | -0.85 | 0.37 | 0.42 | 0.33 | False |\n", + "| 78 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.00/ | 16.3 | 3.27 | nan | 100 | 0 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.83 | -0.82 | -0.83 | 0.41 | 0.44 | 0.38 | False |\n", + "| 79 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/100ep/dec_drop_0.05/ | 17.2 | 3.3 | nan | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.81 | -0.82 | 0.4 | 0.43 | 0.37 | False |\n", + "| 80 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05/ | 15.1 | 3.36 | 0.02 | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.83 | -0.83 | 0.39 | 0.44 | 0.35 | False |\n", + "| 81 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/grad_clip_10/ | 14.5 | 3.31 | nan | 200 | 0 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.85 | -0.84 | -0.85 | 0.4 | 0.45 | 0.36 | False |\n", + "| 82 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.0/epsilon_1e-8/ | 95.9 | 3.06 | nan | 200 | 0 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-08} | -0.83 | -0.75 | -0.81 | 0.98 | 0.45 | 0.46 | False |\n", + "| 83 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/200ep/dec_drop_0.05_epsilon_1e-8/ | 15.7 | 3.35 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-08} | -0.81 | -0.83 | -0.83 | 0.39 | 0.44 | 0.35 | False |\n", + "| 84 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/gin512/grad_clip_10/dec_drop_0.05/ | 14.4 | 3.29 | 0.02 | 400 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 400: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.82 | -0.84 | -0.85 | 0.37 | 0.44 | 0.33 | False |\n", + "| 85 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS/enc768/400ep/grad_clip_10/dec_drop_0.05/ | 13.3 | 3.35 | 0.02 | 400 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 400: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.82 | -0.84 | -0.85 | 0.37 | 0.44 | 0.33 | False |\n", + "| 86 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.0/ | 96.2 | 2.29 | nan | 200 | 0 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.8 | -0.8 | 0.98 | 0.87 | 0.88 | False |\n", + "| 87 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep/dec_drop_0.05/ | 14.6 | 3.26 | 0.02 | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.82 | -0.83 | 0.39 | 0.44 | 0.36 | False |\n", + "| 88 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/gin512/dec_drop_0.05/ | 12.9 | 3.33 | 0.02 | 400 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 400: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.82 | -0.83 | -0.84 | 0.38 | 0.46 | 0.34 | False |\n", + "| 89 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/400ep/dec_drop_0.05/ | 13.1 | 3.3 | 0.02 | 400 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 400: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.82 | -0.83 | -0.84 | 0.38 | 0.46 | 0.34 | False |\n", + "| 90 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.05/ | 15.6 | 3.21 | nan | 200 | 0.05 | False | 768 | [0: 1e-06, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.82 | -0.83 | 0.39 | 0.45 | 0.36 | False |\n", + "| 91 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/200ep_long_cooldown/dec_drop_0.0/ | 16.7 | 3.32 | nan | 200 | 0 | False | 768 | [0: 1e-06, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.85 | -0.76 | -0.85 | 0.41 | 0.47 | 0.38 | False |\n", + "| 92 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc768/100ep/dec_drop_0.05/ | 16.3 | 3.18 | 0.02 | 100 | 0.05 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.81 | -0.82 | 0.41 | 0.44 | 0.38 | False |\n", + "| 93 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.0/ | 95.4 | 2.41 | nan | 200 | 0 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.84 | -0.82 | -0.83 | 0.97 | 1.15 | 1.18 | False |\n", + "| 94 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2/enc192/200ep/dec_drop_0.05/ | 13.3 | 3.3 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.82 | -0.83 | 0.38 | 0.41 | 0.34 | False |\n", + "| 95 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss_grad_clip_10/ed_scale_0.1/ | 95.7 | 3.21 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.79 | -0.79 | 1.08 | 1.81 | 1.83 | False |\n", + "| 96 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_x_vector_v2_logdist_loss_grad_clip_10/ed_scale_1.0/ | 98.8 | 2.27 | nan | 200 | 0.05 | False | 192 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.74 | -0.76 | -0.76 | 1.13 | 1.54 | 1.55 | False |\n", + "| 97 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_simple_encoder/20cb/200ep/dec_drop_0.05/ | 57.3 | 3.46 | nan | 200 | 0.05 | False | - | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.83 | -0.85 | -0.86 | 0.4 | 0.47 | 0.38 | False |\n", + "| 98 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/TTS_models/glowTTS_simple_encoder/12cb/200ep/dec_drop_0.05/ | 70.3 | 3.47 | nan | 200 | 0.05 | False | - | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.82 | -0.83 | 0.43 | 0.5 | 0.4 | False |\n", + "| 123 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_simple_encoder/silence_preprocessed/ | 56.4 | 3.33 | nan | 100 | 0 | - | - | [0: 0.0001, 49: 0.0005, 50: 0.0005, 100: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.84 | -0.82 | nan | 0.38 | 0.43 | nan | False |\n", + "| 124 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc192/200ep/long_cooldown/not_silence_preprocessed/ | 14.7 | 3.29 | nan | 200 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-07] | {'class': 'adam', 'epsilon': 1e-08} | -0.84 | -0.69 | nan | 0.41 | 0.43 | nan | False |\n", + "| 125 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc192/200ep/long_cooldown/silence_preprocessed/ | 13.5 | 3.28 | nan | 200 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-07] | {'class': 'adam', 'epsilon': 1e-08} | -0.82 | -0.81 | nan | 0.38 | 0.39 | nan | False |\n", + "| 126 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/not_silence_preprocessed/ | 14.3 | 3.31 | nan | 100 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.84 | -0.77 | nan | 0.41 | 0.42 | nan | False |\n", + "| 127 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/silence_preprocessed/ | 13.5 | 3.3 | nan | 100 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.82 | -0.81 | nan | 0.37 | 0.39 | nan | False |\n", + "| 128 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc768/mean_only/not_silence_preprocessed/ | 15.5 | 3.47 | nan | 200 | 0 | True | 768 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.85 | -0.76 | nan | 0.4 | 0.45 | nan | False |\n", + "| 129 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc768/mean_only/silence_preprocessed/ | 16.9 | 3.52 | nan | 200 | 0 | True | 768 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.74 | -0.71 | nan | 0.37 | 0.41 | nan | False |\n", + "| 130 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc768/with_sigma/silence_preprocessed/ | 15.3 | 3.4 | nan | 200 | 0 | False | 768 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.74 | -0.46 | nan | 0.37 | 0.41 | nan | False |\n", + "| 131 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc768/with_sigma/not_silence_preprocessed/ | 15.7 | 3.35 | nan | 200 | 0 | False | 768 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.85 | -0.83 | nan | 0.4 | 0.45 | nan | False |\n", + "| 132 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS/enc256/not_silence_preprocessed/ | 97.9 | 1.69 | nan | 200 | 0 | False | 256 | [0: 5e-05, 49: 0.0005, 50: 0.0005, 200: 1e-06] | {'class': 'adam', 'epsilon': 1e-09} | -0.85 | -0.54 | nan | 0.96 | 0.5 | nan | False |\n", + "| 133 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/silence_preprocessed/ | 25.9 | 3.39 | nan | 200 | 0 | - | - | [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.83 | -0.81 | nan | 0.39 | 0.42 | nan | False |\n", + "| 134 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder/not_silence_preprocessed/ | 25.2 | 3.46 | nan | 200 | 0 | - | - | [0: 1e-05, 49: 0.0005, 50: 0.0005, 200: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.84 | -0.71 | nan | 0.41 | 0.46 | nan | False |\n", + "| 135 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_nar_taco_encoder_no_blstm/silence_preprocessed/ | 105.6 | 3.17 | nan | 100 | 0 | - | - | [0: 0.0001, 49: 0.0005, 50: 0.0001, 100: 1e-06] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.79 | nan | 0.88 | 0.87 | nan | False |\n", + "| 136 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc768/100ep/not_silence_preprocessed/ | 20.9 | 3.31 | nan | 100 | 0 | False | 768 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.81 | -0.8 | nan | 0.46 | 0.47 | nan | False |\n", + "| 137 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc192/100ep/not_silence_preprocessed/ | 95.4 | 2.63 | nan | 100 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.81 | -0.79 | nan | 0.97 | 1.27 | nan | False |\n", + "| 138 | tts_architecture | tts_architecture/glow_tts/raw_audio/glowTTS_ddi_actnorm/enc192/100ep/not_silence_preprocessed/LR_scheduled/ | 100 | 3.12 | nan | 100 | 0 | False | 192 | [0: 1e-05, 49: 0.0005, 50: 0.0005, 100: 1e-05] | {'class': 'adam', 'epsilon': 1e-09} | -0.84 | -0.8 | nan | 0.42 | 0.44 | nan | False |\n" ] } ], @@ -1092,47 +1441,95 @@ "name": "stdout", "output_type": "stream", "text": [ - "| | Group | Experiment | sWER | autoMOS | autoMOS confidence | num_epochs | decoder dropout | mean only | encoder channels | LR | Optimizer | MLE | dev MLE | devtrain MLE | DP loss | DP dev loss | DP devtrain loss | Joint |\n", - "|---:|:----------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------|-------:|----------:|---------------------:|-------------:|------------------:|:------------|-------------------:|:-------------------------------------------------|:-------------------------------------|----------:|----------:|---------------:|----------:|--------------:|-------------------:|:--------|\n", - "| 6 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/ddi/glowTTS_ASR_conformer_two_forward_pass/ | 95.7 | 1.82 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.657168 | | -0.670671 | 1.0046 | | 2.09134 | True |\n", - "| 9 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/ | 4.6 | 3.11 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74139 | -0.690156 | -0.759844 | 0.346902 | 0.481245 | 0.329289 | True |\n", - "| 10 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/ | 9.5 | 2.75 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.610471 | -0.555647 | -0.629746 | 0.416831 | 0.583815 | 0.403638 | True |\n", - "| 11 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/ | 11.2 | 3.18 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.758909 | -0.69436 | -0.778751 | 0.400894 | 0.55533 | 0.379635 | True |\n", - "| 12 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/ | 13.3 | 3.12 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.753013 | -0.694853 | -0.771774 | 0.417359 | 0.565613 | 0.399888 | True |\n", - "| 13 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/ | 5.2 | 2.3 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.506816 | -0.458788 | -0.521236 | 0.245737 | 0.346906 | 0.225934 | True |\n", - "| 14 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/ | 9.9 | 2.64 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.621646 | -0.565345 | -0.641523 | 0.402243 | 0.552866 | 0.379653 | True |\n", - "| 15 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/ | 7.9 | 3.19 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.754171 | -0.704233 | -0.772961 | 0.369618 | 0.50542 | 0.346386 | True |\n", - "| 16 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/ | 6.7 | 2.48 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.577228 | -0.531676 | -0.591349 | 0.349734 | 0.46387 | 0.326762 | True |\n", - "| 17 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/ | 7.6 | 3.15 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.750811 | -0.700515 | -0.769183 | 0.380467 | 0.502884 | 0.36247 | True |\n", - "| 18 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/ | 4.4 | 3.2 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.745622 | -0.693835 | -0.764738 | 0.330255 | 0.457476 | 0.306534 | True |\n", - "| 19 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/ | 14.8 | 2.61 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.679734 | | -0.699266 | 0.399065 | | 0.370426 | True |\n", - "| 20 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/ | 6 | 2.32 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.520932 | -0.470559 | -0.535346 | 0.229065 | 0.345538 | 0.203592 | True |\n", - "| 21 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/ | 15.2 | 3.16 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.784652 | | -0.805268 | 0.383526 | | 0.355401 | True |\n", - "| 22 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/ | 98 | 1.58 | 0.01 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.552728 | -0.497187 | -0.548408 | 1.01904 | 0.823625 | 0.701494 | True |\n", - "| 53 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_0.1/ | 25.2 | 3.11 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.802836 | -0.81678 | -0.823897 | 0.434087 | 0.464465 | 0.407059 | True |\n", - "| 54 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_1/ | 88.5 | 1.87 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.72108 | -0.734634 | -0.74084 | 0.747725 | 0.789389 | 0.691204 | True |\n", - "| 55 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 98.1 | 1.82 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.785492 | -0.790627 | -0.79625 | 0.890597 | 0.574966 | 0.575373 | True |\n", - "| 56 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 32.9 | 2.77 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.803716 | -0.816318 | -0.825113 | 0.439343 | 0.490279 | 0.409061 | True |\n", - "| 57 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_1/ | 83.9 | 1.92 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.690569 | -0.702706 | -0.708801 | 0.760198 | 0.943391 | 0.716967 | True |\n", - "| 58 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_0.1/ | 41.9 | 2.75 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.794999 | -0.80951 | -0.816001 | 0.505974 | 0.538897 | 0.493393 | True |\n", - "| 59 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 14.1 | 3.2 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.793062 | -0.805013 | -0.81407 | 0.39621 | 0.451179 | 0.36325 | True |\n", - "| 60 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 23.3 | 3.13 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.77557 | -0.785888 | -0.794856 | 0.432057 | 0.483314 | 0.399083 | True |\n", - "| 61 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/mean_only/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 96.4 | 2.37 | nan | 200 | 0.05 | True | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.803728 | -0.802102 | -0.808006 | 0.932394 | 0.721534 | 0.725609 | True |\n", - "| 62 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 91.7 | 2.66 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.805783 | -0.821988 | -0.827055 | 0.625962 | 0.62989 | 0.592278 | True |\n", - "| 63 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_1.0/ | 97.2 | 2.12 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.801181 | -0.803144 | -0.808404 | 0.834689 | 0.611662 | 0.619833 | True |\n", - "| 64 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 99.9 | 1.56 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.799504 | -0.772613 | -0.798104 | 0.818331 | 0.531823 | 0.536853 | True |\n", - "| 65 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.1/ | 99.7 | 1.83 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.790621 | -0.805995 | -0.811551 | 0.0523576 | 0.0489216 | 0.0415039 | True |\n", - "| 66 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.01/ | 97.2 | 1.97 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.797074 | -0.812303 | -0.818065 | 0.0631472 | 0.0577708 | 0.0508676 | True |\n", - "| 67 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.1/ | 99.1 | 1.64 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.782616 | -0.797878 | -0.803697 | 0.0519058 | 0.0493118 | 0.0413708 | True |\n", - "| 68 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.01/ | 98.2 | 2.18 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.787278 | -0.802672 | -0.808277 | 0.0638544 | 0.0582362 | 0.0520775 | True |\n", - "| 69 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_1.0/ | 100 | 1.97 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.72889 | -0.745734 | -0.75171 | 0.075977 | 0.0820037 | 0.0666421 | True |\n", - "| 70 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_0.1/ | 100 | 1.98 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.760167 | -0.775606 | -0.781364 | 0.075069 | 0.0806164 | 0.0651269 | True |\n", - "| 71 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_1.0/ | 100 | 1.98 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.729906 | -0.74441 | -0.750384 | 0.0698329 | 0.0841703 | 0.0607633 | True |\n", - "| 72 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_0.1/ | 100 | 1.98 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.757141 | -0.771994 | -0.778037 | 0.071102 | 0.0818373 | 0.0614895 | True |\n", - "| 73 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/basic_init/ce_ls_0.1/ | 97.9 | 1.78 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.779347 | -0.79411 | -0.800602 | 0.255822 | 0.238816 | 0.246318 | True |\n", - "| 74 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/ce_ls_0.1/ | 95.4 | 2.53 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.792338 | -0.806682 | -0.812767 | 0.247139 | 0.215995 | 0.232498 | True |\n", - "| 75 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/tts_pretrained/ce_ls_0.1/ | 95.2 | 2.18 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.775121 | -0.788423 | -0.794789 | 0.263264 | 0.235715 | 0.253516 | True |\n", - "| 76 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/basic_init/ce_ls_0.1/ | 96.1 | 2.35 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.76667 | -0.781423 | -0.785757 | 0.265068 | 0.264766 | 0.261945 | True |\n" + "| | Group | Experiment | sWER | autoMOS | autoMOS confidence | num_epochs | decoder dropout | mean only | encoder channels | LR | Optimizer | MLE | dev MLE | devtrain MLE | DP loss | DP dev loss | DP devtrain loss | Joint |\n", + "|----:|:----------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------|-------:|----------:|---------------------:|-------------:|------------------:|:------------|-------------------:|:-------------------------------------------------|:-------------------------------------|------:|----------:|---------------:|----------:|--------------:|-------------------:|:--------|\n", + "| 0 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_ctc_scale_0.1/ | 13.3 | 3.12 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.75 | -0.69 | -0.77 | 0.42 | 0.57 | 0.4 | True |\n", + "| 1 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass/ | 9.5 | 2.75 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.61 | -0.56 | -0.63 | 0.42 | 0.58 | 0.4 | True |\n", + "| 2 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2_ctc_scale_0.1/ | 11.2 | 3.18 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.76 | -0.69 | -0.78 | 0.4 | 0.56 | 0.38 | True |\n", + "| 3 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_4.5/ | 16 | 3.11 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.68 | -0.62 | -0.69 | 0.33 | 0.48 | 0.3 | True |\n", + "| 4 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_3.0/ | 16 | 3.11 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.68 | -0.62 | -0.69 | 0.33 | 0.48 | 0.3 | True |\n", + "| 5 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_2.0/ | 16 | 3.11 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.68 | -0.62 | -0.69 | 0.33 | 0.48 | 0.3 | True |\n", + "| 6 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_3.5/ | 16 | 3.11 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.68 | -0.62 | -0.69 | 0.33 | 0.48 | 0.3 | True |\n", + "| 7 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_2.5/ | 16 | 3.11 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.68 | -0.62 | -0.69 | 0.33 | 0.48 | 0.3 | True |\n", + "| 8 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/tuning/lm_4.0/ | 16 | 3.11 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.68 | -0.62 | -0.69 | 0.33 | 0.48 | 0.3 | True |\n", + "| 9 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector/ | 16 | 3.11 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.68 | -0.62 | -0.69 | 0.33 | 0.48 | 0.3 | True |\n", + "| 10 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment_ctc_scale_0.1/ | 4.6 | 3.11 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74 | -0.69 | -0.76 | 0.35 | 0.48 | 0.33 | True |\n", + "| 11 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/ | 4.4 | 3.2 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.75 | -0.69 | -0.76 | 0.33 | 0.46 | 0.31 | True |\n", + "| 12 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/ | 14.8 | 2.61 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.68 | nan | -0.7 | 0.4 | nan | 0.37 | True |\n", + "| 13 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_two_forward_pass_v2/ | 9.9 | 2.64 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.62 | -0.57 | -0.64 | 0.4 | 0.55 | 0.38 | True |\n", + "| 14 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_ctc_scale_0.1/ | 7.6 | 3.15 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.75 | -0.7 | -0.77 | 0.38 | 0.5 | 0.36 | True |\n", + "| 15 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/ | 6.7 | 2.48 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.58 | -0.53 | -0.59 | 0.35 | 0.46 | 0.33 | True |\n", + "| 16 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector_specaug/ | 6.3 | 3.01 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.62 | -0.56 | -0.64 | 0.23 | 0.38 | 0.2 | True |\n", + "| 17 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/ | 16.1 | 3.22 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.79 | nan | -0.81 | 0.39 | nan | 0.37 | True |\n", + "| 18 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuned/ | 16.1 | 3.22 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.79 | nan | -0.81 | 0.39 | nan | 0.37 | True |\n", + "| 19 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_3.5_ps_0.5/ | 16.1 | 3.22 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.79 | nan | -0.81 | 0.39 | nan | 0.37 | True |\n", + "| 20 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_3.0_ps_0.3/ | 16.1 | 3.22 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.79 | nan | -0.81 | 0.39 | nan | 0.37 | True |\n", + "| 21 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_3.5_ps_0/ | 16.1 | 3.22 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.79 | nan | -0.81 | 0.39 | nan | 0.37 | True |\n", + "| 22 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_4.5_ps_0.5/ | 16.1 | 3.22 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.79 | nan | -0.81 | 0.39 | nan | 0.37 | True |\n", + "| 23 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_4.0_ps_0.3/ | 16.1 | 3.22 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.79 | nan | -0.81 | 0.39 | nan | 0.37 | True |\n", + "| 24 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_2.0_ps_0.3/ | 16.1 | 3.22 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.79 | nan | -0.81 | 0.39 | nan | 0.37 | True |\n", + "| 25 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_3.0_ps_0/ | 16.1 | 3.22 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.79 | nan | -0.81 | 0.39 | nan | 0.37 | True |\n", + "| 26 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_2.5_ps_0.5/ | 16.1 | 3.22 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.79 | nan | -0.81 | 0.39 | nan | 0.37 | True |\n", + "| 27 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_3.5_ps_0.3/ | 16.1 | 3.22 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.79 | nan | -0.81 | 0.39 | nan | 0.37 | True |\n", + "| 28 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_2.0_ps_0/ | 16.1 | 3.22 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.79 | nan | -0.81 | 0.39 | nan | 0.37 | True |\n", + "| 29 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_4.5_ps_0/ | 16.1 | 3.22 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.79 | nan | -0.81 | 0.39 | nan | 0.37 | True |\n", + "| 30 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_3.0_ps_0.5/ | 16.1 | 3.22 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.79 | nan | -0.81 | 0.39 | nan | 0.37 | True |\n", + "| 31 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_4.5_ps_0.3/ | 16.1 | 3.22 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.79 | nan | -0.81 | 0.39 | nan | 0.37 | True |\n", + "| 32 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_4.0_ps_0.5/ | 16.1 | 3.22 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.79 | nan | -0.81 | 0.39 | nan | 0.37 | True |\n", + "| 33 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_2.0_ps_0.5/ | 16.1 | 3.22 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.79 | nan | -0.81 | 0.39 | nan | 0.37 | True |\n", + "| 34 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_2.5_ps_0.3/ | 16.1 | 3.22 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.79 | nan | -0.81 | 0.39 | nan | 0.37 | True |\n", + "| 35 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_4.0_ps_0/ | 16.1 | 3.22 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.79 | nan | -0.81 | 0.39 | nan | 0.37 | True |\n", + "| 36 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/tuning/lm_2.5_ps_0/ | 16.1 | 3.22 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.79 | nan | -0.81 | 0.39 | nan | 0.37 | True |\n", + "| 37 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/ | 15.2 | 3.16 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.78 | nan | -0.81 | 0.38 | nan | 0.36 | True |\n", + "| 38 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/ | 7.9 | 3.19 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.75 | -0.7 | -0.77 | 0.37 | 0.51 | 0.35 | True |\n", + "| 39 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector/ | 98 | 1.58 | 0.01 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.55 | -0.5 | -0.55 | 1.02 | 0.82 | 0.7 | True |\n", + "| 40 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment/ | 6 | 2.32 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.52 | -0.47 | -0.54 | 0.23 | 0.35 | 0.2 | True |\n", + "| 41 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_3.5_ps_0/ | 15.8 | 3.01 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74 | nan | -0.76 | 0.39 | nan | 0.36 | True |\n", + "| 42 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_3.0_ps_0.5/ | 15.8 | 3.01 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74 | nan | -0.76 | 0.39 | nan | 0.36 | True |\n", + "| 43 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_3.5_ps_0.3/ | 15.8 | 3.01 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74 | nan | -0.76 | 0.39 | nan | 0.36 | True |\n", + "| 44 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_4.0_ps_0.5/ | 15.8 | 3.01 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74 | nan | -0.76 | 0.39 | nan | 0.36 | True |\n", + "| 45 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_4.5_ps_0.3/ | 15.8 | 3.01 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74 | nan | -0.76 | 0.39 | nan | 0.36 | True |\n", + "| 46 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_2.5_ps_0.3/ | 15.8 | 3.01 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74 | nan | -0.76 | 0.39 | nan | 0.36 | True |\n", + "| 47 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_2.0_ps_0.5/ | 15.8 | 3.01 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74 | nan | -0.76 | 0.39 | nan | 0.36 | True |\n", + "| 48 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_3.0_ps_0/ | 15.8 | 3.01 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74 | nan | -0.76 | 0.39 | nan | 0.36 | True |\n", + "| 49 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_3.0_ps_0.3/ | 15.8 | 3.01 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74 | nan | -0.76 | 0.39 | nan | 0.36 | True |\n", + "| 50 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_3.5_ps_0.5/ | 15.8 | 3.01 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74 | nan | -0.76 | 0.39 | nan | 0.36 | True |\n", + "| 51 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_2.0_ps_0/ | 15.8 | 3.01 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74 | nan | -0.76 | 0.39 | nan | 0.36 | True |\n", + "| 52 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_4.0_ps_0.3/ | 15.8 | 3.01 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74 | nan | -0.76 | 0.39 | nan | 0.36 | True |\n", + "| 53 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_4.5_ps_0/ | 15.8 | 3.01 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74 | nan | -0.76 | 0.39 | nan | 0.36 | True |\n", + "| 54 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_4.5_ps_0.5/ | 15.8 | 3.01 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74 | nan | -0.76 | 0.39 | nan | 0.36 | True |\n", + "| 55 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_4.0_ps_0/ | 15.8 | 3.01 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74 | nan | -0.76 | 0.39 | nan | 0.36 | True |\n", + "| 56 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_2.5_ps_0.5/ | 15.8 | 3.01 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74 | nan | -0.76 | 0.39 | nan | 0.36 | True |\n", + "| 57 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_2.0_ps_0.3/ | 15.8 | 3.01 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74 | nan | -0.76 | 0.39 | nan | 0.36 | True |\n", + "| 58 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuning/lm_2.5_ps_0/ | 15.8 | 3.01 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74 | nan | -0.76 | 0.39 | nan | 0.36 | True |\n", + "| 59 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/tuned/ | 15.8 | 3.01 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74 | nan | -0.76 | 0.39 | nan | 0.36 | True |\n", + "| 60 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/ | 15.8 | 3.01 | 0.03 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.74 | nan | -0.76 | 0.39 | nan | 0.36 | True |\n", + "| 61 | joint_training/default | joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_spec_augment/ | 5.2 | 2.3 | 0.02 | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.51 | -0.46 | -0.52 | 0.25 | 0.35 | 0.23 | True |\n", + "| 67 | joint_training/conformer_coupling | joint_training/conformer_coupling/raw_audio/no_ddi/glowTTS_ASR_conformer_x_vector_v2/ | 4.2 | 2.05 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.79 | -0.74 | -0.8 | 0.33 | 0.46 | 0.31 | True |\n", + "| 99 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_0.1/ | 100 | 1.98 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.76 | -0.78 | -0.78 | 0.08 | 0.08 | 0.07 | True |\n", + "| 100 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/ce_ls_1.0/ | 100 | 1.97 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.73 | -0.75 | -0.75 | 0.08 | 0.08 | 0.07 | True |\n", + "| 101 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_0.1/ | 100 | 1.98 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.76 | -0.77 | -0.78 | 0.07 | 0.08 | 0.06 | True |\n", + "| 102 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/ce_ls_1.0/ | 100 | 1.98 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.73 | -0.74 | -0.75 | 0.07 | 0.08 | 0.06 | True |\n", + "| 103 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.01/ | 97.2 | 1.97 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.81 | -0.82 | 0.06 | 0.06 | 0.05 | True |\n", + "| 104 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/tts_pretrained/ce_ls_0.1/ | 99.7 | 1.83 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.79 | -0.81 | -0.81 | 0.05 | 0.05 | 0.04 | True |\n", + "| 105 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.01/ | 98.2 | 2.18 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.79 | -0.8 | -0.81 | 0.06 | 0.06 | 0.05 | True |\n", + "| 106 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector_v2/basic_init/ce_ls_0.1/ | 99.1 | 1.64 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.78 | -0.8 | -0.8 | 0.05 | 0.05 | 0.04 | True |\n", + "| 107 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/ce_ls_0.1/ | 95.4 | 2.53 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.79 | -0.81 | -0.81 | 0.25 | 0.22 | 0.23 | True |\n", + "| 108 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_ffn_x_vector/200ep/basic_init/ce_ls_0.1/ | 97.9 | 1.78 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.78 | -0.79 | -0.8 | 0.26 | 0.24 | 0.25 | True |\n", + "| 109 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/tts_pretrained/ce_ls_0.1/ | 95.2 | 2.18 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.78 | -0.79 | -0.79 | 0.26 | 0.24 | 0.25 | True |\n", + "| 110 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/flow_given_alignment/ga_glowTTS_ASR_cnn_x_vector/basic_init/ce_ls_0.1/ | 96.1 | 2.35 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.77 | -0.78 | -0.79 | 0.27 | 0.26 | 0.26 | True |\n", + "| 111 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 14.1 | 3.2 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.79 | -0.81 | -0.81 | 0.4 | 0.45 | 0.36 | True |\n", + "| 112 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_cnn_x_vector/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 23.3 | 3.13 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.78 | -0.79 | -0.79 | 0.43 | 0.48 | 0.4 | True |\n", + "| 113 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/mean_only/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 96.4 | 2.37 | nan | 200 | 0.05 | True | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.8 | -0.81 | 0.93 | 0.72 | 0.73 | True |\n", + "| 114 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 91.7 | 2.66 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.81 | -0.82 | -0.83 | 0.63 | 0.63 | 0.59 | True |\n", + "| 115 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 99.9 | 1.56 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.77 | -0.8 | 0.82 | 0.53 | 0.54 | True |\n", + "| 116 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector_v2/200ep/basic_init/no_specaug/tts_target_size/ce_ls_1.0/ | 97.2 | 2.12 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.8 | -0.81 | 0.83 | 0.61 | 0.62 | True |\n", + "| 117 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/basic_init/no_specaug/tts_target_size/ce_ls_0.1/ | 98.1 | 1.82 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.79 | -0.79 | -0.8 | 0.89 | 0.57 | 0.58 | True |\n", + "| 118 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/200ep/tts_pretrained/no_specaug/tts_target_size/ce_ls_0.1/ | 32.9 | 2.77 | nan | 200 | 0.05 | False | 768 | [0: 1e-05, 99: 0.0005, 100: 0.0005, 200: 1e-05] | {'class': 'radam', 'epsilon': 1e-09} | -0.8 | -0.82 | -0.83 | 0.44 | 0.49 | 0.41 | True |\n", + "| 119 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_1/ | 83.9 | 1.92 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.69 | -0.7 | -0.71 | 0.76 | 0.94 | 0.72 | True |\n", + "| 120 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/no_specaug/ce_ls_0.1/ | 41.9 | 2.75 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.79 | -0.81 | -0.82 | 0.51 | 0.54 | 0.49 | True |\n", + "| 121 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_1/ | 88.5 | 1.87 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.72 | -0.73 | -0.74 | 0.75 | 0.79 | 0.69 | True |\n", + "| 122 | joint_training/given_alignments | joint_training/given_alignments/raw_audio/joint_models/glowTTS_ASR_ffn_x_vector/specaug/ce_ls_0.1/ | 25.2 | 3.11 | nan | 250 | 0.05 | False | 768 | [0: 7e-06, 109: 0.0007, 110: 0.0007, 250: 1e-08] | {'class': 'adam', 'epsilon': 1e-08} | -0.8 | -0.82 | -0.82 | 0.43 | 0.46 | 0.41 | True |\n" ] } ], @@ -1157,7 +1554,7 @@ }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -1187,7 +1584,7 @@ }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -1217,7 +1614,7 @@ }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -1247,7 +1644,7 @@ }, { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAGwCAYAAABVdURTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA5U0lEQVR4nO3de3hU9YH/8c/kQkiATIDcQAMJJtVyESMQCKEBKpUiVal26+bHLqjoPm65iJbtSl3ddavEbuulRlu0KjRtQbyitValkJACkXusrhZDc4EqgQm5kQRBkvP7w82UIclkJpnJnDnzfj3PPI/5njMz3znBnM98rzbDMAwBAABYRFigKwAAAOBLhBsAAGAphBsAAGAphBsAAGAphBsAAGAphBsAAGAphBsAAGApEYGuQH9rb2/XZ599piFDhshmswW6OgAAwAOGYejUqVMaOXKkwsLct82EXLj57LPPlJKSEuhqAACAXjh69Kguvvhit+eEXLgZMmSIpC8vTmxsbIBrAwAAPNHU1KSUlBTnfdydkAs3HV1RsbGxhBsAAIKMJ0NKGFAMAAAshXADAAAshXADAAAshXADAAAshXADAAAshXADAAAshXADAAAshXADAAAshXADAAAshXADAAAsJeS2XwAAIJhUOJpVXdeq1OGDlBY/KNDVCQqEGwAATKih9axWbCxTSbnDWZabkaCCvEzZYyIDWDPzo1sKAAATWrGxTDsP17qU7Txcq+UbDwaoRsGDcAMAgMlUOJpVUu5Qm2G4lLcZhkrKHaqsbQlQzYID4QYAAJOprmt1e7zqJOHGHcINAAAmM3pYjNvjqcMZWOwO4QYAAJMZkzBYuRkJCrfZXMrDbTblZiQwa6oHhBsAAEyoIC9TOenxLmU56fEqyMsMUI2CB1PBAQAwIXtMpAqXZKmytkVVJ1tY58YLhBsAAEwsLZ5Q4y26pQAAgKUENNzk5+drypQpGjJkiBITE7VgwQIdOnSox+e99NJLuuyyyzRw4EBNmDBBb731Vj/UFgAABIOAhpvt27dr6dKleu+997RlyxZ98cUXuvrqq9XS0v38/V27dikvL09LlizRwYMHtWDBAi1YsEAffvhhP9YcAACYlc0wLlj+MIAcDocSExO1fft25ebmdnnOTTfdpJaWFr355pvOsmnTpumKK67Q2rVre3yPpqYm2e12NTY2KjY21md1BwAA/uPN/dtUY24aGxslScOGDev2nNLSUs2ZM8elbO7cuSotLe3y/DNnzqipqcnlAQAArMs04aa9vV0rV65UTk6Oxo8f3+15NTU1SkpKcilLSkpSTU1Nl+fn5+fLbrc7HykpKT6tNwAAMBfThJulS5fqww8/1AsvvODT1129erUaGxudj6NHj/r09QEAgLmYYp2bZcuW6c0331RJSYkuvvhit+cmJyfr+PHjLmXHjx9XcnJyl+dHRUUpKirKZ3UFAADmFtCWG8MwtGzZMr322mvatm2b0tLSenxOdna2tm7d6lK2ZcsWZWdn+6uaAAAgiAS05Wbp0qXasGGDXn/9dQ0ZMsQ5bsZutys6OlqStGjRIl100UXKz8+XJN15552aOXOmHnnkEc2fP18vvPCC9u3bp2eeeSZgnwMAAJhHQFtufvGLX6ixsVGzZs3SiBEjnI9NmzY5zzly5IiOHTvm/Hn69OnasGGDnnnmGU2cOFEvv/yyNm/e7HYQMgAACB2mWuemP7DODQAAwSdo17kBAADoK8INAACwFMINAACwFMINAACwFMINAACwFMINAACwFMINAACwFMINAACwFMINAACwFMINAACwFMINAACwFMINAACwFMINAACwFMINAACwFMINAACwFMINAACwFMINAACwFMINAACwFMINAACwlIhAVwAAAHSvwtGs6rpWpQ4fpLT4QYGuTlAg3AAAYEINrWe1YmOZSsodzrLcjAQV5GXKHhMZwJqZH91SAACY0IqNZdp5uNalbOfhWi3feDBANQoehBsAAEymwtGsknKH2gzDpbzNMFRS7lBlbUuAahYcCDcAAJhMdV2r2+NVJwk37hBuAAAwmdHDYtweTx3OwGJ3CDcAAJjMmITBys1IULjN5lIebrMpNyOBWVM9INwAAGBCBXmZykmPdynLSY9XQV5mgGoUPJgKDgCACdljIlW4JEuVtS2qOtnCOjdeINwAAGBiafGEGm/RLQUAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACwlItAVAAAAvlXhaFZ1XatShw9SWvygQFen3xFuAACwiIbWs1qxsUwl5Q5nWW5GggryMmWPiQxgzfoX3VIAAFjEio1l2nm41qVs5+FaLd94MEA1CgzCDQAAFlDhaFZJuUNthuFS3mYYKil3qLK2JUA163+EGwAALKC6rtXt8aqThBsAABBERg+LcXs8dXjoDCwm3AAAYAFjEgYrNyNB4TabS3m4zabcjISQmjVFuAEAwCIK8jKVkx7vUpaTHq+CvMwA1SgwAhpuSkpKdO2112rkyJGy2WzavHmz2/OLi4tls9k6PWpqavqnwgAAmJg9JlKFS7JUtGqW1t0yRUWrZqlwSVZITQOXArzOTUtLiyZOnKhbb71VN9xwg8fPO3TokGJjY50/JyYm+qN6AAAEpbT40Fy8r0NAw828efM0b948r5+XmJiouLg4j849c+aMzpw54/y5qanJ6/cDAADBIyjH3FxxxRUaMWKEvvGNb2jnzp1uz83Pz5fdbnc+UlJS+qmWAAAgEIIq3IwYMUJr167VK6+8oldeeUUpKSmaNWuWDhw40O1zVq9ercbGRufj6NGj/VhjAADQ34Jqb6lLL71Ul156qfPn6dOn669//asee+wx/frXv+7yOVFRUYqKiuqvKgIAgAALqpabrmRlZenw4cOBrgYAADCJoA83ZWVlGjFiRKCrAQAATCKg3VLNzc0urS6VlZUqKyvTsGHDNGrUKK1evVqffvqpCgsLJUmPP/640tLSNG7cOH3++ed69tlntW3bNr377ruB+ggAAMBkAhpu9u3bp9mzZzt/vvvuuyVJixcv1vr163Xs2DEdOXLEefzs2bP6/ve/r08//VQxMTG6/PLL9cc//tHlNQAAQGizGcYFe6NbXFNTk+x2uxobG10WAgQAAOblzf076MfcAAAAnI9wAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALCWoNs4EAACdVTiaVV3XqtThg5QWPyjQ1Qk4wg0AAEGqofWsVmwsU0m5w1mWm5GggrxM2WMiA1izwKJbCgCAILViY5l2Hq51Kdt5uFbLNx4MUI3MgXADAEAQqnA0q6TcobYLdlFqMwyVlDtUWdsSoJoFHuEGAIAgVF3X6vZ41UnCDQAACCKjh8W4PZ46PHQHFhNuAAAIQmMSBis3I0HhNptLebjNptyMhJCeNUW4AQAgSBXkZSonPd6lLCc9XgV5mQGqkTkwFRwAgCBlj4lU4ZIsVda2qOpkC+vc/B/CDQAAQS4tnlBzPrqlAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApbCIHwAAJlThaFZ1XSurDvcC4QYAABNpaD2rFRvLVFLucJblZiSoIC9T9pjIANYseNAtBQCAiazYWKadh2tdynYertXyjQcDVKPgQ7gBAMAkKhzNKil3qM0wXMrbDEMl5Q5V1rYEqGbBhXADAIBJVNe1uj1edZJw4wmvw83p06fV2vr3i19dXa3HH39c7777rk8rBgBAqBk9LMbt8dThDCz2hNfh5vrrr1dhYaEkqaGhQVOnTtUjjzyi66+/Xr/4xS98XkEAAELFmITBys1IULjN5lIebrMpNyOBWVMe8jrcHDhwQF/72tckSS+//LKSkpJUXV2twsJCPfHEEz6vIAAAoaQgL1M56fEuZTnp8SrIywxQjYKP11PBW1tbNWTIEEnSu+++qxtuuEFhYWGaNm2aqqurfV5BAABCiT0mUoVLslRZ26Kqky2sc9MLXrfcpKena/PmzTp69KjeeecdXX311ZKkEydOKDY21ucVBAAgFKXFD9LsSxMJNr3gdbi5//77tWrVKqWmpmrq1KnKzs6W9GUrTmYmTWYAACCwbIZxwWR6D9TU1OjYsWOaOHGiwsK+zEd79uxRbGysLrvsMp9X0peamppkt9vV2NhISxMAAEHCm/t3r7ZfSE5OVnJysvPNtm3bpksvvdT0wQYAAFif191S3/3ud/Xkk09K+nLNm8mTJ+u73/2uLr/8cr3yyis+ryAAAIA3vA43JSUlzqngr732mgzDUENDg5544gk9+OCDPq8gAACAN7wON42NjRo2bJgk6e2339aNN96omJgYzZ8/X+Xl5T6vIAAAgDe8DjcpKSkqLS1VS0uL3n77bedU8Pr6eg0cONDnFQQAAPCG1wOKV65cqYULF2rw4MEaPXq0Zs2aJenL7qoJEyb4un4AAABe8TrcfO9731NWVpaOHj2qb3zjG86p4GPGjGHMDQAACLherXPToeOptgs2+DIz1rkBACD4eHP/9nrMjSQVFhZqwoQJio6OVnR0tC6//HL9+te/7lVlAQBAZxWOZhUdOqHK2pZAVyXoeN0t9eijj+q+++7TsmXLlJOTI0nasWOH7rjjDtXW1uquu+7yeSUBAAgVDa1ntWJjmUrKHc6y3IwEFeRlyh4TGcCaBQ+vu6XS0tL0wAMPaNGiRS7lv/rVr/Rf//Vfqqys9GkFfY1uKQCAmS16bo92Hq5V23m353CbTTnp8SpckhXAmgWWX7uljh07punTp3cqnz59uo4dO+btywEAgP9T4WhWSbnDJdhIUpthqKTcQReVh7wON+np6XrxxRc7lW/atEkZGRk+qRQAAKGouq7V7fGqk4QbT3g95uaBBx7QTTfdpJKSEueYm507d2rr1q1dhh4AAOCZ0cNi3B5PHT6on2oS3Lxuubnxxhu1e/duxcfHa/Pmzdq8ebPi4+O1Z88effvb3/ZHHQEACAljEgYrNyNB4RcssRJusyk3I0Fp8YQbT/RpnZtgxIBiAICZNbZ+oeUbDzJb6gLe3L896pZqamry+M0JDAAA9J49JlKFS7JUWduiqpMtSh0+iBYbL3kUbuLi4npchdgwDNlsNrW1tfmkYgAAhLK0eEJNb3kUboqKivxdDwAAAJ/wKNzMnDnT3/UAAADwCa+nggMAgOBS4WhWdV1ryIzfIdwAAGBRobpPVa92BQcAAOa3YmOZdh6udSnbebhWyzceDFCN+gfhBgAACwrlfaq8DjcPPvig6Xf+BgAg1IXyPlVeh5uXXnpJ6enpmj59un7+85+rtra25ycBAIB+Fcr7VHkdbt5//339+c9/1qxZs/TTn/5UI0eO1Pz587Vhwwa1trpPiQAAoH+E8j5Vfd5baufOndqwYYNeeuklff75515t1RAI7C0FAAgVVtqnyud7S7kzaNAgRUdHa8CAATp16lRfXw4AAPhIqO5T1avZUpWVlXrooYc0btw4TZ48WQcPHtQDDzygmpoaX9cPAAD0UVr8IM2+NDEkgo3Ui5abadOmae/evbr88st1yy23KC8vTxdddJE/6gYAAOA1r8PNVVddpeeff15jx471R30AAAD6pNcDis+ePavKykpdcskliogInl0cGFAMAEDw8eb+7fWYm9OnT2vJkiWKiYnRuHHjdOTIEUnS8uXL9fDDD3v1WiUlJbr22ms1cuRI2Ww2bd68ucfnFBcX68orr1RUVJTS09O1fv16bz8CAACwMK/DzT333KP3339fxcXFGjhwoLN8zpw52rRpk1ev1dLSookTJ+qpp57y6PzKykrNnz9fs2fPVllZmVauXKnbbrtN77zzjlfvCwAArMvr/qTNmzdr06ZNmjZtmmznLQw0btw4/fWvf/XqtebNm6d58+Z5fP7atWuVlpamRx55RJL01a9+VTt27NBjjz2muXPndvmcM2fO6MyZM86fzb4ODwAA6BuvW24cDocSExM7lbe0tLiEHX8oLS3VnDlzXMrmzp2r0tLSbp+Tn58vu93ufKSkpPi1jgAAILC8DjeTJ0/W73//e+fPHYHm2WefVXZ2tu9q1oWamholJSW5lCUlJampqUmnT5/u8jmrV69WY2Oj83H06FG/1hEAAASW191Sa9as0bx58/TRRx/p3Llz+tnPfqaPPvpIu3bt0vbt2/1Rxz6JiopSVFRUoKsBAAD6idctNzNmzFBZWZnOnTunCRMm6N1331ViYqJKS0s1adIkf9TRKTk5WcePH3cpO378uGJjYxUdHe3X9wYAAMGhVwvUXHLJJfrlL3/p67r0KDs7W2+99ZZL2ZYtW/zeHQYAAIKHR+HGmxlG3iyM19zcrMOHDzt/rqysVFlZmYYNG6ZRo0Zp9erV+vTTT1VYWChJuuOOO/Tkk0/qBz/4gW699VZt27ZNL774ossYIAAAENo8CjdxcXEez4Rqa2vz+M337dun2bNnO3++++67JUmLFy/W+vXrdezYMecigZKUlpam3//+97rrrrv0s5/9TBdffLGeffbZbqeBAwBCS4WjWdV1rSGz+zW65tH2C+cPFK6qqtI999yjm2++2dkdVFpaql/96lfKz8/X4sWL/VdbH2D7BQCwnobWs1qxsUwl5Q5nWW5GggryMmWPiQxgzeAr3ty/vd5b6qqrrtJtt92mvLw8l/INGzbomWeeUXFxsdcV7k+EGwCwnkXP7dHOw7VqO++WFm6zKSc9XoVLsgJYM/iKX/eWKi0t1eTJkzuVT548WXv27PH25QAA6JMKR7NKyh0uwUaS2gxDJeUOVda2BKhmCBSvw01KSkqXM6WeffZZVv8FAPS76rpWt8erThJuQo3XU8Efe+wx3XjjjfrDH/6gqVOnSpL27Nmj8vJyvfLKKz6vIAAA7oweFuP2eOpwBhaHGq9bbq655hqVl5fruuuuU11dnerq6nTttdfqk08+0TXXXOOPOgIA0K0xCYOVm5Gg8Atm9YbbbMrNSGDWVAjyekBxsGNAMQBYT2PrF1q+8SCzpSzMm/t3r1YoBgDATOwxkSpckqXK2hZVnWxhnZsQR7gBAFhGWjyhBr0YcwMAAGBmtNwAAACfMcMWGF6Fm/fee0+/+93vdPbsWV111VX65je/6a96AQCA85ghNLhjpi0wPJ4t9fLLL+umm25SdHS0IiMj1dTUpB//+MdatWqVv+voU8yWAgAEEzOFBnf8vQWGX7ZfyM/P1+23367GxkbV19frwQcf1Jo1a/pcWQAA0L0VG8u083CtS9nOw7VavvFggGrUmdm2wPA43Bw6dEirVq1SeHi4JOn73/++Tp06pRMnTvitcgAAhDKzhYbumG0LDI/DTWtrq0sz0IABAzRw4EA1Nzf7pWIAAIQ6s4WG7phtCwyvBhQ/++yzGjx4sPPnc+fOaf369YqPj3eWrVixwne1AwAghJktNHSnYwuM7sbc9PcAaI8HFKempsp2wb4dnV7MZlNFRYVPKuYvDCgGAAQTfw/U9RV/b4Hhzf2bvaUAADAxT0ODWaaK+2sLDMKNG4QbAEAw6i40BMtU8b7yy1RwSWpvb9fzzz+vb33rWxo/frwmTJig6667ToWFhQqxjAQAQL9Kix+k2ZcmyjAMFR064ZwpFQxTxfubxwOKDcPQddddp7feeksTJ07UhAkTZBiGPv74Y91888169dVXtXnzZj9WFQCA0NVVC82U1KHaW1Xf6dzzp4qbcTVjf/M43Kxfv14lJSXaunWrZs+e7XJs27ZtWrBggQoLC7Vo0SKfVxIAgFDXVQvN/urOweZ8VSdDM9x43C21ceNG/fCHP+wUbCTp61//uu655x799re/9WnlAABA94v5tfcwIsQsU8X7m8fh5s9//rPbjTLnzZun999/3yeVAgDALCoczS5jXAKhp8X8LryZh9tsys1ICMlWG8mLbqm6ujolJSV1ezwpKUn19e6bxwAACBZmmoXU02J+k0YP1d7zuqhy0uNVkJfp72qZlsfhpq2tTRER3Z8eHh6uc+fO+aRSAAAEmrtZSP29eF5PKwAXLslSyScOHTxarytHDdXXMhL6tX5m49VsqZtvvllRUVFdHj9z5ozPKgUAQCB1jHG5UCBnIRXkZXZazC8nPV4PLhivRc/tMUULk1l4HG4WL17c4znMlAIAWIEnG1b2d7ixx0SqcElWp8X8OrZnOF+gWpjMwuNws27dOn/WAwAA0zDzhpVp8X9fodiMLUxm4NUKxQAAhIKOMS7hF2wYbbZZSJ60MIUiwg0AAF0oyMtUTnq8S5nZZiGZuYUpkDzulgIAIJR0N8bFTHqaRWW2+vYXWm4AAHCjY8NKswaFYGhh6m+03AAAEMTsMZH6r+vGandlnWySpo4Zbtog1l8INwAABIkKR7Oq61qdXWRmWkXZTAg3AACYXHch5ou2du2prHM5N9TXuJEINwAAmF5XW0HsOOzoclfwUF/jRmJAMQAAptaxUN/5s6EkdRlszheqa9xIhBsAAEytp4X6uhOqa9xIdEsBAGBqPS3UFyap/byfQ32NG4mWGwAATM3dVhDTLxmuGRkJLuWhvsaNRMsNAACmV5CXqeUbD7rMluoIMfaYSFOvohwINsMwehiSZC1NTU2y2+1qbGxUbGxsoKsDAIDHQjnEeHP/puUGAIAgkRYfeqGmNxhzAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIXZUgAA+EGFo1nVda0hOW070Ag3AAD4UEPrWa3YWOay4F5uRoJzwT34H91SAAD40IqNZdp5uNalbOfhWi3feLDf6lDhaFbRoROqrA3NncFpuQEAwEcqHM0uLTYd2gxDJeUOVda2+KyLqqtuL1qNvkS4AQDAR6rrWt0erzrZ93DjLsC4azUqXJLVp/cNJnRLAQDgI6OHxbg9njq876023QWY2wr3qqTcobYLtow8v9UoVBBuAADwkTEJg5WbkaBwm82lPNxmU25GQp9bbTq6vboKMHur6t0+t+ok4QYmE+qDwwAgWBTkZSonPd6lLCc9XgV5mX1+7Z66vdzxRatRsGDMjckxOAwAgos9JlKFS7JUWduiqpMtPl3npqdurymjh+rAkQaXlp1wm0056fEhtdYOLTcmZ4YphQAA76XFD9LsSxN9GircdXtNSR2qm6en6spRcS7Hzm81CpVeAFpuTMyXUwpZKRMArKEgL1PLNx50uT/ERkdob1W9c9zNlNShWjw9VeNG2pUWP0gNrWe16Lk9IdMLQLgxMV9MKaRbCwCs5cJur59vO6wDRxpczjlQ3aDoyL+pcMlISe57Aaw4RZxuKRPzdEqhu2ZGurUAwJrS4gdp9LAY7a2udzv9290MK6tOEaflxsQ6+lZ3lDvUfsGxoTGRCrfJbTNjf66UCQDof5608PfEFwsLmg0tNybXXfdR0+kvdP1TO922yvjiHz2sIVQGEQKhxpMW/v5YWNBsaLkxuZMtZ1Tf+kWn8jZD3ZT/vVUmFP9BwxVjrgBr62jh33m41u30b0/OsRJabkyutws2VZ1s8ftKmTA/xlwB1ufJooEFeZnKdDNF3GpouTG5nlpfutPRKtPVlEEr/4PG3zHmCggNPS0a2NGCu6/679szTEkdaukWXMKNyblrcoyNjlDT6XNumxn9uVImzK0/dicGYB5p8V3/fe+qBfdAdYNlp4FLdEsFhe6aHN9YOsPj/Uv8sVImzI0xVwBCcRq4RMuNKVQ4mrW78qQkm6aNGd4pgLhrfaFVBt3xdKAhAOsK1RZcU7TcPPXUU0pNTdXAgQM1depU7dmzp9tz169fL5vN5vIYOHBgP9bWdxpaz+r//fI9ff2R7Vr96oda/eoHmv3TYuU9854au5gJ1V3rC60y6I4/dyeG+bEEQOjo7ncdqi24AW+52bRpk+6++26tXbtWU6dO1eOPP665c+fq0KFDSkxM7PI5sbGxOnTokPNn2wWzgYLFio1l2vXXk53KSytOWrov1IrMuncXY65CE0sAhI6efteh2oIb8JabRx99VLfffrtuueUWjR07VmvXrlVMTIyef/75bp9js9mUnJzsfCQlJfVjjX2ju5ksHazcF+qpYPjW2bEZ3dcf2a5b1u3V7J8Wa9Fze7pseQskWvdCC0sAhA5Pfteh2IIb0Jabs2fPav/+/Vq9erWzLCwsTHPmzFFpaWm3z2tubtbo0aPV3t6uK6+8UmvWrNG4ceO6PPfMmTM6c+aM8+empibffYAuePoN3pP1a6zaF9qTYPrWGWqb0cH8WAIgdHj6uw7FFtyAttzU1taqra2tU8tLUlKSampqunzOpZdequeff16vv/66fvOb36i9vV3Tp0/X3/72ty7Pz8/Pl91udz5SUlJ8/jkk77/Be7J+jVX7QnsSLN86Q3UWglUEQ8tgb7DtSujw9ncdSi24Ae+W8lZ2drYWLVqkK664QjNnztSrr76qhIQEPf30012ev3r1ajU2NjofR48e9Uu9vL0hd/SDdsdsKwj76kbQ0+t4GxgCeYPiJhKcgqUrsbdCdQBpKOJ33b2AdkvFx8crPDxcx48fdyk/fvy4kpOTPXqNyMhIZWZm6vDhw10ej4qKUlRUVJ/r6o63zcAdXVer5n5F59rbOw0qzh4z3DR9ob7qIvL0dTydtmiGriv+sAQnq3clhuoA0lDE77p7AW25GTBggCZNmqStW7c6y9rb27V161ZlZ2d79BptbW364IMPNGLECH9Vs0eefoO/8BvjdU/uVERYmN5YmqOHb5ig/BsmqGjVLG38l2k+vUH3pXXDV11Enr6Op4HBDF1X7N0VfEKlKzEUB5BaXXd/x/lddy3gU8HvvvtuLV68WJMnT1ZWVpYef/xxtbS06JZbbpEkLVq0SBdddJHy8/MlSf/93/+tadOmKT09XQ0NDfrJT36i6upq3XbbbQH7DEOj3QeRnm7IkvzyjdFd68bJljMuA587WpPCbTaVHa1XXctZjR0R65OBid60bHnyTcRMAybZuyu4hMqCZqE4gNSqemql5nfdtYCHm5tuukkOh0P333+/ampqdMUVV+jtt992DjI+cuSIwsL+3sBUX1+v22+/XTU1NRo6dKgmTZqkXbt2aezYsYH6CHp0S3m3xzq+wQfihtxVmNpR7tCsnxap/rzxBVERYTpzrt3r1/f0RuDtDaWnwGCmGxR/WIJLqHUldrfXEIKHp92onv6uzboml68FPNxI0rJly7Rs2bIujxUXF7v8/Nhjj+mxxx7rh1p5pqf1alZd/RVJ/X9D7q5e7ZJLsJHUq2AjeX4j8PaG0lNg6KkvNSKs/xd15CYSHBijgGDiyy/FZhin2J+CbraU2fQUWk62npXU/98YPVlHp7fCJE0ZPdTj/6l6Ozalu2mLPUWxc+1GD2cgFHWMWVh19VcYo4Cg4MsZmWYYp9ifTNFyE8w8DS39/Y2xp3FAfdEuaW91vRY9t8fj1O/LsSk9XfNAtNzAvLr7xvrGshydbDlr+eZ5BC9ffSk20zjF/kLLTR950yrRl1Ht3s54cjcOyFe8Sf0dXU1Fq2Zp3S1TVLRqlgqXZPWqObS7a97hn5/bY6l1S9A33X1j/ek7n4TMgmYITr6akRmKa3LZDMMIqTb8pqYm2e12NTY2KjY21iev2dj6RadWCXd9md4MPu1NP2mFo1lff2R7Lz+N94pWzer3G0RX1/x8HS1iVli3BL3X0/8Lgfi3C3jD2/tLV6zy/4E392+6pXzA2xkz3gw+7c2CY/4cb9OVQEyf7bjmJZ84tOj5PZ2OW7m5FZ4z08w6oDd8MSMzFAfS0y3lQ77et6O3C455sm+VLwVy+uyF1+ZCVmxuhedCbeo3rKuv95dQW+yPlhsT6+23zo6UvuOwQ/6eOOTNrCl/4OYFd0LxGyvQlVBbk4uWGxPry427IC9TY0f2fUzRLdNT3R6/uYfj/sYWCOhJqH1jBdwJlZ3BCTcm1pcbtz0mUk/8Y9//eM++LNHt8bEX2fv8Hn3FzQvu+HKmHoDgQLeUyfVlfZgxCYM1efRQ7auu7/ac/Bsm6NX9f9OBIw1dNtvnfiXB9M36odbcit5hFWkgdBBuTK6vN+5rJoxwG25skp5dPMVtgAqWzSG5eQEAJMJN0OjtjTt6QLjb43+padLUMcPdBihaRgAAwYRwY1EdO79eFBft9rz1u6q1fle1xl8UqzXfnqDZl3Y/xoaWEQBAMCDcWExXKxrboyPUePqc2+d9+GmTrntyp6V3iQUAhAZmS/nJhXtBebs3VG91taLxqdPnNNTDsLKj3GHZXWIBAKGBlhsf66rlZGhMpOrP28TRX60j3e382i6pvvUL/XpJlg4cqddjbjbVbJfYtgAAENRoufGxrlpO6i/Yndqb3bS90dOKxufaDV17+UiPXottCwAAwYpw40Pd7QV1oZ72huotT1Y07lgYMMzm9lS2LQAABC3CjQ95uxu3r1tHPF3RuCAvUzPSE7p8DbYtAAAEO8KND3m7G7c/Wkc82YqgY92aN5bmaPwF+0+ZcXE+AAC8YTOMHvpQLKapqUl2u12NjY2Kje37xpIXWvTcnk5bFVyoY+uCwiVZPn//Dt4suMfifAAAs/Pm/k248bHG1i86bVXQX7OlAACwKm/u30wF97HutiqgdQQAgP5BuPGTC7cqYOsCAAD6BwOKAQCApRBuAACApRBuAACApTDmBgAAk6twNKu6rpVJKR4i3AAAYFJdbcbMciI9o1sKAACT6mozZn9tvmwlhBsAAEyou82Y/bX5spUQbgAAMKGeNmM+f/PlCkezig6dIPD8H8bcAABgMhWOZtU0fu72nNThgxiT0w3CDQAAJtFVWLlQx+bLafGDnJs1n69jTI4/N2c2O7qlAAAwia4GEF8oJz1eBXmZjMlxg5YbAABMoCOsdOfhGyZo6pjhznVuDhytd/t6VSdbQnZNHFpuAAAwgZ4GEA+JinAJK6OHxbg9P3V4aAYbiXADAIAp9BRW1u+qcvl5TMJg5WYkKNxmcykPt9mUm5EQsq02EuEGAABTGJMwWFNSh3Z7fG91fadxNAV5mcpJj3cp6xiTE8oYcwMAgEksnp6qvVXdj6W5cByNPSZShUuyVFnboqqTLew99X8INwAAmMTYEbFuj3c3jiYtnlBzPrqlAAAwCcbR+AbhBgAAE2EcTd/RLQUAgIkwjqbvCDcAAJgQ42h6j24pAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKewtBQBAiKhwNKu6rtXym3ESbgAAsLiG1rNasbFMJeUOZ1luRoIK8jJlj4kMYM38g24pAAAs7tZ1e1yCjSSVlDt0y7o9AaqRfxFuAACwsApHsw4cbezy2IGjDaqsbennGvkf4QYAAIuocDSr6NAJl8Dy69Iqt8/5TQ/HgxFjbgAACELnDw4eGhPZ7Ziao/Wn3b5OdV2rv6va7wg3AAAEka4GBw+NiVTT6S9cztt5uFa3Fe7V+JF2t683d1yyX+oZSIQbAACCyIqNZdp5uNalrL71i07ntRmG9lbVa29VfbevFREm/cPkFJ/XMdAINwAABIkKR3OnWU+9FRFm0xtLc3zyWmZDuAEAIEj4cnzMlrtnWnYhP2ZLAQAQJEYPi/HZa1WdtN4U8A6EGwAAgsSYhMHKzUhQuM3mUh6mLwcVeyN1uDVbbSTCDQAAQaUgL1M56fEuZTMyElS8araKVs3SulumaMrooZ0CUIdwm025GQmW7ZKSJJthGEagK9GfmpqaZLfb1djYqNjY2EBXBwCAXqmsbVHVyZYuN8FsbP1Cyzce7HLwcbDuKeXN/dsU4eapp57ST37yE9XU1GjixIkqKChQVlZWt+e/9NJLuu+++1RVVaWMjAz9+Mc/1jXXXOPRe/VXuNl+6ITK/tagK0cN1dcyEvz2Pp7obhfYQOwOGyo70gKAGXQEoIgwm861G0H9t9eb+3fAZ0tt2rRJd999t9auXaupU6fq8ccf19y5c3Xo0CElJiZ2On/Xrl3Ky8tTfn6+vvWtb2nDhg1asGCBDhw4oPHjxwfgE7iqPtmiBU/tdFlzYGhMpN5YOkMpw303EMwT3e0C++CC8fqPzR/26+6wobYjLQCYQVp88IaZvgh4y83UqVM1ZcoUPfnkk5Kk9vZ2paSkaPny5brnnns6nX/TTTeppaVFb775prNs2rRpuuKKK7R27doe38/fLTeZ//1ul4spDY2J1MH7r/b5+7mz6Lk92nm4Vm3n/YrDbTbFRkeo6fS5TuU56fEqXNJ9i5k/6uLP9wQAWIc39++ADig+e/as9u/frzlz5jjLwsLCNGfOHJWWlnb5nNLSUpfzJWnu3Lndnn/mzBk1NTW5PPxl+6ETXQYb6cvVI//ko4WXPNGx0FPbBdm1zTBU3/pFl+Ul5Q6/7A7rri7+ek8AQOgKaLipra1VW1ubkpKSXMqTkpJUU1PT5XNqamq8Oj8/P192u935SEnx3zLTZX9rcHv8wJHul8D2td4u9OSPdQ96qouV11oAAPQ/y08FX716tRobG52Po0eP+u29rrg4zu3xK0cN9dt7X6i3Cz35Y92Dnupi5bUWAAD9L6DhJj4+XuHh4Tp+/LhL+fHjx5Wc3PUupcnJyV6dHxUVpdjYWJeHv8y8NLHbRZSGxkT266yp7hZ6CrfZNDQmsstyf6174K4uVl9rAQDQ/wIabgYMGKBJkyZp69atzrL29nZt3bpV2dnZXT4nOzvb5XxJ2rJlS7fn97c3ls7oFHA6Zkv1t64WespJj9cbS2d0WV6Ql9nvdfHnewIAQlPAZ0tt2rRJixcv1tNPP62srCw9/vjjevHFF/WXv/xFSUlJWrRokS666CLl5+dL+nIq+MyZM/Xwww9r/vz5euGFF7RmzRqPp4L31zo3fyp36MCRelOsc9PdQk/uFoDq77oAAOBOUK1zc9NNN8nhcOj+++9XTU2NrrjiCr399tvOQcNHjhxRWNjfG5imT5+uDRs26D/+4z/0wx/+UBkZGdq8ebMp1rg539cyEgIeajp0t85BINY/CNU1FwAA/SfgLTf9je0XAAAIPkGzzg0AAICvEW4AAIClEG4AAIClEG4AAIClEG4AAIClEG4AAIClEG4AAIClEG4AAIClEG4AAIClBHz7hf7WsSBzU1NTgGsCAAA81XHf9mRjhZALN6dOnZIkpaSkBLgmAADAW6dOnZLdbnd7TsjtLdXe3q7PPvtMQ4YM0alTp5SSkqKjR4+yz5SfNDU1cY39jGvcP7jO/sc19r9gvsaGYejUqVMaOXKky4baXQm5lpuwsDBdfPHFkiSbzSZJio2NDbpfcrDhGvsf17h/cJ39j2vsf8F6jXtqsenAgGIAAGAphBsAAGApIR1uoqKi9J//+Z+KiooKdFUsi2vsf1zj/sF19j+usf+FyjUOuQHFAADA2kK65QYAAFgP4QYAAFgK4QYAAFgK4QYAAFhKSIebp556SqmpqRo4cKCmTp2qPXv2BLpKQSs/P19TpkzRkCFDlJiYqAULFujQoUMu53z++edaunSphg8frsGDB+vGG2/U8ePHA1Tj4Pbwww/LZrNp5cqVzjKur298+umn+qd/+icNHz5c0dHRmjBhgvbt2+c8bhiG7r//fo0YMULR0dGaM2eOysvLA1jj4NLW1qb77rtPaWlpio6O1iWXXKIf/ehHLvsFcY29U1JSomuvvVYjR46UzWbT5s2bXY57cj3r6uq0cOFCxcbGKi4uTkuWLFFzc3M/fgofM0LUCy+8YAwYMMB4/vnnjf/93/81br/9diMuLs44fvx4oKsWlObOnWusW7fO+PDDD42ysjLjmmuuMUaNGmU0Nzc7z7njjjuMlJQUY+vWrca+ffuMadOmGdOnTw9grYPTnj17jNTUVOPyyy837rzzTmc517fv6urqjNGjRxs333yzsXv3bqOiosJ45513jMOHDzvPefjhhw273W5s3rzZeP/9943rrrvOSEtLM06fPh3AmgePhx56yBg+fLjx5ptvGpWVlcZLL71kDB482PjZz37mPIdr7J233nrLuPfee41XX33VkGS89tprLsc9uZ7f/OY3jYkTJxrvvfee8ac//clIT0838vLy+vmT+E7IhpusrCxj6dKlzp/b2tqMkSNHGvn5+QGslXWcOHHCkGRs377dMAzDaGhoMCIjI42XXnrJec7HH39sSDJKS0sDVc2gc+rUKSMjI8PYsmWLMXPmTGe44fr6xr//+78bM2bM6PZ4e3u7kZycbPzkJz9xljU0NBhRUVHGxo0b+6OKQW/+/PnGrbfe6lJ2ww03GAsXLjQMg2vcVxeGG0+u50cffWRIMvbu3es85w9/+INhs9mMTz/9tN/q7ksh2S119uxZ7d+/X3PmzHGWhYWFac6cOSotLQ1gzayjsbFRkjRs2DBJ0v79+/XFF1+4XPPLLrtMo0aN4pp7YenSpZo/f77LdZS4vr7yxhtvaPLkyfqHf/gHJSYmKjMzU7/85S+dxysrK1VTU+Nyne12u6ZOncp19tD06dO1detWffLJJ5Kk999/Xzt27NC8efMkcY19zZPrWVpaqri4OE2ePNl5zpw5cxQWFqbdu3f3e519IeQ2zpSk2tpatbW1KSkpyaU8KSlJf/nLXwJUK+tob2/XypUrlZOTo/Hjx0uSampqNGDAAMXFxbmcm5SUpJqamgDUMvi88MILOnDggPbu3dvpGNfXNyoqKvSLX/xCd999t374wx9q7969WrFihQYMGKDFixc7r2VXfzu4zp6555571NTUpMsuu0zh4eFqa2vTQw89pIULF0oS19jHPLmeNTU1SkxMdDkeERGhYcOGBe01D8lwA/9aunSpPvzwQ+3YsSPQVbGMo0eP6s4779SWLVs0cODAQFfHstrb2zV58mStWbNGkpSZmakPP/xQa9eu1eLFiwNcO2t48cUX9dvf/lYbNmzQuHHjVFZWppUrV2rkyJFcY/hMSHZLxcfHKzw8vNNMkuPHjys5OTlAtbKGZcuW6c0331RRUZEuvvhiZ3lycrLOnj2rhoYGl/O55p7Zv3+/Tpw4oSuvvFIRERGKiIjQ9u3b9cQTTygiIkJJSUlcXx8YMWKExo4d61L21a9+VUeOHJEk57Xkb0fv/du//Zvuuece/eM//qMmTJigf/7nf9Zdd92l/Px8SVxjX/PkeiYnJ+vEiRMux8+dO6e6urqgveYhGW4GDBigSZMmaevWrc6y9vZ2bd26VdnZ2QGsWfAyDEPLli3Ta6+9pm3btiktLc3l+KRJkxQZGelyzQ8dOqQjR45wzT1w1VVX6YMPPlBZWZnzMXnyZC1cuND531zfvsvJyem0hMEnn3yi0aNHS5LS0tKUnJzscp2bmpq0e/durrOHWltbFRbmeusJDw9Xe3u7JK6xr3lyPbOzs9XQ0KD9+/c7z9m2bZva29s1derUfq+zTwR6RHOgvPDCC0ZUVJSxfv1646OPPjL+5V/+xYiLizNqamoCXbWg9K//+q+G3W43iouLjWPHjjkfra2tznPuuOMOY9SoUca2bduMffv2GdnZ2UZ2dnYAax3czp8tZRhcX1/Ys2ePERERYTz00ENGeXm58dvf/taIiYkxfvOb3zjPefjhh424uDjj9ddfN/785z8b119/PdOUvbB48WLjoosuck4Ff/XVV434+HjjBz/4gfMcrrF3Tp06ZRw8eNA4ePCgIcl49NFHjYMHDxrV1dWGYXh2Pb/5zW8amZmZxu7du40dO3YYGRkZTAUPVgUFBcaoUaOMAQMGGFlZWcZ7770X6CoFLUldPtatW+c85/Tp08b3vvc9Y+jQoUZMTIzx7W9/2zh27FjgKh3kLgw3XF/f+N3vfmeMHz/eiIqKMi677DLjmWeecTne3t5u3HfffUZSUpIRFRVlXHXVVcahQ4cCVNvg09TUZNx5553GqFGjjIEDBxpjxowx7r33XuPMmTPOc7jG3ikqKury7+/ixYsNw/Dsep48edLIy8szBg8ebMTGxhq33HKLcerUqQB8Gt+wGcZ5y0ICAAAEuZAccwMAAKyLcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAMAACyFcAPAtN5++23ZbDbV1NS4lI8YMUKpqakuZVVVVbLZbM4NAmfNmiWbzdbpcccddzifc355bGyspkyZotdff93vnwuAfxFuAJjWjBkzFBERoeLiYmfZxx9/rNOnT6u+vl5VVVXO8qKiIkVFRSknJ8dZdvvtt+vYsWMuj//5n/9xeY9169bp2LFj2rdvn3JycvSd73xHH3zwgb8/GgA/ItwAMIWXX35ZEyZMUHR0tIYPH645c+bIZrNpypQpLuGmuLhYM2bMUE5OTqfyadOmaeDAgc6ymJgYJScnuzxiY2Nd3jcuLk7Jycn6yle+oh/96Ec6d+6cioqK/P1xAfgR4QZAwB07dkx5eXm69dZb9fHHH6u4uFg33HCDDMPQ7NmzXcJGUVGRZs2apZkzZ7qUFxcXa/bs2b2uw7lz5/Tcc89JkgYMGND7DwMg4NgVHEDAHThwQJMmTVJVVZVGjx7tcuyPf/yjvvGNb+izzz7TiBEjlJSUpDfffFPnzp1TXl6eqqqqVFFRoUsuuUTbt29Xbm6upC/H3OzatatTUHn66ae1cOFCSV+OuRk4cKDCw8N1+vRptbe3KzU1Vfv379ewYcP658MD8LmIQFcAACZOnKirrrpKEyZM0Ny5c3X11VfrO9/5joYOHarp06drwIABKi4u1sSJE3X69GldeeWVam9vl8PhUGVlpYqLixUdHa1p06a5vO7ChQt17733upQlJSW5/PzYY49pzpw5qqio0F133aUnnniCYAMEOcINgIALDw/Xli1btGvXLr377rsqKCjQvffeq927dystLU1ZWVkqKipSXV2dZsyYofDwcIWHh2v69OkqKipSUVGRcnJyOrXS2O12paenu33v5ORkpaenKz09XevWrdM111yjjz76SImJif78yAD8iDE3AEzBZrMpJydHDzzwgA4ePKgBAwbotddekyTNnj1bxcXFKi4u1qxZs5zPyc3NVXFxsbZv396n8TYdsrKyNGnSJD300EN9fi0AgUO4ARBwu3fv1po1a7Rv3z4dOXJEr776qhwOh7761a9K+jLclJeX65133tHMmTOdz5s5c6Y2b96so0ePdhluWltbVVNT4/Kor693W5eVK1fq6aef1qeffurbDwmg3zCgGEDAffzxx7rrrrt04MABNTU1afTo0Vq+fLmWLVsmSfr8888VFxenyMhI1dfXKyLiyx71M2fOKC4uThERES7l0pcDirdv397pvebOnau3335b0petRa+99poWLFjgPG4YhsaOHavZs2fr5z//uR8/NQB/IdwAAABLoVsKAABYCuEGAABYCuEGAABYCuEGAABYCuEGAABYCuEGAABYCuEGAABYCuEGAABYCuEGAABYCuEGAABYCuEGAABYyv8Hvl5sOEFaNGQAAAAASUVORK5CYII=", + "image/png": "", "text/plain": [ "
" ] @@ -1277,7 +1674,7 @@ }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] diff --git a/users/rilling/evaluation/wer_eval.ipynb b/users/rilling/evaluation/wer_eval.ipynb index ebc9639f8..69804a5a4 100644 --- a/users/rilling/evaluation/wer_eval.ipynb +++ b/users/rilling/evaluation/wer_eval.ipynb @@ -1062,7 +1062,7 @@ " with open(f, \"r\") as ff:\n", " wers.append(float(ff.readline().replace(\"\\n\", \"\")))\n", " folders = [\n", - " \"librispeech_glow_asr\",\n", + " \"ASR_only\",\n", " \"joint_training/default\",\n", " \"joint_training/conformer_coupling\",\n", " \"joint_training/given_alignments\",\n", @@ -2283,6 +2283,7 @@ "source": [ "ctc_scores = []\n", "ctc_dev_scores = []\n", + "ctc_devtrain_scores = []\n", "overfitting = []\n", "mle_scores = []\n", "dp_scores = []\n", @@ -2299,6 +2300,7 @@ " finished.append(False)\n", " ctc_scores.append(np.nan)\n", " ctc_dev_scores.append(np.nan)\n", + " ctc_devtrain_scores.append(np.nan)\n", " mle_scores.append(np.nan)\n", " dp_scores.append(np.nan)\n", " overfitting.append(np.nan)\n", @@ -2307,7 +2309,12 @@ " finished.append(True)\n", " ctc_scores.append(last_epoch_data[\"error\"][\"ctc\"])\n", " ctc_dev_scores.append(last_epoch_data[\"error\"][\"dev_loss_ctc\"])\n", - " overfitting.append(ctc_dev_scores[-1] / ctc_scores[-1])\n", + " if \"devtrain_loss_ctc\" in last_epoch_data[\"error\"]:\n", + " ctc_devtrain_scores.append(last_epoch_data[\"error\"][\"devtrain_loss_ctc\"])\n", + " overfitting.append(ctc_dev_scores[-1] / ctc_scores[-1])\n", + " else:\n", + " ctc_devtrain_scores.append(np.nan)\n", + " overfitting.append(np.nan)\n", "\n", " if \"mle\" in last_epoch_data[\"error\"]:\n", " breakpoint()\n", diff --git a/users/rilling/experiments/librispeech/librispeech_glow_asr/experiments.py b/users/rilling/experiments/librispeech/librispeech_glow_asr/experiments.py index 90c4d0fbd..a5777f2e4 100644 --- a/users/rilling/experiments/librispeech/librispeech_glow_asr/experiments.py +++ b/users/rilling/experiments/librispeech/librispeech_glow_asr/experiments.py @@ -37,7 +37,7 @@ def glowASR(TTS_experiments: dict): :param dict TTS_experiments: Dictionary containing the TTS-only experiments from ../librispeech_glowtts to import Glow-TTS decoder parameters """ - prefix_name = "experiments/librispeech/librispeech_glow_asr/pytorch/" + prefix_name = "experiments/librispeech/ASR_only/raw_audio/" train_settings = TrainingDatasetSettings( custom_processing_function=None, partition_epoch=3, epoch_wise_filters=[], seq_ordering="laplace:.1000" diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/glowTTS/experiments.py b/users/rilling/experiments/librispeech/librispeech_glowtts/glowTTS/experiments.py index 56164bf9f..9827b994f 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/glowTTS/experiments.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/glowTTS/experiments.py @@ -29,7 +29,7 @@ def get_pytorch_glowTTS(x_vector_exp: dict, gl_checkpoint: dict): :return dict: Dictionary containing the experiment dictionaries to import attributes of experiment jobs in other experiment folders (should be done using storage) """ - prefix = "experiments/librispeech/tts_architecture/glow_tts/raw_audio/" + prefix = "experiments/librispeech/TTS_only/v1/raw_audio/" experiments = {} def run_exp( diff --git a/users/rilling/experiments/librispeech/librispeech_glowtts/glowTTS/gt_extraction.py b/users/rilling/experiments/librispeech/librispeech_glowtts/glowTTS/gt_extraction.py index d63eda851..03367cd63 100644 --- a/users/rilling/experiments/librispeech/librispeech_glowtts/glowTTS/gt_extraction.py +++ b/users/rilling/experiments/librispeech/librispeech_glowtts/glowTTS/gt_extraction.py @@ -21,7 +21,7 @@ def get_ground_truth_audio_and_spectrograms(): :return: durations_hdf """ - prefix = "experiments/librispeech/tts_architecture/glow_tts/ground_truth/" + prefix = "experiments/librispeech/ground_truth/" def run_exp(name, args, dataset): forward_config = get_forward_config( diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/experiments.py b/users/rilling/experiments/librispeech/librispeech_joint_training/experiments.py index dd1ecffe2..6d620552e 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/experiments.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/experiments.py @@ -1727,6 +1727,104 @@ def run_exp( search_args=default_search_args, tts_forward=False, ) + + #=================== 200EP training ================================= + model_config_specaug = copy.deepcopy(model_config) + model_config_specaug.specaug_config = specaug_config + train_args_200ep = { + "net_args": {"fe_config": asdict(fe_config), "model_config": asdict(model_config_specaug)}, + "network_module": net_module, + "debug": True, + "config": { + "optimizer": {"class": "adam", "epsilon": 1e-8}, + "learning_rates": list(np.linspace(7e-6, 7e-4, 88)) + + list(np.linspace(7e-4, 7e-5, 88)) + + list(np.linspace(7e-5, 1e-8, 24)), + "batch_size": 300 * 16000, + "max_seq_length": {"audio_features": 25 * 16000}, + "max_seqs": 60, + }, + } + + train_args_200ep["config"]["preload_from_files"] = { + "x_vector_model": { + "filename": x_vect_train_job.out_checkpoints[x_vect_train_job.returnn_config.get("num_epochs", 100)], + "init_for_train": True, + "prefix": "x_vector.", + "ignore_missing": True, + } + } + + net_module = "glowTTS_ASR_conformer_x_vector_v2" + train_args_200ep["network_module"] = net_module + + exp_dict = run_exp( + net_module + "_200ep_spec_augment_ctc_scale_0.1", + train_args_200ep, + training_datasets_pe1, + asr_test_datasets, + 200, + training_args={"ctc_scale": 0.1}, + forward_args=forward_args, + search_args=default_search_args, + eval_tts=True, + tts_eval_datasets=tts_forward_datasets_xvectors, + eval_invertibility=True, + ) + + net_module = "glowTTS_ASR_conformer_two_forward_pass" + train_args_200ep_two_forward = copy.deepcopy(train_args_200ep) + del train_args_200ep_two_forward["config"]["preload_from_files"] + train_args_200ep_two_forward["network_module"] = net_module + + exp_dict = run_exp( + net_module + "_200ep_ctc_scale_0.1", + train_args_200ep_two_forward, + training_datasets_pe1, + asr_test_datasets, + 200, + training_args={"ctc_scale": 0.1}, + forward_args=forward_args, + search_args=default_search_args, + eval_tts=True, + tts_eval_datasets=tts_forward_datasets, + eval_invertibility=True, + ) + + for lm in [2.0, 2.5, 3.0, 3.5, 4.0, 4.5]: + for ps in [0, 0.3, 0.5]: + additional_search_args = {"lm_weight": lm, "prior_scale": ps} + suffix = f"/tuning/lm_{lm}_ps_{ps}" + + exp_dict = run_exp( + train_args_200ep["network_module"] + "_200ep_spec_augment_ctc_scale_0.1" + suffix, + train_args_200ep, + training_datasets_pe1, + asr_test_datasets, + 200, + training_args={"ctc_scale": 0.1}, + forward_args=forward_args, + search_args={**default_search_args, **additional_search_args}, + eval_tts=True, + tts_eval_datasets=tts_forward_datasets_xvectors, + ) + breakpoint() + + exp_dict = run_exp( + train_args_200ep_two_forward["network_module"] + "_200ep_ctc_scale_0.1" + suffix, + train_args_200ep_two_forward, + training_datasets_pe1, + asr_test_datasets, + 200, + training_args={"ctc_scale": 0.1}, + forward_args=forward_args, + search_args={**default_search_args, **additional_search_args}, + eval_tts=True, + tts_eval_datasets=tts_forward_datasets, + eval_invertibility=True, + with_prior=True, + ) + # ================== BLSTM ================= model_config_blstm = ModelConfigV2( specaug_config=None, diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_two_forward_pass.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_two_forward_pass.py index c9b6ec80b..622fc9883 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_two_forward_pass.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/glowTTS_ASR_conformer_two_forward_pass.py @@ -520,7 +520,6 @@ def train_step(*, model: Model, data, run_ctx, **kwargs): ) num_phonemes = torch.sum(phonemes_eow_len) - breakpoint() if "ctc_scale" in kwargs: ctc_scale = kwargs["ctc_scale"] else: diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/shared/eval_invertibility.py b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/shared/eval_invertibility.py index 7dcde6680..7eafca4d7 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/shared/eval_invertibility.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/pytorch_networks/shared/eval_invertibility.py @@ -111,26 +111,11 @@ def forward_step_asr_invertibility(*, model, data, run_ctx, **kwargs): phonemes = data["phonemes"] phonemes_len = data["phonemes:size1"] - if "xvectors" in data: - g = data["xvectors"] - elif "speaker_labels" in data: - g = data["speaker_labels"] - else: - raise Exception("Missing speaker embedding!") - squeezed_audio = torch.squeeze(raw_audio) y, y_lengths = model.feature_extraction(squeezed_audio, raw_audio_len) # [B, T, F] y = y.transpose(1, 2) # [B, F, T] - if hasattr(model, "x_vector"): - _, _, g = model.x_vector(y, y_lengths) - - if hasattr(model, "x_vector_bottleneck"): - g = model.x_vector_bottleneck(g) - elif hasattr(model, "emb_g"): - g = torch.nn.functional.normalize(model.emb_g(g.squeeze(-1))).unsqueeze(-1) - else: - g = None + g = None y_max_length = y.size(2) diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training/training_comparison.ipynb b/users/rilling/experiments/librispeech/librispeech_joint_training/training_comparison.ipynb index 56937615a..ce73c9e82 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training/training_comparison.ipynb +++ b/users/rilling/experiments/librispeech/librispeech_joint_training/training_comparison.ipynb @@ -2,18 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": 15, + "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [ "import sys\n", "sys.path.append(\"/u/lukas.rilling/dev/\")\n", @@ -32,29 +23,30 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/training': '/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_ctc_scale_1.0/training': '/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_ctc_scale_1.0/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/training': '/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ddi_actnorm/training': '/glowTTS_ASR_conformer_two_forward_pass_ddi_actnorm/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_200ep_ctc_scale_0.1/training': '/glowTTS_ASR_conformer_two_forward_pass_200ep_ctc_scale_0.1/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_ctc_scale_0.1/training': '/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_ctc_scale_0.1/',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/training': '/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/training': '/glowTTS_ASR_conformer_two_forward_pass/'}" + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/training': '/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_200ep_spec_augment_ctc_scale_0.1/training': '/glowTTS_ASR_conformer_x_vector_v2_200ep_spec_augment_ctc_scale_0.1/',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/training': '/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/'}" ] }, - "execution_count": 16, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "globs = [\n", - " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass*/training\",\n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass*_ctc_scale_0.1/training\",\n", + " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2*_ctc_scale_0.1/training\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_ctc_scale_1.0/training\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2/training\",\n", " # \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_blstm_x_vector*/training\",\n", @@ -88,26 +80,26 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/training',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_ctc_scale_1.0/training',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/training',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ddi_actnorm/training',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_200ep_ctc_scale_0.1/training',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_ctc_scale_0.1/training',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/training',\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/training',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/training',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_200ep_spec_augment_ctc_scale_0.1/training',\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/training',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/not_silence_preprocessed/training',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc768/100ep/not_silence_preprocessed/training',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/training',\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/training']" ] }, - "execution_count": 17, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -147,23 +139,21 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/not_silence_preprocessed/training': 1,\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector_v3_norm_xvector/enc768/100ep/not_silence_preprocessed/training': 1,\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc768/100ep/not_silence_preprocessed/training': 1,\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector_v3/enc768/100ep/not_silence_preprocessed/training': 1,\n", + "{'/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_x_vector_pe1/training': 1,\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_x_vector_pe1_radam/training': 1,\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_x_vector_pe1/training': 1,\n", " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_x_vector_pe1_radam_no_dec_dropout/training': 1,\n", - " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS/training': 1}" + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS/training': 1,\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_200ep_ctc_scale_0.1/training': 1,\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_200ep_spec_augment_ctc_scale_0.1/training': 1}" ] }, - "execution_count": 18, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -174,6 +164,8 @@ " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector*/enc768/100ep/not_silence_preprocessed/training\": 1, \n", " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_x_vector_pe*/training\": 1,\n", " \"/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS/training\": 1,\n", + " '/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer*200ep*_ctc_scale_0.1/training': 1,\n", + "\n", "\n", "}\n", "\n", @@ -188,7 +180,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -196,16 +188,12 @@ "output_type": "stream", "text": [ "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ctc_scale_0.1/training: 3\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_ctc_scale_1.0/training: 3\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_1.0/training: 3\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_ddi_actnorm/training: 3\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_200ep_ctc_scale_0.1/training: 1\n", "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_ctc_scale_0.1/training: 3\n", "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass_strong_conformer_weak_specaug_ctc_scale_0.1/training: 3\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_two_forward_pass/training: 3\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS/enc192/100ep/not_silence_preprocessed/training: 1\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/tts_architecture/glow_tts/raw_audio/glowTTS_x_vector/enc768/100ep/not_silence_preprocessed/training: 1\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/weak_baseline/training: 3\n", - "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/librispeech_glow_asr/pytorch/conformer/asr_dataset/spec_augment/no_glow/training: 3\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_ctc_scale_0.1/training: 3\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_200ep_spec_augment_ctc_scale_0.1/training: 1\n", + "/u/lukas.rilling/experiments/glow_tts_asr_v2/alias/experiments/librispeech/joint_training/default/raw_audio/glowTTS_ASR_conformer_x_vector_v2_spec_augment_ctc_scale_0.1/training: 3\n", "Large Font: False\n", "Setup Interactive Legend\n", "Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous view', 'arrow-left', 'back'), ('Forward', 'Forward to next view', 'arrow-right', 'forward'), ('Pan', 'Left button pans, Right button zooms\\nx/y fixes axis, CTRL fixes aspect', 'arrows', 'pan'), ('Zoom', 'Zoom to rectangle\\nx/y fixes axis', 'square-o', 'zoom'), ('Download', 'Download plot', 'floppy-o', 'save_figure')]))\n" @@ -214,18 +202,18 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "79c0076f856b4e01889533746fe74923", + "model_id": "f96371033c3046fcb1d49e08b1b32573", "version_major": 2, "version_minor": 0 }, - "image/png": "", + "image/png": "", "text/html": [ "\n", "
\n", "
\n", " Figure\n", "
\n", - " \n", + " \n", "
\n", " " ], @@ -287,7 +275,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.13" } }, "nbformat": 4, diff --git a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_tts/experiments.py b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_tts/experiments.py index 72af00dba..b4b595bca 100644 --- a/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_tts/experiments.py +++ b/users/rilling/experiments/librispeech/librispeech_joint_training_given_alignments/exp_tts/experiments.py @@ -43,7 +43,7 @@ def get_glow_tts(x_vector_exp, joint_exps, tts_exps, gl_checkpoint): but in a cleaner setup similar to the setup used for joint training and joint training with external alignments """ - prefix = "experiments/librispeech/joint_training/given_alignments/raw_audio/TTS_models/" + prefix = "experiments/librispeech/TTS_only/v2/raw_audio/" def run_exp( name, From da239e06896b34fb5b7367da6e1d1d2328ae610b Mon Sep 17 00:00:00 2001 From: Benedikt Hilmes Date: Mon, 3 Jun 2024 22:18:21 +0200 Subject: [PATCH 087/227] updates quant --- .../hybrid/torch_baselines/torch_args.py | 43 +- .../tedlium2/standalone/data/common.py | 30 ++ .../tedlium2/standalone/default_tools.py | 2 +- .../experiments/ctc_phon/baseline.py | 252 +++++---- .../experiments/ctc_phon/tune_eval.py | 14 +- .../experiments/ctc_phon/uni_mod_aggr_v1.py | 253 +++++++++ ...elsV2_VGG4LayerActFrontendV1_auxloss_v1.py | 197 +++++++ ...2_VGG4LayerActFrontendV1_auxloss_v1_cfg.py | 90 ++++ .../conformer_1023/quant/baseline_quant_v2.py | 507 ++++++++++++++++++ .../quant/baseline_quant_v2_modules.py | 358 +++++++++++++ .../experiments/tedlium2/standalone/report.py | 50 +- 11 files changed, 1672 insertions(+), 124 deletions(-) create mode 100644 users/hilmes/experiments/tedlium2/standalone/experiments/ctc_phon/uni_mod_aggr_v1.py create mode 100644 users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_0106/i6modelsV2_VGG4LayerActFrontendV1_auxloss_v1.py create mode 100644 users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_0106/i6modelsV2_VGG4LayerActFrontendV1_auxloss_v1_cfg.py create mode 100644 users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_1023/quant/baseline_quant_v2.py create mode 100644 users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_1023/quant/baseline_quant_v2_modules.py diff --git a/users/hilmes/experiments/tedlium2/asr_2023/hybrid/torch_baselines/torch_args.py b/users/hilmes/experiments/tedlium2/asr_2023/hybrid/torch_baselines/torch_args.py index fcb8a5f70..d0d38ef09 100644 --- a/users/hilmes/experiments/tedlium2/asr_2023/hybrid/torch_baselines/torch_args.py +++ b/users/hilmes/experiments/tedlium2/asr_2023/hybrid/torch_baselines/torch_args.py @@ -146,6 +146,47 @@ def get_nn_args(num_outputs: int = 9001, num_epochs: int = 250, debug=False, **n "torch_jj_config2", ], }, + "quant-paper": { + "epochs": evaluation_epochs, + "feature_flow_key": "fb", + "prior_scales": [0.9], + "pronunciation_scales": [0.0], + "lm_scales": [10.0], + "lm_lookahead": True, + "lookahead_options": None, + "create_lattice": True, + "eval_single_best": True, + "eval_best_in_lattice": True, + "search_parameters": { + "beam-pruning": 15.0, + "beam-pruning-limit": 10000, + "word-end-pruning": 0.5, + "word-end-pruning-limit": 15000, + }, + "lattice_to_ctm_kwargs": { + "fill_empty_segments": True, + "best_path_algo": "bellman-ford", + }, + "optimize_am_lm_scale": True, + "rtf": 50, + "mem": 7, + "lmgc_mem": 16, + "cpu": 2, + "parallelize_conversion": True, + "needs_features_size": True, + # "quantize": [10, 15, 25, 100, 250, 500, 750, 1000, 2500, 5000], + "quantize": [100], + "random_seed_draws": 100, + "quant_modes": [CalibrationMethod.MinMax, CalibrationMethod.Percentile], + "quant_ops": [["Conv", "MatMul"]], + "quant_sym_modes": [False], + "quant_avg_modes": [False], + "quant_percentiles": [99.999], + "quant_num_bin_ls": [2048], + "training_whitelist": [ + "torch_jj_config2", + ], + }, # "quant-base": { # "epochs": evaluation_epochs, # "feature_flow_key": "fb", @@ -743,7 +784,7 @@ def get_nn_args(num_outputs: int = 9001, num_epochs: int = 250, debug=False, **n # recognition_args.update(speed) test_recognition_args = { "dev": {}, - "quant-base": {} + "quant-paper": {} } nn_args = HybridArgs( diff --git a/users/hilmes/experiments/tedlium2/standalone/data/common.py b/users/hilmes/experiments/tedlium2/standalone/data/common.py index f16454a37..c1421930a 100644 --- a/users/hilmes/experiments/tedlium2/standalone/data/common.py +++ b/users/hilmes/experiments/tedlium2/standalone/data/common.py @@ -198,3 +198,33 @@ def build_test_dataset( ) return test_dataset, bliss_dict[dataset_key] + + +def build_st_test_dataset( + corpus_path: tk.Path, + settings: DatasetSettings, +) -> Tuple[Dataset, tk.Path]: + """ + Create ASR test set that only contains the audio stream + + :param corpus_path: + :param settings: settings object for the RETURNN data pipeline + :return: tuple of the test dataset and a path to the corresponding bliss corpus file + """ + from i6_core.returnn.oggzip import BlissToOggZipJob + + ogg_zip = BlissToOggZipJob(bliss_corpus=corpus_path, returnn_root=MINI_RETURNN_ROOT, returnn_python_exe=RETURNN_EXE) + + + audio_datastream = get_audio_raw_datastream(settings.preemphasis, settings.peak_normalization) + + data_map = {"raw_audio": ("zip_dataset", "data")} + + test_zip_dataset = OggZipDataset( + files=[ogg_zip.out_ogg_zip], audio_options=audio_datastream.as_returnn_audio_opts(), seq_ordering="sorted_reverse" + ) + test_dataset = MetaDataset( + data_map=data_map, datasets={"zip_dataset": test_zip_dataset}, seq_order_control_dataset="zip_dataset" + ) + + return test_dataset, corpus_path diff --git a/users/hilmes/experiments/tedlium2/standalone/default_tools.py b/users/hilmes/experiments/tedlium2/standalone/default_tools.py index 8c1b48070..42549a8fe 100644 --- a/users/hilmes/experiments/tedlium2/standalone/default_tools.py +++ b/users/hilmes/experiments/tedlium2/standalone/default_tools.py @@ -19,7 +19,7 @@ I6_MODELS_REPO_PATH = CloneGitRepositoryJob( url="https://github.com/rwth-i6/i6_models", - commit="3c9173691521778b1e8b4070c172cbe929e4826b", + commit="645d4e1a00349ed46593ab2c82dc373db353c33f", checkout_folder_name="i6_models", ).out_repository.copy() I6_MODELS_REPO_PATH.hash_overwrite = "TEDLIUM_STANDALONE_DEFAULT_I6_MODELS" diff --git a/users/hilmes/experiments/tedlium2/standalone/experiments/ctc_phon/baseline.py b/users/hilmes/experiments/tedlium2/standalone/experiments/ctc_phon/baseline.py index 77e80e7b9..ea21e48f8 100644 --- a/users/hilmes/experiments/tedlium2/standalone/experiments/ctc_phon/baseline.py +++ b/users/hilmes/experiments/tedlium2/standalone/experiments/ctc_phon/baseline.py @@ -1,10 +1,11 @@ from dataclasses import asdict import numpy as np from typing import cast +from sisyphus import tk from i6_experiments.common.setups.returnn.datastreams.vocabulary import LabelDatastream from .tune_eval import QuantArgs -from ...data.common import DatasetSettings, build_test_dataset +from ...data.common import DatasetSettings, build_test_dataset, build_st_test_dataset from ...data.phon import build_eow_phon_training_datasets, get_text_lexicon from ...default_tools import RETURNN_EXE, MINI_RETURNN_ROOT from ...lm import get_4gram_binary_lm @@ -40,9 +41,18 @@ def eow_phon_ted_1023_base(quant=False): ) test_dataset_tuples = {} - for testset in ["test"]: - test_dataset_tuples[testset] = build_test_dataset( - dataset_key=testset, + # for testset in ["test"]: + # test_dataset_tuples[testset] = build_test_dataset( + # dataset_key=testset, + # settings=train_settings, + # ) + for path in [ + "/work/smt3/bahar/expriments/st/iwslt2018/iwslt2018/iwslt/sets/iwslt.dev2010.xml.gz", + "/work/smt3/bahar/expriments/st/iwslt2018/iwslt2018/iwslt/sets/iwslt.tst2015.xml.gz", + "/work/smt3/bahar/expriments/st/iwslt2018/iwslt2018/iwslt/sets/iwslt.tst2014.xml.gz", + ]: + test_dataset_tuples[path.split("/")[-1].split(".")[1]] = build_st_test_dataset( + corpus_path=tk.Path(path), settings=train_settings, ) @@ -208,7 +218,51 @@ def eow_phon_ted_1023_base(quant=False): from ...pytorch_networks.ctc.conformer_1023.quant.baseline_quant_v1_cfg import QuantModelConfigV1 num_iterations = 100 # what if we give more information to the activation instead? - for activation_bit in [8, 7, 6, 5, 4, 3, 2, 1]: + if False: + for activation_bit in [8, 7, 6, 5, 4, 3, 2, 1]: + for weight_bit in [8, 7, 6, 5, 4, 3, 2, 1]: + results = {} + model_config_quant_v1 = QuantModelConfigV1( + weight_quant_dtype="qint8", + weight_quant_method="per_tensor", + activation_quant_dtype="qint8", + activation_quant_method="per_tensor", + dot_quant_dtype="qint8", + dot_quant_method="per_tensor", + Av_quant_dtype="qint8", + Av_quant_method="per_tensor", + moving_average=0.01, + weight_bit_prec=weight_bit, + activation_bit_prec=activation_bit, + linear_quant_output=False, + ) + quant_args = QuantArgs( + sample_ls=[10] if weight_bit < 8 or activation_bit < 8 else [10, 100, 1000, 10000], + quant_config_dict={"quant_config_dict": asdict(model_config_quant_v1)}, + decoder="ctc.decoder.flashlight_quant_stat_phoneme_ctc", + num_iterations=num_iterations, + datasets=train_data, + network_module="ctc.conformer_1023.quant.baseline_quant_v1", + ) + quant_str = f"_weight_{weight_bit}_act_{activation_bit}" + asr_model = prepare_asr_model( + training_name+quant_str, + train_job, + train_args, + with_prior=True, + datasets=train_data, + get_specific_checkpoint=250, + ) + res, _ = tune_and_evaluate_helper( # only take best for now, since otherwise too many searches + training_name, asr_model, default_decoder_config, lm_scales=[2.8], + prior_scales=[0.7], quant_args=quant_args, quant_str=quant_str, + dev_dataset_tuples=dev_dataset_tuples, + ) + results.update(res) + generate_report(results=results, exp_name=training_name + quant_str) + del results + transcribers = [(8, 8)] + for activation_bit in [8]: for weight_bit in [8, 7, 6, 5, 4, 3, 2, 1]: results = {} model_config_quant_v1 = QuantModelConfigV1( @@ -231,9 +285,9 @@ def eow_phon_ted_1023_base(quant=False): decoder="ctc.decoder.flashlight_quant_stat_phoneme_ctc", num_iterations=num_iterations, datasets=train_data, - network_module="ctc.conformer_1023.quant.baseline_quant_v1", + network_module="ctc.conformer_1023.quant.baseline_quant_v2", ) - quant_str = f"_weight_{weight_bit}_act_{activation_bit}" + quant_str = f"_v2_weight_{weight_bit}_act_{activation_bit}" asr_model = prepare_asr_model( training_name+quant_str, train_job, @@ -246,19 +300,65 @@ def eow_phon_ted_1023_base(quant=False): training_name, asr_model, default_decoder_config, lm_scales=[2.8], prior_scales=[0.7], quant_args=quant_args, quant_str=quant_str, dev_dataset_tuples=dev_dataset_tuples, + test_dataset_tuples=test_dataset_tuples if (activation_bit, weight_bit) in transcribers else None ) results.update(res) generate_report(results=results, exp_name=training_name + quant_str) del results num_iterations = 250 - for filter in [ - ({"unique_tags": 0.0}, "unique"), - ({"single_tag": 0.0}, "single"), - ({"max_dur": 1.0}, "max_dur_1"), - ({"min_dur": 15.0}, "min_dur_15") - ]: + if False: + for filter in [ + ({"unique_tags": 0.0}, "unique"), + ({"single_tag": 0.0}, "single"), + ({"max_dur": 1.0}, "max_dur_1"), + ({"min_dur": 15.0}, "min_dur_15") + ]: + for activation_bit in [8]: + for weight_bit in [8]: + results = {} + model_config_quant_v1 = QuantModelConfigV1( + weight_quant_dtype="qint8", + weight_quant_method="per_tensor", + activation_quant_dtype="qint8", + activation_quant_method="per_tensor", + dot_quant_dtype="qint8", + dot_quant_method="per_tensor", + Av_quant_dtype="qint8", + Av_quant_method="per_tensor", + moving_average=0.01, + weight_bit_prec=weight_bit, + activation_bit_prec=activation_bit, + linear_quant_output=False, + ) + quant_args = QuantArgs( + sample_ls=[1], #§, 10, 25, 5], + quant_config_dict={"quant_config_dict": asdict(model_config_quant_v1)}, + decoder="ctc.decoder.flashlight_quant_stat_phoneme_ctc", + num_iterations=num_iterations, + datasets=train_data, + network_module="ctc.conformer_1023.quant.baseline_quant_v1", + filter_args=filter[0], + ) + quant_str = f"_weight_{weight_bit}_act_{activation_bit}_{filter[1]}" + asr_model = prepare_asr_model( + training_name+quant_str, + train_job, + train_args, + with_prior=True, + datasets=train_data, + get_specific_checkpoint=250, + ) + res, _ = tune_and_evaluate_helper( # only take best for now, since otherwise too many searches + training_name, asr_model, default_decoder_config, lm_scales=[2.8], + prior_scales=[0.7], quant_args=quant_args, quant_str=quant_str, dev_dataset_tuples=dev_dataset_tuples + ) + results.update(res) + generate_report(results=results, exp_name=training_name + quant_str) + del results + + num_iterations = 100 for activation_bit in [8]: - for weight_bit in [8]: + for weight_bit in [8, 7, 6, 5, 4, 3, 2, 1]: results = {} model_config_quant_v1 = QuantModelConfigV1( weight_quant_dtype="qint8", @@ -272,18 +372,54 @@ def eow_phon_ted_1023_base(quant=False): moving_average=0.01, weight_bit_prec=weight_bit, activation_bit_prec=activation_bit, + linear_quant_output=True, + ) + quant_args = QuantArgs( + sample_ls=[10] if weight_bit < 8 or activation_bit < 8 else [10, 100, 1000, 10000], + quant_config_dict={"quant_config_dict": asdict(model_config_quant_v1)}, + decoder="ctc.decoder.flashlight_quant_stat_phoneme_ctc", + num_iterations=num_iterations, + datasets=train_data, + network_module="ctc.conformer_1023.quant.baseline_quant_v1", + ) + quant_str = f"_weight_{weight_bit}_act_{activation_bit}_qlin" + asr_model = prepare_asr_model( + training_name+quant_str, train_job, train_args, with_prior=True, datasets=train_data, get_specific_checkpoint=250 + ) + res, _ = tune_and_evaluate_helper( # only take best for now, since otherwise too many searches + training_name, asr_model, default_decoder_config, lm_scales=[2.8], + prior_scales=[0.7], quant_args=quant_args, quant_str=quant_str, dev_dataset_tuples=dev_dataset_tuples + ) + results.update(res) + generate_report(results=results, exp_name=training_name+quant_str) + del results + + for activation_bit in [8]: + for weight_bit in [8]: + results = {} + model_config_quant_v1 = QuantModelConfigV1( + weight_quant_dtype="qint8", + weight_quant_method="per_tensor", + activation_quant_dtype="qint8", + activation_quant_method="per_tensor", + dot_quant_dtype="qint8", + dot_quant_method="per_tensor", + Av_quant_dtype="qint8", + Av_quant_method="per_tensor", + moving_average=None, + weight_bit_prec=weight_bit, + activation_bit_prec=activation_bit, linear_quant_output=False, ) quant_args = QuantArgs( - sample_ls=[1], #§, 10, 25, 5], + sample_ls=[10] if weight_bit < 8 or activation_bit < 8 else [10, 100, 1000, 10000], quant_config_dict={"quant_config_dict": asdict(model_config_quant_v1)}, decoder="ctc.decoder.flashlight_quant_stat_phoneme_ctc", num_iterations=num_iterations, datasets=train_data, network_module="ctc.conformer_1023.quant.baseline_quant_v1", - filter_args=filter[0], ) - quant_str = f"_weight_{weight_bit}_act_{activation_bit}_{filter[1]}" + quant_str = f"_weight_{weight_bit}_act_{activation_bit}_no_avg" asr_model = prepare_asr_model( training_name+quant_str, train_job, @@ -294,92 +430,12 @@ def eow_phon_ted_1023_base(quant=False): ) res, _ = tune_and_evaluate_helper( # only take best for now, since otherwise too many searches training_name, asr_model, default_decoder_config, lm_scales=[2.8], - prior_scales=[0.7], quant_args=quant_args, quant_str=quant_str, dev_dataset_tuples=dev_dataset_tuples + prior_scales=[0.7], quant_args=quant_args, quant_str=quant_str, dev_dataset_tuples=dev_dataset_tuples, ) results.update(res) generate_report(results=results, exp_name=training_name + quant_str) del results - num_iterations = 100 - for activation_bit in [8]: - for weight_bit in [8, 7, 6, 5, 4, 3, 2, 1]: - results = {} - model_config_quant_v1 = QuantModelConfigV1( - weight_quant_dtype="qint8", - weight_quant_method="per_tensor", - activation_quant_dtype="qint8", - activation_quant_method="per_tensor", - dot_quant_dtype="qint8", - dot_quant_method="per_tensor", - Av_quant_dtype="qint8", - Av_quant_method="per_tensor", - moving_average=0.01, - weight_bit_prec=weight_bit, - activation_bit_prec=activation_bit, - linear_quant_output=True, - ) - quant_args = QuantArgs( - sample_ls=[10] if weight_bit < 8 or activation_bit < 8 else [10, 100, 1000, 10000], - quant_config_dict={"quant_config_dict": asdict(model_config_quant_v1)}, - decoder="ctc.decoder.flashlight_quant_stat_phoneme_ctc", - num_iterations=num_iterations, - datasets=train_data, - network_module="ctc.conformer_1023.quant.baseline_quant_v1", - ) - quant_str = f"_weight_{weight_bit}_act_{activation_bit}_qlin" - asr_model = prepare_asr_model( - training_name+quant_str, train_job, train_args, with_prior=True, datasets=train_data, get_specific_checkpoint=250 - ) - res, _ = tune_and_evaluate_helper( # only take best for now, since otherwise too many searches - training_name, asr_model, default_decoder_config, lm_scales=[2.8], - prior_scales=[0.7], quant_args=quant_args, quant_str=quant_str, dev_dataset_tuples=dev_dataset_tuples - ) - results.update(res) - generate_report(results=results, exp_name=training_name+quant_str) - del results - - for activation_bit in [8]: - for weight_bit in [8]: - results = {} - model_config_quant_v1 = QuantModelConfigV1( - weight_quant_dtype="qint8", - weight_quant_method="per_tensor", - activation_quant_dtype="qint8", - activation_quant_method="per_tensor", - dot_quant_dtype="qint8", - dot_quant_method="per_tensor", - Av_quant_dtype="qint8", - Av_quant_method="per_tensor", - moving_average=None, - weight_bit_prec=weight_bit, - activation_bit_prec=activation_bit, - linear_quant_output=False, - ) - quant_args = QuantArgs( - sample_ls=[10] if weight_bit < 8 or activation_bit < 8 else [10, 100, 1000, 10000], - quant_config_dict={"quant_config_dict": asdict(model_config_quant_v1)}, - decoder="ctc.decoder.flashlight_quant_stat_phoneme_ctc", - num_iterations=num_iterations, - datasets=train_data, - network_module="ctc.conformer_1023.quant.baseline_quant_v1", - ) - quant_str = f"_weight_{weight_bit}_act_{activation_bit}_no_avg" - asr_model = prepare_asr_model( - training_name+quant_str, - train_job, - train_args, - with_prior=True, - datasets=train_data, - get_specific_checkpoint=250, - ) - res, _ = tune_and_evaluate_helper( # only take best for now, since otherwise too many searches - training_name, asr_model, default_decoder_config, lm_scales=[2.8], - prior_scales=[0.7], quant_args=quant_args, quant_str=quant_str, dev_dataset_tuples=dev_dataset_tuples, - ) - results.update(res) - generate_report(results=results, exp_name=training_name + quant_str) - del results - # E-Branchformer branchformer_module = "ctc.conformer_1023.i6models_ebranchformer_v1" train_config = { diff --git a/users/hilmes/experiments/tedlium2/standalone/experiments/ctc_phon/tune_eval.py b/users/hilmes/experiments/tedlium2/standalone/experiments/ctc_phon/tune_eval.py index 520ca392c..05f208c4c 100644 --- a/users/hilmes/experiments/tedlium2/standalone/experiments/ctc_phon/tune_eval.py +++ b/users/hilmes/experiments/tedlium2/standalone/experiments/ctc_phon/tune_eval.py @@ -113,11 +113,23 @@ def tune_and_evaluate_helper( **default_returnn, ) results.update(wers) + if test_dataset_tuples is not None and seed in [0]: + decoder_config = copy.deepcopy(base_decoder_config) + search_jobs, wers = search( + search_name, + forward_config={}, + asr_model=quant_model, + decoder_module=quant_args.decoder, + decoder_args={"config": asdict(decoder_config)}, + test_dataset_tuples=test_dataset_tuples, + **default_returnn, + ) + #results.update(wers) pick_optimal_params_job = GetOptimalParametersAsVariableJob( parameters=tune_parameters, values=tune_values, mode="minimize" ) pick_optimal_params_job.add_alias(training_name + f"/pick_best_dev") - if test_dataset_tuples is not None: + if test_dataset_tuples is not None and False: for key, tune_values in [("test", tune_values)]: decoder_config = copy.deepcopy(base_decoder_config) decoder_config.lm_weight = pick_optimal_params_job.out_optimal_parameters[0] diff --git a/users/hilmes/experiments/tedlium2/standalone/experiments/ctc_phon/uni_mod_aggr_v1.py b/users/hilmes/experiments/tedlium2/standalone/experiments/ctc_phon/uni_mod_aggr_v1.py new file mode 100644 index 000000000..0e1a893c2 --- /dev/null +++ b/users/hilmes/experiments/tedlium2/standalone/experiments/ctc_phon/uni_mod_aggr_v1.py @@ -0,0 +1,253 @@ +from dataclasses import asdict +import numpy as np +from typing import cast +import copy + +from i6_experiments.common.setups.returnn.datastreams.vocabulary import LabelDatastream + +from ...data.common import DatasetSettings, build_test_dataset +from ...data.phon import build_eow_phon_training_datasets, get_text_lexicon +from ...default_tools import RETURNN_EXE, MINI_RETURNN_ROOT +from ...lm import get_4gram_binary_lm +from ...pipeline import training, prepare_asr_model +from ...report import generate_report +from .tune_eval import tune_and_evaluate_helper + + + +def eow_phon_ted_0106_unimod(): + prefix_name = "experiments/tedlium2/ctc_rnnt_standalone_2024/ctc_eow_phon_unimod" + + train_settings = DatasetSettings( + preemphasis=0.97, # TODO: Check if this is really useful + peak_normalization=True, # TODO: Also check if really useful, older Attention setups did not have that + # training + train_partition_epoch=5, + train_seq_ordering="laplace:.1000", + ) + + # build the training datasets object containing train, cv, dev-train and the extern_data dict + train_data = build_eow_phon_training_datasets( + prefix=prefix_name, + settings=train_settings, + ) + label_datastream = cast(LabelDatastream, train_data.datastreams["labels"]) + vocab_size_without_blank = label_datastream.vocab_size + + dev_dataset_tuples = {} + for testset in ["dev"]: + dev_dataset_tuples[testset] = build_test_dataset( + dataset_key=testset, + settings=train_settings, + ) + + test_dataset_tuples = {} + for testset in ["test"]: + test_dataset_tuples[testset] = build_test_dataset( + dataset_key=testset, + settings=train_settings, + ) + + arpa_4gram_lm = get_4gram_binary_lm(prefix_name=prefix_name) + + default_returnn = { + "returnn_exe": RETURNN_EXE, + "returnn_root": MINI_RETURNN_ROOT, + } + + from ...pytorch_networks.ctc.decoder.flashlight_ctc_v1 import DecoderConfig + + default_decoder_config = DecoderConfig( + lexicon=get_text_lexicon(), + returnn_vocab=label_datastream.vocab, + beam_size=1024, + beam_size_token=12, # makes it much faster + arpa_lm=arpa_4gram_lm, + beam_threshold=14, + ) + + from ...pytorch_networks.ctc.conformer_0106.i6modelsV2_VGG4LayerActFrontendV1_auxloss_v1_cfg import ( + SpecaugConfig, + VGG4LayerActFrontendV1Config_mod, + ModelConfig, + LogMelFeatureExtractionV1Config, + ) + + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + specaug_config = SpecaugConfig( + repeat_per_n_frames=25, + max_dim_time=20, + max_dim_feat=8, # Jingjing style + num_repeat_feat=5, + ) + frontend_config = VGG4LayerActFrontendV1Config_mod( + in_features=80, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation_str="ReLU", + out_features=384, + activation=None, + ) + + model_config = ModelConfig( + feature_extraction_config=fe_config, + frontend_config=frontend_config, + specaug_config=specaug_config, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=1536, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + specauc_start_epoch=1, + module_list=["ff", "conv", "mhsa", "ff"], + module_scales=[0.5, 1.0, 1.0, 0.5], + aux_ctc_loss_layers=[3, 7, 11], # 4, 8, 12 when counting from 1 + aux_ctc_loss_scales=[0.3, 0.3, 1.0], + ) + model_config_decoding = copy.deepcopy(model_config) + model_config_decoding.aux_ctc_loss_scales = [0.0, 0.0, 1.0] # for decoding use result only of last layer + + network_module = "ctc.conformer_0106.i6modelsV2_VGG4LayerActFrontendV1_auxloss_v1" + + train_config = { + "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3}, + "learning_rates": list(np.linspace(7e-6, 5e-4, 110)) + + list(np.linspace(5e-4, 5e-5, 110)) + + list(np.linspace(5e-5, 1e-7, 30)), + ############# + "batch_size": 180 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + "accum_grad_multiple_step": 1, + } + train_args = { + "config": train_config, + "network_module": network_module, + "net_args": {"model_config_dict": asdict(model_config)}, + "debug": False, + } + train_args_decoding = copy.deepcopy(train_args) + train_args_decoding["net_args"] = {"model_config_dict": asdict(model_config_decoding)} + + results = {} + training_name = prefix_name + "/" + network_module + "_384dim_sub4_50eps" + train_job = training(training_name, train_data, train_args, num_epochs=250, **default_returnn) + asr_model = prepare_asr_model( + training_name, train_job, train_args_decoding, with_prior=True, datasets=train_data, get_specific_checkpoint=250 + ) + lm_scales = [2.0, 2.2, 2.4, 2.6, 2.8] + prior_scales = [0.7, 0.9] + res, _ = tune_and_evaluate_helper( + training_name, asr_model, default_decoder_config, lm_scales=lm_scales, + prior_scales=prior_scales, dev_dataset_tuples=dev_dataset_tuples + ) + results.update(res) + asr_model_best4 = prepare_asr_model( + training_name + "/best4", train_job, train_args, with_prior=True, datasets=train_data, + get_best_averaged_checkpoint=(4, "dev_loss_ctc") + ) + res, _ = tune_and_evaluate_helper(training_name + "/best4", asr_model_best4, default_decoder_config, + lm_scales=lm_scales, prior_scales=prior_scales, dev_dataset_tuples=dev_dataset_tuples) + results.update(res) + asr_model_best = prepare_asr_model( + training_name + "/best", train_job, train_args, with_prior=True, datasets=train_data, + get_best_averaged_checkpoint=(1, "dev_loss_ctc") + ) + res, _ = tune_and_evaluate_helper(training_name + "/best", asr_model_best, default_decoder_config, + lm_scales=lm_scales, prior_scales=prior_scales, dev_dataset_tuples=dev_dataset_tuples) + results.update(res) + generate_report(results=results, exp_name=training_name) + del results + + return + # TODO + unimod_module = "ctc.conformer_1023.conformer_v1_uni_aggr_v1" + from ...pytorch_networks.ctc.conformer_1023.conformer_v1_uni_aggr_cfg_v1 import ModelConfig as UniAggrConfig + uni_aggr_model_config = UniAggrConfig( + feature_extraction_config=fe_config, + frontend_config=frontend_config, + specaug_config=specaug_config, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=1536, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + specauc_start_epoch=1, + aggr_layer=9, + ) + train_config = { + "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3}, + "learning_rates": list(np.linspace(7e-6, 5e-4, 110)) + + list(np.linspace(5e-4, 5e-5, 110)) + + list(np.linspace(5e-5, 1e-7, 30)), + ############# + "batch_size": 180 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + "accum_grad_multiple_step": 1, + } + # Unimodal Aggregation + train_args = { + "config": train_config, + "network_module": unimod_module, + "net_args": {"model_config_dict": asdict(uni_aggr_model_config)}, + "debug": False, + } + results = {} + training_name = prefix_name + "/" + unimod_module + "_384dim_sub4_50eps" + train_job = training(training_name, train_data, train_args, num_epochs=250, **default_returnn) + asr_model = prepare_asr_model( + training_name, train_job, train_args, with_prior=True, datasets=train_data, get_specific_checkpoint=111 + ) + lm_scales = [2.0, 2.2, 2.4, 2.6, 2.8] + prior_scales = [0.7, 0.9] + res, _ = tune_and_evaluate_helper( + training_name, asr_model, default_decoder_config, lm_scales=lm_scales, + prior_scales=prior_scales + ) + results.update(res) + asr_model_best4 = prepare_asr_model( + training_name + "/best4", train_job, train_args, with_prior=True, datasets=train_data, + get_best_averaged_checkpoint=(4, "dev_loss_ctc") + ) + res, _ = tune_and_evaluate_helper(training_name + "/best4", asr_model_best4, default_decoder_config, + lm_scales=lm_scales, prior_scales=prior_scales) + results.update(res) + asr_model_best = prepare_asr_model( + training_name + "/best", train_job, train_args, with_prior=True, datasets=train_data, + get_best_averaged_checkpoint=(1, "dev_loss_ctc") + ) + res, _ = tune_and_evaluate_helper(training_name + "/best", asr_model_best, default_decoder_config, + lm_scales=lm_scales, prior_scales=prior_scales) + results.update(res) + generate_report(results=results, exp_name=training_name) + del results diff --git a/users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_0106/i6modelsV2_VGG4LayerActFrontendV1_auxloss_v1.py b/users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_0106/i6modelsV2_VGG4LayerActFrontendV1_auxloss_v1.py new file mode 100644 index 000000000..31c7f48ad --- /dev/null +++ b/users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_0106/i6modelsV2_VGG4LayerActFrontendV1_auxloss_v1.py @@ -0,0 +1,197 @@ +""" +Like v2, but with i6_models specaugment (v3) +and now controllable start time for when specaugment is applied (v4) +and with the proper feature extraction from i6-models +""" + +import numpy as np +import torch +from torch import nn + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v2 import ConformerEncoderV2, ConformerEncoderV2Config, ConformerBlockV2Config +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1 + +from returnn.torch.context import get_run_ctx + +from .i6modelsV2_VGG4LayerActFrontendV1_auxloss_v1_cfg import ModelConfig + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV2Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV2Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, kernel_size=self.cfg.conv_kernel_size, dropout=self.cfg.conv_dropout, activation=nn.functional.silu, + norm=LayerNormNC(conformer_size) + ), + modules=self.cfg.module_list, + scales=self.cfg.module_scales, + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=self.cfg.feature_extraction_config) + self.conformer = ConformerEncoderV2(cfg=conformer_config) + self.num_output_linears = 1 if self.cfg.aux_ctc_loss_layers is None else len(self.cfg.aux_ctc_loss_layers) + self.output_linears = nn.ModuleList([ + nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + for _ in range(self.num_output_linears) + ]) + self.output_dropout = nn.Dropout(p=self.cfg.final_dropout) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :return: list of logprobs [B, T, #labels + blank], mask [B, T] + """ + + squeezed_features = torch.squeeze(raw_audio, dim=-1) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + run_ctx = get_run_ctx() + if self.training and run_ctx.epoch >= self.specaug_start_epoch: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, # TODO: make configurable + time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.cfg.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.cfg.specaug_config.max_dim_feat, + freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + + return_layers = self.cfg.aux_ctc_loss_layers or [self.cfg.num_layers - 1] + print(return_layers) + conformer_out_layers, out_mask = self.conformer(conformer_in, mask, return_layers=return_layers) + log_probs_list = [] + for i, (out_layer, scale) in enumerate(zip(conformer_out_layers, self.cfg.aux_ctc_loss_scales)): + if scale == 0.0: + continue + conformer_out = self.output_dropout(out_layer) + logits = self.output_linears[i](conformer_out) + log_probs = torch.log_softmax(logits, dim=2) + log_probs_list.append(log_probs) + + if len(log_probs_list) == 1: + log_probs_list = log_probs_list[0] + + return log_probs_list, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"].to("cpu") # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs_list, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + for logprobs, layer_index, scale in zip(logprobs_list, model.cfg.aux_ctc_loss_layers, model.cfg.aux_ctc_loss_scales): + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True, + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name=f"ctc_loss_layer{layer_index + 1}", loss=ctc_loss, scale=scale, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", 'w') as f: + np.savetxt(f, log_average_probs, delimiter=' ') + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_0106/i6modelsV2_VGG4LayerActFrontendV1_auxloss_v1_cfg.py b/users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_0106/i6modelsV2_VGG4LayerActFrontendV1_auxloss_v1_cfg.py new file mode 100644 index 000000000..fdd006b90 --- /dev/null +++ b/users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_0106/i6modelsV2_VGG4LayerActFrontendV1_auxloss_v1_cfg.py @@ -0,0 +1,90 @@ +""" +Config for the base CTC model with aux loss and ordering +""" + +from dataclasses import dataclass + +import torch +from torch import nn +from typing import Callable, List, Optional, Type, Union + +from i6_models.assemblies.conformer.conformer_v2 import ConformerBlockV2Config, ConformerBlockV2 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config +from i6_models.config import ModuleFactoryV1, ModelConfiguration +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1Config + + +@dataclass(kw_only=True) +class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config): + activation_str: str = "" + activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None + + @classmethod + def from_dict(cls, d): + d = d.copy() + activation_str = d.pop("activation_str") + if activation_str == "ReLU": + from torch.nn import ReLU + activation = ReLU() + else: + assert False, "Unsupported activation %s" % d["activation_str"] + d["activation"] = activation + return VGG4LayerActFrontendV1Config(**d) + + +@dataclass +class ConformerEncoderV2Config(ModelConfiguration): + """ + Attributes: + num_layers: Number of conformer layers in the conformer encoder + frontend: A pair of ConformerFrontend and corresponding config + block_cfg: Configuration for ConformerBlockV1 + """ + + num_layers: int + + # nested configurations + frontend: ModuleFactoryV1 + block_cfg: ConformerBlockV2Config + + +@dataclass +class SpecaugConfig(ModelConfiguration): + repeat_per_n_frames: int + max_dim_time: int + num_repeat_feat: int + max_dim_feat: int + + + +@dataclass +class ModelConfig(): + feature_extraction_config: LogMelFeatureExtractionV1Config + frontend_config: VGG4LayerActFrontendV1Config + specaug_config: SpecaugConfig + specauc_start_epoch: int + label_target_size: int + conformer_size: int + num_layers: int + num_heads: int + ff_dim: int + att_weights_dropout: float + conv_dropout: float + ff_dropout: float + mhsa_dropout: float + conv_kernel_size: int + final_dropout: float + module_list: List[str] + module_scales: List[float] + aux_ctc_loss_layers: Optional[List[int]] + aux_ctc_loss_scales: Optional[List[float]] + + @classmethod + def from_dict(cls, d): + d = d.copy() + d["feature_extraction_config"] = LogMelFeatureExtractionV1Config(**d["feature_extraction_config"]) + d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"]) + d["specaug_config"] = SpecaugConfig(**d["specaug_config"]) + return ModelConfig(**d) + + diff --git a/users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_1023/quant/baseline_quant_v2.py b/users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_1023/quant/baseline_quant_v2.py new file mode 100644 index 000000000..e742bbb09 --- /dev/null +++ b/users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_1023/quant/baseline_quant_v2.py @@ -0,0 +1,507 @@ +""" +Uses the MHSA fix from v2 modules +""" +import math + +import numpy as np +import torch +from torch import nn +import copy +from typing import Tuple + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 +from i6_models.util import compat + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + +from returnn.torch.context import get_run_ctx + +from .baseline_quant_v1_cfg import QuantModelTrainConfigV1, QuantModelConfigV1, ConformerPositionwiseFeedForwardQuantV1Config, QuantizedMultiheadAttentionV1Config, ConformerConvolutionQuantV1Config, ConformerBlockQuantV1Config, ConformerEncoderQuantV1Config +from .baseline_quant_v2_modules import LinearQuant, QuantizedMultiheadAttention + +class ConformerPositionwiseFeedForwardQuant(nn.Module): + """ + Conformer feedforward module + """ + + def __init__(self, cfg: ConformerPositionwiseFeedForwardQuantV1Config): + super().__init__() + + self.layer_norm = nn.LayerNorm(cfg.input_dim) + self.linear_ff = LinearQuant( + in_features=cfg.input_dim, + out_features=cfg.hidden_dim, + weight_bit_prec=cfg.weight_bit_prec, + weight_quant_dtype=cfg.weight_quant_dtype, + weight_quant_method=cfg.weight_quant_method, + activation_bit_prec=cfg.activation_bit_prec, + activation_quant_dtype=cfg.activation_quant_dtype, + activation_quant_method=cfg.activation_quant_method, + moving_average=cfg.moving_average, + bias=True, + quant_output=cfg.linear_quant_output + ) + self.activation = cfg.activation + self.linear_out = LinearQuant( + in_features=cfg.hidden_dim, + out_features=cfg.input_dim, + weight_bit_prec=cfg.weight_bit_prec, + weight_quant_dtype=cfg.weight_quant_dtype, + weight_quant_method=cfg.weight_quant_method, + activation_bit_prec=cfg.activation_bit_prec, + activation_quant_dtype=cfg.activation_quant_dtype, + activation_quant_method=cfg.activation_quant_method, + moving_average=cfg.moving_average, + bias=True, + quant_output=cfg.linear_quant_output + ) + self.dropout = cfg.dropout + + def forward(self, tensor: torch.Tensor) -> torch.Tensor: + """ + :param tensor: shape [B,T,F], F=input_dim + :return: shape [B,T,F], F=input_dim + """ + tensor = self.layer_norm(tensor) + tensor = self.linear_ff(tensor) # [B,T,F] + tensor = self.activation(tensor) # [B,T,F] + tensor = nn.functional.dropout(tensor, p=self.dropout, training=self.training) # [B,T,F] + tensor = self.linear_out(tensor) # [B,T,F] + tensor = nn.functional.dropout(tensor, p=self.dropout, training=self.training) # [B,T,F] + return tensor + +class ConformerMHSAQuant(torch.nn.Module): + """ + Conformer multi-headed self-attention module + """ + + def __init__(self, cfg: QuantizedMultiheadAttentionV1Config): + + super().__init__() + + self.layernorm = torch.nn.LayerNorm(cfg.input_dim) + self.mhsa = QuantizedMultiheadAttention(cfg=cfg) + self.dropout = cfg.dropout + + def forward(self, input_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> torch.Tensor: + """ + Apply layer norm and multi-head self attention and dropout + + :param input_tensor: Input to the self attention of shape (B, T, F) + :param sequence_mask: bool mask of shape (B, T), True signals within sequence, False outside, will be inverted + which will be applied/added to dot product, used to mask padded key positions out + """ + inv_sequence_mask = compat.logical_not(sequence_mask) + output_tensor = self.layernorm(input_tensor) # [B,T,F] + + output_tensor, _ = self.mhsa( + output_tensor, output_tensor, output_tensor, mask=inv_sequence_mask + ) # [B,T,F] + output_tensor = torch.nn.functional.dropout(output_tensor, p=self.dropout, training=self.training) # [B,T,F] + + return output_tensor + + +class ConformerConvolutionQuant(nn.Module): + """ + Conformer convolution module. + see also: https://github.com/espnet/espnet/blob/713e784c0815ebba2053131307db5f00af5159ea/espnet/nets/pytorch_backend/conformer/convolution.py#L13 + + Uses explicit padding for ONNX exportability, see: + https://github.com/pytorch/pytorch/issues/68880 + """ + + def __init__(self, model_cfg: ConformerConvolutionQuantV1Config): + """ + :param model_cfg: model configuration for this module + """ + super().__init__() + model_cfg.check_valid() + self.pointwise_conv1 = LinearQuant( + in_features=model_cfg.channels, + out_features=2 * model_cfg.channels, + weight_bit_prec=model_cfg.weight_bit_prec, + weight_quant_dtype=model_cfg.weight_quant_dtype, + weight_quant_method=model_cfg.weight_quant_method, + activation_bit_prec=model_cfg.activation_bit_prec, + activation_quant_dtype=model_cfg.activation_quant_dtype, + activation_quant_method=model_cfg.activation_quant_method, + moving_average=model_cfg.moving_average, + bias=True, + quant_output=model_cfg.linear_quant_output, + ) + # TODO: Quantize this + self.depthwise_conv = nn.Conv1d( + in_channels=model_cfg.channels, + out_channels=model_cfg.channels, + kernel_size=model_cfg.kernel_size, + padding=(model_cfg.kernel_size - 1) // 2, + groups=model_cfg.channels, + ) + self.pointwise_conv2 = LinearQuant( + in_features=model_cfg.channels, + out_features=model_cfg.channels, + weight_bit_prec=model_cfg.weight_bit_prec, + weight_quant_dtype=model_cfg.weight_quant_dtype, + weight_quant_method=model_cfg.weight_quant_method, + activation_bit_prec=model_cfg.activation_bit_prec, + activation_quant_dtype=model_cfg.activation_quant_dtype, + activation_quant_method=model_cfg.activation_quant_method, + moving_average=model_cfg.moving_average, + bias=True, + quant_output=model_cfg.linear_quant_output, + ) + self.layer_norm = nn.LayerNorm(model_cfg.channels) + self.norm = copy.deepcopy(model_cfg.norm) + self.dropout = nn.Dropout(model_cfg.dropout) + self.activation = model_cfg.activation + + def forward(self, tensor: torch.Tensor) -> torch.Tensor: + """ + :param tensor: input tensor of shape [B,T,F] + :return: torch.Tensor of shape [B,T,F] + """ + tensor = self.layer_norm(tensor) + tensor = self.pointwise_conv1(tensor) # [B,T,2F] + tensor = nn.functional.glu(tensor, dim=-1) # [B,T,F] + + # conv layers expect shape [B,F,T] so we have to transpose here + tensor = tensor.transpose(1, 2) # [B,F,T] + tensor = self.depthwise_conv(tensor) + + tensor = self.norm(tensor) + tensor = tensor.transpose(1, 2) # transpose back to [B,T,F] + + tensor = self.activation(tensor) + tensor = self.pointwise_conv2(tensor) + + return self.dropout(tensor) + + +class ConformerBlockQuant(nn.Module): + """ + Conformer block module + """ + + def __init__(self, cfg: ConformerBlockQuantV1Config): + """ + :param cfg: conformer block configuration with subunits for the different conformer parts + """ + super().__init__() + self.ff1 = ConformerPositionwiseFeedForwardQuant(cfg=cfg.ff_cfg) + self.mhsa = ConformerMHSAQuant(cfg=cfg.mhsa_cfg) + self.conv = ConformerConvolutionQuant(model_cfg=cfg.conv_cfg) + self.ff2 = ConformerPositionwiseFeedForwardQuant(cfg=cfg.ff_cfg) + self.final_layer_norm = torch.nn.LayerNorm(cfg.ff_cfg.input_dim) + + def forward(self, x: torch.Tensor, /, sequence_mask: torch.Tensor) -> torch.Tensor: + """ + :param x: input tensor of shape [B, T, F] + :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T] + :return: torch.Tensor of shape [B, T, F] + """ + x = 0.5 * self.ff1(x) + x # [B, T, F] + x = self.mhsa(x, sequence_mask) + x # [B, T, F] + x = self.conv(x) + x # [B, T, F] + x = 0.5 * self.ff2(x) + x # [B, T, F] + x = self.final_layer_norm(x) # [B, T, F] + return x + + +class ConformerEncoderQuant(nn.Module): + """ + TODO + Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication. + The model consists of a frontend and a stack of N conformer blocks. + C.f. https://arxiv.org/pdf/2005.08100.pdf + """ + + def __init__(self, cfg: ConformerEncoderQuantV1Config): + """ + :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks + """ + super().__init__() + + self.frontend = cfg.frontend() + self.module_list = torch.nn.ModuleList([ConformerBlockQuant(cfg.block_cfg) for _ in range(cfg.num_layers)]) + + def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + :param data_tensor: input tensor of shape [B, T', F] + :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T'] + :return: (output, out_seq_mask) + where output is torch.Tensor of shape [B, T, F'], + out_seq_mask is a torch.Tensor of shape [B, T] + + F: input feature dim, F': internal and output feature dim + T': data time dim, T: down-sampled time dim (internal time dim) + """ + x, sequence_mask = self.frontend(data_tensor, sequence_mask) # [B, T, F'] + for module in self.module_list: + x = module(x, sequence_mask) # [B, T, F'] + + return x, sequence_mask + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, quant_config_dict=None, **kwargs): + epoch = kwargs.pop("epoch") + step = kwargs.pop("step") + if len(kwargs) >= 2: + assert False, f"You did not use all kwargs: {kwargs}" + elif len(kwargs) == 1: + assert "random" in list(kwargs.keys())[0], "This must only be RETURNN random arg" + + super().__init__() + self.train_config = QuantModelTrainConfigV1.from_dict(model_config_dict) + fe_config = self.train_config.feature_extraction_config + frontend_config = self.train_config.frontend_config + conformer_size = self.train_config.conformer_size + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + if quant_config_dict: + print("Using Quantizable Model") + self.cfg = QuantModelConfigV1.from_dict(quant_config_dict) + conformer_config = ConformerEncoderQuantV1Config( + num_layers=self.train_config.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockQuantV1Config( + ff_cfg=ConformerPositionwiseFeedForwardQuantV1Config( + input_dim=conformer_size, + hidden_dim=self.train_config.ff_dim, + dropout=self.train_config.ff_dropout, + activation=nn.functional.silu, + weight_quant_dtype=self.cfg.weight_quant_dtype, + weight_quant_method=self.cfg.weight_quant_method, + activation_quant_dtype=self.cfg.activation_quant_dtype, + activation_quant_method=self.cfg.activation_quant_method, + moving_average=self.cfg.moving_average, + weight_bit_prec=self.cfg.weight_bit_prec, + activation_bit_prec=self.cfg.activation_bit_prec, + linear_quant_output=self.cfg.linear_quant_output + ), + mhsa_cfg=QuantizedMultiheadAttentionV1Config( + input_dim=conformer_size, + num_att_heads=self.train_config.num_heads, + att_weights_dropout=self.train_config.att_weights_dropout, + dropout=self.train_config.mhsa_dropout, + weight_quant_dtype=self.cfg.weight_quant_dtype, + weight_quant_method=self.cfg.weight_quant_method, + activation_quant_dtype=self.cfg.activation_quant_dtype, + activation_quant_method=self.cfg.activation_quant_method, + activation_bit_prec=self.cfg.activation_bit_prec, + dot_quant_dtype=self.cfg.dot_quant_dtype, + dot_quant_method=self.cfg.dot_quant_method, + Av_quant_dtype=self.cfg.Av_quant_dtype, + Av_quant_method=self.cfg.Av_quant_method, + bit_prec_W_q=self.cfg.weight_bit_prec, + bit_prec_W_k=self.cfg.weight_bit_prec, + bit_prec_W_v=self.cfg.weight_bit_prec, + bit_prec_dot=self.cfg.weight_bit_prec, + bit_prec_A_v=self.cfg.weight_bit_prec, + bit_prec_W_o=self.cfg.weight_bit_prec, + moving_average=self.cfg.moving_average, + linear_quant_output=self.cfg.linear_quant_output, + ), + conv_cfg=ConformerConvolutionQuantV1Config( + channels=conformer_size, + kernel_size=self.train_config.conv_kernel_size, + dropout=self.train_config.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + weight_bit_prec=self.cfg.weight_bit_prec, + weight_quant_dtype=self.cfg.weight_quant_dtype, + weight_quant_method=self.cfg.weight_quant_method, + activation_bit_prec=self.cfg.activation_bit_prec, + activation_quant_dtype=self.cfg.activation_quant_dtype, + activation_quant_method=self.cfg.activation_quant_method, + moving_average=self.cfg.moving_average, + linear_quant_output=self.cfg.linear_quant_output, + ), + ), + ) + self.conformer = ConformerEncoderQuant(cfg=conformer_config) + else: + conformer_config = ConformerEncoderV1Config( + num_layers=self.train_config.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.train_config.ff_dim, + dropout=self.train_config.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.train_config.num_heads, + att_weights_dropout=self.train_config.att_weights_dropout, + dropout=self.train_config.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.train_config.conv_kernel_size, + dropout=self.train_config.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + self.conformer = ConformerEncoderV1(cfg=conformer_config) + + + self.final_linear = nn.Linear(conformer_size, self.train_config.label_target_size + 1) # + CTC blank TODO: do we quant here too? + self.final_dropout = nn.Dropout(p=self.train_config.final_dropout) + self.specaug_start_epoch = self.train_config.specauc_start_epoch + + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio, dim=-1) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + run_ctx = get_run_ctx() + if self.training and run_ctx.epoch >= self.specaug_start_epoch: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, + time_max_mask_per_n_frames=self.train_config.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.train_config.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.train_config.specaug_config.max_dim_feat, + freq_max_num_masks=self.train_config.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"].to("cpu") # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.train_config.label_target_size, + reduction="sum", + zero_infinity=True, + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) + + +def static_quant_init_hook(run_ctx, **kwargs): + # These flags are not required for quant, only later for forward + run_ctx.iterative_quant = False + run_ctx.apply_quant = False + run_ctx.tag_file = open("seq_tags.txt", "wt") + +def static_quant_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + assert not model.training + assert model.eval() + model.eval() + _, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + for tag, feat_len, raw_len in zip(data["seq_tag"], audio_features_len, raw_audio_len): + run_ctx.tag_file.write(tag + f" len: {feat_len} raw_len: {raw_len}\n") + +def static_quant_finish_hook(run_ctx, **kwargs): + run_ctx.tag_file.close() + torch.save({"model": run_ctx.engine._model.state_dict(), "epoch": 250, "step": run_ctx.engine._train_step}, "model.pt") diff --git a/users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_1023/quant/baseline_quant_v2_modules.py b/users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_1023/quant/baseline_quant_v2_modules.py new file mode 100644 index 000000000..d603e3696 --- /dev/null +++ b/users/hilmes/experiments/tedlium2/standalone/pytorch_networks/ctc/conformer_1023/quant/baseline_quant_v2_modules.py @@ -0,0 +1,358 @@ +""" +Fixes mhsa norm +""" +import copy + +import torch +from torch import nn +import torch.ao.quantization as torch_quant +import torch.nn.functional as F +from typing import Optional +from .baseline_quant_v1_cfg import QuantizedMultiheadAttentionV1Config +import math +from returnn.torch.context import get_run_ctx +from torch.ao.quantization.utils import check_min_max_valid + +def get_quantization_range_from_bit_precision(bits, dtype): + + if dtype == torch.qint8: + quant_min = -(2**(bits-1)) + quant_max = (2**(bits-1))-1 + + elif dtype == torch.quint8: + quant_min = 0 + quant_max = (2**bits)-1 + + else: + raise ValueError(f'Unrecognized dtype {dtype}') + + return quant_min, quant_max + + +class WeightQuantizer(nn.Module): + def __init__(self, bit_precision: int, dtype: torch.dtype, method: str, reduce_range: bool = False): + super().__init__() + + self.quant_min, self.quant_max = get_quantization_range_from_bit_precision(bit_precision, dtype) + self.dtype = dtype + self.reduce_range = reduce_range + self.quant_fn, self.observer = None, None + self.quant_fn, self.observer = self.__get_quant_fn_and_observer_for_method(method) + self.scale = None + self.zero_point = None + + def __get_quant_fn_and_observer_for_method(self, method): + if self.quant_fn is not None and self.observer is not None: + return self.quant_fn, self.observer + if method == 'per_tensor': + quant_fn = torch.fake_quantize_per_tensor_affine + observer = torch_quant.observer.MinMaxObserver( + quant_min=self.quant_min, + quant_max=self.quant_max, + dtype=self.dtype, + reduce_range=self.reduce_range + ) + else: + raise ValueError(f'Unknown quantization method: {method}!') + + return quant_fn, observer + + def forward(self, tensor: torch.Tensor): + if self.training: + # This module does not do anything in training + return tensor + if not get_run_ctx().apply_quant: + tensor = self.observer(tensor) + if get_run_ctx().iterative_quant or get_run_ctx().apply_quant: + # und nicht erst ganz am Ende. Heißt jeder Batch wird iterativ quantisiert + self.set_scale_and_zp() + assert self.scale is not None and self.zero_point is not None + tensor = self.quant_fn(tensor, self.scale, self.zero_point, self.quant_min, self.quant_max) + return tensor + + def set_scale_and_zp(self): + assert self.observer is not None + assert check_min_max_valid(self.observer.min_val, self.observer.max_val), "Need to init observer first" + self.scale, self.zero_point = self.observer.calculate_qparams() + + +class ActivationQuantizer(nn.Module): + + def __init__( + self, + bit_precision: int, + dtype: torch.dtype, + method: str, + channel_axis: Optional[int], + moving_avrg: Optional[float], # default if enabled should be 0.01, if set enables moving average + reduce_range: bool = False, + ): + super().__init__() + self.quant_min, self.quant_max = get_quantization_range_from_bit_precision(bit_precision, dtype) + self.dtype = dtype + self.channel_axis = channel_axis + self.moving_avrg = moving_avrg + self.reduce_range = reduce_range + self.quant_fn, self.observer, self.base_observer_args = None, None, None + self.quant_fn, self.observer, self.base_observer_args = self.__get_quant_fn_and_observer_for_method(method) + self.zero_point = None + self.scale = None + + def __get_quant_fn_and_observer_for_method(self, method): + if all(x is not None for x in [self.quant_fn, self.base_observer_args, self.observer]): + return self.quant_fn, self.base_observer_args, self.observer + if method == 'per_tensor': + quant_fn = torch.fake_quantize_per_tensor_affine + base_observer_args = [self.quant_min, self.quant_max] + if self.moving_avrg: + observer = torch_quant.observer.MovingAverageMinMaxObserver( + averaging_constant=self.moving_avrg, + quant_min=self.quant_min, + quant_max=self.quant_max, + dtype=self.dtype, + reduce_range=self.reduce_range + ) + else: + observer = torch_quant.observer.MinMaxObserver( + quant_min=self.quant_min, + quant_max=self.quant_max, + dtype=self.dtype, + reduce_range=self.reduce_range + ) + elif method == 'per_channel': + quant_fn = torch.fake_quantize_per_channel_affine + base_observer_args = [self.channel_axis, self.quant_min, self.quant_max] + assert self.channel_axis is not None + if self.moving_avrg: + observer = torch_quant.observer.MovingAveragePerChannelMinMaxObserver( + averaging_constant=self.moving_avrg, + quant_min=self.quant_min, + quant_max=self.quant_max, + dtype=self.dtype, + ch_axis=self.channel_axis, + reduce_range=self.reduce_range + ) + else: + observer = torch_quant.observer.PerChannelMinMaxObserver( + quant_min=self.quant_min, + quant_max=self.quant_max, + dtype=self.dtype, + reduce_range=self.reduce_range, + ch_axis=self.channel_axis + ) + else: + raise ValueError(f'Unknown quantization method: {method}!') + + return quant_fn, observer, base_observer_args + + + def forward(self, tensor: torch.Tensor): + if self.training: + # This module does not do anything in training + return tensor + # self.observer.reset_min_max_vals() + if get_run_ctx().apply_quant is not True: + tensor = self.observer(tensor) + if get_run_ctx().iterative_quant is True or get_run_ctx().apply_quant is True: + self.set_scale_and_zp() + assert self.scale is not None and self.zero_point is not None, "Need to calibrate before applying quant, disable apply_calibration" + old_tensor = copy.deepcopy(tensor) + tensor = self.quant_fn(tensor, self.scale, self.zero_point, self.quant_min, self.quant_max) + # This 0 case should only happen when the model is broken anyways like for 4 bit + # TODO it seems for linear out this might be the case + #assert not torch.equal(old_tensor, tensor) or torch.sum(old_tensor) == torch.sum(tensor) == 0, (tensor[0,0,0], old_tensor[0 , 0 ,0], self.scale, self.zero_point, self.quant_min, self.quant_max) + return tensor + + def set_scale_and_zp(self): + assert self.observer is not None + assert check_min_max_valid(self.observer.min_val, self.observer.max_val), "Need to init observer first" + self.scale, self.zero_point = self.observer.calculate_qparams() + + +class LinearQuant(nn.Module): + + def __init__( + self, + in_features: int, + out_features: int, + weight_bit_prec: int, + weight_quant_dtype: torch.dtype, + weight_quant_method: str, + activation_bit_prec: int, + activation_quant_dtype: torch.dtype, + activation_quant_method: str, + moving_average: Optional[float], # default if enabled should be 0.01, if set enables moving average + bias: bool, + quant_output: bool + ): + super().__init__() + self.weight = nn.Parameter(torch.empty((out_features, in_features)), requires_grad=True) + if bias: + self.bias = nn.Parameter(torch.empty((out_features,)), requires_grad=True) + + self.weight_bit_prec = weight_bit_prec + self.weight_quant_dtype = weight_quant_dtype + self.weight_quant_method = weight_quant_method + self.weight_quantizer = WeightQuantizer( + bit_precision=self.weight_bit_prec, + dtype=self.weight_quant_dtype, + method=self.weight_quant_method + ) + + self.activation_bit_prec = activation_bit_prec + self.activation_quant_dtype = activation_quant_dtype + self.activation_quant_method = activation_quant_method + self.activation_quantizer = ActivationQuantizer( + bit_precision=self.activation_bit_prec, + dtype=self.activation_quant_dtype, + method=self.activation_quant_method, + channel_axis=2, + moving_avrg=moving_average) + + self.quant_output = quant_output + if self.quant_output: + self.output_quantizer = ActivationQuantizer( + bit_precision=self.activation_bit_prec, + dtype=self.activation_quant_dtype, + method=self.activation_quant_method, + channel_axis=2, + moving_avrg=moving_average + ) + + def forward(self, tensor: torch.Tensor): + lin = F.linear(self.activation_quantizer(tensor), self.weight_quantizer(self.weight), self.bias) + if self.quant_output: + return self.output_quantizer(lin) + else: + return lin + +class QuantizedMultiheadAttention(nn.Module): + def __init__( + self, + cfg: QuantizedMultiheadAttentionV1Config + ): + super().__init__() + self.cfg = cfg + self.num_att_heads = cfg.num_att_heads + self.input_dim = cfg.input_dim + self.dim_heads = self.input_dim // self.num_att_heads + + self.bit_prec_dot = cfg.bit_prec_dot + self.bit_prec_Av = cfg.bit_prec_A_v + self.weight_quant_dtype = cfg.weight_quant_dtype + self.weight_quant_method = cfg.weight_quant_method + self.activation_quant_dtype = cfg.activation_quant_dtype + self.activation_quant_method = cfg.activation_quant_method + self.dot_quant_dtype = cfg.dot_quant_dtype + self.dot_quant_method = cfg.dot_quant_method + self.Av_quant_dtype = cfg.Av_quant_dtype + self.Av_quant_method = cfg.Av_quant_method + self.linear_quant_output = cfg.linear_quant_output + + self.out_proj = self._create_linear_layer( + weight_bits=cfg.bit_prec_W_o, + act_bits=cfg.activation_bit_prec, + ) + + # For some reason pytorch saves the in_proj_weight and bias in this format not with . so we need to adjust + self.in_proj = self._create_linear_layer( + weight_bits=cfg.bit_prec_W_q, + act_bits=cfg.activation_bit_prec, + output_dim=3 * self.input_dim + ) + self.register_parameter("in_proj_weight", self.in_proj.weight) + self.register_parameter("in_proj_bias", self.in_proj.bias) + + if self.bit_prec_dot < 16: + self.q_quantizer = ActivationQuantizer( + self.bit_prec_dot, + self.dot_quant_dtype, + self.dot_quant_method, + channel_axis=None if self.dot_quant_method == "per_tensor" else 3, + moving_avrg=cfg.moving_average + ) + self.k_quantizer = ActivationQuantizer( + self.bit_prec_dot, + self.dot_quant_dtype, + self.dot_quant_method, + channel_axis=None if self.dot_quant_method == "per_tensor" else 2, + moving_avrg=cfg.moving_average + ) + + if self.bit_prec_Av < 16: + self.a_quantizer = WeightQuantizer( + self.bit_prec_Av, + self.Av_quant_dtype, + self.Av_quant_method + ) + self.v_quantizer = ActivationQuantizer( + self.bit_prec_Av, + self.Av_quant_dtype, + self.Av_quant_method, + moving_avrg=cfg.moving_average, + channel_axis=None if self.dot_quant_method == "per_tensor" else NotImplementedError, + ) + self.norm = math.sqrt(self.dim_heads) + self.softmax = nn.Softmax(-1) + self.dropout = nn.Dropout(cfg.att_weights_dropout) + + def _create_linear_layer(self, weight_bits, act_bits, output_dim=None): + return LinearQuant( + in_features=self.input_dim, + out_features=output_dim or self.input_dim, + weight_bit_prec=weight_bits, + weight_quant_dtype=self.weight_quant_dtype, + weight_quant_method=self.weight_quant_method, + activation_bit_prec=act_bits, + activation_quant_dtype=self.activation_quant_dtype, + activation_quant_method=self.activation_quant_method, + moving_average=self.cfg.moving_average, + bias=True, + quant_output=self.linear_quant_output + ) + + def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: Optional[torch.Tensor] = None): + + batch_dim = query.shape[0] + + #query = self.W_q(query) + #key = self.W_k(key) + #value = self.W_v(value) + assert query is value is key, "currently only this case is implemented" + + x = self.in_proj(query) + hidden_dim = query.size(-1) + query, key, value = x.unflatten(-1, (3, hidden_dim)).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous() + + query = query.view(batch_dim, -1, self.num_att_heads, self.dim_heads) # [B, T, D//H, D'] + key = key.view(batch_dim, -1, self.num_att_heads, self.dim_heads) # [B, T, D//H, D'] + value = value.view(batch_dim, -1, self.num_att_heads, self.dim_heads) # [B, T, D//H, D'] + + query = torch.transpose(query, 1, 2) # [B, D//H, T, D'] + key = torch.transpose(key, 1, 2) # [B, D//H, T, D'] + value = torch.transpose(value, 1, 2) # [B, D//H, T, D'] + + key = torch.transpose(key, -2, -1) # [B, D//H, D', T] + + if self.bit_prec_dot < 16: + query = self.q_quantizer(query) + key = self.k_quantizer(key) + + dot = torch.matmul(query, key) # [B, D//H, T, T] + dot = dot / self.norm + if mask is not None: + mask = mask.view(batch_dim, 1, 1, mask.size(1)) + dot = dot.masked_fill(mask, -float('inf')) + alpha = self.softmax(dot) + alpha = self.dropout(alpha) + + if self.bit_prec_Av < 16: + alpha = self.a_quantizer(alpha) + value = self.v_quantizer(value) + + att_out = torch.matmul(alpha, value) # [B, D//H, T, D'] + att_out = torch.transpose(att_out, 1, 2) # [B, D//H, T, D'] + att_out = att_out.reshape(batch_dim, -1, self.input_dim) # [B, T, D] + att_out = self.out_proj(att_out) + + return att_out, alpha diff --git a/users/hilmes/experiments/tedlium2/standalone/report.py b/users/hilmes/experiments/tedlium2/standalone/report.py index e8df79852..917bceb58 100644 --- a/users/hilmes/experiments/tedlium2/standalone/report.py +++ b/users/hilmes/experiments/tedlium2/standalone/report.py @@ -8,7 +8,7 @@ def calc_stat(ls): max = np.max([float(x[1]) for x in ls]) median = np.median([float(x[1]) for x in ls]) std = np.std([float(x[1]) for x in ls]) - ex_str = f"Avrg: {avrg}, Min {min}, Max {max}, Median {median}, Std {std}, ({avrg},{min},{max},{median},{std})" + ex_str = f"Avrg: {avrg}, Min {min}, Max {max}, Median {median}, Std {std}, ({avrg},{min},{max},{median},{std}) Num Values: {len(ls)}" return ex_str def baseline_report_format(report: _Report_Type) -> str: @@ -18,34 +18,38 @@ def baseline_report_format(report: _Report_Type) -> str: :return: """ extra_ls = ["quantize_static"] + sets = set() + for recog in report: + sets.add(recog.split("/")[-1]) out = [(" ".join(recog.split("/")[3:]), str(report[recog])) for recog in report if not any(extra in recog for extra in extra_ls)] out = sorted(out, key=lambda x: float(x[1])) best_ls = [out[0]] - for extra in extra_ls: - if extra == "quantize_static": - tmp = {recog: report[recog] for recog in report if extra in recog} - iters = set() - for recog in tmp: - x = recog.split("/") - for sub in x: - if "samples" in sub: - iters.add(sub[len("samples_"):]) - for samples in iters: - out2 = [(" ".join(recog.split("/")[3:]), str(report[recog])) for recog in report if f"samples_{samples}/" in recog] + for dataset in sets: + for extra in extra_ls: + if extra == "quantize_static": + tmp = {recog: report[recog] for recog in report if extra in recog and dataset in recog} + iters = set() + for recog in tmp: + x = recog.split("/") + for sub in x: + if "samples" in sub: + iters.add(sub[len("samples_"):]) + for samples in iters: + out2 = [(" ".join(recog.split("/")[3:]), str(report[recog])) for recog in report if f"samples_{samples}/" in recog and dataset in recog] + out2 = sorted(out2, key=lambda x: float(x[1])) + if len(out2) > 0: + ex_str = calc_stat(out2) + out.append((dataset + " " + extra + f"_samples_{samples}", ex_str)) + out.extend(out2[:3]) + out.extend(out2[-3:]) + best_ls.append(out2[0]) + else: + out2 = [(" ".join(recog.split("/")[3:]), str(report[recog])) for recog in report if extra in recog and dataset in recog] out2 = sorted(out2, key=lambda x: float(x[1])) if len(out2) > 0: - ex_str = calc_stat(out2) - out.append((extra + f"_samples_{samples}", ex_str)) - out.extend(out2[:3]) - out.extend(out2[-3:]) + out.append((dataset + " " + extra, "")) + out.extend(out2) best_ls.append(out2[0]) - else: - out2 = [(" ".join(recog.split("/")[3:]), str(report[recog])) for recog in report if extra in recog] - out2 = sorted(out2, key=lambda x: float(x[1])) - if len(out2) > 0: - out.append((extra, "")) - out.extend(out2) - best_ls.append(out2[0]) best_ls = sorted(best_ls, key=lambda x: float(x[1])) best_ls += [("Base Results", "")] out = best_ls + out From 045aa35a47d4b69edc20860d3fdb5041fe3472bb Mon Sep 17 00:00:00 2001 From: marvin84 Date: Tue, 4 Jun 2024 14:07:49 +0200 Subject: [PATCH 088/227] added factored bw --- common/datasets/tedlium2_v2/corpus.py | 136 +++++++ common/datasets/tedlium2_v2/download.py | 48 +++ common/datasets/tedlium2_v2/export.py | 96 +++++ common/datasets/tedlium2_v2/lexicon.py | 171 ++++++++ common/datasets/tedlium2_v2/textual_data.py | 39 ++ common/datasets/tedlium2_v2/vocab.py | 51 +++ .../configs/LFR_factored/baseline/config.py | 9 + .../common/BASE_factored_hybrid_system.py | 2 +- .../common/TF_factored_hybrid_system.py | 42 +- .../decoder/BASE_factored_hybrid_search.py | 20 +- users/raissi/setups/common/decoder/config.py | 6 + .../setups/common/helpers/network/augment.py | 200 +++++++++- .../setups/common/helpers/priors/__init__.py | 2 +- .../priors/estimate_povey_like_prior_fh.py | 372 +++++++++++------- .../helpers/priors/factored_estimation.py | 249 +++++++----- .../common/helpers/priors/transcription.py | 108 +++-- .../setups/common/helpers/priors/util.py | 13 +- users/raissi/setups/common/util/tdp.py | 4 +- .../decoder/LBS_factored_hybrid_search.py | 61 ++- .../setups/librispeech/helpers/__init__.py | 0 .../librispeech/helpers/priors/__init__.py | 0 .../helpers/priors/transcription.py | 56 +++ users/raissi/utils/default_tools.py | 1 + 23 files changed, 1385 insertions(+), 301 deletions(-) create mode 100644 common/datasets/tedlium2_v2/corpus.py create mode 100644 common/datasets/tedlium2_v2/download.py create mode 100644 common/datasets/tedlium2_v2/export.py create mode 100644 common/datasets/tedlium2_v2/lexicon.py create mode 100644 common/datasets/tedlium2_v2/textual_data.py create mode 100644 common/datasets/tedlium2_v2/vocab.py create mode 100644 users/raissi/setups/librispeech/helpers/__init__.py create mode 100644 users/raissi/setups/librispeech/helpers/priors/__init__.py create mode 100644 users/raissi/setups/librispeech/helpers/priors/transcription.py diff --git a/common/datasets/tedlium2_v2/corpus.py b/common/datasets/tedlium2_v2/corpus.py new file mode 100644 index 000000000..f74a7acbf --- /dev/null +++ b/common/datasets/tedlium2_v2/corpus.py @@ -0,0 +1,136 @@ +import os +from functools import lru_cache +from typing import Dict, Optional, Any + +from sisyphus import tk + +from i6_core.audio.encoding import BlissChangeEncodingJob + +from i6_core.meta import CorpusObject + +from ..tedlium2.constants import DURATIONS +from .download import download_data_dict + + +@lru_cache() +def get_bliss_corpus_dict(audio_format: str = "wav", output_prefix: str = "datasets") -> Dict[str, tk.Path]: + """ + creates a dictionary of all corpora in the TedLiumV2 dataset in the bliss xml format + + :param audio_format: options: wav, ogg, flac, sph, nist. nist (NIST sphere format) and sph are the same. + :param output_prefix: + :return: + """ + assert audio_format in ["flac", "ogg", "wav", "sph", "nist"] + + output_prefix = os.path.join(output_prefix, "Ted-Lium-2") + + bliss_corpus_dict = download_data_dict(output_prefix=output_prefix).bliss_nist + + audio_format_options = { + "wav": { + "output_format": "wav", + "codec": "pcm_s16le", + }, + "ogg": {"output_format": "ogg", "codec": "libvorbis"}, + "flac": {"output_format": "flac", "codec": "flac"}, + } + + converted_bliss_corpus_dict = {} + if audio_format not in ["sph", "nist"]: + for corpus_name, sph_corpus in bliss_corpus_dict.items(): + bliss_change_encoding_job = BlissChangeEncodingJob( + corpus_file=sph_corpus, + sample_rate=16000, + recover_duration=False, + **audio_format_options[audio_format], + ) + bliss_change_encoding_job.add_alias( + os.path.join( + output_prefix, + "%s_conversion" % audio_format, + corpus_name, + ) + ) + converted_bliss_corpus_dict[corpus_name] = bliss_change_encoding_job.out_corpus + else: + converted_bliss_corpus_dict = bliss_corpus_dict + + return converted_bliss_corpus_dict + + +@lru_cache() +def get_corpus_object_dict(audio_format: str = "flac", output_prefix: str = "datasets") -> Dict[str, CorpusObject]: + """ + creates a dict of all corpora in the TedLiumV2 dataset as a `meta.CorpusObject` + + :param audio_format: options: wav, ogg, flac, sph, nist. nist (NIST sphere format) and sph are the same. + :param output_prefix: + :return: + """ + bliss_corpus_dict = get_bliss_corpus_dict(audio_format=audio_format, output_prefix=output_prefix) + + corpus_object_dict = {} + + for corpus_name, bliss_corpus in bliss_corpus_dict.items(): + corpus_object = CorpusObject() + corpus_object.corpus_file = bliss_corpus + corpus_object.audio_format = audio_format + corpus_object.audio_dir = None + corpus_object.duration = DURATIONS[corpus_name] + + corpus_object_dict[corpus_name] = corpus_object + + return corpus_object_dict + + +@lru_cache() +def get_stm_dict(output_prefix: str = "datasets") -> Dict[str, tk.Path]: + """ + fetches the STM files for TedLiumV2 dataset + + :param output_prefix: + :return: + """ + return download_data_dict(output_prefix=output_prefix).stm + + +def get_ogg_zip_dict( + subdir_prefix: str = "datasets", + returnn_python_exe: Optional[tk.Path] = None, + returnn_root: Optional[tk.Path] = None, + bliss_to_ogg_job_rqmt: Optional[Dict[str, Any]] = None, + extra_args: Optional[Dict[str, Dict[str, Any]]] = None, +) -> Dict[str, tk.Path]: + """ + Get a dictionary containing the paths to the ogg_zip for each corpus part. + + No outputs will be registered. + + :param subdir_prefix: dir name prefix for aliases and outputs + :param returnn_python_exe: path to returnn python executable + :param returnn_root: python to returnn root + :param bliss_to_ogg_job_rqmt: rqmt for bliss to ogg job + :param extra_args: extra args for each dataset for bliss to ogg job + :return: dictionary with ogg zip paths for each corpus (train, dev, test) + """ + from i6_core.returnn.oggzip import BlissToOggZipJob + + ogg_zip_dict = {} + bliss_corpus_dict = get_bliss_corpus_dict(audio_format="wav", output_prefix=subdir_prefix) + if extra_args is None: + extra_args = {} + for name, bliss_corpus in bliss_corpus_dict.items(): + ogg_zip_job = BlissToOggZipJob( + bliss_corpus, + no_conversion=False, # cannot be used for corpus with multiple segments per recording + returnn_python_exe=returnn_python_exe, + returnn_root=returnn_root, + **extra_args.get(name, {}), + ) + if bliss_to_ogg_job_rqmt: + ogg_zip_job.rqmt = bliss_to_ogg_job_rqmt + ogg_zip_job.add_alias(os.path.join(subdir_prefix, "Ted-Lium-2", "%s_ogg_zip_job" % name)) + ogg_zip_dict[name] = ogg_zip_job.out_ogg_zip + + return ogg_zip_dict diff --git a/common/datasets/tedlium2_v2/download.py b/common/datasets/tedlium2_v2/download.py new file mode 100644 index 000000000..948224ae7 --- /dev/null +++ b/common/datasets/tedlium2_v2/download.py @@ -0,0 +1,48 @@ +import os +from dataclasses import dataclass +from functools import lru_cache +from typing import Any, Dict + +from sisyphus import tk + +from i6_core.datasets.tedlium2 import ( + DownloadTEDLIUM2CorpusJob, + CreateTEDLIUM2BlissCorpusJobV2, +) + + +@dataclass(frozen=True) +class TedLium2Data: + """Class for storing the TedLium2 data""" + + data_dir: Dict[str, tk.Path] + lm_dir: tk.Path + vocab: tk.Path + bliss_nist: Dict[str, tk.Path] + stm: Dict[str, tk.Path] + + +@lru_cache() +def download_data_dict(output_prefix: str = "datasets") -> TedLium2Data: + """ + downloads the TedLiumV2 dataset and performs the initial data processing steps + Uses the fixed job CreateTEDLIUM2BlissCorpusJobV2 from: https://github.com/rwth-i6/i6_core/pull/490 + + :param output_prefix: + :return: + """ + download_tedlium2_job = DownloadTEDLIUM2CorpusJob() + download_tedlium2_job.add_alias(os.path.join(output_prefix, "download", "raw_corpus_job")) + + bliss_corpus_tedlium2_job = CreateTEDLIUM2BlissCorpusJobV2(download_tedlium2_job.out_corpus_folders) + bliss_corpus_tedlium2_job.add_alias(os.path.join(output_prefix, "create_bliss", "bliss_corpus_job")) + + tl2_data = TedLium2Data( + data_dir=download_tedlium2_job.out_corpus_folders, + lm_dir=download_tedlium2_job.out_lm_folder, + vocab=download_tedlium2_job.out_vocab_dict, + bliss_nist=bliss_corpus_tedlium2_job.out_corpus_files, + stm=bliss_corpus_tedlium2_job.out_stm_files, + ) + + return tl2_data diff --git a/common/datasets/tedlium2_v2/export.py b/common/datasets/tedlium2_v2/export.py new file mode 100644 index 000000000..1919fa8c0 --- /dev/null +++ b/common/datasets/tedlium2_v2/export.py @@ -0,0 +1,96 @@ +import os + +from sisyphus import tk + +from .corpus import get_bliss_corpus_dict, get_stm_dict +from .lexicon import get_bliss_lexicon, get_g2p_augmented_bliss_lexicon +from .textual_data import get_text_data_dict + +TEDLIUM_PREFIX = "Ted-Lium-2" + + +def _export_datasets(output_prefix: str = "datasets"): + """ + exports all datasets for TedLiumV2 with all available audio formats + + :param output_prefix: + :return: + """ + for audio_format in ["flac", "ogg", "wav", "nist", "sph"]: + bliss_corpus_dict = get_bliss_corpus_dict(audio_format=audio_format, output_prefix=output_prefix) + for name, bliss_corpus in bliss_corpus_dict.items(): + tk.register_output( + os.path.join( + output_prefix, + TEDLIUM_PREFIX, + "corpus", + f"{name}-{audio_format}.xml.gz", + ), + bliss_corpus, + ) + + +def _export_stms(output_prefix: str = "datasets"): + """ + exports all STMs for TedLiumV2 + + :param output_prefix: + :return: + """ + stm_dict = get_stm_dict(output_prefix=output_prefix) + for name, stm_file in stm_dict.items(): + tk.register_output( + os.path.join( + output_prefix, + TEDLIUM_PREFIX, + "stm", + f"{name}.txt", + ), + stm_file, + ) + + +def _export_text_data(output_prefix: str = "datasets"): + """ + exports all the textual data for TedLiumV2 dataset + + :param output_prefix: + :return: + """ + txt_data_dict = get_text_data_dict(output_prefix=output_prefix) + for k, v in txt_data_dict.items(): + tk.register_output(os.path.join(output_prefix, TEDLIUM_PREFIX, "text_data", f"{k}.gz"), v) + + +def _export_lexicon(output_prefix: str = "datasets"): + """ + exports the lexicon for TedLiumV2 + + :param output_prefix: + :return: + """ + lexicon_output_prefix = os.path.join(output_prefix, TEDLIUM_PREFIX, "lexicon") + + bliss_lexicon = get_bliss_lexicon(output_prefix=output_prefix) + tk.register_output(os.path.join(lexicon_output_prefix, "tedlium2.lexicon.xml.gz"), bliss_lexicon) + + g2p_bliss_lexicon = get_g2p_augmented_bliss_lexicon( + add_unknown_phoneme_and_mapping=False, output_prefix=output_prefix + ) + tk.register_output( + os.path.join(lexicon_output_prefix, "tedlium2.lexicon_with_g2p.xml.gz"), + g2p_bliss_lexicon, + ) + + +def export_all(output_prefix: str = "datasets"): + """ + exports everything for TedLiumV2 + + :param output_prefix: + :return: + """ + _export_datasets(output_prefix=output_prefix) + _export_stms(output_prefix=output_prefix) + _export_text_data(output_prefix=output_prefix) + _export_lexicon(output_prefix=output_prefix) diff --git a/common/datasets/tedlium2_v2/lexicon.py b/common/datasets/tedlium2_v2/lexicon.py new file mode 100644 index 000000000..4d8366155 --- /dev/null +++ b/common/datasets/tedlium2_v2/lexicon.py @@ -0,0 +1,171 @@ +import os +from functools import lru_cache +from sisyphus import tk + +from i6_core.lexicon import LexiconFromTextFileJob +from i6_core.lexicon.modification import WriteLexiconJob, MergeLexiconJob +from i6_core.lib import lexicon +from i6_experiments.common.helpers.g2p import G2PBasedOovAugmenter + +from ..tedlium2.constants import SILENCE_PHONEME, UNKNOWN_PHONEME +from .corpus import get_bliss_corpus_dict +from .download import download_data_dict + + +@lru_cache() +def _get_special_lemma_lexicon( + add_unknown_phoneme_and_mapping: bool = False, + add_silence: bool = True, +) -> lexicon.Lexicon: + """ + creates the special lemma used in RASR + + :param add_unknown_phoneme_and_mapping: adds [unknown] as label with [UNK] as phoneme and as LM token + :param add_silence: adds [silence] label with [SILENCE] phoneme, + use False for CTC/RNN-T setups without silence modelling. + :return: + """ + lex = lexicon.Lexicon() + if add_silence: + lex.add_lemma( + lexicon.Lemma( + orth=["[silence]", ""], + phon=[SILENCE_PHONEME], + synt=[], + special="silence", + eval=[[]], + ) + ) + if add_unknown_phoneme_and_mapping: + lex.add_lemma( + lexicon.Lemma( + orth=["[unknown]"], + phon=[UNKNOWN_PHONEME], + synt=[""], + special="unknown", + eval=[[]], + ) + ) + else: + lex.add_lemma( + lexicon.Lemma( + orth=["[unknown]"], + synt=[""], + special="unknown", + eval=[[]], + ) + ) + + lex.add_lemma( + lexicon.Lemma( + orth=["[sentence-begin]"], + synt=[""], + special="sentence-begin", + eval=[[]], + ) + ) + lex.add_lemma( + lexicon.Lemma( + orth=["[sentence-end]"], + synt=[""], + special="sentence-end", + eval=[[]], + ) + ) + if add_silence: + lex.add_phoneme(SILENCE_PHONEME, variation="none") + if add_unknown_phoneme_and_mapping: + lex.add_phoneme(UNKNOWN_PHONEME, variation="none") + + return lex + + +@lru_cache() +def _get_raw_bliss_lexicon( + output_prefix: str, +) -> tk.Path: + """ + downloads the vocabulary file from the TedLiumV2 dataset and creates a bliss lexicon + + :param output_prefix: + :return: + """ + vocab = download_data_dict(output_prefix=output_prefix).vocab + + convert_lexicon_job = LexiconFromTextFileJob( + text_file=vocab, + compressed=True, + ) + convert_lexicon_job.add_alias(os.path.join(output_prefix, "convert_text_to_bliss_lexicon_job")) + + return convert_lexicon_job.out_bliss_lexicon + + +@lru_cache() +def get_bliss_lexicon( + add_unknown_phoneme_and_mapping: bool = True, + add_silence: bool = True, + output_prefix: str = "datasets", +) -> tk.Path: + """ + merges the lexicon with special RASR tokens with the lexicon created from the downloaded TedLiumV2 vocabulary + + :param add_unknown_phoneme_and_mapping: add an unknown phoneme and mapping unknown phoneme:lemma + :param add_silence: include silence lemma and phoneme + :param output_prefix: + :return: + """ + static_lexicon = _get_special_lemma_lexicon(add_unknown_phoneme_and_mapping, add_silence) + static_lexicon_job = WriteLexiconJob(static_lexicon, sort_phonemes=True, sort_lemmata=False) + static_lexicon_job.add_alias(os.path.join(output_prefix, "static_lexicon_job")) + + raw_tedlium2_lexicon = _get_raw_bliss_lexicon(output_prefix=output_prefix) + + merge_lexicon_job = MergeLexiconJob( + bliss_lexica=[ + static_lexicon_job.out_bliss_lexicon, + raw_tedlium2_lexicon, + ], + sort_phonemes=True, + sort_lemmata=True, + compressed=True, + ) + merge_lexicon_job.add_alias(os.path.join(output_prefix, "merge_lexicon_job")) + + return merge_lexicon_job.out_bliss_lexicon + + +@lru_cache() +def get_g2p_augmented_bliss_lexicon( + add_unknown_phoneme_and_mapping: bool = False, + add_silence: bool = True, + audio_format: str = "wav", + output_prefix: str = "datasets", +) -> tk.Path: + """ + augment the kernel lexicon with unknown words from the training corpus + + :param add_unknown_phoneme_and_mapping: add an unknown phoneme and mapping unknown phoneme:lemma + :param add_silence: include silence lemma and phoneme + :param audio_format: options: wav, ogg, flac, sph, nist. nist (NIST sphere format) and sph are the same. + :param output_prefix: + :return: + """ + original_bliss_lexicon = get_bliss_lexicon( + add_unknown_phoneme_and_mapping, add_silence=add_silence, output_prefix=output_prefix + ) + corpus_name = "train" + bliss_corpus = get_bliss_corpus_dict(audio_format=audio_format, output_prefix=output_prefix)[corpus_name] + + g2p_augmenter = G2PBasedOovAugmenter( + original_bliss_lexicon=original_bliss_lexicon, + train_lexicon=original_bliss_lexicon, + ) + augmented_bliss_lexicon = g2p_augmenter.get_g2p_augmented_bliss_lexicon( + bliss_corpus=bliss_corpus, + corpus_name=corpus_name, + alias_path=os.path.join(output_prefix, "g2p"), + casing="lower", + ) + + return augmented_bliss_lexicon diff --git a/common/datasets/tedlium2_v2/textual_data.py b/common/datasets/tedlium2_v2/textual_data.py new file mode 100644 index 000000000..553489a0d --- /dev/null +++ b/common/datasets/tedlium2_v2/textual_data.py @@ -0,0 +1,39 @@ +from functools import lru_cache +from typing import Dict + +from sisyphus import tk + +from i6_core.corpus import CorpusToTxtJob +from i6_core.text import ConcatenateJob + +from i6_experiments.common.datasets.tedlium2.corpus_v2 import get_bliss_corpus_dict + +from .download import download_data_dict + + +@lru_cache() +def get_text_data_dict(output_prefix: str = "datasets") -> Dict[str, tk.Path]: + """ + gather all the textual data provided within the TedLiumV2 dataset + + :param output_prefix: + :return: + """ + lm_dir = download_data_dict(output_prefix=output_prefix).lm_dir + + text_corpora = [ + "commoncrawl-9pc", + "europarl-v7-6pc", + "giga-fren-4pc", + "news-18pc", + "news-commentary-v8-9pc", + "yandex-1m-31pc", + ] + + txt_dict = {name: lm_dir.join_right("%s.en.gz" % name) for name in text_corpora} + txt_dict["audio-transcriptions"] = CorpusToTxtJob( + get_bliss_corpus_dict(audio_format="wav", output_prefix="corpora")["train"] + ).out_txt + txt_dict["background-data"] = ConcatenateJob(list(txt_dict.values())).out + + return txt_dict diff --git a/common/datasets/tedlium2_v2/vocab.py b/common/datasets/tedlium2_v2/vocab.py new file mode 100644 index 000000000..14d4455f5 --- /dev/null +++ b/common/datasets/tedlium2_v2/vocab.py @@ -0,0 +1,51 @@ +from i6_experiments.common.helpers.text_labels.subword_nmt_bpe import ( + get_returnn_subword_nmt, + get_bpe_settings, + BPESettings, +) +from .corpus import get_bliss_corpus_dict + + +def get_subword_nmt_bpe(bpe_size: int, unk_label: str = "", subdir_prefix: str = "") -> BPESettings: + """ + Get the BPE tokens via the Returnn subword-nmt for a Tedlium2 setup. + + :param bpe_size: the number of BPE merge operations. This is NOT the resulting vocab size! + :param unk_label: unknown label symbol + :param subdir_prefix: dir name prefix for aliases and outputs + """ + subword_nmt_repo = get_returnn_subword_nmt(output_prefix=subdir_prefix) + train_corpus = get_bliss_corpus_dict()["train"] + bpe_settings = get_bpe_settings( + train_corpus, + bpe_size=bpe_size, + unk_label=unk_label, + output_prefix=subdir_prefix, + subword_nmt_repo_path=subword_nmt_repo, + ) + return bpe_settings + + +def get_subword_nmt_bpe_v2(bpe_size: int, unk_label: str = "", subdir_prefix: str = "") -> BPESettings: + """ + Get the BPE tokens via the Returnn subword-nmt for a Tedlium2 setup. + + V2: Uses subword-nmt version corrected for Apptainer related bug, adds hash overwrite for repo + + :param bpe_size: the number of BPE merge operations. This is NOT the resulting vocab size! + :param unk_label: unknown label symbol + :param subdir_prefix: dir name prefix for aliases and outputs + """ + subword_nmt_repo = get_returnn_subword_nmt( + commit_hash="5015a45e28a958f800ef1c50e7880c0c9ef414cf", output_prefix=subdir_prefix + ) + subword_nmt_repo.hash_overwrite = "I6_SUBWORD_NMT_V2" + train_corpus = get_bliss_corpus_dict()["train"] + bpe_settings = get_bpe_settings( + train_corpus, + bpe_size=bpe_size, + unk_label=unk_label, + output_prefix=subdir_prefix, + subword_nmt_repo_path=subword_nmt_repo, + ) + return bpe_settings diff --git a/users/raissi/experiments/librispeech/configs/LFR_factored/baseline/config.py b/users/raissi/experiments/librispeech/configs/LFR_factored/baseline/config.py index 545954399..5eb18de33 100644 --- a/users/raissi/experiments/librispeech/configs/LFR_factored/baseline/config.py +++ b/users/raissi/experiments/librispeech/configs/LFR_factored/baseline/config.py @@ -79,6 +79,15 @@ out_joint_diphone="output/output_batch_major", ) +CONF_FH_TRIPHONE_FS_DECODING_TENSOR_CONFIG_V2 = dataclasses.replace( + DecodingTensorMap.default(), + in_encoder_output="conformer_12_output/add", + out_encoder_output="encoder__output/output_batch_major", + out_right_context="right__output/output_batch_major", + out_left_context="left__output/output_batch_major", + out_center_state="center__output/output_batch_major", + out_joint_diphone="output/output_batch_major", +) BLSTM_FH_DECODING_TENSOR_CONFIG = dataclasses.replace( CONF_FH_DECODING_TENSOR_CONFIG, diff --git a/users/raissi/setups/common/BASE_factored_hybrid_system.py b/users/raissi/setups/common/BASE_factored_hybrid_system.py index 1ec82b301..a82d23919 100644 --- a/users/raissi/setups/common/BASE_factored_hybrid_system.py +++ b/users/raissi/setups/common/BASE_factored_hybrid_system.py @@ -530,7 +530,7 @@ def _set_native_lstm_path(self, search_numpy_blas=True, blas_lib=None): self.native_lstm2_path = compile_native_op_job.out_op def set_local_flf_tool_for_decoding(self, path): - self.csp["base"].flf_tool_exe = path + self.crp["base"].flf_tool_exe = path # --------------------- Init procedure ----------------- def set_initial_nn_args(self, initial_nn_args): diff --git a/users/raissi/setups/common/TF_factored_hybrid_system.py b/users/raissi/setups/common/TF_factored_hybrid_system.py index b758c266f..81eb0b02a 100644 --- a/users/raissi/setups/common/TF_factored_hybrid_system.py +++ b/users/raissi/setups/common/TF_factored_hybrid_system.py @@ -47,6 +47,8 @@ import i6_experiments.users.raissi.setups.common.helpers.train as train_helpers import i6_experiments.users.raissi.setups.common.helpers.decode as decode_helpers +from i6_experiments.users.raissi.setups.common.helpers.priors.factored_estimation import get_triphone_priors +from i6_experiments.users.raissi.setups.common.helpers.priors.util import PartitionDataSetup # user based modules from i6_experiments.users.raissi.setups.common.data.backend import BackendInfo @@ -74,7 +76,7 @@ from i6_experiments.users.raissi.setups.common.data.backend import Backend, BackendInfo - +from i6_experiments.users.raissi.setups.common.decoder.BASE_factored_hybrid_search import DecodingTensorMap from i6_experiments.users.raissi.setups.common.decoder.config import ( PriorInfo, PriorConfig, @@ -160,9 +162,6 @@ def get_model_checkpoint(self, model_job, epoch): def get_model_path(self, model_job, epoch): return model_job.out_checkpoints[epoch].ckpt_path - def set_local_flf_tool_for_decoding(self, path=None): - self.csp["base"].flf_tool_exe = path - # -------------------------------------------- Training -------------------------------------------------------- # -------------encoder architectures ------------------------------- @@ -279,7 +278,7 @@ def get_conformer_network_zhou_variant( network["classes_"]["from"] = "slice_classes" else: - network=encoder_net + network = encoder_net return network @@ -736,9 +735,38 @@ def set_diphone_priors_returnn_rasr( self.experiments[key]["priors"] = p_info - - def set_triphone_priors_factored(self): + def set_triphone_priors_factored( + self, + key: str, + epoch: int, + tensor_map: DecodingTensorMap, + partition_data_setup: PartitionDataSetup = None, + model_path: tk.Path = None, + ): self.create_hdf() + if self.experiments[key]["graph"].get("inference", None) is None: + self.set_graph_for_experiment(key) + if partition_data_setup is None: + partition_data_setup = PartitionDataSetup() + + if model_path is None: + model_path = DelayedFormat(self.get_model_path(model_job=self.experiments[key]["train_job"], epoch=epoch)) + triphone_priors = get_triphone_priors( + name=f"{self.experiments[key]['name']}/e{epoch}", + graph_path=self.experiments[key]["graph"]["inference"], + model_path=model_path, + data_paths=self.hdfs[self.train_key], + tensor_map=tensor_map, + partition_data_setup=partition_data_setup, + label_info=self.label_info, + ) + + p_info = PriorInfo( + center_state_prior=PriorConfig(file=triphone_priors[1], scale=0.0), + left_context_prior=PriorConfig(file=triphone_priors[2], scale=0.0), + right_context_prior=PriorConfig(file=triphone_priors[0], scale=0.0), + ) + self.experiments[key]["priors"] = p_info def set_triphone_priors_returnn_rasr( self, diff --git a/users/raissi/setups/common/decoder/BASE_factored_hybrid_search.py b/users/raissi/setups/common/decoder/BASE_factored_hybrid_search.py index d0ed08923..c27141a02 100644 --- a/users/raissi/setups/common/decoder/BASE_factored_hybrid_search.py +++ b/users/raissi/setups/common/decoder/BASE_factored_hybrid_search.py @@ -671,8 +671,11 @@ def recognize( if search_parameters.tdp_scale is not None: if name_override is None: name += f"-tdpScale-{search_parameters.tdp_scale}" - name += f"-spTdp-{format_tdp(search_parameters.tdp_speech)}" name += f"-silTdp-{format_tdp(search_parameters.tdp_silence)}" + if search_parameters.tdp_nonword is not None: + name += f"-nwTdp-{format_tdp(search_parameters.tdp_nonword)}" + name += f"-spTdp-{format_tdp(search_parameters.tdp_speech)}" + if self.feature_scorer_type.is_factored(): if search_parameters.transition_scales is not None: @@ -758,6 +761,12 @@ def recognize( adv_search_extra_config = ( copy.deepcopy(adv_search_extra_config) if adv_search_extra_config is not None else rasr.RasrConfig() ) + + if search_parameters.word_recombination_limit is not None: + adv_search_extra_config.flf_lattice_tool.network.recognizer.recognizer.reduce_context_word_recombination = True + adv_search_extra_config.flf_lattice_tool.network.recognizer.recognizer.reduce_context_word_recombination_limit = search_parameters.word_recombination_limit + name += f"recombLim{search_parameters.word_recombination_limit}" + if search_parameters.altas is not None: adv_search_extra_config.flf_lattice_tool.network.recognizer.recognizer.acoustic_lookahead_temporal_approximation_scale = ( search_parameters.altas @@ -907,7 +916,7 @@ def recognize( if add_sis_alias_and_output: tk.register_output(f"{pre_path}/{name}.wer", scorer.out_report_dir) - if opt_lm_am and search_parameters.altas is None: + if opt_lm_am and (search_parameters.altas is None or search_parameters.altas < 3.0): assert search_parameters.beam >= 15.0 if pron_scale is not None: if isinstance(pron_scale, DelayedBase) and pron_scale.is_set(): @@ -1311,14 +1320,16 @@ def push_delayed_tuple( best_priors = best_overall_n.out_argmin[0] best_tdp_scale = best_overall_n.out_argmin[1] best_tdp_sil = best_overall_n.out_argmin[2] - best_tdp_sp = best_overall_n.out_argmin[3] + best_tdp_nw = best_overall_n.out_argmin[3] + best_tdp_sp = best_overall_n.out_argmin[4] if use_pron: - best_pron = best_overall_n.out_argmin[4] + best_pron = best_overall_n.out_argmin[5] base_cfg = dataclasses.replace( search_parameters, tdp_scale=best_tdp_scale, tdp_silence=push_delayed_tuple(best_tdp_sil), + tdp_nonword=push_delayed_tuple(best_tdp_nw), tdp_speech=push_delayed_tuple(best_tdp_sp), pron_scale=best_pron, ) @@ -1327,6 +1338,7 @@ def push_delayed_tuple( search_parameters, tdp_scale=best_tdp_scale, tdp_silence=push_delayed_tuple(best_tdp_sil), + tdp_nonword=push_delayed_tuple(best_tdp_nw), tdp_speech=push_delayed_tuple(best_tdp_sp), ) diff --git a/users/raissi/setups/common/decoder/config.py b/users/raissi/setups/common/decoder/config.py index 455f023cf..1d5788bf5 100644 --- a/users/raissi/setups/common/decoder/config.py +++ b/users/raissi/setups/common/decoder/config.py @@ -157,6 +157,7 @@ class SearchParameters: altas: Optional[float] = None lm_lookahead_scale: Optional[float] = None lm_lookahead_history_limit: Int = 1 + word_recombination_limit: Optional[Int] = None posterior_scales: Optional[PosteriorScales] = None silence_penalties: Optional[Tuple[Float, Float]] = None # loop, fwd state_dependent_tdps: Optional[Union[str, tk.Path]] = None @@ -189,6 +190,11 @@ def with_lm_lookahead_scale(self, scale: Float) -> "SearchParameters": def with_lm_lookahead_history_limit(self, history_limit: Int) -> "SearchParameters": return dataclasses.replace(self, lm_lookahead_history_limit=history_limit) + def with_word_recombination_limit(self, word_recombination_limit: Int) -> "SearchParameters": + return dataclasses.replace(self, word_recombination_limit=word_recombination_limit) + + + def with_prior_scale( self, center: Optional[Float] = None, diff --git a/users/raissi/setups/common/helpers/network/augment.py b/users/raissi/setups/common/helpers/network/augment.py index 71a639926..5613a379f 100644 --- a/users/raissi/setups/common/helpers/network/augment.py +++ b/users/raissi/setups/common/helpers/network/augment.py @@ -29,11 +29,29 @@ class LogLinearScales: label_posterior_scale: float transition_scale: float + context_label_posterior_scale: float = 1.0 label_prior_scale: Optional[float] = None @classmethod def default(cls) -> "LogLinearScales": - return cls(label_posterior_scale=0.3, label_prior_scale=None, transition_scale=0.3) + return cls(label_posterior_scale=0.3, transition_scale=0.3, label_prior_scale=None, context_label_posterior_scale=1.0) + +@dataclass(frozen=True, eq=True) +class LossScales: + center_scale:int = 1.0 + right_scale: int = 1.0 + left_scale: int = 1.0 + + def get_scale(self, label_name: str): + if 'center' in label_name: + return self.center_scale + elif 'right' in label_name: + return self.right_scale + elif 'left' in label_name: + return self.left_scale + else: + raise NotImplemented("Not recognized label name for output loss scale") + Layer = Dict[str, Any] @@ -889,3 +907,183 @@ def add_fast_bw_layer_to_returnn_config( # ToDo: handel the import model part return returnn_config + +def add_fast_bw_factored_layer_to_network( + crp: rasr.CommonRasrParameters, + network: Network, + log_linear_scales: LogLinearScales, + loss_scales: LossScales, + label_info: LabelInfo, + reference_layers: [str] = ["left-output", "center-output" "right-output"], + label_prior_type: Optional[PriorType] = None, + label_prior: Optional[returnn.CodeWrapper] = None, + label_prior_estimation_axes: str = None, + extra_rasr_config: Optional[rasr.RasrConfig] = None, + extra_rasr_post_config: Optional[rasr.RasrConfig] = None, +) -> Network: + + crp = correct_rasr_FSA_bug(crp) + + if label_prior_type is not None: + assert log_linear_scales.label_prior_scale is not None, "If you plan to use the prior, please set the scale for it" + if label_prior_type == PriorType.TRANSCRIPT: + assert label_prior is not None, "You forgot to set the label prior file" + + inputs = [] + for reference_layer in reference_layers: + for attribute in ["loss", "loss_opts", "target"]: + if reference_layer in network: + network[reference_layer].pop(attribute, None) + + out_denot = reference_layer.split("-")[0] + am_scale = log_linear_scales.label_posterior_scale if "center" in reference_layer else log_linear_scales.context_label_posterior_scale + # prior calculation + if label_prior_type is not None: + prior_name = ("_").join(["label_prior", out_denot]) + comb_name = ("_").join(["comb-prior", out_denot]) + prior_eval_string = "(safe_log(source(1)) * prior_scale)" + inputs.append(comb_name) + if label_prior_type == PriorType.TRANSCRIPT: + network[prior_name] = {"class": "constant", "dtype": "float32", "value": label_prior} + elif label_prior_type == PriorType.AVERAGE: + network[prior_name] = { + "class": "accumulate_mean", + "exp_average": 0.001, + "from": reference_layer, + "is_prob_distribution": True, + } + elif label_prior_type == PriorType.ONTHEFLY: + assert label_prior_estimation_axes is not None, "You forgot to set one which axis you want to average the prior, eg. bt" + network[prior_name] = { + "class": "reduce", + "mode": "mean", + "from": reference_layer, + "axis": label_prior_estimation_axes, + } + prior_eval_string = "tf.stop_gradient((safe_log(source(1)) * prior_scale))" + else: + raise NotImplementedError("Unknown PriorType") + + network[comb_name] = { + "class": "combine", + "kind": "eval", + "eval": f"am_scale*(safe_log(source(0)) - {prior_eval_string})", + "eval_locals": { + "am_scale": am_scale, + "prior_scale": log_linear_scales.label_prior_scale, + }, + "from": [reference_layer, prior_name], + } + + else: + comb_name = ("_").join(["multiply-scale", out_denot]) + inputs.append(comb_name) + network[comb_name] = { + "class": "combine", + "kind": "eval", + "eval": "am_scale*(safe_log(source(0)))", + "eval_locals": {"am_scale": am_scale}, + "from": [reference_layer], + } + + bw_out = ("_").join(["output-bw", out_denot]) + network[bw_out] = { + "class": "copy", + "from": reference_layer, + "loss": "via_layer", + "loss_opts": { + "align_layer": ("/").join(["fast_bw", out_denot]), + "loss_wrt_to_act_in": "softmax", + }, + "loss_scale": loss_scales.get_scale(reference_layer), + } + + network["fast_bw"] = { + "class": "fast_bw_factored", + "align_target": "hmm-monophone", + "hmm_opts": {"num_contexts": label_info.n_contexts}, + "from": inputs, + "tdp_scale": log_linear_scales.transition_scale, + "n_out": label_info.n_contexts*2 + label_info.get_n_state_classes() + } + + # Create additional Rasr config file for the automaton + mapping = { + "corpus": "neural-network-trainer.corpus", + "lexicon": ["neural-network-trainer.alignment-fsa-exporter.model-combination.lexicon"], + "acoustic_model": ["neural-network-trainer.alignment-fsa-exporter.model-combination.acoustic-model"], + } + config, post_config = rasr.build_config_from_mapping(crp, mapping) + post_config["*"].output_channel.file = "fastbw.log" + + # Define action + config.neural_network_trainer.action = "python-control" + # neural_network_trainer.alignment_fsa_exporter.allophone_state_graph_builder + config.neural_network_trainer.alignment_fsa_exporter.allophone_state_graph_builder.orthographic_parser.allow_for_silence_repetitions = ( + False + ) + config.neural_network_trainer.alignment_fsa_exporter.allophone_state_graph_builder.orthographic_parser.normalize_lemma_sequence_scores = ( + False + ) + # neural_network_trainer.alignment_fsa_exporter + config.neural_network_trainer.alignment_fsa_exporter.model_combination.acoustic_model.fix_allophone_context_at_word_boundaries = ( + True + ) + config.neural_network_trainer.alignment_fsa_exporter.model_combination.acoustic_model.transducer_builder_filter_out_invalid_allophones = ( + True + ) + + # additional config + config._update(extra_rasr_config) + post_config._update(extra_rasr_post_config) + + automaton_config = rasr.WriteRasrConfigJob(config, post_config).out_config + tk.register_output("train/bw.config", automaton_config) + + network["fast_bw"]["sprint_opts"] = { + "sprintExecPath": rasr.RasrCommand.select_exe(crp.nn_trainer_exe, "nn-trainer"), + "sprintConfigStr": DelayedFormat("--config={}", automaton_config), + "sprintControlConfig": {"verbose": True}, + "usePythonSegmentOrder": False, + "numInstances": 1, + } + + return network + + +def add_fast_bw_factored_layer_to_returnn_config( + crp: rasr.CommonRasrParameters, + returnn_config: returnn.ReturnnConfig, + log_linear_scales: LogLinearScales, + loss_scales: LossScales, + label_info: LabelInfo, + import_model: [tk.Path, str] = None, + reference_layers: [str] = ["left-output", "center-output", "right-output"], + label_prior_type: Optional[PriorType] = None, + label_prior: Optional[returnn.CodeWrapper] = None, + label_prior_estimation_axes: str = None, + extra_rasr_config: Optional[rasr.RasrConfig] = None, + extra_rasr_post_config: Optional[rasr.RasrConfig] = None, +) -> returnn.ReturnnConfig: + + returnn_config.config["network"] = add_fast_bw_factored_layer_to_network( + crp=crp, + network=returnn_config.config["network"], + log_linear_scales=log_linear_scales, + loss_scales=loss_scales, + label_info=label_info, + reference_layers=reference_layers, + label_prior_type=label_prior_type, + label_prior=label_prior, + label_prior_estimation_axes=label_prior_estimation_axes, + extra_rasr_config=extra_rasr_config, + extra_rasr_post_config=extra_rasr_post_config, + ) + + if "chunking" in returnn_config.config: + del returnn_config.config["chunking"] + if "pretrain" in returnn_config.config and import_model is not None: + del returnn_config.config["pretrain"] + + return returnn_config + diff --git a/users/raissi/setups/common/helpers/priors/__init__.py b/users/raissi/setups/common/helpers/priors/__init__.py index 26b313536..cb3f4df67 100644 --- a/users/raissi/setups/common/helpers/priors/__init__.py +++ b/users/raissi/setups/common/helpers/priors/__init__.py @@ -7,5 +7,5 @@ from .flat import CreateFlatPriorsJob from .smoothen import smoothen_priors, SmoothenPriorsJob from .scale import scale_priors, ScalePriorsJob -from .transcription import get_mono_transcription_priors +from .transcription import get_prior_from_transcription from .tri_join import JoinRightContextPriorsJob, ReshapeCenterStatePriorsJob diff --git a/users/raissi/setups/common/helpers/priors/estimate_povey_like_prior_fh.py b/users/raissi/setups/common/helpers/priors/estimate_povey_like_prior_fh.py index 986ffa287..1cae2bc1b 100644 --- a/users/raissi/setups/common/helpers/priors/estimate_povey_like_prior_fh.py +++ b/users/raissi/setups/common/helpers/priors/estimate_povey_like_prior_fh.py @@ -2,9 +2,10 @@ import h5py +import logging import numpy as np import math -import tensorflow as tf +from typing import List, Optional, Union from IPython import embed @@ -14,11 +15,11 @@ import pickle from sisyphus import * - -from i6_core.lib.rasr_cache import FileArchive +from sisyphus.delayed_ops import DelayedFormat Path = setup_path(__package__) +from i6_core.lib.rasr_cache import FileArchive from i6_experiments.users.raissi.setups.common.data.factored_label import LabelInfo from i6_experiments.users.raissi.setups.common.decoder.BASE_factored_hybrid_search import DecodingTensorMap @@ -26,9 +27,10 @@ initialize_dicts, initialize_dicts_with_zeros, get_batch_from_segments, - ) +from i6_experiments.users.raissi.setups.common.util.cache_manager import cache_file + ################################### # Triphone ################################### @@ -36,7 +38,7 @@ class EstimateFactoredTriphonePriorsJob(Job): def __init__( self, graph_path: Path, - model_path: Path, + model_path: DelayedFormat, tensor_map: Optional[Union[dict, DecodingTensorMap]], data_paths: [Path], data_indices: [int], @@ -44,10 +46,10 @@ def __init__( end_ind_segment: int, label_info: LabelInfo, tf_library_path: str = None, - n_batch=15000, + n_batch=10000, cpu=2, gpu=1, - mem=4, + mem=32, time=1, ): self.graph_path = graph_path @@ -56,10 +58,12 @@ def __init__( self.data_indices = data_indices self.segment_slice = (start_ind_segment, end_ind_segment) self.tf_library_path = tf_library_path - self.triphone_means, self.diphone_means = initialize_dicts_with_zeros(label_info.n_contexts, label_info.get_n_state_classes()) + self.triphone_means, self.diphone_means = initialize_dicts_with_zeros( + label_info.n_contexts, label_info.get_n_state_classes() + ) self.context_means = np.zeros(label_info.n_contexts) self.num_segments = [ - self.output_path("segmentLength.%d.%d-%d" % (index, start_ind_segment, end_ind_segment), cached=False) + self.output_path("segment_length.%d.%d-%d" % (index, start_ind_segment, end_ind_segment), cached=False) for index in self.data_indices ] self.triphone_files = [ @@ -70,7 +74,7 @@ def __init__( self.output_path("diphone_means.%d.%d-%d" % (index, start_ind_segment, end_ind_segment), cached=False) for index in self.data_indices ] - self.context_means = [ + self.context_files = [ self.output_path("context_means.%d.%d-%d" % (index, start_ind_segment, end_ind_segment), cached=False) for index in self.data_indices ] @@ -84,10 +88,14 @@ def tasks(self): yield Task("run", resume="run", rqmt=self.rqmt, args=range(1, (len(self.data_indices) + 1))) def get_dense_label(self, left_context, center_state, right_context=0): - return (((center_state * self.label_info.n_contexts) + left_context) * self.label_info.n_contexts) + right_context + return ( + ((center_state * self.label_info.n_contexts) + left_context) * self.label_info.n_contexts + ) + right_context def get_segment_features_from_hdf(self, dataIndex): - hf = h5py.File(self.data_paths[dataIndex].get_path()) + logging.info(f"processing {self.data_paths[dataIndex]}") + file_path = self.data_paths[dataIndex] + hf = h5py.File(file_path) segment_names = list(hf["streams"]["features"]["data"]) segments = [] for name in segment_names: @@ -96,28 +104,35 @@ def get_segment_features_from_hdf(self, dataIndex): def get_encoder_output(self, session, feature_vector): return session.run( - [self.tensor_map.out_encoder_output], + [f"{self.tensor_map.out_encoder_output}:0"], feed_dict={ - self.tensor_map.in_data: feature_vector.reshape(1, feature_vector.shape[0], feature_vector.shape[1]), - self.tensor_map.in_seq_length: [feature_vector.shape[0]], + f"{self.tensor_map.in_data}:0": feature_vector.reshape( + 1, feature_vector.shape[0], feature_vector.shape[1] + ), + f"{self.tensor_map.in_seq_length}:0": [feature_vector.shape[0]], }, ) def get_posteriors_given_encoder_output(self, session, feature_vector, class_label_vector): feature_in = ( feature_vector.reshape(feature_vector.shape[1], 1, feature_vector.shape[2]) - if "fwd" in self.tensor_map.in_encoder_output + if "fwd" in tensor_map.in_encoder_output else feature_vector ) return session.run( - [self.tensor_map.out_left_context, self.tensor_map.out_center_state, self.tensor_map.out_right_context], + [ + f"{self.tensor_map.out_left_context}:0", + f"{self.tensor_map.out_center_state}:0", + f"{self.tensor_map.out_right_context}:0", + ], feed_dict={ - self.tensor_map.in_encoder_output: feature_in, - self.tensor_map.in_seq_length: [[class_label_vector] * feature_vector.shape[1]], + f"{self.tensor_map.in_encoder_output}:0": feature_in, + f"{self.tensor_map.in_classes}:0": [[class_label_vector] * feature_vector.shape[1]], }, ) - def calculateMeanPosteriors(self, session, task_id): + def calculate_mean_posteriors(self, session, task_id): + logging.info(f"starting with {task_id}") sample_count = 0 segments = self.get_segment_features_from_hdf(self.data_indices[task_id - 1]) @@ -127,21 +142,21 @@ def calculateMeanPosteriors(self, session, task_id): if len(batch) == 0: break encoder_output = self.get_encoder_output(session, batch) - for pastContextId in range(self.label_info.n_contexts): - for currentState in range(self.label_info.get_n_state_classes()): - denselabel = self.get_dense_label(left_context=pastContextId, center_state=currentState) + for left_context in range(self.label_info.n_contexts): + for center_state in range(self.label_info.get_n_state_classes()): + denselabel = self.get_dense_label(left_context=left_context, center_state=center_state) p = self.get_posteriors_given_encoder_output(session, encoder_output[0], denselabel) # triphone is calculates for each center and left context - tri = (sample_count * self.triphone_means[pastContextId][currentState]) + ( + tri = (sample_count * self.triphone_means[left_context][center_state]) + ( b_size * np.mean(p[0][0], axis=0) ) - self.triphone_means[pastContextId][currentState] = np.divide(tri, denom) + self.triphone_means[left_context][center_state] = np.divide(tri, denom) # diphone is calculated for each context with centerstate 0 - if not currentState: - di = (sample_count * self.diphone_means[pastContextId]) + (b_size * np.mean(p[1][0], axis=0)) - self.diphone_means[pastContextId] = np.divide(di, denom) + if not center_state: + di = (sample_count * self.diphone_means[left_context]) + (b_size * np.mean(p[1][0], axis=0)) + self.diphone_means[left_context] = np.divide(di, denom) # context is not label dependent - if not pastContextId: + if not left_context: ctx = (sample_count * self.context_means) + (b_size * np.mean(p[2][0], axis=0)) self.context_means = np.divide(ctx, denom) sample_count += b_size @@ -149,7 +164,8 @@ def calculateMeanPosteriors(self, session, task_id): with open(self.num_segments[task_id - 1].get_path(), "wb") as fp: pickle.dump(sample_count, fp, protocol=pickle.HIGHEST_PROTOCOL) - def dumpMeans(self, task_id): + def dump_means(self, task_id): + logging.info(f"dumping means") with open(self.triphone_files[task_id - 1].get_path(), "wb") as f1: pickle.dump(self.triphone_means, f1, protocol=pickle.HIGHEST_PROTOCOL) with open(self.diphone_files[task_id - 1].get_path(), "wb") as f2: @@ -158,25 +174,104 @@ def dumpMeans(self, task_id): pickle.dump(self.context_means, f3, protocol=pickle.HIGHEST_PROTOCOL) def run(self, task_id): - tf.load_op_library(self.tf_library_path) + import tensorflow as tf + if self.tf_library_path is not None: + tf.load_op_library(self.tf_library_path) mg = tf.compat.v1.MetaGraphDef() mg.ParseFromString(open(self.graph_path.get_path(), "rb").read()) - tf.compat.v1.import_graph_def(mg.graph_def, name="") + tf.import_graph_def(mg.graph_def, name="") # session s = tf.compat.v1.Session() - returnValue = s.run(["save/restore_all"], feed_dict={"save/Const:0": self.model.get_path()}) + returnValue = s.run(["save/restore_all"], feed_dict={"save/Const:0": self.model_path.get()}) + + self.calculate_mean_posteriors(s, task_id) + self.dump_means(task_id) + + +class CombineMeansForTriphoneForward(Job): + def __init__( + self, + triphone_files: List[Path], + diphone_files: List[Path], + context_files: List[Path], + num_segment_files: List[Path], + label_info: LabelInfo, + ): + self.triphone_files = triphone_files + self.diphone_files = diphone_files + self.context_files = context_files + self.num_segment_files = num_segment_files + self.label_info = label_info + self.num_segments = [] + self.triphone_means, self.diphoneMeans = initialize_dicts( + n_contexts=label_info.n_contexts, n_state_classes=label_info.get_n_state_classes() + ) + self.context_means = [] + self.num_segments_out = self.output_path("segment_length", cached=False) + self.triphone_files_out = self.output_path("triphone_means", cached=False) + self.diphone_files_out = self.output_path("diphoneMeans", cached=False) + self.context_files_out = self.output_path("context_means", cached=False) + self.rqmt = {"cpu": 1, "mem": 1, "time": 0.5} - self.calculateMeanPosteriors(s, task_id) - self.dumpMeans(task_id) + def tasks(self): + yield Task("run", resume="run", rqmt=self.rqmt) + + def read_num_segments(self): + for filename in self.num_segment_files: + with open(tk.uncached_path(filename), "rb") as f: + self.num_segments.append(pickle.load(f)) + + def calculate_weighted_averages(self): + coeffs = [self.num_segments[i] / np.sum(self.num_segments) for i in range(len(self.num_segment_files))] + for filename in self.triphone_files: + with open(tk.uncached_path(filename), "rb") as f: + triphoneDict = pickle.load(f) + for i in range(self.nContexts): + for j in range(self.nStates): + self.triphone_means[i][j].append( + np.dot(coeffs[self.triphone_files.index(filename)], triphoneDict[i][j]) + ) + for filename in self.diphone_files: + with open(tk.uncached_path(filename), "rb") as f: + diphoneDict = pickle.load(f) + for i in range(self.nContexts): + self.diphoneMeans[i].append(np.dot(coeffs[self.diphone_files.index(filename)], diphoneDict[i])) + for filename in self.context_files: + with open(tk.uncached_path(filename), "rb") as f: + means = pickle.load(f) + self.context_means.append(np.dot(coeffs[self.context_files.index(filename)], means)) + for i in range(self.nContexts): + self.diphoneMeans[i] = np.sum(self.diphoneMeans[i], axis=0) + for j in range(self.nStates): + self.triphone_means[i][j] = np.sum(self.triphone_means[i][j], axis=0) + self.context_means = np.sum(self.context_means, axis=0) + + def dump_means(self): + with open(tk.uncached_path(self.triphone_files_out), "wb") as f1: + pickle.dump(self.triphone_means, f1, protocol=pickle.HIGHEST_PROTOCOL) + with open(tk.uncached_path(self.diphone_files_out), "wb") as f2: + pickle.dump(self.diphoneMeans, f2, protocol=pickle.HIGHEST_PROTOCOL) + with open(tk.uncached_path(self.context_files_out), "wb") as f3: + pickle.dump(self.context_means, f3, protocol=pickle.HIGHEST_PROTOCOL) + sumSegNums = np.sum(self.num_segments) + with open(tk.uncached_path(self.num_segments_out), "wb") as f4: + pickle.dump(sumSegNums, f4, protocol=pickle.HIGHEST_PROTOCOL) + + def run(self): + self.read_num_segments() + self.calculate_weighted_averages() + self.dump_means() class DumpXmlForTriphoneForwardJob(Job): - def __init__(self, - triphone_files: List, - diphone_files: List, - context_files: List, - num_segment_files: List, - label_info: LabelInfo): + def __init__( + self, + triphone_files: List, + diphone_files: List, + context_files: List, + num_segment_files: List, + label_info: LabelInfo, + ): self.triphone_files = triphone_files self.diphone_files = diphone_files self.context_files = context_files @@ -184,9 +279,9 @@ def __init__(self, self.label_info = label_info self.num_segments = [] self.triphone_means, self.diphone_means = initialize_dicts( - n_contexts=n_contexts, n_state_classes=n_state_classes + n_contexts=label_info.n_contexts, n_state_classes=label_info.get_n_state_classes() ) - self.contextMeans = [] + self.context_means = [] self.triphone_xml = self.output_path("triphone_scores.xml", cached=False) self.diphone_xml = self.output_path("diphone_scores.xml", cached=False) self.context_xml = self.output_path("context_scores.xml", cached=False) @@ -195,95 +290,93 @@ def __init__(self, def tasks(self): yield Task("run", resume="run", rqmt=self.rqmt) - def readnum_segments(self): + def read_num_segments(self): for filename in self.num_segment_files: with open(filename.get_path(), "rb") as f: self.num_segments.append(pickle.load(f)) - def calculateWeightedAverages(self): + def calculate_weighted_averages(self): coeffs = [self.num_segments[i] / np.sum(self.num_segments) for i in range(len(self.num_segment_files))] for filename in self.triphone_files: with open(filename.get_path(), "rb") as f: triphoneDict = pickle.load(f) - for i in range(self.n_contexts): - for j in range(self.n_state_classes): + for i in range(self.label_info.n_contexts): + for j in range(self.label_info.get_n_state_classes()): self.triphone_means[i][j].append( np.dot(coeffs[self.triphone_files.index(filename)], triphoneDict[i][j]) ) for filename in self.diphone_files: with open(filename.get_path(), "rb") as f: - diphoneDict = pickle.load(f) - for i in range(self.n_contexts): - self.diphone_means[i].append(np.dot(coeffs[self.diphone_files.index(filename)], diphoneDict[i])) + diphone_dict = pickle.load(f) + for i in range(self.label_info.n_contexts): + self.diphone_means[i].append(np.dot(coeffs[self.diphone_files.index(filename)], diphone_dict[i])) for filename in self.context_files: with open(filename.get_path(), "rb") as f: means = pickle.load(f) - self.contextMeans.append(np.dot(coeffs[self.context_files.index(filename)], means)) - for i in range(self.n_contexts): + self.context_means.append(np.dot(coeffs[self.context_files.index(filename)], means)) + for i in range(self.label_info.n_contexts): self.diphone_means[i] = np.sum(self.diphone_means[i], axis=0) - for j in range(self.n_state_classes): + for j in range(self.label_info.get_n_state_classes()): self.triphone_means[i][j] = np.sum(self.triphone_means[i][j], axis=0) - self.contextMeans = np.sum(self.contextMeans, axis=0) + self.context_means = np.sum(self.context_means, axis=0) - def dumpXml(self): - for pastId in range(self.n_contexts): - for currentstateId in range(self.n_state_classes): - for i, s in enumerate(self.triphone_means[pastId][currentstateId]): + def dump_xml(self): + for context_id in range(self.label_info.n_contexts): + for center_stateId in range(self.label_info.get_n_state_classes()): + for i, s in enumerate(self.triphone_means[context_id][center_stateId]): if s == 0: - self.triphone_means[pastId][currentstateId][i] += 1e-5 + self.triphone_means[context_id][center_stateId][i] += 1e-5 with open(self.triphone_xml.get_path(), "wt") as f: f.write( '\n\n' - % (self.n_contexts * self.n_state_classes, self.n_contexts) + % (self.label_info.n_contexts * self.label_info.get_n_state_classes(), self.label_info.n_contexts) ) - for pastId in range(self.n_contexts): - for currentstateId in range(self.n_state_classes): - for i, s in enumerate(self.triphone_means[pastId][currentstateId]): + for context_id in range(self.label_info.n_contexts): + for center_stateId in range(self.label_info.get_n_state_classes()): + for i, s in enumerate(self.triphone_means[context_id][center_stateId]): if s == 0: - self.triphone_means[pastId][currentstateId][i] += 1e-5 - f.write(" ".join("%.20e" % math.log(s) for s in self.triphone_means[pastId][currentstateId]) + "\n") + self.triphone_means[context_id][center_stateId][i] += 1e-5 + f.write(" ".join("%.20e" % math.log(s) for s in self.triphone_means[context_id][center_stateId]) + "\n") f.write("") with open(self.diphone_xml.get_path(), "wt") as f: f.write( '\n\n' - % (self.n_contexts, self.n_state_classes) + % (self.label_info.n_contexts, self.label_info.get_n_state_classes()) ) - for pastId in range(self.n_contexts): - for i, c in enumerate(self.diphone_means[pastId]): + for context_id in range(self.label_info.n_contexts): + for i, c in enumerate(self.diphone_means[context_id]): if c == 0: - self.diphone_means[pastId][i] += 1e-5 - f.write(" ".join("%.20e" % math.log(s) for s in self.diphone_means[pastId]) + "\n") + self.diphone_means[context_id][i] += 1e-5 + f.write(" ".join("%.20e" % math.log(s) for s in self.diphone_means[context_id]) + "\n") f.write("") with open(self.context_xml.get_path(), "wt") as f: - f.write('\n\n' % (self.n_contexts)) - f.write(" ".join("%.20e" % math.log(s) for s in np.nditer(self.contextMeans)) + "\n") + f.write('\n\n' % (self.label_info.n_contexts)) + f.write(" ".join("%.20e" % math.log(s) for s in np.nditer(self.context_means)) + "\n") f.write("") def run(self): - self.readnum_segments() - print("number of segments read") - self.calculateWeightedAverages() - self.dumpXml() - - + self.read_num_segments() + logging.info("number of segments read") + self.calculate_weighted_averages() + self.dump_xml() +# needs refactoring class EstimateRasrDiphoneAndContextPriors(Job): def __init__( self, graph_path: Path, - model_path: Path, + model_path: DelayedFormat, tensor_map: Optional[Union[dict, DecodingTensorMap]], data_paths: [Path], data_indices: [int], label_info: LabelInfo, tf_library_path: str = None, - n_batch=15000, + n_batch=12000, cpu=2, gpu=1, mem=4, time=1, - ): self.graph_path = graph_path self.model_path = model_path @@ -291,11 +384,13 @@ def __init__( self.data_paths = data_paths self.data_indices = data_indices self.tf_library_path = tf_library_path - self.diphoneMeans = dict(zip(range(label_info.n_contexts), [np.zeros(nStateClasses) for _ in range(label_info.n_contexts)])) - self.contextMeans = np.zeros(label_info.n_contexts) + self.diphoneMeans = dict( + zip(range(label_info.n_contexts), [np.zeros(nStateClasses) for _ in range(label_info.n_contexts)]) + ) + self.context_means = np.zeros(label_info.n_contexts) self.num_segments = [self.output_path("segmentLength.%d" % index, cached=False) for index in self.data_indices] - self.diphoneFiles = [self.output_path("diphoneMeans.%d" % index, cached=False) for index in self.data_indices] - self.contextFiles = [self.output_path("contextMeans.%d" % index, cached=False) for index in self.data_indices] + self.diphone_files = [self.output_path("diphoneMeans.%d" % index, cached=False) for index in self.data_indices] + self.context_files = [self.output_path("context_means.%d" % index, cached=False) for index in self.data_indices] self.n_batch = n_batch if not gpu: @@ -307,7 +402,6 @@ def tasks(self): def get_segment_features_from_hdf(self, dataIndex): hf = h5py.File(tk.uncached_path(self.data_paths[dataIndex])) - print(self.data_paths[dataIndex]) segmentNames = list(hf["streams"]["features"]["data"]) segments = [] for name in segmentNames: @@ -340,9 +434,12 @@ def getPosteriorsOfBothOutputsWithEncoded(self, session, feature_vector, class_l ) def get_dense_label(self, left_context, center_state, right_context=0): - return (((center_state * self.label_info.n_contexts) + left_context) * self.label_info.n_contexts) + right_context + return ( + ((center_state * self.label_info.n_contexts) + left_context) * self.label_info.n_contexts + ) + right_context - def calculateMeanPosteriors(self, session, task_id): + def calculate_mean_posteriors(self, session, task_id): + logging.info(f"starting with {task_id}") sample_count = 0 segments = self.get_segment_features_from_hdf(self.data_indices[task_id - 1]) @@ -353,49 +450,52 @@ def calculateMeanPosteriors(self, session, task_id): break encoder_output = self.get_encoder_output(session, batch) - for pastContextId in range(self.label_info.n_contexts): + for left_context in range(self.label_info.n_contexts): p = self.getPosteriorsOfBothOutputsWithEncoded( - session, encoder_output[0], self.get_dense_label(pastContextId) + session, encoder_output[0], self.get_dense_label(left_context) ) - di = (sample_count * self.diphoneMeans[pastContextId]) + (b_size * np.mean(p[0][0], axis=0)) - self.diphoneMeans[pastContextId] = np.divide(di, denom) + di = (sample_count * self.diphoneMeans[left_context]) + (b_size * np.mean(p[0][0], axis=0)) + self.diphoneMeans[left_context] = np.divide(di, denom) # context is not label dependent - if not pastContextId: - ctx = (sample_count * self.contextMeans) + (b_size * np.mean(p[1][0], axis=0)) - self.contextMeans = np.divide(ctx, denom) + if not left_context: + ctx = (sample_count * self.context_means) + (b_size * np.mean(p[1][0], axis=0)) + self.context_means = np.divide(ctx, denom) sample_count += b_size with open(tk.uncached_path(self.num_segments[task_id - 1]), "wb") as fp: pickle.dump(sample_count, fp, protocol=pickle.HIGHEST_PROTOCOL) - def dumpMeans(self, task_id): - with open(tk.uncached_path(self.diphoneFiles[task_id - 1]), "wb") as fp: + def dump_means(self, task_id): + with open(tk.uncached_path(self.diphone_files[task_id - 1]), "wb") as fp: pickle.dump(self.diphoneMeans, fp, protocol=pickle.HIGHEST_PROTOCOL) - with open(tk.uncached_path(self.contextFiles[task_id - 1]), "wb") as fp: - pickle.dump(self.contextMeans, fp, protocol=pickle.HIGHEST_PROTOCOL) + with open(tk.uncached_path(self.context_files[task_id - 1]), "wb") as fp: + pickle.dump(self.context_means, fp, protocol=pickle.HIGHEST_PROTOCOL) def run(self, task_id): - tf.load_op_library(self.tf_library_path) + import tensorflow as tf + if self.tf_library_path is not None: + tf.load_op_library(self.tf_library_path) mg = tf.MetaGraphDef() mg.ParseFromString(open(self.graph_path.get_path(), "rb").read()) tf.import_graph_def(mg.graph_def, name="") # session s = tf.Session() - returnValue = s.run(["save/restore_all"], feed_dict={"save/Const:0": self.model_path.get_path()}) + returnValue = s.run(["save/restore_all"], feed_dict={"save/Const:0": self.model_path.get()}) - self.calculateMeanPosteriors(s, task_id) - self.dumpMeans(task_id) + self.calculate_mean_posteriors(s, task_id) + self.dump_means(task_id) -# you can use DumpXmlForDiphone and have an attribute called isSprint, with which you call your additional function. +# needs refactoring +# you can use dump_xmlForDiphone and have an attribute called isSprint, with which you call your additional function. # Generally think to merge all functions -class DumpXmlRasrForDiphone(Job): +class dump_xmlRasrForDiphone(Job): def __init__( self, - diphoneFiles, - contextFiles, - numSegmentFiles, + diphone_files, + context_files, + num_segment_files, nContexts, nStateClasses, adjustSilence=True, @@ -404,12 +504,12 @@ def __init__( nonWordIndices=None, ): - self.diphoneFiles = diphoneFiles - self.contextFiles = contextFiles - self.numSegmentFiles = numSegmentFiles + self.diphone_files = diphone_files + self.context_files = context_files + self.num_segment_files = num_segment_files self.num_segments = [] self.diphoneMeans = dict(zip(range(nContexts), [[] for _ in range(nContexts)])) - self.contextMeans = [] + self.context_means = [] self.diphoneXml = self.output_path("diphoneScores.xml", cached=False) self.contextXml = self.output_path("contextScores.xml", cached=False) self.nContexts = nContexts @@ -423,30 +523,30 @@ def __init__( def tasks(self): yield Task("run", resume="run", rqmt=self.rqmt) - def readnum_segments(self): - for filename in self.numSegmentFiles: + def read_num_segments(self): + for filename in self.num_segment_files: with open(tk.uncached_path(filename), "rb") as f: self.num_segments.append(pickle.load(f)) - def calculateWeightedAverages(self): - coeffs = [self.num_segments[i] / np.sum(self.num_segments) for i in range(len(self.numSegmentFiles))] - for filename in self.diphoneFiles: + def calculate_weighted_averages(self): + coeffs = [self.num_segments[i] / np.sum(self.num_segments) for i in range(len(self.num_segment_files))] + for filename in self.diphone_files: with open(tk.uncached_path(filename), "rb") as f: - diphoneDict = pickle.load(f) + diphone_dict = pickle.load(f) for i in range(self.label_info.n_contexts): - self.diphoneMeans[i].append(np.dot(coeffs[self.diphoneFiles.index(filename)], diphoneDict[i])) - for filename in self.contextFiles: + self.diphoneMeans[i].append(np.dot(coeffs[self.diphone_files.index(filename)], diphone_dict[i])) + for filename in self.context_files: with open(tk.uncached_path(filename), "rb") as f: means = pickle.load(f) - self.contextMeans.append(np.dot(coeffs[self.contextFiles.index(filename)], means)) + self.context_means.append(np.dot(coeffs[self.context_files.index(filename)], means)) for i in range(self.label_info.n_contexts): self.diphoneMeans[i] = np.sum(self.diphoneMeans[i], axis=0) - self.contextMeans = np.sum(self.contextMeans, axis=0) + self.context_means = np.sum(self.context_means, axis=0) def setSilenceAndNonWordValues(self): # context vectors - sil = sum([self.contextMeans[i] for i in self.silBoundaryIndices]) - noise = sum([self.contextMeans[i] for i in self.nonWordIndices]) + sil = sum([self.context_means[i] for i in self.silBoundaryIndices]) + noise = sum([self.context_means[i] for i in self.nonWordIndices]) # center given context vectors meansListSil = [self.diphoneMeans[i] for i in self.silBoundaryIndices] @@ -455,24 +555,24 @@ def setSilenceAndNonWordValues(self): dpNoise = [sum(x) for x in zip(*meansListNonword)] for i in self.silBoundaryIndices: - self.contextMeans[i] = sil + self.context_means[i] = sil self.diphoneMeans[i] = dpSil for i in self.nonWordIndices: - self.contextMeans[i] = noise + self.context_means[i] = noise self.diphoneMeans[i] = dpNoise def setSilenceValues(self): - sil = sum([self.contextMeans[i] for i in self.silBoundaryIndices]) + sil = sum([self.context_means[i] for i in self.silBoundaryIndices]) # center given context vectors meansListSil = [self.diphoneMeans[i] for i in self.silBoundaryIndices] dpSil = [np.sum(x) for x in zip(*meansListSil)] for i in self.silBoundaryIndices: - self.contextMeans[i] = sil + self.context_means[i] = sil self.diphoneMeans[i] = dpSil - def dumpXml(self): + def dump_xml(self): perturbation = 1e-8 with open(tk.uncached_path(self.diphoneXml), "wt") as f: f.write( @@ -484,25 +584,25 @@ def dumpXml(self): f.write(" ".join("%.20e" % math.log(s) for s in self.diphoneMeans[i]) + "\n") f.write("") with open(tk.uncached_path(self.contextXml), "wt") as f: - self.contextMeans[self.contextMeans == 0] = perturbation + self.context_means[self.context_means == 0] = perturbation f.write('\n\n' % (self.label_info.n_contexts)) - f.write(" ".join("%.20e" % math.log(s) for s in np.nditer(self.contextMeans)) + "\n") + f.write(" ".join("%.20e" % math.log(s) for s in np.nditer(self.context_means)) + "\n") f.write("") def dumpPickle(self): with open("/u/raissi/experiments/notebooks/diphones.pickle", "wb") as fp: pickle.dump(self.diphoneMeans, fp, protocol=pickle.HIGHEST_PROTOCOL) with open("/u/raissi/experiments/notebooks/context.pickle", "wb") as fp: - pickle.dump(self.contextMeans, fp, protocol=pickle.HIGHEST_PROTOCOL) + pickle.dump(self.context_means, fp, protocol=pickle.HIGHEST_PROTOCOL) def run(self): - self.readnum_segments() - self.calculateWeightedAverages() + self.read_num_segments() + self.calculate_weighted_averages() if self.adjustSilence: if self.adjustNonWord: self.setSilenceAndNonWordValues() else: self.setSilenceValues() - self.dumpXml() - self.dumpPickle() \ No newline at end of file + self.dump_xml() + self.dumpPickle() diff --git a/users/raissi/setups/common/helpers/priors/factored_estimation.py b/users/raissi/setups/common/helpers/priors/factored_estimation.py index d177daa2d..b7036b83f 100644 --- a/users/raissi/setups/common/helpers/priors/factored_estimation.py +++ b/users/raissi/setups/common/helpers/priors/factored_estimation.py @@ -1,32 +1,153 @@ - -def get_diphone_priors(graphPath, model, dataPaths, datasetIndices, - nStateClasses=141, nContexts=47, gpu=1, time=20, isSilMapped=True, name=None, nBatch=10000, tf_library=None, tm=None): +import numpy as np +from typing import List + +from sisyphus import * + + +from i6_experiments.users.raissi.setups.common.data.factored_label import LabelInfo +from i6_experiments.users.raissi.setups.common.decoder.BASE_factored_hybrid_search import DecodingTensorMap +from i6_experiments.users.raissi.setups.common.helpers.priors.estimate_povey_like_prior_fh import ( + EstimateFactoredTriphonePriorsJob, + CombineMeansForTriphoneForward, + DumpXmlForTriphoneForwardJob, +) + +from i6_experiments.users.raissi.setups.common.helpers.priors.util import PartitionDataSetup + +Path = setup_path(__package__) +RANDOM_SEED = 42 + + +def get_triphone_priors( + name: str, + graph_path: Path, + model_path: Path, + data_paths: List[Path], + label_info: LabelInfo, + tensor_map: DecodingTensorMap, + partition_data_setup: PartitionDataSetup, + tf_library=None, + n_batch=10000, + cpu: int = 2, + gpu: int = 1, + time: int = 1, +): + + triphone_files = [] + diphone_files = [] + context_files = [] + num_segments = [] + + np.random.seed(RANDOM_SEED) + for i in np.random.choice(range(len(data_paths)//partition_data_setup.data_offset), partition_data_setup.n_data_indices, replace=False): + start_ind = i * partition_data_setup.data_offset + end_ind = (i + 1) * partition_data_setup.data_offset + for j in range(partition_data_setup.n_segment_indices): + start_ind_seg = j * partition_data_setup.segment_offset + end_ind_seg = (j + 1) * partition_data_setup.segment_offset + # if end_ind_seg > 1248: end_ind_seg = 1248 + data_indices = list(range(start_ind, end_ind)) + estimateJob = EstimateFactoredTriphonePriorsJob( + graph_path=graph_path, + model_path=model_path, + tensor_map=tensor_map, + data_paths=data_paths, + data_indices=data_indices, + start_ind_segment=start_ind_seg, + end_ind_segment=end_ind_seg, + label_info=label_info, + tf_library_path=tf_library, + n_batch=n_batch, + cpu=cpu, + gpu=gpu, + time=time, + ) + if name is not None: + estimateJob.add_alias(f"priors/{name}-{data_indices}_{start_ind_seg}") + triphone_files.extend(estimateJob.triphone_files) + diphone_files.extend(estimateJob.diphone_files) + context_files.extend(estimateJob.context_files) + num_segments.extend(estimateJob.num_segments) + + comb_jobs = [] + for spliter in range(0, len(triphone_files), partition_data_setup.split_step): + start = spliter + end = min(spliter + partition_data_setup.split_step, len(triphone_files)) + comb_jobs.append( + CombineMeansForTriphoneForward( + triphone_files=triphone_files[start:end], + diphone_files=diphone_files[start:end], + context_files=context_files[start:end], + num_segment_files=num_segments[start:end], + label_info=label_info, + ) + ) + + comb_triphone_files = [c.triphone_files_out for c in comb_jobs] + comb_diphone_files = [c.diphone_files_out for c in comb_jobs] + comb_context_files = [c.context_files_out for c in comb_jobs] + comb_num_segs = [c.num_segments_out for c in comb_jobs] + xmlJob = DumpXmlForTriphoneForwardJob( + triphone_files=comb_triphone_files, + diphone_files=comb_diphone_files, + context_files=comb_context_files, + num_segment_files=comb_num_segs, + label_info=label_info + ) + + prior_files_triphone = [xmlJob.triphone_xml, xmlJob.diphone_xml, xmlJob.context_xml] + xml_name = f"priors/{name}" + tk.register_output(xml_name, prior_files_triphone[0]) + + return prior_files_triphone + + +# needs refactoring +def get_diphone_priors( + graph_path, + model_path, + data_paths, + data_indices, + nStateClasses=141, + nContexts=47, + gpu=1, + time=20, + isSilMapped=True, + name=None, + n_batch=10000, + tf_library=None, + tensor_map=None, +): if tf_library is None: tf_library = libraryPath - if tm is None: - tm = defaultTfMap - - estimateJob = EstimateSprintDiphoneAndContextPriors(graphPath, - model, - dataPaths, - datasetIndices, - tf_library, - nContexts=nContexts, - nStateClasses=nStateClasses, - gpu=gpu, - time=time, - tensorMap=tm, - nBatch=nBatch ,) + if tensor_map is None: + tensor_map = defaultTfMap + + estimateJob = EstimateSprintDiphoneAndContextPriors( + graph_path, + model_path, + data_paths, + data_indices, + tf_library, + nContexts=nContexts, + nStateClasses=nStateClasses, + gpu=gpu, + time=time, + tensorMap=tensor_map, + n_batch=n_batch, + ) if name is not None: estimateJob.add_alias(f"priors/{name}") - xmlJob = DumpXmlSprintForDiphone(estimateJob.diphoneFiles, - estimateJob.contextFiles, - estimateJob.numSegments, - nContexts=nContexts, - nStateClasses=nStateClasses, - adjustSilence=isSilMapped) + xmlJob = DumpXmlSprintForDiphone( + estimateJob.diphone_files, + estimateJob.context_files, + estimateJob.num_segments, + nContexts=nContexts, + nStateClasses=nStateClasses, + adjustSilence=isSilMapped, + ) priorFiles = [xmlJob.diphoneXml, xmlJob.contextXml] @@ -34,85 +155,3 @@ def get_diphone_priors(graphPath, model, dataPaths, datasetIndices, tk.register_output(xmlName, priorFiles[0]) return priorFiles - - - -def get_triphone_priors(graphPath, model, dataPaths, nStateClasses=282, nContexts=47, nPhones=47, nStates=3, - cpu=2, gpu=1, time=1, nBatch=18000, dNum=3, sNum=20, step=200, dataOffset=10, segmentOffset=10, - name=None, tf_library=None, tm=None, isMulti=False): - if tf_library is None: - tf_library = libraryPath - if tm is None: - tm = defaultTfMap - - triphoneFiles = [] - diphoneFiles = [] - contextFiles = [] - numSegments = [] - - - for i in range(2, dNum + 2): - startInd = i * dataOffset - endInd = (i + 1) * dataOffset - for j in range(sNum): - startSegInd = j * segmentOffset - endSegInd = (j + 1) * segmentOffset - if endSegInd > 1248: endSegInd = 1248 - - datasetIndices = list(range(startInd, endInd)) - estimateJob = EstimateSprintTriphonePriorsForward(graphPath, - model, - dataPaths, - datasetIndices, - startSegInd, endSegInd, - tf_library, - nContexts=nContexts, - nStateClasses=nStateClasses, - nStates=nStates, - nPhones=nPhones, - nBatch=nBatch, - cpu=cpu, - gpu=gpu, - time=time, - tensorMap=tm, - isMultiEncoder=isMulti) - if name is not None: - estimateJob.add_alias(f"priors/{name}-startind{startSegInd}") - triphoneFiles.extend(estimateJob.triphoneFiles) - diphoneFiles.extend(estimateJob.diphoneFiles) - contextFiles.extend(estimateJob.contextFiles) - numSegments.extend(estimateJob.numSegments) - - - - comJobs = [] - for spliter in range(0, len(triphoneFiles), step): - start = spliter - end = spliter + step - if end > len(triphoneFiles): - end = triphoneFiles - comJobs.append(CombineMeansForTriphoneForward(triphoneFiles[start:end], - diphoneFiles[start:end], - contextFiles[start:end], - numSegments[start:end], - nContexts=nContexts, - nStates=nStateClasses, - )) - - combTriphoneFiles = [c.triphoneFilesOut for c in comJobs] - combDiphoneFiles = [c.diphoneFilesOut for c in comJobs] - combContextFiles = [c.contextFilesOut for c in comJobs] - combNumSegs = [c.numSegmentsOut for c in comJobs] - xmlJob = DumpXmlForTriphoneForward(combTriphoneFiles, - combDiphoneFiles, - combContextFiles, - combNumSegs, - nContexts=nContexts, - nStates=nStateClasses) - - priorFilesTriphone = [xmlJob.triphoneXml, xmlJob.diphoneXml, xmlJob.contextXml] - xmlName = f"priors/{name}" - tk.register_output(xmlName, priorFilesTriphone[0]) - - - return priorFilesTriphone \ No newline at end of file diff --git a/users/raissi/setups/common/helpers/priors/transcription.py b/users/raissi/setups/common/helpers/priors/transcription.py index 8bb8c01b7..f8879ada1 100644 --- a/users/raissi/setups/common/helpers/priors/transcription.py +++ b/users/raissi/setups/common/helpers/priors/transcription.py @@ -1,56 +1,82 @@ -__all__ = ["get_mono_transcription_priors"] +from sisyphus import * +from sisyphus.tools import try_get -import numpy as np -from typing import Iterator, List -import pickle +import os -from sisyphus import Job, Task +from i6_core.corpus.transform import ApplyLexiconToCorpusJob +from i6_core.lexicon.allophones import DumpStateTyingJob +from i6_core.lexicon.modification import AddEowPhonemesToLexiconJob -from i6_experiments.users.raissi.setups.common.decoder.config import PriorInfo, PriorConfig -from i6_experiments.users.raissi.setups.common.helpers.priors.util import write_prior_xml +from i6_experiments.users.mann.experimental.statistics import AllophoneCounts +from i6_experiments.users.mann.setups.prior import PriorFromTranscriptionCounts -pickles = { - ( - 1, - False, - ): "/work/asr4/raissi/setups/librispeech/960-ls/dependencies/priors/daniel/monostate/monostate.pickle", - ( - 1, - True, - ): "/work/asr4/raissi/setups/librispeech/960-ls/dependencies/priors/daniel/monostate/monostate.we.pickle", - ( - 3, - False, - ): "/work/asr4/raissi/setups/librispeech/960-ls/dependencies/priors/daniel/threepartite/threepartite.pickle", - ( - 3, - True, - ): "/work/asr4/raissi/setups/librispeech/960-ls/dependencies/priors/daniel/threepartite/threepartite.we.pickle", -} +def output(name, value): + opath = os.path.join(fname, name) + if isinstance(value, dict): + tk.register_report(opath, DescValueReport(value)) + return + tk.register_report(opath, SimpleValueReport(value)) -class LoadTranscriptionPriorsJob(Job): - def __init__(self, n: int, eow: bool): - assert n in [1, 3] +from sisyphus.delayed_ops import DelayedBase - self.n = n - self.eow = eow +class DelayedGetDefault(DelayedBase): + def __init__(self, a, b, default=None): + super().__init__(a, b) + self.default = default - self.out_priors = self.output_path("priors.xml") + def get(self): + try: + return try_get(self.a)[try_get(self.b)] + except KeyError: + return self.default - def tasks(self) -> Iterator[Task]: - yield Task("run", mini_task=True) - def run(self): - file = pickles[(self.n, self.eow)] +def get_prior_from_transcription( + crp, + total_frames, + average_phoneme_frames, + epsilon=1e-12, + lemma_end_probability=0.0, - with open(file, "rb") as f: - priors: List[float] = pickle.load(f) +): - write_prior_xml(log_priors=np.log(priors), path=self.out_priors) + lexicon_w_we = AddEowPhonemesToLexiconJob( + crp.lexicon_config.file, + boundary_marker=" #", # the prepended space is important + ) + corpus = crp.corpus_config.file + if not isinstance(crp.corpus_config.file, tk.Path): + corpus = tk.Path(crp.corpus_config.file) -def get_mono_transcription_priors(states_per_phone: int, with_word_end: bool) -> PriorInfo: - load_j = LoadTranscriptionPriorsJob(states_per_phone, with_word_end) - return PriorInfo(center_state_prior=PriorConfig(file=load_j.out_priors, scale=0.0)) + + transcribe_job = ApplyLexiconToCorpusJob( + corpus, + lexicon_w_we.out_lexicon, + ) + + count_phonemes = AllophoneCounts( + transcribe_job.out_corpus, + lemma_end_probability=lemma_end_probability, + ) + + state_tying_file = DumpStateTyingJob(crp).out_state_tying + + + + prior_job = PriorFromTranscriptionCounts( + allophone_counts=count_phonemes.counts, + total_count=count_phonemes.total, + state_tying=state_tying_file, + average_phoneme_frames=average_phoneme_frames, + num_frames=total_frames, + eps=epsilon, + ) + + return { + "txt": prior_job.out_prior_txt_file, + "xml": prior_job.out_prior_xml_file, + "png": prior_job.out_prior_png_file + } \ No newline at end of file diff --git a/users/raissi/setups/common/helpers/priors/util.py b/users/raissi/setups/common/helpers/priors/util.py index 77f6f6e33..c90fb2339 100644 --- a/users/raissi/setups/common/helpers/priors/util.py +++ b/users/raissi/setups/common/helpers/priors/util.py @@ -2,12 +2,21 @@ from dataclasses import dataclass import numpy as np -from typing import List, Tuple, Union +from typing import List, Tuple, Union import xml.etree.ElementTree as ET from sisyphus import Path +@dataclass(frozen=True, eq=True) +class PartitionDataSetup: + n_segment_indices: int = 20 + n_data_indices: int = 3 + segment_offset: int = 10 + data_offset: int = 10 + split_step: int = 200 + + @dataclass(frozen=True, eq=True) class ParsedPriors: priors_log: List[float] @@ -81,4 +90,4 @@ def get_batch_from_segments(segments: List, batchSize=10000): yield segments[index * batchSize : (index + 1) * batchSize] index += 1 except IndexError: - index = 0 \ No newline at end of file + index = 0 diff --git a/users/raissi/setups/common/util/tdp.py b/users/raissi/setups/common/util/tdp.py index 5833c99d2..7fc7cc4c5 100644 --- a/users/raissi/setups/common/util/tdp.py +++ b/users/raissi/setups/common/util/tdp.py @@ -3,7 +3,7 @@ from typing import Union, Tuple from sisyphus import tk -from sisyphus.delayed_ops import DelayedBase +from sisyphus.delayed_ops import DelayedBase, DelayedGetItem from i6_experiments.common.setups.rasr.config.am_config import Tdp from i6_experiments.users.raissi.setups.common.data.typings import TDP @@ -14,6 +14,8 @@ def to_tdp(tdp_tuple: Tuple[TDP, TDP, TDP, TDP]) -> Tdp: def format_tdp_val(val) -> str: + if isinstance(val, DelayedGetItem): + val = val.get() return "inf" if val == "infinity" else f"{val}" diff --git a/users/raissi/setups/librispeech/decoder/LBS_factored_hybrid_search.py b/users/raissi/setups/librispeech/decoder/LBS_factored_hybrid_search.py index 2bca15a42..736be9b91 100644 --- a/users/raissi/setups/librispeech/decoder/LBS_factored_hybrid_search.py +++ b/users/raissi/setups/librispeech/decoder/LBS_factored_hybrid_search.py @@ -97,6 +97,7 @@ def __init__( lm_gc_simple_hash=lm_gc_simple_hash, gpu=gpu, ) + self.trafo_lm_config = self.get_eugen_trafo_with_quant_and_compress_config() def get_ls_kazuki_lstm_lm_config( self, @@ -115,7 +116,7 @@ def get_ls_kazuki_lstm_lm_config( state_manager="lstm", ).get() - def get_eugen_trafo_config( + def get_eugen_trafo_with_quant_and_compress_config( self, min_batch_size: int = 0, opt_batch_size: int = 64, @@ -229,6 +230,62 @@ def get_eugen_trafo_config( return trafo_config + def get_eugen_trafo_config( + self, + min_batch_size: int = 0, + opt_batch_size: int = 64, + max_batch_size: int = 64, + scale: Optional[float] = None, + ) -> rasr.RasrConfig: + # assert self.library_path is not None + + + trafo_config = rasr.RasrConfig() + + trafo_config.min_batch_size = min_batch_size + trafo_config.opt_batch_size = opt_batch_size + trafo_config.max_batch_size = max_batch_size + trafo_config.allow_reduced_history = True + if scale is not None: + trafo_config.scale = scale + trafo_config.type = "tfrnn" + trafo_config.vocab_file = tk.Path("/work/asr3/raissi/shared_workspaces/gunz/dependencies/ls-eugen-trafo-lm/vocabulary", cached=True) + trafo_config.transform_output_negate = True + trafo_config.vocab_unknown_word = "" + + trafo_config.input_map.info_0.param_name = "word" + trafo_config.input_map.info_0.tensor_name = "extern_data/placeholders/delayed/delayed" + trafo_config.input_map.info_0.seq_length_tensor_name = "extern_data/placeholders/delayed/delayed_dim0_size" + + trafo_config.input_map.info_1.param_name = "state-lengths" + trafo_config.input_map.info_1.tensor_name = "output/rec/dec_0_self_att_att/state_lengths" + + trafo_config.loader.type = "meta" + trafo_config.loader.meta_graph_file = ( + "/work/asr4/raissi/setups/librispeech/960-ls/dependencies/trafo-lm_eugen/integrated_fixup_graph_no_cp_no_quant.meta" + ) + model_path = "/work/asr3/raissi/shared_workspaces/gunz/dependencies/ls-eugen-trafo-lm/epoch.030" + trafo_config.loader.saved_model_file = rasr.StringWrapper(model_path, f"{model_path}.index") + trafo_config.loader.required_libraries = self.library_path + + trafo_config.output_map.info_0.param_name = "softmax" + trafo_config.output_map.info_0.tensor_name = "output/rec/decoder/add" + + trafo_config.output_map.info_1.param_name = "weights" + trafo_config.output_map.info_1.tensor_name = "output/rec/output/W/read" + + trafo_config.output_map.info_2.param_name = "bias" + trafo_config.output_map.info_2.tensor_name = "output/rec/output/b/read" + + + trafo_config.state_manager.cache_prefix = True + trafo_config.state_manager.min_batch_size = min_batch_size + trafo_config.state_manager.min_common_prefix_length = 0 + trafo_config.state_manager.type = "transformer" + trafo_config.softmax_adapter.type = "blas-nce" + + return trafo_config + def recognize_ls_trafo_lm( self, *, @@ -265,7 +322,7 @@ def recognize_ls_trafo_lm( is_nn_lm=True, keep_value=keep_value, label_info=label_info, - lm_config=self.get_eugen_trafo_config(), + lm_config=self.trafo_lm_config, name_override=name_override, name_prefix=name_prefix, num_encoder_output=num_encoder_output, diff --git a/users/raissi/setups/librispeech/helpers/__init__.py b/users/raissi/setups/librispeech/helpers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/raissi/setups/librispeech/helpers/priors/__init__.py b/users/raissi/setups/librispeech/helpers/priors/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/raissi/setups/librispeech/helpers/priors/transcription.py b/users/raissi/setups/librispeech/helpers/priors/transcription.py new file mode 100644 index 000000000..81ea888e3 --- /dev/null +++ b/users/raissi/setups/librispeech/helpers/priors/transcription.py @@ -0,0 +1,56 @@ +__all__ = ["get_mono_transcription_priors"] + +import numpy as np +from typing import Iterator, List +import pickle + +from sisyphus import Job, Task + +from i6_experiments.users.raissi.setups.common.decoder.config import PriorInfo, PriorConfig +from i6_experiments.users.raissi.setups.common.helpers.priors.util import write_prior_xml + + +pickles = { + ( + 1, + False, + ): "/work/asr4/raissi/setups/librispeech/960-ls/dependencies/priors/daniel/monostate/monostate.pickle", + ( + 1, + True, + ): "/work/asr4/raissi/setups/librispeech/960-ls/dependencies/priors/daniel/monostate/monostate.we.pickle", + ( + 3, + False, + ): "/work/asr4/raissi/setups/librispeech/960-ls/dependencies/priors/daniel/threepartite/threepartite.pickle", + ( + 3, + True, + ): "/work/asr4/raissi/setups/librispeech/960-ls/dependencies/priors/daniel/threepartite/threepartite.we.pickle", +} + + +class LoadTranscriptionPriorsJob(Job): + def __init__(self, n: int, eow: bool): + assert n in [1, 3] + + self.n = n + self.eow = eow + + self.out_priors = self.output_path("priors.xml") + + def tasks(self) -> Iterator[Task]: + yield Task("run", mini_task=True) + + def run(self): + file = pickles[(self.n, self.eow)] + + with open(file, "rb") as f: + priors: List[float] = pickle.load(f) + + write_prior_xml(log_priors=np.log(priors), path=self.out_priors) + + +def get_mono_transcription_priors(states_per_phone: int, with_word_end: bool) -> PriorInfo: + load_j = LoadTranscriptionPriorsJob(states_per_phone, with_word_end) + return PriorInfo(center_state_prior=PriorConfig(file=load_j.out_priors, scale=0.0)) \ No newline at end of file diff --git a/users/raissi/utils/default_tools.py b/users/raissi/utils/default_tools.py index 47d9ee5b3..baef3b003 100644 --- a/users/raissi/utils/default_tools.py +++ b/users/raissi/utils/default_tools.py @@ -93,6 +93,7 @@ def get_rasr_binary_path(rasr_path): hash_overwrite="CONFORMER_RETURNN_Len_FIX", ) RETURNN_ROOT_TORCH = tk.Path("/work/tools/users/raissi/returnn_versions/torch", hash_overwrite="TORCH_RETURNN_ROOT") +RETURNN_ROOT_BW_FACTORED = tk.Path("/work/tools/users/raissi/returnn_versions/bw-factored", hash_overwrite="BW_RETURNN_ROOT") SCTK_BINARY_PATH = compile_sctk(branch="v2.4.12") # use last published version SCTK_BINARY_PATH.hash_overwrite = "DEFAULT_SCTK_BINARY_PATH" From d45e75c91a171b8d4e5459223d31b6bd578821ac Mon Sep 17 00:00:00 2001 From: marvin84 Date: Tue, 4 Jun 2024 14:09:50 +0200 Subject: [PATCH 089/227] deleted wrong stashed --- common/datasets/tedlium2_v2/corpus.py | 136 ---------------- common/datasets/tedlium2_v2/download.py | 48 ------ common/datasets/tedlium2_v2/export.py | 96 ----------- common/datasets/tedlium2_v2/lexicon.py | 171 -------------------- common/datasets/tedlium2_v2/textual_data.py | 39 ----- common/datasets/tedlium2_v2/vocab.py | 51 ------ 6 files changed, 541 deletions(-) delete mode 100644 common/datasets/tedlium2_v2/corpus.py delete mode 100644 common/datasets/tedlium2_v2/download.py delete mode 100644 common/datasets/tedlium2_v2/export.py delete mode 100644 common/datasets/tedlium2_v2/lexicon.py delete mode 100644 common/datasets/tedlium2_v2/textual_data.py delete mode 100644 common/datasets/tedlium2_v2/vocab.py diff --git a/common/datasets/tedlium2_v2/corpus.py b/common/datasets/tedlium2_v2/corpus.py deleted file mode 100644 index f74a7acbf..000000000 --- a/common/datasets/tedlium2_v2/corpus.py +++ /dev/null @@ -1,136 +0,0 @@ -import os -from functools import lru_cache -from typing import Dict, Optional, Any - -from sisyphus import tk - -from i6_core.audio.encoding import BlissChangeEncodingJob - -from i6_core.meta import CorpusObject - -from ..tedlium2.constants import DURATIONS -from .download import download_data_dict - - -@lru_cache() -def get_bliss_corpus_dict(audio_format: str = "wav", output_prefix: str = "datasets") -> Dict[str, tk.Path]: - """ - creates a dictionary of all corpora in the TedLiumV2 dataset in the bliss xml format - - :param audio_format: options: wav, ogg, flac, sph, nist. nist (NIST sphere format) and sph are the same. - :param output_prefix: - :return: - """ - assert audio_format in ["flac", "ogg", "wav", "sph", "nist"] - - output_prefix = os.path.join(output_prefix, "Ted-Lium-2") - - bliss_corpus_dict = download_data_dict(output_prefix=output_prefix).bliss_nist - - audio_format_options = { - "wav": { - "output_format": "wav", - "codec": "pcm_s16le", - }, - "ogg": {"output_format": "ogg", "codec": "libvorbis"}, - "flac": {"output_format": "flac", "codec": "flac"}, - } - - converted_bliss_corpus_dict = {} - if audio_format not in ["sph", "nist"]: - for corpus_name, sph_corpus in bliss_corpus_dict.items(): - bliss_change_encoding_job = BlissChangeEncodingJob( - corpus_file=sph_corpus, - sample_rate=16000, - recover_duration=False, - **audio_format_options[audio_format], - ) - bliss_change_encoding_job.add_alias( - os.path.join( - output_prefix, - "%s_conversion" % audio_format, - corpus_name, - ) - ) - converted_bliss_corpus_dict[corpus_name] = bliss_change_encoding_job.out_corpus - else: - converted_bliss_corpus_dict = bliss_corpus_dict - - return converted_bliss_corpus_dict - - -@lru_cache() -def get_corpus_object_dict(audio_format: str = "flac", output_prefix: str = "datasets") -> Dict[str, CorpusObject]: - """ - creates a dict of all corpora in the TedLiumV2 dataset as a `meta.CorpusObject` - - :param audio_format: options: wav, ogg, flac, sph, nist. nist (NIST sphere format) and sph are the same. - :param output_prefix: - :return: - """ - bliss_corpus_dict = get_bliss_corpus_dict(audio_format=audio_format, output_prefix=output_prefix) - - corpus_object_dict = {} - - for corpus_name, bliss_corpus in bliss_corpus_dict.items(): - corpus_object = CorpusObject() - corpus_object.corpus_file = bliss_corpus - corpus_object.audio_format = audio_format - corpus_object.audio_dir = None - corpus_object.duration = DURATIONS[corpus_name] - - corpus_object_dict[corpus_name] = corpus_object - - return corpus_object_dict - - -@lru_cache() -def get_stm_dict(output_prefix: str = "datasets") -> Dict[str, tk.Path]: - """ - fetches the STM files for TedLiumV2 dataset - - :param output_prefix: - :return: - """ - return download_data_dict(output_prefix=output_prefix).stm - - -def get_ogg_zip_dict( - subdir_prefix: str = "datasets", - returnn_python_exe: Optional[tk.Path] = None, - returnn_root: Optional[tk.Path] = None, - bliss_to_ogg_job_rqmt: Optional[Dict[str, Any]] = None, - extra_args: Optional[Dict[str, Dict[str, Any]]] = None, -) -> Dict[str, tk.Path]: - """ - Get a dictionary containing the paths to the ogg_zip for each corpus part. - - No outputs will be registered. - - :param subdir_prefix: dir name prefix for aliases and outputs - :param returnn_python_exe: path to returnn python executable - :param returnn_root: python to returnn root - :param bliss_to_ogg_job_rqmt: rqmt for bliss to ogg job - :param extra_args: extra args for each dataset for bliss to ogg job - :return: dictionary with ogg zip paths for each corpus (train, dev, test) - """ - from i6_core.returnn.oggzip import BlissToOggZipJob - - ogg_zip_dict = {} - bliss_corpus_dict = get_bliss_corpus_dict(audio_format="wav", output_prefix=subdir_prefix) - if extra_args is None: - extra_args = {} - for name, bliss_corpus in bliss_corpus_dict.items(): - ogg_zip_job = BlissToOggZipJob( - bliss_corpus, - no_conversion=False, # cannot be used for corpus with multiple segments per recording - returnn_python_exe=returnn_python_exe, - returnn_root=returnn_root, - **extra_args.get(name, {}), - ) - if bliss_to_ogg_job_rqmt: - ogg_zip_job.rqmt = bliss_to_ogg_job_rqmt - ogg_zip_job.add_alias(os.path.join(subdir_prefix, "Ted-Lium-2", "%s_ogg_zip_job" % name)) - ogg_zip_dict[name] = ogg_zip_job.out_ogg_zip - - return ogg_zip_dict diff --git a/common/datasets/tedlium2_v2/download.py b/common/datasets/tedlium2_v2/download.py deleted file mode 100644 index 948224ae7..000000000 --- a/common/datasets/tedlium2_v2/download.py +++ /dev/null @@ -1,48 +0,0 @@ -import os -from dataclasses import dataclass -from functools import lru_cache -from typing import Any, Dict - -from sisyphus import tk - -from i6_core.datasets.tedlium2 import ( - DownloadTEDLIUM2CorpusJob, - CreateTEDLIUM2BlissCorpusJobV2, -) - - -@dataclass(frozen=True) -class TedLium2Data: - """Class for storing the TedLium2 data""" - - data_dir: Dict[str, tk.Path] - lm_dir: tk.Path - vocab: tk.Path - bliss_nist: Dict[str, tk.Path] - stm: Dict[str, tk.Path] - - -@lru_cache() -def download_data_dict(output_prefix: str = "datasets") -> TedLium2Data: - """ - downloads the TedLiumV2 dataset and performs the initial data processing steps - Uses the fixed job CreateTEDLIUM2BlissCorpusJobV2 from: https://github.com/rwth-i6/i6_core/pull/490 - - :param output_prefix: - :return: - """ - download_tedlium2_job = DownloadTEDLIUM2CorpusJob() - download_tedlium2_job.add_alias(os.path.join(output_prefix, "download", "raw_corpus_job")) - - bliss_corpus_tedlium2_job = CreateTEDLIUM2BlissCorpusJobV2(download_tedlium2_job.out_corpus_folders) - bliss_corpus_tedlium2_job.add_alias(os.path.join(output_prefix, "create_bliss", "bliss_corpus_job")) - - tl2_data = TedLium2Data( - data_dir=download_tedlium2_job.out_corpus_folders, - lm_dir=download_tedlium2_job.out_lm_folder, - vocab=download_tedlium2_job.out_vocab_dict, - bliss_nist=bliss_corpus_tedlium2_job.out_corpus_files, - stm=bliss_corpus_tedlium2_job.out_stm_files, - ) - - return tl2_data diff --git a/common/datasets/tedlium2_v2/export.py b/common/datasets/tedlium2_v2/export.py deleted file mode 100644 index 1919fa8c0..000000000 --- a/common/datasets/tedlium2_v2/export.py +++ /dev/null @@ -1,96 +0,0 @@ -import os - -from sisyphus import tk - -from .corpus import get_bliss_corpus_dict, get_stm_dict -from .lexicon import get_bliss_lexicon, get_g2p_augmented_bliss_lexicon -from .textual_data import get_text_data_dict - -TEDLIUM_PREFIX = "Ted-Lium-2" - - -def _export_datasets(output_prefix: str = "datasets"): - """ - exports all datasets for TedLiumV2 with all available audio formats - - :param output_prefix: - :return: - """ - for audio_format in ["flac", "ogg", "wav", "nist", "sph"]: - bliss_corpus_dict = get_bliss_corpus_dict(audio_format=audio_format, output_prefix=output_prefix) - for name, bliss_corpus in bliss_corpus_dict.items(): - tk.register_output( - os.path.join( - output_prefix, - TEDLIUM_PREFIX, - "corpus", - f"{name}-{audio_format}.xml.gz", - ), - bliss_corpus, - ) - - -def _export_stms(output_prefix: str = "datasets"): - """ - exports all STMs for TedLiumV2 - - :param output_prefix: - :return: - """ - stm_dict = get_stm_dict(output_prefix=output_prefix) - for name, stm_file in stm_dict.items(): - tk.register_output( - os.path.join( - output_prefix, - TEDLIUM_PREFIX, - "stm", - f"{name}.txt", - ), - stm_file, - ) - - -def _export_text_data(output_prefix: str = "datasets"): - """ - exports all the textual data for TedLiumV2 dataset - - :param output_prefix: - :return: - """ - txt_data_dict = get_text_data_dict(output_prefix=output_prefix) - for k, v in txt_data_dict.items(): - tk.register_output(os.path.join(output_prefix, TEDLIUM_PREFIX, "text_data", f"{k}.gz"), v) - - -def _export_lexicon(output_prefix: str = "datasets"): - """ - exports the lexicon for TedLiumV2 - - :param output_prefix: - :return: - """ - lexicon_output_prefix = os.path.join(output_prefix, TEDLIUM_PREFIX, "lexicon") - - bliss_lexicon = get_bliss_lexicon(output_prefix=output_prefix) - tk.register_output(os.path.join(lexicon_output_prefix, "tedlium2.lexicon.xml.gz"), bliss_lexicon) - - g2p_bliss_lexicon = get_g2p_augmented_bliss_lexicon( - add_unknown_phoneme_and_mapping=False, output_prefix=output_prefix - ) - tk.register_output( - os.path.join(lexicon_output_prefix, "tedlium2.lexicon_with_g2p.xml.gz"), - g2p_bliss_lexicon, - ) - - -def export_all(output_prefix: str = "datasets"): - """ - exports everything for TedLiumV2 - - :param output_prefix: - :return: - """ - _export_datasets(output_prefix=output_prefix) - _export_stms(output_prefix=output_prefix) - _export_text_data(output_prefix=output_prefix) - _export_lexicon(output_prefix=output_prefix) diff --git a/common/datasets/tedlium2_v2/lexicon.py b/common/datasets/tedlium2_v2/lexicon.py deleted file mode 100644 index 4d8366155..000000000 --- a/common/datasets/tedlium2_v2/lexicon.py +++ /dev/null @@ -1,171 +0,0 @@ -import os -from functools import lru_cache -from sisyphus import tk - -from i6_core.lexicon import LexiconFromTextFileJob -from i6_core.lexicon.modification import WriteLexiconJob, MergeLexiconJob -from i6_core.lib import lexicon -from i6_experiments.common.helpers.g2p import G2PBasedOovAugmenter - -from ..tedlium2.constants import SILENCE_PHONEME, UNKNOWN_PHONEME -from .corpus import get_bliss_corpus_dict -from .download import download_data_dict - - -@lru_cache() -def _get_special_lemma_lexicon( - add_unknown_phoneme_and_mapping: bool = False, - add_silence: bool = True, -) -> lexicon.Lexicon: - """ - creates the special lemma used in RASR - - :param add_unknown_phoneme_and_mapping: adds [unknown] as label with [UNK] as phoneme and as LM token - :param add_silence: adds [silence] label with [SILENCE] phoneme, - use False for CTC/RNN-T setups without silence modelling. - :return: - """ - lex = lexicon.Lexicon() - if add_silence: - lex.add_lemma( - lexicon.Lemma( - orth=["[silence]", ""], - phon=[SILENCE_PHONEME], - synt=[], - special="silence", - eval=[[]], - ) - ) - if add_unknown_phoneme_and_mapping: - lex.add_lemma( - lexicon.Lemma( - orth=["[unknown]"], - phon=[UNKNOWN_PHONEME], - synt=[""], - special="unknown", - eval=[[]], - ) - ) - else: - lex.add_lemma( - lexicon.Lemma( - orth=["[unknown]"], - synt=[""], - special="unknown", - eval=[[]], - ) - ) - - lex.add_lemma( - lexicon.Lemma( - orth=["[sentence-begin]"], - synt=[""], - special="sentence-begin", - eval=[[]], - ) - ) - lex.add_lemma( - lexicon.Lemma( - orth=["[sentence-end]"], - synt=[""], - special="sentence-end", - eval=[[]], - ) - ) - if add_silence: - lex.add_phoneme(SILENCE_PHONEME, variation="none") - if add_unknown_phoneme_and_mapping: - lex.add_phoneme(UNKNOWN_PHONEME, variation="none") - - return lex - - -@lru_cache() -def _get_raw_bliss_lexicon( - output_prefix: str, -) -> tk.Path: - """ - downloads the vocabulary file from the TedLiumV2 dataset and creates a bliss lexicon - - :param output_prefix: - :return: - """ - vocab = download_data_dict(output_prefix=output_prefix).vocab - - convert_lexicon_job = LexiconFromTextFileJob( - text_file=vocab, - compressed=True, - ) - convert_lexicon_job.add_alias(os.path.join(output_prefix, "convert_text_to_bliss_lexicon_job")) - - return convert_lexicon_job.out_bliss_lexicon - - -@lru_cache() -def get_bliss_lexicon( - add_unknown_phoneme_and_mapping: bool = True, - add_silence: bool = True, - output_prefix: str = "datasets", -) -> tk.Path: - """ - merges the lexicon with special RASR tokens with the lexicon created from the downloaded TedLiumV2 vocabulary - - :param add_unknown_phoneme_and_mapping: add an unknown phoneme and mapping unknown phoneme:lemma - :param add_silence: include silence lemma and phoneme - :param output_prefix: - :return: - """ - static_lexicon = _get_special_lemma_lexicon(add_unknown_phoneme_and_mapping, add_silence) - static_lexicon_job = WriteLexiconJob(static_lexicon, sort_phonemes=True, sort_lemmata=False) - static_lexicon_job.add_alias(os.path.join(output_prefix, "static_lexicon_job")) - - raw_tedlium2_lexicon = _get_raw_bliss_lexicon(output_prefix=output_prefix) - - merge_lexicon_job = MergeLexiconJob( - bliss_lexica=[ - static_lexicon_job.out_bliss_lexicon, - raw_tedlium2_lexicon, - ], - sort_phonemes=True, - sort_lemmata=True, - compressed=True, - ) - merge_lexicon_job.add_alias(os.path.join(output_prefix, "merge_lexicon_job")) - - return merge_lexicon_job.out_bliss_lexicon - - -@lru_cache() -def get_g2p_augmented_bliss_lexicon( - add_unknown_phoneme_and_mapping: bool = False, - add_silence: bool = True, - audio_format: str = "wav", - output_prefix: str = "datasets", -) -> tk.Path: - """ - augment the kernel lexicon with unknown words from the training corpus - - :param add_unknown_phoneme_and_mapping: add an unknown phoneme and mapping unknown phoneme:lemma - :param add_silence: include silence lemma and phoneme - :param audio_format: options: wav, ogg, flac, sph, nist. nist (NIST sphere format) and sph are the same. - :param output_prefix: - :return: - """ - original_bliss_lexicon = get_bliss_lexicon( - add_unknown_phoneme_and_mapping, add_silence=add_silence, output_prefix=output_prefix - ) - corpus_name = "train" - bliss_corpus = get_bliss_corpus_dict(audio_format=audio_format, output_prefix=output_prefix)[corpus_name] - - g2p_augmenter = G2PBasedOovAugmenter( - original_bliss_lexicon=original_bliss_lexicon, - train_lexicon=original_bliss_lexicon, - ) - augmented_bliss_lexicon = g2p_augmenter.get_g2p_augmented_bliss_lexicon( - bliss_corpus=bliss_corpus, - corpus_name=corpus_name, - alias_path=os.path.join(output_prefix, "g2p"), - casing="lower", - ) - - return augmented_bliss_lexicon diff --git a/common/datasets/tedlium2_v2/textual_data.py b/common/datasets/tedlium2_v2/textual_data.py deleted file mode 100644 index 553489a0d..000000000 --- a/common/datasets/tedlium2_v2/textual_data.py +++ /dev/null @@ -1,39 +0,0 @@ -from functools import lru_cache -from typing import Dict - -from sisyphus import tk - -from i6_core.corpus import CorpusToTxtJob -from i6_core.text import ConcatenateJob - -from i6_experiments.common.datasets.tedlium2.corpus_v2 import get_bliss_corpus_dict - -from .download import download_data_dict - - -@lru_cache() -def get_text_data_dict(output_prefix: str = "datasets") -> Dict[str, tk.Path]: - """ - gather all the textual data provided within the TedLiumV2 dataset - - :param output_prefix: - :return: - """ - lm_dir = download_data_dict(output_prefix=output_prefix).lm_dir - - text_corpora = [ - "commoncrawl-9pc", - "europarl-v7-6pc", - "giga-fren-4pc", - "news-18pc", - "news-commentary-v8-9pc", - "yandex-1m-31pc", - ] - - txt_dict = {name: lm_dir.join_right("%s.en.gz" % name) for name in text_corpora} - txt_dict["audio-transcriptions"] = CorpusToTxtJob( - get_bliss_corpus_dict(audio_format="wav", output_prefix="corpora")["train"] - ).out_txt - txt_dict["background-data"] = ConcatenateJob(list(txt_dict.values())).out - - return txt_dict diff --git a/common/datasets/tedlium2_v2/vocab.py b/common/datasets/tedlium2_v2/vocab.py deleted file mode 100644 index 14d4455f5..000000000 --- a/common/datasets/tedlium2_v2/vocab.py +++ /dev/null @@ -1,51 +0,0 @@ -from i6_experiments.common.helpers.text_labels.subword_nmt_bpe import ( - get_returnn_subword_nmt, - get_bpe_settings, - BPESettings, -) -from .corpus import get_bliss_corpus_dict - - -def get_subword_nmt_bpe(bpe_size: int, unk_label: str = "", subdir_prefix: str = "") -> BPESettings: - """ - Get the BPE tokens via the Returnn subword-nmt for a Tedlium2 setup. - - :param bpe_size: the number of BPE merge operations. This is NOT the resulting vocab size! - :param unk_label: unknown label symbol - :param subdir_prefix: dir name prefix for aliases and outputs - """ - subword_nmt_repo = get_returnn_subword_nmt(output_prefix=subdir_prefix) - train_corpus = get_bliss_corpus_dict()["train"] - bpe_settings = get_bpe_settings( - train_corpus, - bpe_size=bpe_size, - unk_label=unk_label, - output_prefix=subdir_prefix, - subword_nmt_repo_path=subword_nmt_repo, - ) - return bpe_settings - - -def get_subword_nmt_bpe_v2(bpe_size: int, unk_label: str = "", subdir_prefix: str = "") -> BPESettings: - """ - Get the BPE tokens via the Returnn subword-nmt for a Tedlium2 setup. - - V2: Uses subword-nmt version corrected for Apptainer related bug, adds hash overwrite for repo - - :param bpe_size: the number of BPE merge operations. This is NOT the resulting vocab size! - :param unk_label: unknown label symbol - :param subdir_prefix: dir name prefix for aliases and outputs - """ - subword_nmt_repo = get_returnn_subword_nmt( - commit_hash="5015a45e28a958f800ef1c50e7880c0c9ef414cf", output_prefix=subdir_prefix - ) - subword_nmt_repo.hash_overwrite = "I6_SUBWORD_NMT_V2" - train_corpus = get_bliss_corpus_dict()["train"] - bpe_settings = get_bpe_settings( - train_corpus, - bpe_size=bpe_size, - unk_label=unk_label, - output_prefix=subdir_prefix, - subword_nmt_repo_path=subword_nmt_repo, - ) - return bpe_settings From ae73db3f6e2fa37165aa738c5b5fa3e013fcaab6 Mon Sep 17 00:00:00 2001 From: "luca.gaudino" Date: Tue, 4 Jun 2024 15:50:57 +0200 Subject: [PATCH 090/227] update trainings and initial rnnt decoder rf --- .../conformer_import_moh_att_2023_06_30.py | 4 +- .../conformer_import_moh_att_train.py | 14 +- .../conformer_import_moh_att_2023_10_19.py | 8 +- .../conformer_import_moh_att_train.py | 4 +- .../librispeech_960/conformer_ctc_train.py | 2 +- .../tedlium2/conformer_ctc_train.py | 205 ++- .../tedlium2/conformer_rnnt_train.py | 55 +- .../rf/conformer_ctc/model_conformer_ctc.py | 16 +- .../asr/rf/conformer_rnnt/model_recog_rnnt.py | 1130 ++++++++--------- 9 files changed, 770 insertions(+), 668 deletions(-) diff --git a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py index ab9778259..52193c721 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py @@ -639,7 +639,7 @@ def sis_run_with_prefix(prefix_name: str = None): # opls att + ctc + trafo lm + ilm for scales, prior_scale, lm_scale, ilm_scale, beam_size in product( - [(0.85, 0.15)], [0.0], [0.5], [0.3, 0.35, 0.4, 0.45], [12, 32] + [(0.8, 0.2)], [0.05, 0.03, 0.04, 0.06, 0.07], [0.65], [0.4], [32] ): att_scale, ctc_scale = scales recog_name = ( @@ -668,7 +668,7 @@ def sis_run_with_prefix(prefix_name: str = None): task, model_with_checkpoint, model_recog, - dev_sets=["dev-other"], + dev_sets=["dev-clean", "dev-other"], model_args=model_args, search_args=search_args, prefix_name=name, diff --git a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_train.py b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_train.py index 79cb8641f..5bdc5eff2 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_train.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_train.py @@ -102,7 +102,7 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): # RF recog: {"dev-clean": 2.25, "dev-other": 5.34, "test-clean": 2.42, "test-other": 5.56} # _recog_imported() - train_exp("from-scratch-train", config_11gb, gpu_mem=11) + # train_exp("from-scratch-train", config_11gb, gpu_mem=11) # train_exp( # dev-other 7.6 # "base-24gb-bs30k-f32", @@ -334,6 +334,18 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], }, ) + model = train_exp( + "base-24gb-v6-lrlin1e_5_600k_noCTC", + config_24gb_v6, + config_updates={ + "learning_rate": 1.0, + "dynamic_learning_rate": dyn_lr_piecewise_linear, + # total steps after 2000 epochs: 982.312 + "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], + "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + "aux_loss_layers": [], + }, + ) # All beam search experiments using model_recog_pure_torch, beam_search_sep_ended_keep_v6. # for name, recog_config in { # "beam12-batch200-lenReward01": { diff --git a/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_2023_10_19.py b/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_2023_10_19.py index d89d6fbf1..19b1bb220 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_2023_10_19.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_2023_10_19.py @@ -482,7 +482,7 @@ def sis_run_with_prefix(prefix_name: str = None): # att + trafo lm + ilm correction for model_name, lm_scale, ilm_scale, beam_size in product( # ["model_baseline", "model_ctc0.5_att0.5"], [0.36] ,[0.28], [12] - ["model_baseline"], [0.36], [0.28], [] #12 + ["model_baseline"], [0.36], [0.28], [12,24] #12 ): ilm_model_args = copy.deepcopy(models_with_pt_ckpt[model_name]["model_args"]) ilm_model_args["preload_from_files"] = preload_from_files_ilm @@ -492,7 +492,7 @@ def sis_run_with_prefix(prefix_name: str = None): + "/" + model_name + f"/att_trafolm{lm_scale}_ilm{ilm_scale}" - + f"_beam{beam_size}" + + f"_beam{beam_size}_fffix" ) search_args = { "beam_size": beam_size, @@ -502,6 +502,7 @@ def sis_run_with_prefix(prefix_name: str = None): "bsf": 32, "use_first_lm": True, "use_zoneout_output": True, + "hash_overwrite": "ffix", } recog_res, recog_out = recog_model( task, @@ -532,7 +533,7 @@ def sis_run_with_prefix(prefix_name: str = None): + model_name + f"/opls_att{att_scale}_ctc{ctc_scale}_trafolm{lm_scale}_ilm{ilm_scale}" + (f"_prior{prior_scale}" if prior_scale > 0 else "") - + f"_beam{beam_size}" + + f"_beam{beam_size}_ffix" ) search_args = { "beam_size": beam_size, @@ -546,6 +547,7 @@ def sis_run_with_prefix(prefix_name: str = None): "prior_corr": True if prior_scale > 0 else False, "prior_scale": prior_scale, "ctc_prior_file": models[model_name]["prior"], + "use_first_lm": True, } recog_res, recog_out = recog_model( diff --git a/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_train.py b/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_train.py index f405cd911..b96e1f691 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_train.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/tedlium2/conformer_import_moh_att_train.py @@ -147,7 +147,7 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): # gpu_mem=11, # ) - train_exp( # + train_exp( # dev 7.8 test 7.22 "base-11gb-v3-lrlin1e_5_261k_aux4_8_zoneout_fix", my_config_11gb, config_updates={ @@ -203,7 +203,7 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): # ) - model = train_exp( # + model = train_exp( # dev 8.53 test 7.7 "base-24gb-v6-lrlin1e_5_85k_zoneout_fix", config_24gb_v6, config_updates={ diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py index 4b08a9e96..53e2e77ee 100644 --- a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py @@ -160,7 +160,7 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): }, ) - train_exp( # dev-other + train_exp( # dev-other 7.17 "base-24gb-lrlin1e_5_600k_ctc_only_aux4_8_no_mel_norm", config_24gb_v6, config_updates={ diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_ctc_train.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_ctc_train.py index 3fd2c0362..e82d2cef9 100644 --- a/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_ctc_train.py +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_ctc_train.py @@ -370,70 +370,225 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): # gpu_mem=11, # ) - train_exp( - "from-scratch-11gb_lrmin1e-5_lrmax1e-3_aux4_8_adjSpec6k_noCurrL", + # train_exp( # does not converge + # "from-scratch-11gb_lrmin1e-5_lrmax1e-3_aux4_8_adjSpec6k_noCurrL", + # ctc_train_config, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 2000 epochs: 982.312 + # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + # "aux_loss_layers": [4, 8], + # "specaugment_steps": (5_900, 18_000, 36_000), + # "epoch_wise_filter": { + # # (1, 5): {"max_mean_len": 1000}, # better? + # # older settings: + # # (1, 5): {"max_mean_len": 200}, + # # (6, 10): {"max_mean_len": 500}, + # }, + # }, + # num_epochs=400, + # gpu_mem=11, + # ) + + # train_exp( # does not converge + # "from-scratch-11gb_lrmin8e-5_lrmax8e-4_aux4_8_noSpec_noCurrL", + # ctc_train_config, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 2000 epochs: 982.312 + # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [8e-5, 8e-4, 8e-5, 1e-6], + # "aux_loss_layers": [4, 8], + # "use_specaugment": False, + # "epoch_wise_filter": { + # # (1, 5): {"max_mean_len": 1000}, # better? + # # older settings: + # # (1, 5): {"max_mean_len": 200}, + # # (6, 10): {"max_mean_len": 500}, + # }, + # }, + # num_epochs=400, + # gpu_mem=11, + # ) + + # train_exp( # does not converge + # "from-scratch-11gb_lrmin1e-5_lrmax1e-3_aux4_8_noSpec_noCurrL", + # ctc_train_config, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 2000 epochs: 982.312 + # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + # "aux_loss_layers": [4, 8], + # "use_specaugment": False, + # "epoch_wise_filter": { + # # (1, 5): {"max_mean_len": 1000}, # better? + # # older settings: + # # (1, 5): {"max_mean_len": 200}, + # # (6, 10): {"max_mean_len": 500}, + # }, + # }, + # num_epochs=400, + # gpu_mem=11, + # ) + + # train_exp( # does not converge + # "from-scratch-11gb_lrmin1e-5_lrmax1e-3_aux4_8_noSpec", + # ctc_train_config, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 2000 epochs: 982.312 + # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + # "aux_loss_layers": [4, 8], + # "use_specaugment": False, + # }, + # num_epochs=400, + # gpu_mem=11, + # ) + + # train_exp( # does not converge + # "from-scratch-11gb_lrmin1e-5_lrmax1e-3_aux4_8_adjSpec6k_adjCurrL", + # ctc_train_config, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 2000 epochs: 982.312 + # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + # "aux_loss_layers": [4, 8], + # "specaugment_steps": (5_900, 18_000, 36_000), + # "epoch_wise_filter": { + # # (1, 5): {"max_mean_len": 1000}, # better? + # # older settings: + # # (1, 5): {"max_mean_len": 200}, + # # (6, 10): {"max_mean_len": 500}, + # (1,2): {"max_mean_len": 400}, + # (2,4): {"max_mean_len": 800}, + # }, + # }, + # num_epochs=400, + # gpu_mem=11, + # ) + + # train_exp( # does not converge + # "from-scratch-11gb_lrmin1e-5_lrmax1e-3_aux4_8_adjSpec6k_accumGrad4", + # ctc_train_config, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 2000 epochs: 982.312 + # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + # "aux_loss_layers": [4, 8], + # "specaugment_steps": (5_900, 18_000, 36_000), + # "accum_grad_multiple_step": 4, + # }, + # config_deletes=["learning_rate_warmup_steps", "learning_rate_invsqrt_norm"], + # num_epochs=400, + # gpu_mem=11, + # ) + + # train_exp( # does not converge + # "from-scratch-11gb_lrmax2e-3_aux4_8_adjSpec", + # ctc_train_config, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 2000 epochs: 982.312 + # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [2e-4, 2e-3, 2e-4, 1e-6], + # "aux_loss_layers": [4, 8], + # "specaugment_steps": (5_900, 18_000, 36_000), + # }, + # num_epochs=400, + # gpu_mem=11, + # ) + + train_exp( # + "from-scratch-11gb_lrmax8e-4_aux4_8_adjSpec_no_norm_loss", ctc_train_config, config_updates={ "learning_rate": 1.0, "dynamic_learning_rate": dyn_lr_piecewise_linear, # total steps after 2000 epochs: 982.312 "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% - "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + "learning_rate_piecewise_values": [8e-5, 8e-4, 8e-5, 1e-6], "aux_loss_layers": [4, 8], "specaugment_steps": (5_900, 18_000, 36_000), - "epoch_wise_filter": { - # (1, 5): {"max_mean_len": 1000}, # better? - # older settings: - # (1, 5): {"max_mean_len": 200}, - # (6, 10): {"max_mean_len": 500}, - }, + "use_normalized_loss": False, }, num_epochs=400, gpu_mem=11, ) - train_exp( - "from-scratch-11gb_lrmin1e-5_lrmax1e-3_aux4_8_adjSpec6k_adjCurrL", + train_exp( # + "from-scratch-11gb_lrmax8e-4_aux4_8_adjSpec_no_norm_loss_no_grad_clip", ctc_train_config, config_updates={ "learning_rate": 1.0, "dynamic_learning_rate": dyn_lr_piecewise_linear, # total steps after 2000 epochs: 982.312 "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% - "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + "learning_rate_piecewise_values": [8e-5, 8e-4, 8e-5, 1e-6], "aux_loss_layers": [4, 8], "specaugment_steps": (5_900, 18_000, 36_000), - "epoch_wise_filter": { - # (1, 5): {"max_mean_len": 1000}, # better? - # older settings: - # (1, 5): {"max_mean_len": 200}, - # (6, 10): {"max_mean_len": 500}, - (1,2): {"max_mean_len": 400}, - (2,4): {"max_mean_len": 800}, - }, + "use_normalized_loss": False, }, + config_deletes=["learning_rate_warmup_steps", "learning_rate_invsqrt_norm", "gradient_clip_global_norm"], num_epochs=400, gpu_mem=11, ) - train_exp( - "from-scratch-11gb_lrmin1e-5_lrmax1e-3_aux4_8_adjSpec6k_accumGrad4", + train_exp( # + "from-scratch-11gb_lrmax8e-4_aux4_8_adjSpe_eps2e-8", ctc_train_config, config_updates={ "learning_rate": 1.0, "dynamic_learning_rate": dyn_lr_piecewise_linear, # total steps after 2000 epochs: 982.312 "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% - "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + "learning_rate_piecewise_values": [8e-5, 8e-4, 8e-5, 1e-6], "aux_loss_layers": [4, 8], "specaugment_steps": (5_900, 18_000, 36_000), - "accum_grad_multiple_step": 4, + "optimizer": { + "class": "adamw", + "epsilon": 2e-8, + "weight_decay": 1e-6, + }, }, - config_deletes=["learning_rate_warmup_steps", "learning_rate_invsqrt_norm"], num_epochs=400, gpu_mem=11, ) + train_exp( # + "from-scratch-11gb_lrmax8e-4_aux4_8_adjSpe_adam", + ctc_train_config, + config_updates={ + "learning_rate": 1.0, + "dynamic_learning_rate": dyn_lr_piecewise_linear, + # total steps after 2000 epochs: 982.312 + "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + "learning_rate_piecewise_values": [8e-5, 8e-4, 8e-5, 1e-6], + "aux_loss_layers": [4, 8], + "specaugment_steps": (5_900, 18_000, 36_000), + "optimizer": { + "class": "adam", + "epsilon": 1e-8, + "weight_decay": 1e-6, + }, + }, + num_epochs=400, + gpu_mem=11, + ) + + + _sis_prefix: Optional[str] = None diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_rnnt_train.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_rnnt_train.py index beae49311..4a669a70b 100644 --- a/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_rnnt_train.py +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_rnnt_train.py @@ -56,7 +56,8 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): batch_size=15_000 * _batch_size_factor, max_seqs=200, # max_seq_length_default_target=75, - specaugment_steps=(10_000, 20_000, 40_000), + # specaugment_steps=(10_000, 20_000, 40_000), + specaugment_steps=(5_900, 18_000, 36_000), # gradient_clip=0, # gradient_clip_global_norm = 1.0 optimizer={ @@ -67,12 +68,12 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): # accum_grad_multiple_step=4, # gradient_noise=0.0, learning_rate=2.5e-3, - dynamic_learning_rate=dyn_lr_lin_warmup_invsqrt_decay, - learning_rate_warmup_steps=40_000, - learning_rate_invsqrt_norm=40_000, + dynamic_learning_rate=dyn_lr_piecewise_linear, + # learning_rate_piecewise_steps= [261_000, 522_000, 580_000], # 45% 45 % 10% # 11gb + learning_rate_piecewise_steps = [85_500, 171_000, 190_000], # 45% 45 % 10% # 24gb # aux_loss_layers=[4, 8], max_seq_length_default_target=None, - gradient_clip_global_norm=5.0, + # gradient_clip_global_norm=5.0, accum_grad_multiple_step=2, # aux_loss_layers=[12], ) @@ -80,39 +81,33 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): # train_exp("base-11gb", config_11gb, gpu_mem=11) # train_exp("base-11gb-v1", my_config_11gb, num_epochs=400, gpu_mem=11) train_exp( # - "from-scratch-11gb", + "from-scratch-24gb", rnnt_train_config, config_updates={ "learning_rate": 1.0, - "dynamic_learning_rate": dyn_lr_piecewise_linear, - # total steps after 2000 epochs: 982.312 - # "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], - # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], - "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% - "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + "learning_rate_piecewise_values": [8e-5, 8e-4, 8e-5, 1e-6], }, - config_deletes=["learning_rate_warmup_steps", "learning_rate_invsqrt_norm"], - num_epochs=400, - gpu_mem=11, - ) - - train_exp( # does not converge - "from-scratch-11gb", - rnnt_train_config, - config_updates={ - "learning_rate": 1.0, - "dynamic_learning_rate": dyn_lr_piecewise_linear, - # total steps after 2000 epochs: 982.312 - # "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], - # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], - "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% - "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], - }, - config_deletes=["learning_rate_warmup_steps", "learning_rate_invsqrt_norm"], num_epochs=400, gpu_mem=24, ) + # train_exp( # does not converge (wrong steps + more mistakes) + # "from-scratch-11gb", + # rnnt_train_config, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 2000 epochs: 982.312 + # # "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], + # # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + # "learning_rate_piecewise_steps": [261_000, 522_000, 580_000], # 45% 45 % 10% + # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + # }, + # config_deletes=["learning_rate_warmup_steps", "learning_rate_invsqrt_norm"], + # num_epochs=400, + # gpu_mem=24, + # ) + _sis_prefix: Optional[str] = None diff --git a/users/gaudino/models/asr/rf/conformer_ctc/model_conformer_ctc.py b/users/gaudino/models/asr/rf/conformer_ctc/model_conformer_ctc.py index 21b5fac95..f32fc88dd 100644 --- a/users/gaudino/models/asr/rf/conformer_ctc/model_conformer_ctc.py +++ b/users/gaudino/models/asr/rf/conformer_ctc/model_conformer_ctc.py @@ -150,6 +150,7 @@ def __init__( config = get_global_config(return_empty_if_none=True) self.mel_normalization = config.typed_value("mel_normalization_ted2", True) + self.use_specaugment = config.typed_value("use_specaugment", True) self.in_dim = in_dim self.encoder = ConformerEncoder( @@ -288,13 +289,14 @@ def encode( if self._mixup: source = self._mixup(source, spatial_dim=in_spatial_dim) - # SpecAugment - source = rf.audio.specaugment( - source, - spatial_dim=in_spatial_dim, - feature_dim=self.in_dim, - **self._specaugment_opts, - ) + if self.use_specaugment: + # SpecAugment + source = rf.audio.specaugment( + source, + spatial_dim=in_spatial_dim, + feature_dim=self.in_dim, + **self._specaugment_opts, + ) # Encoder including convolutional frontend with _opt_apply_pretrain_to_encoder( self.encoder, collected_outputs, self._pretrain_opts diff --git a/users/gaudino/models/asr/rf/conformer_rnnt/model_recog_rnnt.py b/users/gaudino/models/asr/rf/conformer_rnnt/model_recog_rnnt.py index afd6c3eb8..5525e4369 100644 --- a/users/gaudino/models/asr/rf/conformer_rnnt/model_recog_rnnt.py +++ b/users/gaudino/models/asr/rf/conformer_rnnt/model_recog_rnnt.py @@ -24,6 +24,74 @@ from i6_experiments.users.gaudino.experiments.rf_conformer_att_2023.librispeech_960.trafo_lm import trafo_lm_kazuki_import + +Hypothesis = Tuple[List[int], Tensor, rf.State, float] +Hypothesis.__doc__ = """Hypothesis generated by RNN-T beam search decoder, + represented as tuple of (tokens, prediction network output, prediction network state, score). + """ + + +def _get_hypo_tokens(hypo: Hypothesis) -> List[int]: + return hypo[0] + + +def _get_hypo_predictor_out(hypo: Hypothesis) -> torch.Tensor: + return hypo[1] + + +def _get_hypo_state(hypo: Hypothesis) -> List[List[torch.Tensor]]: + return hypo[2] + + +def _get_hypo_score(hypo: Hypothesis) -> float: + return hypo[3] + + +def _get_hypo_key(hypo: Hypothesis) -> str: + return str(hypo[0]) + + +def _batch_state(hypos: List[Hypothesis]) -> List[List[torch.Tensor]]: # TODO + states: List[List[torch.Tensor]] = [] + for i in range(len(_get_hypo_state(hypos[0]))): + batched_state_components: List[torch.Tensor] = [] + for j in range(len(_get_hypo_state(hypos[0])[i])): + batched_state_components.append(torch.cat([_get_hypo_state(hypo)[i][j] for hypo in hypos])) + states.append(batched_state_components) + return states + + +def _slice_state(states: List[List[torch.Tensor]], idx: int, device: torch.device) -> List[List[torch.Tensor]]: # TODO + idx_tensor = torch.tensor([idx], device=device) + return [[state.index_select(0, idx_tensor) for state in state_tuple] for state_tuple in states] + + +def _default_hypo_sort_key(hypo: Hypothesis) -> float: + return _get_hypo_score(hypo) / (len(_get_hypo_tokens(hypo)) + 1) # is this doing length normalization ? + + +def _compute_updated_scores( + hypos: List[Hypothesis], + next_token_probs: Tensor, + beam_width: int, +) -> Tuple[Tensor, Tensor, Tensor]: # TODO + hypo_scores = torch.tensor([_get_hypo_score(h) for h in hypos]).unsqueeze(1) + nonblank_scores = hypo_scores + next_token_probs[:, :-1] # [beam_width, num_tokens - 1] + nonblank_nbest_scores, nonblank_nbest_idx = nonblank_scores.reshape(-1).topk(beam_width) + nonblank_nbest_hypo_idx = nonblank_nbest_idx.div(nonblank_scores.shape[1], rounding_mode="trunc") + nonblank_nbest_token = nonblank_nbest_idx % nonblank_scores.shape[1] + return nonblank_nbest_scores, nonblank_nbest_hypo_idx, nonblank_nbest_token + + +def _remove_hypo(hypo: Hypothesis, hypo_list: List[Hypothesis]) -> None: # TODO + for i, elem in enumerate(hypo_list): + if _get_hypo_key(hypo) == _get_hypo_key(elem): + del hypo_list[i] + break + + + + def model_recog( *, model: Model, @@ -70,6 +138,166 @@ def model_recog( out_seq_len = rf.constant(0, dims=batch_dims_) seq_log_prob = rf.constant(0.0, dims=batch_dims_) + + # TODO implement rnnt search + temperature = 1.0 + step_max_tokens = 100 + + def _init_b_hypos(self, device: torch.device) -> List[Hypothesis]: + token = self.blank + state = None + + one_tensor = torch.tensor([1], device=device) + pred_out, _, pred_state = model.predict(torch.tensor([[token]], device=device), one_tensor, state) + init_hypo = ( + [token], + pred_out[0].detach(), + pred_state, + 0.0, + ) + return [init_hypo] + + def _gen_next_token_probs( + self, enc_out: torch.Tensor, hypos: List[Hypothesis], device: torch.device + ) -> torch.Tensor: + one_tensor = torch.tensor([1], device=device) + predictor_out = torch.stack([_get_hypo_predictor_out(h) for h in hypos], dim=0) + joined_out, _, _ = model.join( + enc_out, + one_tensor, + predictor_out, + torch.tensor([1] * len(hypos), device=device), + ) # [beam_width, 1, 1, num_tokens] + joined_out = torch.nn.functional.log_softmax(joined_out / self.temperature, dim=3) + return joined_out[:, 0, 0] + + def _gen_b_hypos( + self, + b_hypos: List[Hypothesis], + a_hypos: List[Hypothesis], + next_token_probs: torch.Tensor, + key_to_b_hypo: Dict[str, Hypothesis], + ) -> List[Hypothesis]: + for i in range(len(a_hypos)): + h_a = a_hypos[i] + append_blank_score = _get_hypo_score(h_a) + next_token_probs[i, -1] + if _get_hypo_key(h_a) in key_to_b_hypo: + h_b = key_to_b_hypo[_get_hypo_key(h_a)] + _remove_hypo(h_b, b_hypos) + score = float(torch.tensor(_get_hypo_score(h_b)).logaddexp(append_blank_score)) + else: + score = float(append_blank_score) + h_b = ( + _get_hypo_tokens(h_a), + _get_hypo_predictor_out(h_a), + _get_hypo_state(h_a), + score, + ) + b_hypos.append(h_b) + key_to_b_hypo[_get_hypo_key(h_b)] = h_b + _, sorted_idx = torch.tensor([_get_hypo_score(hypo) for hypo in b_hypos]).sort() + return [b_hypos[idx] for idx in sorted_idx] + + def _gen_a_hypos( + self, + a_hypos: List[Hypothesis], + b_hypos: List[Hypothesis], + next_token_probs: torch.Tensor, + t: int, + beam_width: int, + device: torch.device, + ) -> List[Hypothesis]: + ( + nonblank_nbest_scores, + nonblank_nbest_hypo_idx, + nonblank_nbest_token, + ) = _compute_updated_scores(a_hypos, next_token_probs, beam_width) + + if len(b_hypos) < beam_width: + b_nbest_score = -float("inf") + else: + b_nbest_score = _get_hypo_score(b_hypos[-beam_width]) + + base_hypos: List[Hypothesis] = [] + new_tokens: List[int] = [] + new_scores: List[float] = [] + for i in range(beam_width): + score = float(nonblank_nbest_scores[i]) + if score > b_nbest_score: + a_hypo_idx = int(nonblank_nbest_hypo_idx[i]) + base_hypos.append(a_hypos[a_hypo_idx]) + new_tokens.append(int(nonblank_nbest_token[i])) + new_scores.append(score) + + if base_hypos: + new_hypos = self._gen_new_hypos(base_hypos, new_tokens, new_scores, t, device) + else: + new_hypos: List[Hypothesis] = [] + + return new_hypos + + def _gen_new_hypos( + self, + base_hypos: List[Hypothesis], + tokens: List[int], + scores: List[float], + t: int, + device: torch.device, + ) -> List[Hypothesis]: + tgt_tokens = torch.tensor([[token] for token in tokens], device=device) + states = _batch_state(base_hypos) + pred_out, _, pred_states = self.model.predict( + tgt_tokens, + torch.tensor([1] * len(base_hypos), device=device), + states, + ) + new_hypos: List[Hypothesis] = [] + for i, h_a in enumerate(base_hypos): + new_tokens = _get_hypo_tokens(h_a) + [tokens[i]] + new_hypos.append((new_tokens, pred_out[i].detach(), _slice_state(pred_states, i, device), scores[i])) + return new_hypos + + # from _search function + n_time_steps = enc_out.shape[1] + device = enc_out.device + + a_hypos: List[Hypothesis] = [] + b_hypos = self._init_b_hypos(device) if hypo is None else hypo + for t in range(n_time_steps): + a_hypos = b_hypos + b_hypos = torch.jit.annotate(List[Hypothesis], []) + key_to_b_hypo: Dict[str, Hypothesis] = {} + symbols_current_t = 0 + + while a_hypos: + next_token_probs = self._gen_next_token_probs(enc_out[:, t: t + 1], a_hypos, device) + next_token_probs = next_token_probs.cpu() + b_hypos = self._gen_b_hypos(b_hypos, a_hypos, next_token_probs, key_to_b_hypo) + + if symbols_current_t == self.step_max_tokens: + break + + a_hypos = self._gen_a_hypos( + a_hypos, + b_hypos, + next_token_probs, + t, + beam_width, + device, + ) + if a_hypos: + symbols_current_t += 1 + + _, sorted_idx = torch.tensor([self.hypo_sort_key(hyp) for hyp in b_hypos]).topk(beam_width) + b_hypos = [b_hypos[idx] for idx in sorted_idx] + + # return b_hypos + # results is in b_hypoes + # TODO: extract results + + return seq_targets, seq_log_prob, out_spatial_dim, beam_dim + + # old search code i = 0 seq_targets = [] seq_backrefs = [] @@ -90,8 +318,6 @@ def model_recog( # logits = model.decode_logits(input_embed=input_embed, **step_out) label_log_prob = rf.log_softmax(step_out["output"], axis=model.target_dim) - # TODO: implement rnnt search - breakpoint() # Filter out finished beams label_log_prob = rf.where( ended, @@ -166,634 +392,344 @@ def model_recog( model_recog.batch_size_dependent = False -def model_recog_pure_torch( - *, - model: Model, - data: Tensor, - data_spatial_dim: Dim, - targets: Optional[Tensor] = None, - targets_spatial_dim: Optional[Dim] = None, - max_seq_len: Optional[int] = None, -) -> Tuple[Tensor, Tensor, Dict[str, Tensor], Dim, Dim]: - """ - Function is run within RETURNN. +### Copied from torchaudio +# TODO: Adapt to rf +from typing import Callable, Dict, List, Optional, Tuple - Earlier we used the generic beam_search function, - but now we just directly perform the search here, - as this is overall simpler and shorter. +import torch +from torchaudio.models import RNNT - :return: - recog results including beam {batch, beam, out_spatial}, - log probs {batch, beam}, - recog results info: key -> {batch, beam}, - out_spatial_dim, - final beam_dim + +__all__ = ["Hypothesis", "RNNTBeamSearch"] + + +Hypothesis = Tuple[List[int], torch.Tensor, List[List[torch.Tensor]], float] +Hypothesis.__doc__ = """Hypothesis generated by RNN-T beam search decoder, + represented as tuple of (tokens, prediction network output, prediction network state, score). """ - import torch - import time - from i6_experiments.users.zeyer.decoding.beam_search_torch.beam_search_v5 import ( - BeamSearchOptsV5, - beam_search_v5, - ) - from i6_experiments.users.zeyer.decoding.beam_search_torch.beam_search_sep_ended import ( - BeamSearchDynBeamOpts, - beam_search_sep_ended, - ) - from i6_experiments.users.zeyer.decoding.beam_search_torch.beam_search_sep_ended_keep_v6 import ( - BeamSearchSepEndedKeepOpts, - beam_search_sep_ended_keep_v6, - ) - from i6_experiments.users.zeyer.decoding.beam_search_torch.scorers.length_reward import ( - LengthRewardScorer, - ) - from i6_experiments.users.zeyer.decoding.beam_search_torch.scorers.shallow_fusion import ( - ShallowFusedLabelScorers, - ) - from returnn.config import get_global_config - config = get_global_config() - torch.cuda.set_sync_debug_mode( - 1 - ) # debug CUDA sync. does not hurt too much to leave this always in? - start_time = time.perf_counter_ns() +def _get_hypo_tokens(hypo: Hypothesis) -> List[int]: + return hypo[0] - data_concat_zeros = config.float("data_concat_zeros", 0) - if data_concat_zeros: - data_concat_zeros_dim = Dim( - int(data_concat_zeros * _batch_size_factor * 100), name="data_concat_zeros" - ) - data, data_spatial_dim = rf.concat( - (data, data_spatial_dim), - (rf.zeros([data_concat_zeros_dim]), data_concat_zeros_dim), - allow_broadcast=True, - ) - batch_dims = data.remaining_dims((data_spatial_dim, data.feature_dim)) - assert ( - len(batch_dims) == 1 - ), batch_dims # not implemented otherwise, simple to add... - batch_dim = batch_dims[0] - enc, enc_spatial_dim = model.encode(data, in_spatial_dim=data_spatial_dim) - if max_seq_len is None: - max_seq_len = enc_spatial_dim.get_size_tensor() - else: - max_seq_len = rf.convert_to_tensor(max_seq_len, dtype="int32") +def _get_hypo_predictor_out(hypo: Hypothesis) -> torch.Tensor: + return hypo[1] - if data.raw_tensor.device.type == "cuda": - # Just so that timing of encoder is correct. - torch.cuda.synchronize(data.raw_tensor.device) - - enc_end_time = time.perf_counter_ns() - - beam_search_version = config.typed_value("beam_search_version", 1) - beam_search_func = { - 5: beam_search_v5, - "sep_ended": beam_search_sep_ended, - "sep_ended_keep_v6": beam_search_sep_ended_keep_v6, - }[beam_search_version] - if beam_search_version == "sep_ended": - beam_search_opts_cls = BeamSearchDynBeamOpts - elif isinstance(beam_search_version, str) and beam_search_version.startswith( - "sep_ended_keep" - ): - beam_search_opts_cls = BeamSearchSepEndedKeepOpts - elif isinstance(beam_search_version, int) and beam_search_version >= 5: - beam_search_opts_cls = BeamSearchOptsV5 - else: - raise ValueError(f"unexpected {beam_search_version=}") - beam_search_opts = (config.typed_value("beam_search_opts", None) or {}).copy() - if beam_search_opts.get("beam_size") is None: - beam_search_opts["beam_size"] = config.int("beam_size", 12) - if beam_search_opts.get("length_normalization_exponent") is None: - beam_search_opts["length_normalization_exponent"] = config.float( - "length_normalization_exponent", 1.0 - ) - if beam_search_opts.get("length_reward") is None: - beam_search_opts["length_reward"] = config.float("length_reward", 0.0) - extra = {} - out_individual_seq_scores = None - if config.bool("beam_search_collect_individual_seq_scores", False): - out_individual_seq_scores = {} - extra["out_individual_seq_scores"] = out_individual_seq_scores - cheating = config.bool("cheating", False) - if cheating: - assert targets and targets_spatial_dim - extra["cheating_targets"] = targets.copy_compatible_to_dims_raw( - [batch_dim, targets_spatial_dim] - ) - extra[ - "cheating_targets_seq_len" - ] = targets_spatial_dim.dyn_size_ext.copy_compatible_to_dims_raw([batch_dim]) - coverage_scale = beam_search_opts.pop("attention_coverage_scale", 0.0) - coverage_opts = beam_search_opts.pop("attention_coverage_opts", {}) - neg_coverage_scale = beam_search_opts.pop("neg_attention_coverage_scale", 0.0) - neg_coverage_opts = beam_search_opts.pop("neg_attention_coverage_opts", {}) - monotonicity_scale = beam_search_opts.pop("attention_monotonicity_scale", 0.0) - monotonicity_opts = beam_search_opts.pop("attention_monotonicity_opts", {}) - max_seq_len_factor = beam_search_opts.pop("max_seq_len_factor", 1) - if max_seq_len_factor != 1: - max_seq_len = rf.cast(max_seq_len * max_seq_len_factor, max_seq_len.dtype) - label_scorer = ShallowFusedLabelScorers() - if coverage_scale or neg_coverage_scale or cheating: - label_scorer.label_scorers.update( - get_label_scorer_and_coverage_scorer_pure_torch( - model=model, - batch_dim=batch_dim, - enc=enc, - enc_spatial_dim=enc_spatial_dim, - coverage_opts=coverage_opts, - coverage_scale=coverage_scale, - neg_coverage_scale=neg_coverage_scale, - neg_coverage_opts=neg_coverage_opts, - monotonicity_scale=monotonicity_scale, - monotonicity_opts=monotonicity_opts, - always_add_scorers=cheating, - ) - ) - else: - label_scorer.label_scorers["decoder"] = ( - get_label_scorer_pure_torch( - model=model, - batch_dim=batch_dim, - enc=enc, - enc_spatial_dim=enc_spatial_dim, - ), - 1.0, - ) - if isinstance(beam_search_version, str) or beam_search_version >= 5: - len_reward = beam_search_opts.pop("length_reward", 0.0) - if len_reward or cheating: - label_scorer.label_scorers["length_reward"] = ( - LengthRewardScorer(), - len_reward, - ) - if model.language_model: - lm_scale = beam_search_opts.pop("lm_scale") # must be defined with LM - label_scorer.label_scorers["lm"] = ( - model.language_model_make_label_scorer(), - lm_scale, - ) - print("** max seq len:", max_seq_len.raw_tensor) +def _get_hypo_state(hypo: Hypothesis) -> List[List[torch.Tensor]]: + return hypo[2] - # Beam search happening here: - ( - seq_targets, # [Batch,FinalBeam,OutSeqLen] - seq_log_prob, # [Batch,FinalBeam] - out_seq_len, # [Batch,FinalBeam] - ) = beam_search_func( - label_scorer, - batch_size=int(batch_dim.get_dim_value()), - max_seq_len=max_seq_len.copy_compatible_to_dims_raw([batch_dim]), - device=data.raw_tensor.device, - opts=beam_search_opts_cls( - **beam_search_opts, - bos_label=model.bos_idx, - eos_label=model.eos_idx, - num_labels=model.target_dim.dimension, - ), - **extra, - ) - beam_dim = Dim(seq_log_prob.shape[1], name="beam") - out_spatial_dim = Dim( - rf.convert_to_tensor( - out_seq_len, dims=[batch_dim, beam_dim], name="out_spatial" - ) - ) - seq_targets_t = rf.convert_to_tensor( - seq_targets, - dims=[batch_dim, beam_dim, out_spatial_dim], - sparse_dim=model.target_dim, - ) - seq_log_prob_t = rf.convert_to_tensor(seq_log_prob, dims=[batch_dim, beam_dim]) +def _get_hypo_score(hypo: Hypothesis) -> float: + return hypo[3] - search_end_time = time.perf_counter_ns() - data_seq_len_sum = rf.reduce_sum( - data_spatial_dim.dyn_size_ext, axis=data_spatial_dim.dyn_size_ext.dims - ) - data_seq_len_sum_secs = data_seq_len_sum.raw_tensor / _batch_size_factor / 100.0 - data_seq_len_max_seqs = ( - data_spatial_dim.get_dim_value() / _batch_size_factor / 100.0 - ) - out_len_longest_sum = rf.reduce_sum( - rf.reduce_max(out_spatial_dim.dyn_size_ext, axis=beam_dim), axis=batch_dim - ) - print( - "TIMINGS:", - ", ".join( - ( - f"batch size {data.get_batch_dim_tag().get_dim_value()}", - f"data len max {data_spatial_dim.get_dim_value()} ({data_seq_len_max_seqs:.2f} secs)", - f"data len sum {data_seq_len_sum.raw_tensor} ({data_seq_len_sum_secs:.2f} secs)", - f"enc {enc_end_time - start_time} ns", - f"enc len max {enc_spatial_dim.get_dim_value()}", - f"dec {search_end_time - enc_end_time} ns", - f"out len max {out_spatial_dim.get_dim_value()}", - f"out len longest sum {out_len_longest_sum.raw_tensor}", - ) - ), - ) - extra_recog_results = {} - if out_individual_seq_scores: - for k, v in out_individual_seq_scores.items(): - extra_recog_results[f"score:{k}"] = rf.convert_to_tensor( - v.expand(batch_dim.get_dim_value(), beam_dim.get_dim_value()), - dims=[batch_dim, beam_dim], - ) +def _get_hypo_key(hypo: Hypothesis) -> str: + return str(hypo[0]) - return seq_targets_t, seq_log_prob_t, extra_recog_results, out_spatial_dim, beam_dim +def _batch_state(hypos: List[Hypothesis]) -> List[List[torch.Tensor]]: + states: List[List[torch.Tensor]] = [] + for i in range(len(_get_hypo_state(hypos[0]))): + batched_state_components: List[torch.Tensor] = [] + for j in range(len(_get_hypo_state(hypos[0])[i])): + batched_state_components.append(torch.cat([_get_hypo_state(hypo)[i][j] for hypo in hypos])) + states.append(batched_state_components) + return states -def get_label_scorer_pure_torch( - *, - model: Model, - batch_dim: Dim, - enc: Dict[str, Tensor], - enc_spatial_dim: Dim, -): - import torch - import functools - from i6_experiments.users.zeyer.decoding.beam_search_torch.interface import ( - LabelScorerIntf, - StateObjTensorExt, - StateObjIgnored, - ) - class LabelScorer(LabelScorerIntf): - """label scorer""" +def _slice_state(states: List[List[torch.Tensor]], idx: int, device: torch.device) -> List[List[torch.Tensor]]: + idx_tensor = torch.tensor([idx], device=device) + return [[state.index_select(0, idx_tensor) for state in state_tuple] for state_tuple in states] - def get_initial_state(self, *, batch_size: int, device: torch.device) -> Any: - """Initial state.""" - beam_dim = Dim(1, name="initial-beam") - batch_dims_ = [batch_dim, beam_dim] - decoder_state = model.decoder_default_initial_state( - batch_dims=batch_dims_, enc_spatial_dim=enc_spatial_dim - ) - return tree.map_structure( - functools.partial(self._map_tensor_to_raw, beam_dim=beam_dim), - decoder_state, - ) - def max_remaining_seq_score( - self, *, state: Any, max_remaining_steps: torch.Tensor, device: torch.device - ) -> torch.Tensor: - """max remaining""" - return torch.zeros((1, 1), device=device) - - def score_and_update_state( - self, - *, - prev_state: Any, - prev_label: torch.Tensor, - ) -> Tuple[torch.Tensor, Any]: - """update state""" - beam_dim = Dim(prev_label.shape[1], name="beam") - - def _map_raw_to_tensor(v): - if isinstance(v, StateObjTensorExt): - tensor: Tensor = v.extra - tensor = tensor.copy_template_new_dim_tags( - (batch_dim, beam_dim) + tensor.dims[2:], keep_special_axes=True - ) - tensor.raw_tensor = v.tensor - return tensor - elif isinstance(v, StateObjIgnored): - return v.content - else: - raise TypeError( - f"_map_raw_to_tensor: unexpected {v} ({type(v).__name__})" - ) - - input_embed = model.target_embed( - rf.convert_to_tensor( - prev_label, dims=[batch_dim, beam_dim], sparse_dim=model.target_dim - ) - ) - decode_out, decoder_state = model.loop_step( - **enc, - enc_spatial_dim=enc_spatial_dim, - input_embed=input_embed, - state=tree.map_structure(_map_raw_to_tensor, prev_state), - ) - logits = model.decode_logits(input_embed=input_embed, **decode_out) - label_log_prob = rf.log_softmax(logits, axis=model.target_dim) - assert set(label_log_prob.dims) == {batch_dim, beam_dim, model.target_dim} - - return ( - self._map_tensor_to_raw(label_log_prob, beam_dim=beam_dim).tensor, - tree.map_structure( - functools.partial(self._map_tensor_to_raw, beam_dim=beam_dim), - decoder_state, - ), - ) +def _default_hypo_sort_key(hypo: Hypothesis) -> float: + return _get_hypo_score(hypo) / (len(_get_hypo_tokens(hypo)) + 1) - @staticmethod - def _map_tensor_to_raw(v, *, beam_dim: Dim): - if isinstance(v, Tensor): - if beam_dim not in v.dims: - return StateObjIgnored(v) - batch_dims_ = [batch_dim, beam_dim] - v = v.copy_transpose( - batch_dims_ + [dim for dim in v.dims if dim not in batch_dims_] - ) - raw = v.raw_tensor - return StateObjTensorExt(raw, v.copy_template()) - elif isinstance(v, Dim): - return StateObjIgnored(v) - else: - raise TypeError( - f"_map_tensor_to_raw: unexpected {v} ({type(v).__name__})" - ) - return LabelScorer() +def _compute_updated_scores( + hypos: List[Hypothesis], + next_token_probs: torch.Tensor, + beam_width: int, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + hypo_scores = torch.tensor([_get_hypo_score(h) for h in hypos]).unsqueeze(1) + nonblank_scores = hypo_scores + next_token_probs[:, :-1] # [beam_width, num_tokens - 1] + nonblank_nbest_scores, nonblank_nbest_idx = nonblank_scores.reshape(-1).topk(beam_width) + nonblank_nbest_hypo_idx = nonblank_nbest_idx.div(nonblank_scores.shape[1], rounding_mode="trunc") + nonblank_nbest_token = nonblank_nbest_idx % nonblank_scores.shape[1] + return nonblank_nbest_scores, nonblank_nbest_hypo_idx, nonblank_nbest_token -# RecogDef API -model_recog_pure_torch: RecogDef[Model] -model_recog_pure_torch.output_with_beam = True -model_recog_pure_torch.output_blank_label = None -model_recog_pure_torch.batch_size_dependent = False +def _remove_hypo(hypo: Hypothesis, hypo_list: List[Hypothesis]) -> None: + for i, elem in enumerate(hypo_list): + if _get_hypo_key(hypo) == _get_hypo_key(elem): + del hypo_list[i] + break -def get_label_scorer_and_coverage_scorer_pure_torch( - *, - model: Model, - batch_dim: Dim, - enc: Dict[str, Tensor], - enc_spatial_dim: Dim, - coverage_scale: float = 0.0, - coverage_opts: Optional[Dict[str, Any]] = None, - neg_coverage_scale: float = 0.0, - neg_coverage_opts: Optional[Dict[str, Any]] = None, - monotonicity_scale: float = 0.0, - monotonicity_opts: Optional[Dict[str, Any]] = None, - always_add_scorers: bool = False, -): - import torch - import functools - from returnn.frontend.decoder.transformer import TransformerDecoderLayer - from i6_experiments.users.zeyer.decoding.beam_search_torch.interface import ( - LabelScorerIntf, - StateObjTensorExt, - StateObjIgnored, - ) +class RNNTBeamSearch(torch.nn.Module): + r"""Beam search decoder for RNN-T model. - accum_att_weights = rf.zeros(()) # [Batch,Beam,kv_axis] - att_weights_dec_frame: Tensor # [Batch,Beam,kv_axis] - beam_dim: Dim - - raise NotImplementedError("need more work here") # TODO... - - model_att_reduce_type = coverage_opts.get("model_att_reduce_type", "max") - - def hooked_cross_att( - self: rf.CrossAttention, q: Tensor, k: Tensor, v: Tensor, *, kv_axis: Dim - ) -> Tensor: - """apply attention""" - nonlocal att_weights_dec_frame - # Standard dot attention, inline rf.dot_attention. - q *= self.key_dim_per_head.dimension**-0.5 - energy = rf.matmul(q, k, reduce=self.key_dim_per_head) - att_weights = rf.softmax(energy, axis=kv_axis) - if model_att_reduce_type == "max": - att_weights_dec_frame = rf.maximum( - att_weights_dec_frame, rf.reduce_max(att_weights, axis=self.num_heads) - ) - elif model_att_reduce_type == "avg": - att_weights_dec_frame += rf.reduce_mean( - att_weights, axis=self.num_heads - ) * (1 / len(model.decoder.layers)) + See Also: + * :class:`torchaudio.pipelines.RNNTBundle`: ASR pipeline with pretrained model. + + Args: + model (RNNT): RNN-T model to use. + blank (int): index of blank token in vocabulary. + temperature (float, optional): temperature to apply to joint network output. + Larger values yield more uniform samples. (Default: 1.0) + hypo_sort_key (Callable[[Hypothesis], float] or None, optional): callable that computes a score + for a given hypothesis to rank hypotheses by. If ``None``, defaults to callable that returns + hypothesis score normalized by token sequence length. (Default: None) + step_max_tokens (int, optional): maximum number of tokens to emit per input time step. (Default: 100) + """ + + def __init__( + self, + model: RNNT, + blank: int, + temperature: float = 1.0, + hypo_sort_key: Optional[Callable[[Hypothesis], float]] = None, + step_max_tokens: int = 100, + ) -> None: + super().__init__() + self.model = model + self.blank = blank + self.temperature = temperature + + if hypo_sort_key is None: + self.hypo_sort_key = _default_hypo_sort_key else: - raise ValueError(f"invalid model_att_reduce_type {model_att_reduce_type!r}") - # Masking not needed because softmax should already have masked, - # so we have 0.0 att weights for padded frames. - att = rf.matmul(att_weights, v, reduce=kv_axis, use_mask=False) - if v.feature_dim in att.dims: - att.feature_dim = v.feature_dim - output, _ = rf.merge_dims( - att, - dims=(self.num_heads, self.value_dim_per_head), - out_dim=self.value_dim_total, + self.hypo_sort_key = hypo_sort_key + + self.step_max_tokens = step_max_tokens + + def _init_b_hypos(self, device: torch.device) -> List[Hypothesis]: + token = self.blank + state = None + + one_tensor = torch.tensor([1], device=device) + pred_out, _, pred_state = self.model.predict(torch.tensor([[token]], device=device), one_tensor, state) + init_hypo = ( + [token], + pred_out[0].detach(), + pred_state, + 0.0, ) - if self.proj: - output = self.proj(output) - return output - - for layer in model.decoder.layers: - layer: TransformerDecoderLayer - layer.cross_att.attention = functools.partial( - hooked_cross_att, self=layer.cross_att + return [init_hypo] + + def _gen_next_token_probs( + self, enc_out: torch.Tensor, hypos: List[Hypothesis], device: torch.device + ) -> torch.Tensor: + one_tensor = torch.tensor([1], device=device) + predictor_out = torch.stack([_get_hypo_predictor_out(h) for h in hypos], dim=0) + joined_out, _, _ = self.model.join( + enc_out, + one_tensor, + predictor_out, + torch.tensor([1] * len(hypos), device=device), + ) # [beam_width, 1, 1, num_tokens] + joined_out = torch.nn.functional.log_softmax(joined_out / self.temperature, dim=3) + return joined_out[:, 0, 0] + + def _gen_b_hypos( + self, + b_hypos: List[Hypothesis], + a_hypos: List[Hypothesis], + next_token_probs: torch.Tensor, + key_to_b_hypo: Dict[str, Hypothesis], + ) -> List[Hypothesis]: + for i in range(len(a_hypos)): + h_a = a_hypos[i] + append_blank_score = _get_hypo_score(h_a) + next_token_probs[i, -1] + if _get_hypo_key(h_a) in key_to_b_hypo: + h_b = key_to_b_hypo[_get_hypo_key(h_a)] + _remove_hypo(h_b, b_hypos) + score = float(torch.tensor(_get_hypo_score(h_b)).logaddexp(append_blank_score)) + else: + score = float(append_blank_score) + h_b = ( + _get_hypo_tokens(h_a), + _get_hypo_predictor_out(h_a), + _get_hypo_state(h_a), + score, + ) + b_hypos.append(h_b) + key_to_b_hypo[_get_hypo_key(h_b)] = h_b + _, sorted_idx = torch.tensor([_get_hypo_score(hypo) for hypo in b_hypos]).sort() + return [b_hypos[idx] for idx in sorted_idx] + + def _gen_a_hypos( + self, + a_hypos: List[Hypothesis], + b_hypos: List[Hypothesis], + next_token_probs: torch.Tensor, + t: int, + beam_width: int, + device: torch.device, + ) -> List[Hypothesis]: + ( + nonblank_nbest_scores, + nonblank_nbest_hypo_idx, + nonblank_nbest_token, + ) = _compute_updated_scores(a_hypos, next_token_probs, beam_width) + + if len(b_hypos) < beam_width: + b_nbest_score = -float("inf") + else: + b_nbest_score = _get_hypo_score(b_hypos[-beam_width]) + + base_hypos: List[Hypothesis] = [] + new_tokens: List[int] = [] + new_scores: List[float] = [] + for i in range(beam_width): + score = float(nonblank_nbest_scores[i]) + if score > b_nbest_score: + a_hypo_idx = int(nonblank_nbest_hypo_idx[i]) + base_hypos.append(a_hypos[a_hypo_idx]) + new_tokens.append(int(nonblank_nbest_token[i])) + new_scores.append(score) + + if base_hypos: + new_hypos = self._gen_new_hypos(base_hypos, new_tokens, new_scores, t, device) + else: + new_hypos: List[Hypothesis] = [] + + return new_hypos + + def _gen_new_hypos( + self, + base_hypos: List[Hypothesis], + tokens: List[int], + scores: List[float], + t: int, + device: torch.device, + ) -> List[Hypothesis]: + tgt_tokens = torch.tensor([[token] for token in tokens], device=device) + states = _batch_state(base_hypos) + pred_out, _, pred_states = self.model.predict( + tgt_tokens, + torch.tensor([1] * len(base_hypos), device=device), + states, ) + new_hypos: List[Hypothesis] = [] + for i, h_a in enumerate(base_hypos): + new_tokens = _get_hypo_tokens(h_a) + [tokens[i]] + new_hypos.append((new_tokens, pred_out[i].detach(), _slice_state(pred_states, i, device), scores[i])) + return new_hypos + + def _search( + self, + enc_out: torch.Tensor, + hypo: Optional[List[Hypothesis]], + beam_width: int, + ) -> List[Hypothesis]: + n_time_steps = enc_out.shape[1] + device = enc_out.device + + a_hypos: List[Hypothesis] = [] + b_hypos = self._init_b_hypos(device) if hypo is None else hypo + for t in range(n_time_steps): + a_hypos = b_hypos + b_hypos = torch.jit.annotate(List[Hypothesis], []) + key_to_b_hypo: Dict[str, Hypothesis] = {} + symbols_current_t = 0 + + while a_hypos: + next_token_probs = self._gen_next_token_probs(enc_out[:, t : t + 1], a_hypos, device) + next_token_probs = next_token_probs.cpu() + b_hypos = self._gen_b_hypos(b_hypos, a_hypos, next_token_probs, key_to_b_hypo) + + if symbols_current_t == self.step_max_tokens: + break + + a_hypos = self._gen_a_hypos( + a_hypos, + b_hypos, + next_token_probs, + t, + beam_width, + device, + ) + if a_hypos: + symbols_current_t += 1 - class LabelScorer(LabelScorerIntf): - """label scorer""" + _, sorted_idx = torch.tensor([self.hypo_sort_key(hyp) for hyp in b_hypos]).topk(beam_width) + b_hypos = [b_hypos[idx] for idx in sorted_idx] - def get_initial_state(self, *, batch_size: int, device: torch.device) -> Any: - """Initial state.""" - beam_dim = Dim(1, name="initial-beam") - batch_dims_ = [batch_dim, beam_dim] - decoder_state = model.decoder_default_initial_state( - batch_dims=batch_dims_, enc_spatial_dim=enc_spatial_dim - ) - if coverage_scale or neg_coverage_scale or always_add_scorers: - decoder_state["accum_att_weights"] = rf.zeros(batch_dims_) - return tree.map_structure( - functools.partial(self._map_tensor_to_raw, beam_dim=beam_dim), - decoder_state, - ) + return b_hypos - def max_remaining_seq_score( - self, *, state: Any, max_remaining_steps: torch.Tensor, device: torch.device - ) -> torch.Tensor: - """max remaining""" - return torch.zeros((1, 1), device=device) - - def score_and_update_state( - self, - *, - prev_state: Any, - prev_label: torch.Tensor, - ) -> Tuple[torch.Tensor, Any]: - """update state""" - nonlocal beam_dim - beam_dim = Dim(prev_label.shape[1], name="beam") - - def _map_raw_to_tensor(v): - if isinstance(v, StateObjTensorExt): - tensor: Tensor = v.extra - tensor = tensor.copy_template_new_dim_tags( - (batch_dim, beam_dim) + tensor.dims[2:], keep_special_axes=True - ) - tensor.raw_tensor = v.tensor - return tensor - elif isinstance(v, StateObjIgnored): - return v.content - else: - raise TypeError( - f"_map_raw_to_tensor: unexpected {v} ({type(v).__name__})" - ) - - prev_state = tree.map_structure(_map_raw_to_tensor, prev_state) - - nonlocal accum_att_weights, att_weights_dec_frame - accum_att_weights = prev_state["accum_att_weights"] - att_weights_dec_frame = rf.zeros(()) - logits, decoder_state = model.decoder( - rf.convert_to_tensor( - prev_label, dims=[batch_dim, beam_dim], sparse_dim=model.target_dim - ), - spatial_dim=single_step_dim, - encoder=enc, - state=prev_state, - ) - accum_att_weights += att_weights_dec_frame - if coverage_scale or neg_coverage_scale or always_add_scorers: - decoder_state["accum_att_weights"] = accum_att_weights - label_log_prob = rf.log_softmax(logits, axis=model.target_dim) - assert set(label_log_prob.dims) == {batch_dim, beam_dim, model.target_dim} - - return ( - self._map_tensor_to_raw(label_log_prob, beam_dim=beam_dim).tensor, - tree.map_structure( - functools.partial(self._map_tensor_to_raw, beam_dim=beam_dim), - decoder_state, - ), - ) + def forward(self, input: torch.Tensor, length: torch.Tensor, beam_width: int) -> List[Hypothesis]: + r"""Performs beam search for the given input sequence. - @staticmethod - def _map_tensor_to_raw(v, *, beam_dim: Dim): - if isinstance(v, Tensor): - if beam_dim not in v.dims: - return StateObjIgnored(v) - batch_dims_ = [batch_dim, beam_dim] - v = v.copy_transpose( - batch_dims_ + [dim for dim in v.dims if dim not in batch_dims_] - ) - raw = v.raw_tensor - return StateObjTensorExt(raw, v.copy_template()) - elif isinstance(v, Dim): - return StateObjIgnored(v) - else: - raise TypeError( - f"_map_tensor_to_raw: unexpected {v} ({type(v).__name__})" - ) + T: number of frames; + D: feature dimension of each frame. - class CoverageScorer(LabelScorerIntf): - """coverage + Args: + input (torch.Tensor): sequence of input frames, with shape (T, D) or (1, T, D). + length (torch.Tensor): number of valid frames in input + sequence, with shape () or (1,). + beam_width (int): beam size to use during search. - Google NMT: https://arxiv.org/pdf/1609.08144.pdf - Alternative: https://arxiv.org/abs/1612.02695 - Another alternative: https://arxiv.org/pdf/2105.00982.pdf + Returns: + List[Hypothesis]: top-``beam_width`` hypotheses found by beam search. """ - - def __init__(self, opts: Dict[str, Any]): - self.opts = opts - - def get_initial_state(self, *, batch_size: int, device: torch.device) -> Any: - """Initial state.""" - return {"prev_score": torch.zeros([batch_size, 1], device=device)} - - def score_and_update_state( - self, - *, - prev_state: Any, - prev_label: torch.Tensor, - ) -> Tuple[torch.Tensor, Any]: - """update state""" - prev_label # noqa # unused - # We assume the label scorer has run before us (make sure by right ordering). - accum_att_weights_ = accum_att_weights - assert set(accum_att_weights_.dims) == { - batch_dim, - beam_dim, - enc_spatial_dim, - } - cov_type = self.opts.get("type", "log1p") - if self.opts.get("rescale", False): - accum_att_weights_ /= rf.maximum( - rf.reduce_max(accum_att_weights_, axis=enc_spatial_dim), 1.0 - ) - if ( - cov_type == "log1p" - ): # log1p, to avoid having lots of negative numbers. So this starts more around 0.0. - coverage_score = rf.log1p(rf.minimum(accum_att_weights_, 1.0)) - elif ( - cov_type == "log" - ): # orig Google NMT: https://arxiv.org/pdf/1609.08144.pdf, but clipped - eps = self.opts.get("eps", 0.0) - clip_min = self.opts.get("clip_min", 0.01) - coverage_score = rf.log( - rf.clip_by_value(accum_att_weights_, clip_min, 1.0) + eps - ) - elif cov_type == "indicator": - threshold = self.opts.get("threshold", 0.5) - coverage_score = rf.where(accum_att_weights_ >= threshold, 1.0, 0.0) - elif cov_type == "relu_upper": - threshold = self.opts.get("threshold", 0.5) - coverage_score = rf.where( - accum_att_weights_ >= threshold, accum_att_weights_ - threshold, 0.0 - ) - else: - raise ValueError(f"invalid coverage opts type {cov_type!r}") - coverage_score = rf.reduce_sum(coverage_score, axis=enc_spatial_dim) - coverage_score_raw = coverage_score.copy_compatible_to_dims_raw( - (batch_dim, beam_dim) - ) - state = {"prev_score": coverage_score_raw} - return (coverage_score_raw - prev_state["prev_score"])[:, :, None], state - - class MonotonicityScorer(LabelScorerIntf): - """score monotonicity""" - - def get_initial_state(self, *, batch_size: int, device: torch.device) -> Any: - """Initial state.""" - return {"att_pos": torch.zeros([batch_size, 1], device=device)} - - def score_and_update_state( - self, - *, - prev_state: Any, - prev_label: torch.Tensor, - ) -> Tuple[torch.Tensor, Any]: - """update state""" - prev_label # noqa # unused - # We assume the label scorer has run before us (make sure by right ordering). - assert set(att_weights_dec_frame.dims) == { - batch_dim, - beam_dim, - enc_spatial_dim, - } - att_pos = rf.matmul( - att_weights_dec_frame, - rf.range_over_dim(enc_spatial_dim, dtype=att_weights_dec_frame.dtype), - reduce=enc_spatial_dim, - use_mask=False, # not needed, att weights already 0 outside - ) # [Batch,Beam] - att_pos_raw = att_pos.copy_compatible_to_dims_raw((batch_dim, beam_dim)) - delta_raw = prev_state["att_pos"] - att_pos_raw - threshold = monotonicity_opts.get("threshold", 1.0) - # Penalize when below threshold. The more it is below (or even negative), the more. - score_raw = torch.where( - delta_raw < threshold, delta_raw - threshold, 0.0 - ) # [Batch,Beam] - return score_raw[:, :, None], {"att_pos": att_pos_raw} - - # Note: insertion order matters here, we want that decoder is scored first. - res = {"decoder": (LabelScorer(), 1.0)} - if coverage_scale or always_add_scorers: - res["attention_coverage"] = ( - CoverageScorer(coverage_opts or {}), - coverage_scale, - ) - if neg_coverage_scale or (neg_coverage_opts and always_add_scorers): - # Idea: Too much attention on some frames (e.g. repetitions) is scored negatively. - res["attention_neg_coverage"] = ( - CoverageScorer(neg_coverage_opts or {}), - -neg_coverage_scale, - ) - if monotonicity_scale or always_add_scorers: - res["attention_monotonicity"] = (MonotonicityScorer(), monotonicity_scale) - return res \ No newline at end of file + if input.dim() != 2 and not (input.dim() == 3 and input.shape[0] == 1): + raise ValueError("input must be of shape (T, D) or (1, T, D)") + if input.dim() == 2: + input = input.unsqueeze(0) + + if length.shape != () and length.shape != (1,): + raise ValueError("length must be of shape () or (1,)") + if length.dim() == 0: + length = length.unsqueeze(0) + + enc_out, _ = self.model.transcribe(input, length) + return self._search(enc_out, None, beam_width) + + @torch.jit.export + def infer( + self, + input: torch.Tensor, + length: torch.Tensor, + beam_width: int, + state: Optional[List[List[torch.Tensor]]] = None, + hypothesis: Optional[List[Hypothesis]] = None, + ) -> Tuple[List[Hypothesis], List[List[torch.Tensor]]]: + r"""Performs beam search for the given input sequence in streaming mode. + + T: number of frames; + D: feature dimension of each frame. + + Args: + input (torch.Tensor): sequence of input frames, with shape (T, D) or (1, T, D). + length (torch.Tensor): number of valid frames in input + sequence, with shape () or (1,). + beam_width (int): beam size to use during search. + state (List[List[torch.Tensor]] or None, optional): list of lists of tensors + representing transcription network internal state generated in preceding + invocation. (Default: ``None``) + hypothesis (List[Hypothesis] or None): hypotheses from preceding invocation to seed + search with. (Default: ``None``) + + Returns: + (List[Hypothesis], List[List[torch.Tensor]]): + List[Hypothesis] + top-``beam_width`` hypotheses found by beam search. + List[List[torch.Tensor]] + list of lists of tensors representing transcription network + internal state generated in current invocation. + """ + if input.dim() != 2 and not (input.dim() == 3 and input.shape[0] == 1): + raise ValueError("input must be of shape (T, D) or (1, T, D)") + if input.dim() == 2: + input = input.unsqueeze(0) + + if length.shape != () and length.shape != (1,): + raise ValueError("length must be of shape () or (1,)") + if length.dim() == 0: + length = length.unsqueeze(0) + + enc_out, _, state = self.model.transcribe_streaming(input, length, state) + return self._search(enc_out, hypothesis, beam_width), state From d6c45435fc88761dc09b3770db5760421a16ba55 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Tue, 4 Jun 2024 23:59:03 +0200 Subject: [PATCH 091/227] feature batch norm --- .../exp2024_04_23_baselines/ctc.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 265ed9c17..1aa784b0a 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -149,6 +149,20 @@ def py(): train_vocab_opts={"other_opts": {"enable_sampling": True, "alpha": 0.7}}, ) + train_exp( + "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-featBN-speedpertV2-spm10k-spmSample07", + config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, + model_config={"feature_batch_norm": True}, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), + "optimizer.weight_decay": 1e-2, + "__train_audio_preprocess": speed_pert_librosa_config, + "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], + }, + vocab="spm10k", + train_vocab_opts={"other_opts": {"enable_sampling": True, "alpha": 0.7}}, + ) + # noinspection PyShadowingNames def train_exp( @@ -545,6 +559,10 @@ def __init__( vocab_labels, user_defined_symbols={model_recog.output_blank_label: blank_idx} ) + self.feature_batch_norm = None + if config.bool("feature_batch_norm", False): + self.feature_batch_norm = rf.BatchNorm(self.in_dim, affine=False, use_mask=True) + self._specaugment_opts = { "steps": config.typed_value("specaugment_steps") or (0, 1000, 2000), "max_consecutive_spatial_dims": config.typed_value("specaugment_max_consecutive_spatial_dims") or 20, @@ -574,6 +592,8 @@ def __call__( out_dim=self.in_dim, sampling_rate=16_000, ) + if self.feature_batch_norm: + source = self.feature_batch_norm(source) if self._mixup: source = self._mixup(source, spatial_dim=in_spatial_dim) # SpecAugment From e243cc23d2003c802ba7d244b31cd60ad40a23b4 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Wed, 5 Jun 2024 21:51:22 +0200 Subject: [PATCH 092/227] feature normalization --- .../experiments/exp2024_04_23_baselines/ctc.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 1aa784b0a..b254ab994 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -163,6 +163,20 @@ def py(): train_vocab_opts={"other_opts": {"enable_sampling": True, "alpha": 0.7}}, ) + train_exp( + "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-featNorm-speedpertV2-spm10k-spmSample07", + config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, + model_config={"feature_norm": True}, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), + "optimizer.weight_decay": 1e-2, + "__train_audio_preprocess": speed_pert_librosa_config, + "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], + }, + vocab="spm10k", + train_vocab_opts={"other_opts": {"enable_sampling": True, "alpha": 0.7}}, + ) + # noinspection PyShadowingNames def train_exp( @@ -562,6 +576,7 @@ def __init__( self.feature_batch_norm = None if config.bool("feature_batch_norm", False): self.feature_batch_norm = rf.BatchNorm(self.in_dim, affine=False, use_mask=True) + self.feature_norm = config.bool("feature_norm", False) self._specaugment_opts = { "steps": config.typed_value("specaugment_steps") or (0, 1000, 2000), @@ -594,6 +609,8 @@ def __call__( ) if self.feature_batch_norm: source = self.feature_batch_norm(source) + if self.feature_norm: + source = rf.normalize(source, axis=in_spatial_dim) if self._mixup: source = self._mixup(source, spatial_dim=in_spatial_dim) # SpecAugment From e93e1719a4e6ea8b19816023ff44ce99449b4f5a Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Wed, 5 Jun 2024 22:08:22 +0200 Subject: [PATCH 093/227] recog fix API doc --- users/zeyer/model_interfaces/recog.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/users/zeyer/model_interfaces/recog.py b/users/zeyer/model_interfaces/recog.py index 5847c3076..49a09ff27 100644 --- a/users/zeyer/model_interfaces/recog.py +++ b/users/zeyer/model_interfaces/recog.py @@ -4,7 +4,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Optional, Protocol +from typing import TYPE_CHECKING, Optional, Protocol, Tuple if TYPE_CHECKING: from returnn.tensor import Tensor, Dim @@ -24,13 +24,17 @@ def __call__( model: ModelT, data: Tensor, data_spatial_dim: Dim, - ) -> Tensor: + ) -> Tuple[Tensor, Tensor, Dim, Dim]: """ - :return: recog output, including beam or not, depending on output_with_beam + :return: + recog results including beam {batch, beam, out_spatial}, + log probs {batch, beam}, + out_spatial_dim, + final beam_dim """ raise NotImplementedError - output_with_beam: bool = True + output_with_beam: bool = True # False not really supported... output_blank_label: Optional[str] = None # A batched beam search can be dependent on the batch size, From 0804c99f2d46be22e614c3f202da6abefc78ea84 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Wed, 5 Jun 2024 23:33:24 +0200 Subject: [PATCH 094/227] collect stats, initial code --- users/zeyer/collect_model_dataset_stats.py | 289 +++++++++++++++++++++ users/zeyer/model_interfaces/__init__.py | 1 + users/zeyer/model_interfaces/forward.py | 22 ++ 3 files changed, 312 insertions(+) create mode 100644 users/zeyer/collect_model_dataset_stats.py create mode 100644 users/zeyer/model_interfaces/forward.py diff --git a/users/zeyer/collect_model_dataset_stats.py b/users/zeyer/collect_model_dataset_stats.py new file mode 100644 index 000000000..095bdcca2 --- /dev/null +++ b/users/zeyer/collect_model_dataset_stats.py @@ -0,0 +1,289 @@ +""" +Calculate statistics over any dataset, e.g. feature statistics. +Can also perform any computation on the dataset, e.g. forward pass through a model. +Thus, this can also be used to calculate prior statistics for a model. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Optional, Union, Any, Dict +from dataclasses import dataclass + +from sisyphus import tk +from i6_core.util import instanciate_delayed + +from i6_core.returnn import ReturnnConfig +from i6_core.returnn.forward import ReturnnForwardJobV2 +from returnn_common import nn +from returnn_common.datasets_old_2022_10.interface import DatasetConfig +from i6_experiments.common.setups import serialization +from i6_experiments.users.zeyer.utils.serialization import get_import_py_code + +from i6_experiments.users.zeyer import tools_paths +from i6_experiments.users.zeyer.model_interfaces import ModelDef, ModelDefWithCfg, ForwardDef, serialize_model_def +from i6_experiments.users.zeyer.model_with_checkpoints import ModelWithCheckpoint + +if TYPE_CHECKING: + from returnn.tensor import TensorDict + + +def collect_statistics( + *, + dataset: DatasetConfig, + model: Optional[ModelWithCheckpoint] = None, + forward_def: ForwardDef, + config: Optional[Dict[str, Any]] = None, + forward_post_config: Optional[Dict[str, Any]] = None, + forward_mem_rqmt: Union[int, float] = 6, + forward_rqmt: Optional[Dict[str, Any]] = None, + forward_alias_name: Optional[str] = None, +) -> StatisticsOutput: + """ + recog on the specific dataset + """ + env_updates = None + if (config and config.get("__env_updates")) or (forward_post_config and forward_post_config.get("__env_updates")): + env_updates = (config and config.pop("__env_updates", None)) or ( + forward_post_config and forward_post_config.pop("__env_updates", None) + ) + out_files = { + "mean": _prior_mean_out_filename, + "std_dev": _prior_std_dev_out_filename, + "min": _prior_min_out_filename, + "max": _prior_max_out_filename, + "info": _prior_info_out_filename, + } + forward_job = ReturnnForwardJobV2( + model_checkpoint=model.checkpoint if model else None, + returnn_config=_collect_stats_returnn_forward_config( + dataset, model.definition if model else None, forward_def, config=config, post_config=forward_post_config + ), + output_files=list(out_files.values()), + returnn_python_exe=tools_paths.get_returnn_python_exe(), + returnn_root=tools_paths.get_returnn_root(), + mem_rqmt=forward_mem_rqmt, + ) + if forward_rqmt: + forward_job.rqmt.update(forward_rqmt) + if env_updates: + for k, v in env_updates.items(): + forward_job.set_env(k, v) + if forward_alias_name: + forward_job.add_alias(forward_alias_name) + return StatisticsOutput(**forward_job.out_files) + + +@dataclass +class StatisticsOutput: + """statistics""" + + mean: tk.Path + std_dev: tk.Path + min: tk.Path + max: tk.Path + info: tk.Path + + +_prior_mean_out_filename = "stats.mean.txt" +_prior_std_dev_out_filename = "stats.std_dev.txt" +_prior_min_out_filename = "stats.min.txt" +_prior_max_out_filename = "stats.max.txt" +_prior_info_out_filename = "stats.info.txt" + + +def _returnn_get_forward_callback(): + from returnn.tensor import Tensor, TensorDict + from returnn.forward_iface import ForwardCallbackIface + from returnn.util.basic import Stats + + class _ReturnnCollectStatsForwardCallbackIface(ForwardCallbackIface): + def __init__(self): + self.stats: Optional[Stats] = None + + def init(self, *, model): + self.stats = Stats() + + def process_seq(self, *, seq_tag: str, outputs: TensorDict): + # see _returnn_forward_step + out: Tensor = outputs["output"].copy_with_feature_last() + assert out.batch_ndim == 2 # (time,feature) + self.stats.collect(out.raw_tensor) + + def finish(self): + self.stats.dump("stats") + + return _ReturnnCollectStatsForwardCallbackIface() + + +# Those are applied for both training, recog and potential others. +# The values are only used if they are neither set in config nor post_config already. +# They should also not infer with other things from the epilog. +SharedPostConfig = { + # In case pretraining overwrites some of these, they need a default. + "accum_grad_multiple_step": None, + "use_last_best_model": None, +} + + +def _collect_stats_returnn_forward_config( + dataset: DatasetConfig, + model_def: Union[None, ModelDef, ModelDefWithCfg], + forward_def: ForwardDef, + *, + config: Optional[Dict[str, Any]] = None, + post_config: Optional[Dict[str, Any]] = None, +) -> ReturnnConfig: + """ + Create config for collecting stats. + + TODO should use sth like unhashed_package_root (https://github.com/rwth-i6/i6_experiments/pull/157) + """ + from i6_experiments.common.setups.returnn.serialization import get_serializable_config + + returnn_recog_config_dict = dict( + # dataset + default_input=dataset.get_default_input(), + target=dataset.get_default_target(), # only for get_model with model_def + forward_data=dataset.get_main_dataset(), + ) + if model_def: + returnn_recog_config_dict.update( + dict( + backend=model_def.backend, + behavior_version=model_def.behavior_version, + ) + ) + else: + assert config and config.get("backend") and config.get("behavior_version") + if config: + returnn_recog_config_dict.update(config) + if isinstance(model_def, ModelDefWithCfg): + returnn_recog_config_dict.update(model_def.config) + + extern_data_raw = dataset.get_extern_data() + # The extern_data is anyway not hashed, so we can also instanciate any delayed objects here. + # It's not hashed because we assume that all aspects of the dataset are already covered + # by the datasets itself as part in the config above. + extern_data_raw = instanciate_delayed(extern_data_raw) + + returnn_forward_config = ReturnnConfig( + config=returnn_recog_config_dict, + python_epilog=[ + serialization.Collection( + [ + serialization.NonhashedCode(get_import_py_code()), + serialization.NonhashedCode( + nn.ReturnnConfigSerializer.get_base_extern_data_py_code_str_direct(extern_data_raw) + ), + *( + serialize_model_def(model_def) + if model_def + else [serialization.NonhashedCode("_model_def = None\n")] + ), + serialization.Import(_returnn_get_model, import_as="get_model"), + serialization.Import(forward_def, import_as="_forward_def", ignore_import_as_for_hash=True), + serialization.Import(_returnn_forward_step, import_as="forward_step"), + serialization.Import(_returnn_get_forward_callback, import_as="forward_callback"), + serialization.ExplicitHash( + { + # Increase the version whenever some incompatible change is made in this recog() function, + # which influences the outcome, but would otherwise not influence the hash. + "version": 2, + } + ), + serialization.PythonEnlargeStackWorkaroundNonhashedCode, + serialization.PythonCacheManagerFunctionNonhashedCode, + serialization.PythonModelineNonhashedCode, + ] + ) + ], + post_config=dict( # not hashed + log_batch_size=True, + # debug_add_check_numerics_ops = True + # debug_add_check_numerics_on_output = True + # flat_net_construction=True, + torch_log_memory_usage=True, + watch_memory=True, + use_lovely_tensors=True, + ), + sort_config=False, + ) + + # There might be some further functions in the config, e.g. some dataset postprocessing. + returnn_forward_config = get_serializable_config( + returnn_forward_config, + # The only dim tags we directly have in the config are via extern_data, maybe also model_outputs. + # All other dim tags are inside functions such as get_model or train_step, + # so we do not need to care about them here, only about the serialization of those functions. + # Those dim tags and those functions are already handled above. + serialize_dim_tags=False, + ) + + batch_size_dependent = False + if "__batch_size_dependent" in returnn_forward_config.config: + batch_size_dependent = returnn_forward_config.config.pop("__batch_size_dependent") + if "__batch_size_dependent" in returnn_forward_config.post_config: + batch_size_dependent = returnn_forward_config.post_config.pop("__batch_size_dependent") + for k, v in dict( + batching="sorted", + batch_size=20000 * model_def.batch_size_factor, + max_seqs=200, + ).items(): + if k in returnn_forward_config.config: + v = returnn_forward_config.config.pop(k) + if k in returnn_forward_config.post_config: + v = returnn_forward_config.post_config.pop(k) + (returnn_forward_config.config if batch_size_dependent else returnn_forward_config.post_config)[k] = v + + if post_config: + returnn_forward_config.post_config.update(post_config) + + for k, v in SharedPostConfig.items(): + if k in returnn_forward_config.config or k in returnn_forward_config.post_config: + continue + returnn_forward_config.post_config[k] = v + + return returnn_forward_config + + +def _returnn_get_model(*, epoch: int, **_kwargs_unused): + from returnn.tensor import Tensor + from returnn.config import get_global_config + import returnn.frontend as rf + + config = get_global_config() + model_def = config.typed_value("_model_def") + if model_def is None: + return rf.Module() # empty dummy module + + default_input_key = config.typed_value("default_input") + default_target_key = config.typed_value("target") + extern_data_dict = config.typed_value("extern_data") + data = Tensor(name=default_input_key, **extern_data_dict[default_input_key]) + targets = Tensor(name=default_target_key, **extern_data_dict[default_target_key]) + assert targets.sparse_dim and targets.sparse_dim.vocab, f"no vocab for {targets}" + + model = model_def(epoch=epoch, in_dim=data.feature_dim, target_dim=targets.sparse_dim) + return model + + +def _returnn_forward_step(*, model, extern_data: TensorDict, **_kwargs_unused): + import returnn.frontend as rf + from returnn.tensor import Tensor, Dim, batch_dim + from returnn.config import get_global_config + + if rf.is_executing_eagerly(): + batch_size = int(batch_dim.get_dim_value()) + for batch_idx in range(batch_size): + seq_tag = extern_data["seq_tag"].raw_tensor[batch_idx].item() + print(f"batch {batch_idx+1}/{batch_size} seq_tag: {seq_tag!r}") + + config = get_global_config() + default_input_key = config.typed_value("default_input") + data = extern_data[default_input_key] + data_spatial_dim = data.get_time_dim_tag() + forward_def: ForwardDef = config.typed_value("_forward_def") + out, out_spatial_dim = forward_def(data, in_spatial_dim=data_spatial_dim, model=model) + assert isinstance(out, Tensor) and isinstance(out_spatial_dim, Dim) + assert out.feature_dim # we expect a feature dim + rf.get_run_ctx().mark_as_output(out, "output", dims=[batch_dim, out_spatial_dim, out.feature_dim]) diff --git a/users/zeyer/model_interfaces/__init__.py b/users/zeyer/model_interfaces/__init__.py index cd530259e..2fb721309 100644 --- a/users/zeyer/model_interfaces/__init__.py +++ b/users/zeyer/model_interfaces/__init__.py @@ -6,4 +6,5 @@ from .model import * from .model_with_checkpoints import * from .recog import * +from .forward import * from .training import * diff --git a/users/zeyer/model_interfaces/forward.py b/users/zeyer/model_interfaces/forward.py new file mode 100644 index 000000000..917539137 --- /dev/null +++ b/users/zeyer/model_interfaces/forward.py @@ -0,0 +1,22 @@ +""" +Any forwarding +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Protocol, Tuple + +if TYPE_CHECKING: + from returnn.tensor import Tensor, Dim + +from .model import ModelT + + +class ForwardDef(Protocol[ModelT]): + def __call__(self, source: Tensor, /, in_spatial_dim: Dim, model: ModelT) -> Tuple[Tensor, Dim]: + """ + :param source: input tensor + :param in_spatial_dim: input spatial dimension + :param model: + :return: output tensor and output spatial dimension (can be same as in_spatial_dim) + """ From 68479b7991794929f3e72c7fac511231d4d552a6 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Wed, 5 Jun 2024 23:59:31 +0200 Subject: [PATCH 095/227] librispeech feature stats --- users/zeyer/datasets/librispeech.py | 55 ++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/users/zeyer/datasets/librispeech.py b/users/zeyer/datasets/librispeech.py index 020e285fe..d511e8ade 100644 --- a/users/zeyer/datasets/librispeech.py +++ b/users/zeyer/datasets/librispeech.py @@ -3,7 +3,7 @@ """ from __future__ import annotations -from typing import Optional, Any, Union, Tuple, Dict +from typing import TYPE_CHECKING, Optional, Any, Union, Tuple, Dict from copy import deepcopy import re from functools import cache @@ -25,6 +25,10 @@ from .utils.bpe import Bpe from .utils.spm import SentencePieceModel +if TYPE_CHECKING: + from returnn.tensor import Tensor, Dim + from i6_experiments.users.zeyer.collect_model_dataset_stats import StatisticsOutput + librispeech_ogg_zip_dict = librispeech.get_ogg_zip_dict() @@ -681,6 +685,55 @@ def _score_recog_out_v2(dataset: DatasetConfig, recog_output: RecogOutput) -> Sc return ScoreResult(dataset_name=corpus_name, main_measure_value=score_job.out_wer, report=score_job.out_report_dir) +def get_librispeech_raw_audio_only(*, main_key: str = "train") -> LibrispeechOggZip: + """librispeech with raw audio""" + return LibrispeechOggZip(audio=_raw_audio_opts, audio_dim=1, main_key=main_key) + + +def get_librispeech_log_mel_stats(dim: int, **kwargs) -> StatisticsOutput: + """ + Get feature stats + + :param dim: feature dim + :param kwargs: all passed to rf.audio.log_mel_filterbank_from_raw. + Default sampling_rate is 16_000, which is exactly also what we have for Librispeech usually. + Note on log_base: Default is 10.0. + Note that in some earlier setups, and also Mohammads original AED setup, + we used log_base=math.exp(2.3026), which is almost 10.0 but not exactly... + """ + from i6_experiments.users.zeyer.collect_model_dataset_stats import collect_statistics + + return collect_statistics( + dataset=get_librispeech_raw_audio_only(), + forward_def=_librispeech_log_mel_stats_returnn_forward, + config={ + "_audio_feature_dim": dim, + "_audio_feature_opts": kwargs, + }, + ) + + +def _librispeech_log_mel_stats_returnn_forward( + source: Tensor, /, in_spatial_dim: Dim, model: Any +) -> Tuple[Tensor, Dim]: + from returnn.config import get_global_config + import returnn.frontend as rf + from returnn.tensor import Dim + + model # noqa # unused + config = get_global_config() + feat_dim = config.int("_audio_feature_dim", -1) + assert feat_dim > 0 + feat_dim = Dim(feat_dim, name="audio", kind=Dim.Types.Feature) + opts = config.typed_value("_audio_feature_opts", None) + assert isinstance(opts, dict) + + source, out_spatial_dim = rf.audio.log_mel_filterbank_from_raw( + source, in_spatial_dim=in_spatial_dim, out_dim=feat_dim, **opts + ) + return source, out_spatial_dim + + def tests(): from sisyphus.hash import sis_hash_helper From b449d148d51a8242a5e7c359b692f713810b4838 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 6 Jun 2024 00:10:13 +0200 Subject: [PATCH 096/227] feature global norm --- .../exp2024_04_23_baselines/ctc.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index b254ab994..74ec1c794 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -177,6 +177,23 @@ def py(): train_vocab_opts={"other_opts": {"enable_sampling": True, "alpha": 0.7}}, ) + from i6_experiments.users.zeyer.datasets.librispeech import get_librispeech_log_mel_stats + + feature_stats = get_librispeech_log_mel_stats(_log_mel_feature_dim) + train_exp( + "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-featGN-speedpertV2-spm10k-spmSample07", + config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, + model_config={"feature_stats": {"mean": feature_stats.mean, "std_dev": feature_stats.std_dev}}, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), + "optimizer.weight_decay": 1e-2, + "__train_audio_preprocess": speed_pert_librosa_config, + "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], + }, + vocab="spm10k", + train_vocab_opts={"other_opts": {"enable_sampling": True, "alpha": 0.7}}, + ) + # noinspection PyShadowingNames def train_exp( @@ -577,6 +594,11 @@ def __init__( if config.bool("feature_batch_norm", False): self.feature_batch_norm = rf.BatchNorm(self.in_dim, affine=False, use_mask=True) self.feature_norm = config.bool("feature_norm", False) + self.feature_stats = None + feature_stats = config.typed_value("feature_stats") + if feature_stats: + assert isinstance(feature_stats, dict) + self.feature_stats = {k: rf.constant(v, dims=[self.in_dim]) for k, v in feature_stats.items()} self._specaugment_opts = { "steps": config.typed_value("specaugment_steps") or (0, 1000, 2000), @@ -611,6 +633,8 @@ def __call__( source = self.feature_batch_norm(source) if self.feature_norm: source = rf.normalize(source, axis=in_spatial_dim) + if self.feature_stats: + source = (source - self.feature_stats["mean"]) / self.feature_stats["std_dev"] if self._mixup: source = self._mixup(source, spatial_dim=in_spatial_dim) # SpecAugment From f140044ecc7e509f2a866fa076d8e33dfed75a1f Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 6 Jun 2024 00:46:18 +0200 Subject: [PATCH 097/227] small fixes --- users/zeyer/collect_model_dataset_stats.py | 55 ++++++++++++++++++++-- users/zeyer/datasets/librispeech.py | 15 ++---- 2 files changed, 56 insertions(+), 14 deletions(-) diff --git a/users/zeyer/collect_model_dataset_stats.py b/users/zeyer/collect_model_dataset_stats.py index 095bdcca2..c65e13c4a 100644 --- a/users/zeyer/collect_model_dataset_stats.py +++ b/users/zeyer/collect_model_dataset_stats.py @@ -6,7 +6,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Optional, Union, Any, Dict +from typing import TYPE_CHECKING, Optional, Union, Any, Dict, Tuple from dataclasses import dataclass from sisyphus import tk @@ -24,7 +24,7 @@ from i6_experiments.users.zeyer.model_with_checkpoints import ModelWithCheckpoint if TYPE_CHECKING: - from returnn.tensor import TensorDict + from returnn.tensor import Tensor, Dim, TensorDict def collect_statistics( @@ -70,7 +70,54 @@ def collect_statistics( forward_job.set_env(k, v) if forward_alias_name: forward_job.add_alias(forward_alias_name) - return StatisticsOutput(**forward_job.out_files) + return StatisticsOutput(**{k: forward_job.out_files[v] for k, v in out_files.items()}) + + +def collect_log_mel_feature_statistics( + *, dataset: DatasetConfig, dim: int, backend: str = "torch", behavior_version: int = 21, **kwargs +): + """ + Get feature stats + + :param dataset: + :param dim: log mel feature dim + :param backend: + :param behavior_version: + :param kwargs: all passed to rf.audio.log_mel_filterbank_from_raw. + Default sampling_rate is 16_000, which is also what we have for Librispeech usually. + Note on log_base: Default is 10.0. + Note that in some earlier setups, and also Mohammads original AED setup, + we used log_base=math.exp(2.3026), which is almost 10.0 but not exactly... + """ + return collect_statistics( + dataset=dataset, + forward_def=_log_mel_stats_returnn_forward, + config={ + "backend": backend, + "behavior_version": behavior_version, + "_audio_feature_dim": dim, + "_audio_feature_opts": kwargs, + }, + ) + + +def _log_mel_stats_returnn_forward(source: Tensor, /, in_spatial_dim: Dim, model: Any) -> Tuple[Tensor, Dim]: + from returnn.config import get_global_config + import returnn.frontend as rf + from returnn.tensor import Dim + + model # noqa # unused + config = get_global_config() + feat_dim = config.int("_audio_feature_dim", -1) + assert feat_dim > 0 + feat_dim = Dim(feat_dim, name="audio", kind=Dim.Types.Feature) + opts = config.typed_value("_audio_feature_opts", None) + assert isinstance(opts, dict) + + source, out_spatial_dim = rf.audio.log_mel_filterbank_from_raw( + source, in_spatial_dim=in_spatial_dim, out_dim=feat_dim, **opts + ) + return source, out_spatial_dim @dataclass @@ -226,7 +273,7 @@ def _collect_stats_returnn_forward_config( batch_size_dependent = returnn_forward_config.post_config.pop("__batch_size_dependent") for k, v in dict( batching="sorted", - batch_size=20000 * model_def.batch_size_factor, + batch_size=(20000 * model_def.batch_size_factor) if model_def else (20000 * 160), max_seqs=200, ).items(): if k in returnn_forward_config.config: diff --git a/users/zeyer/datasets/librispeech.py b/users/zeyer/datasets/librispeech.py index d511e8ade..630fc3fbe 100644 --- a/users/zeyer/datasets/librispeech.py +++ b/users/zeyer/datasets/librispeech.py @@ -340,6 +340,8 @@ def get_dataset(self, key: str, *, training: bool = False, subset: Optional[int] eos_id = vocab.get_eos_idx() assert eos_id is not None, f"{self}: vocab {vocab} does not define EOS" d["targets"]["seq_postfix"] = [eos_id] + else: + d["targets"] = None if training: d["partition_epoch"] = self.train_epoch_split if self.train_epoch_wise_filter is not None: @@ -701,16 +703,9 @@ def get_librispeech_log_mel_stats(dim: int, **kwargs) -> StatisticsOutput: Note that in some earlier setups, and also Mohammads original AED setup, we used log_base=math.exp(2.3026), which is almost 10.0 but not exactly... """ - from i6_experiments.users.zeyer.collect_model_dataset_stats import collect_statistics - - return collect_statistics( - dataset=get_librispeech_raw_audio_only(), - forward_def=_librispeech_log_mel_stats_returnn_forward, - config={ - "_audio_feature_dim": dim, - "_audio_feature_opts": kwargs, - }, - ) + from i6_experiments.users.zeyer.collect_model_dataset_stats import collect_log_mel_feature_statistics + + return collect_log_mel_feature_statistics(dataset=get_librispeech_raw_audio_only(), dim=dim, **kwargs) def _librispeech_log_mel_stats_returnn_forward( From 950e5788b346a9c0b929e0599c1fca0be45b7149 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 6 Jun 2024 00:48:07 +0200 Subject: [PATCH 098/227] small fix --- users/zeyer/experiments/exp2024_04_23_baselines/ctc.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 74ec1c794..70ed2178f 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -535,6 +535,7 @@ def __init__( ): super(Model, self).__init__() + import numpy from returnn.config import get_global_config config = get_global_config(return_empty_if_none=True) @@ -598,7 +599,9 @@ def __init__( feature_stats = config.typed_value("feature_stats") if feature_stats: assert isinstance(feature_stats, dict) - self.feature_stats = {k: rf.constant(v, dims=[self.in_dim]) for k, v in feature_stats.items()} + self.feature_stats = { + k: rf.constant(numpy.loadtxt(v), dims=[self.in_dim]) for k, v in feature_stats.items() + } self._specaugment_opts = { "steps": config.typed_value("specaugment_steps") or (0, 1000, 2000), From e666f61e17da159bd0146c3044cf0ae64672c80b Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 6 Jun 2024 07:22:49 +0200 Subject: [PATCH 099/227] small fix --- users/zeyer/experiments/exp2024_04_23_baselines/ctc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 70ed2178f..8cbc86a79 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -600,7 +600,7 @@ def __init__( if feature_stats: assert isinstance(feature_stats, dict) self.feature_stats = { - k: rf.constant(numpy.loadtxt(v), dims=[self.in_dim]) for k, v in feature_stats.items() + k: rf.convert_to_tensor(numpy.loadtxt(v), dims=[self.in_dim]) for k, v in feature_stats.items() } self._specaugment_opts = { From e0fc149add997671af4593cab5ffe25d53a344d9 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 6 Jun 2024 09:28:54 +0200 Subject: [PATCH 100/227] small fix --- .../experiments/exp2024_04_23_baselines/ctc.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 8cbc86a79..450331bf0 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -599,9 +599,15 @@ def __init__( feature_stats = config.typed_value("feature_stats") if feature_stats: assert isinstance(feature_stats, dict) - self.feature_stats = { - k: rf.convert_to_tensor(numpy.loadtxt(v), dims=[self.in_dim]) for k, v in feature_stats.items() - } + self.feature_stats = rf.ParameterList( + { + k: rf.Parameter( + rf.convert_to_tensor(numpy.loadtxt(v), dims=[self.in_dim], dtype=rf.get_default_float_dtype()), + auxiliary=True, + ) + for k, v in feature_stats.items() + } + ) self._specaugment_opts = { "steps": config.typed_value("specaugment_steps") or (0, 1000, 2000), @@ -637,7 +643,7 @@ def __call__( if self.feature_norm: source = rf.normalize(source, axis=in_spatial_dim) if self.feature_stats: - source = (source - self.feature_stats["mean"]) / self.feature_stats["std_dev"] + source = (source - self.feature_stats.mean) / self.feature_stats.std_dev if self._mixup: source = self._mixup(source, spatial_dim=in_spatial_dim) # SpecAugment From b7480b8a44d11004587b7da2fbcce5d616874665 Mon Sep 17 00:00:00 2001 From: "luca.gaudino" Date: Thu, 6 Jun 2024 10:07:24 +0200 Subject: [PATCH 101/227] fix feat norm search --- .../conformer_import_moh_att_2023_06_30.py | 2 +- .../librispeech_960/conformer_ctc_train.py | 47 +++++++++++++--- .../tedlium2/conformer_rnnt_train.py | 54 ++++++++++++++----- .../asr/rf/conformer_rnnt/model_recog_rnnt.py | 51 ++++++++++-------- 4 files changed, 111 insertions(+), 43 deletions(-) diff --git a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py index 52193c721..4b0b70543 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py @@ -639,7 +639,7 @@ def sis_run_with_prefix(prefix_name: str = None): # opls att + ctc + trafo lm + ilm for scales, prior_scale, lm_scale, ilm_scale, beam_size in product( - [(0.8, 0.2)], [0.05, 0.03, 0.04, 0.06, 0.07], [0.65], [0.4], [32] + [(0.8, 0.2)], [0.05, 0.07], [0.65], [0.4], [32, 48, 64] ): att_scale, ctc_scale = scales recog_name = ( diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py index 53e2e77ee..66fe36592 100644 --- a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py @@ -10,6 +10,7 @@ import hashlib import contextlib import functools +from sisyphus import tk from returnn.tensor import Tensor, Dim, single_step_dim import returnn.frontend as rf @@ -32,10 +33,10 @@ if TYPE_CHECKING: from i6_experiments.users.zeyer.model_interfaces import ModelDef, RecogDef, TrainDef - from i6_experiments.users.zeyer.model_with_checkpoints import ( - ModelWithCheckpoints, - ModelWithCheckpoint, - ) +from i6_experiments.users.zeyer.model_with_checkpoints import ( + ModelWithCheckpoints, + ModelWithCheckpoint, +) # From Mohammad, 2023-06-29 # dev-clean 2.27 @@ -99,6 +100,9 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): """run the exp""" + + from i6_core.returnn.training import PtCheckpoint + _sis_setup_global_prefix(prefix_name) # Moh: dev-clean 2.27, dev-other 5.39, test-clean 2.41, test-other 5.51 @@ -158,9 +162,12 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): "aux_loss_layers":[], "mel_normalization_ted2": False, }, + search_config={ + "mel_normalization_ted2": False, + }, ) - train_exp( # dev-other 7.17 + train_exp( # dev-other 6.92 "base-24gb-lrlin1e_5_600k_ctc_only_aux4_8_no_mel_norm", config_24gb_v6, config_updates={ @@ -171,7 +178,34 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], "mel_normalization_ted2": False, }, + search_config = { + "mel_normalization_ted2": False, + }, + ) + + _torch_ckpt_path = "/u/luca.gaudino/setups/2023-08-10--rf-librispeech/work/i6_core/returnn/training/ReturnnTrainingJob.AWwVft0oGy8e/output/models/epoch.1981.pt" + + new_ckpt_path = tk.Path( + _torch_ckpt_path, + hash_overwrite= "ctc" + "_torch_ckpt", ) + new_ckpt = PtCheckpoint(new_ckpt_path) + + recog_config = { + "mel_normalization_ted2": False, + } + + # recog ctc only model + _recog( + "model_recogs/base-24gb-lrlin1e_5_600k_ctc_only_aux4_8_no_mel_norm/ep1981/ctc_greedy/recog_results", + ModelWithCheckpoint( + definition=from_scratch_model_def, checkpoint=new_ckpt + ), + model_recog, + recog_config=recog_config, + ) + + _sis_prefix: Optional[str] = None @@ -227,6 +261,7 @@ def train_exp( fine_tune: Optional[Union[int, List[Tuple[int, Dict[str, Any]]]]] = None, time_rqmt: Optional[int] = None, model_avg: bool = False, + search_config: Optional[Dict[str, Any]] = None, ) -> ModelWithCheckpoints: """ Train experiment @@ -264,7 +299,7 @@ def train_exp( time_rqmt=time_rqmt, ) recog_training_exp( - prefix, task, model_with_checkpoint, recog_def=model_recog, model_avg=model_avg + prefix, task, model_with_checkpoint, recog_def=model_recog, model_avg=model_avg, search_config=search_config ) if fine_tune: diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_rnnt_train.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_rnnt_train.py index 4a669a70b..d075dec27 100644 --- a/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_rnnt_train.py +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_rnnt_train.py @@ -11,6 +11,7 @@ import hashlib import contextlib import functools +from sisyphus import tk from returnn.tensor import Tensor, Dim, single_step_dim import returnn.frontend as rf @@ -30,10 +31,11 @@ if TYPE_CHECKING: from i6_experiments.users.gaudino.model_interfaces import ModelDef, RecogDef, TrainDef - from i6_experiments.users.gaudino.model_with_checkpoints import ( - ModelWithCheckpoints, - ModelWithCheckpoint, - ) + +from i6_experiments.users.gaudino.model_with_checkpoints import ( + ModelWithCheckpoints, + ModelWithCheckpoint, +) from i6_experiments.users.gaudino.models.asr.rf.conformer_rnnt.model_conformer_rnnt import from_scratch_model_def, from_scratch_training from i6_experiments.users.gaudino.models.asr.rf.conformer_rnnt.model_recog_rnnt import model_recog @@ -45,6 +47,9 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): """run the exp""" + + from i6_core.returnn.training import PtCheckpoint + _sis_setup_global_prefix(prefix_name) # Moh: dev-clean 2.27, dev-other 5.39, test-clean 2.41, test-other 5.51 @@ -80,15 +85,38 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): # train_exp("base-11gb", config_11gb, gpu_mem=11) # train_exp("base-11gb-v1", my_config_11gb, num_epochs=400, gpu_mem=11) - train_exp( # - "from-scratch-24gb", - rnnt_train_config, - config_updates={ - "learning_rate": 1.0, - "learning_rate_piecewise_values": [8e-5, 8e-4, 8e-5, 1e-6], - }, - num_epochs=400, - gpu_mem=24, + + # train_exp( # TODO: runs in loss nan + # "from-scratch-24gb", + # rnnt_train_config, + # config_updates={ + # "learning_rate": 1.0, + # "learning_rate_piecewise_values": [8e-5, 8e-4, 8e-5, 1e-6], + # }, + # num_epochs=400, + # gpu_mem=24, + # ) + + _torch_ckpt_path = "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_core/returnn/training/ReturnnTrainingJob.J6Uj9xtt1v5J/output/models/epoch.003.pt" + + model_args = { + "mel_normalization": True, + } + new_ckpt_path = tk.Path( + _torch_ckpt_path, + hash_overwrite= "rnnt" + "_torch_ckpt", + ) + new_ckpt = PtCheckpoint(new_ckpt_path) + + + # recog ctc only model + _recog( + "model_recogs/from-scratch-24gb/rnnt_beam_search/recog_results", + ModelWithCheckpoint( + definition=from_scratch_model_def, checkpoint=new_ckpt + ), + model_recog, + dev_sets=["dev"] ) # train_exp( # does not converge (wrong steps + more mistakes) diff --git a/users/gaudino/models/asr/rf/conformer_rnnt/model_recog_rnnt.py b/users/gaudino/models/asr/rf/conformer_rnnt/model_recog_rnnt.py index 5525e4369..e47145a67 100644 --- a/users/gaudino/models/asr/rf/conformer_rnnt/model_recog_rnnt.py +++ b/users/gaudino/models/asr/rf/conformer_rnnt/model_recog_rnnt.py @@ -130,35 +130,39 @@ def model_recog( # Initial state. beam_dim = Dim(1, name="initial-beam") batch_dims_ = [beam_dim] + batch_dims - decoder_state = model.decoder_default_initial_state( - batch_dims=batch_dims_, enc_spatial_dim=enc_spatial_dim - ) + target = rf.constant(model.bos_idx, dims=batch_dims_, sparse_dim=model.target_dim_w_blank) ended = rf.constant(False, dims=batch_dims_) out_seq_len = rf.constant(0, dims=batch_dims_) seq_log_prob = rf.constant(0.0, dims=batch_dims_) + blank_idx = model.target_dim.get_dim_value() + + enc_out = enc_args["enc"] + # TODO implement rnnt search temperature = 1.0 step_max_tokens = 100 - def _init_b_hypos(self, device: torch.device) -> List[Hypothesis]: - token = self.blank - state = None + def _init_b_hypos(device: torch.device) -> List[Hypothesis]: + token = blank_idx + decoder_state = model.decoder_default_initial_state( + batch_dims=batch_dims_, enc_spatial_dim=enc_spatial_dim + ) - one_tensor = torch.tensor([1], device=device) - pred_out, _, pred_state = model.predict(torch.tensor([[token]], device=device), one_tensor, state) + blank_tensor = rf.constant(blank_idx, dims=Dim(1), sparse_dim=model.target_dim_w_blank) + pred_out, _, pred_state = model.predictor(blank_tensor, decoder_state.predictor) init_hypo = ( [token], - pred_out[0].detach(), + pred_out, # pred_out[0].detach(), TODO: what is this doing? pred_state, 0.0, ) return [init_hypo] def _gen_next_token_probs( - self, enc_out: torch.Tensor, hypos: List[Hypothesis], device: torch.device + enc_out: Tensor, hypos: List[Hypothesis], device: torch.device ) -> torch.Tensor: one_tensor = torch.tensor([1], device=device) predictor_out = torch.stack([_get_hypo_predictor_out(h) for h in hypos], dim=0) @@ -168,11 +172,10 @@ def _gen_next_token_probs( predictor_out, torch.tensor([1] * len(hypos), device=device), ) # [beam_width, 1, 1, num_tokens] - joined_out = torch.nn.functional.log_softmax(joined_out / self.temperature, dim=3) + joined_out = torch.nn.functional.log_softmax(joined_out / temperature, dim=3) return joined_out[:, 0, 0] def _gen_b_hypos( - self, b_hypos: List[Hypothesis], a_hypos: List[Hypothesis], next_token_probs: torch.Tensor, @@ -199,7 +202,6 @@ def _gen_b_hypos( return [b_hypos[idx] for idx in sorted_idx] def _gen_a_hypos( - self, a_hypos: List[Hypothesis], b_hypos: List[Hypothesis], next_token_probs: torch.Tensor, @@ -230,14 +232,13 @@ def _gen_a_hypos( new_scores.append(score) if base_hypos: - new_hypos = self._gen_new_hypos(base_hypos, new_tokens, new_scores, t, device) + new_hypos = _gen_new_hypos(base_hypos, new_tokens, new_scores, t, device) else: new_hypos: List[Hypothesis] = [] return new_hypos def _gen_new_hypos( - self, base_hypos: List[Hypothesis], tokens: List[int], scores: List[float], @@ -246,7 +247,7 @@ def _gen_new_hypos( ) -> List[Hypothesis]: tgt_tokens = torch.tensor([[token] for token in tokens], device=device) states = _batch_state(base_hypos) - pred_out, _, pred_states = self.model.predict( + pred_out, _, pred_states = model.predict( tgt_tokens, torch.tensor([1] * len(base_hypos), device=device), states, @@ -257,12 +258,16 @@ def _gen_new_hypos( new_hypos.append((new_tokens, pred_out[i].detach(), _slice_state(pred_states, i, device), scores[i])) return new_hypos + # TODO: call for every seq + # for enc_out in enc_out_batched: + # from _search function - n_time_steps = enc_out.shape[1] + n_time_steps = enc_out.get_dim(1) device = enc_out.device + breakpoint() a_hypos: List[Hypothesis] = [] - b_hypos = self._init_b_hypos(device) if hypo is None else hypo + b_hypos = _init_b_hypos(device) # used for streaming: if hypo is None else hypo for t in range(n_time_steps): a_hypos = b_hypos b_hypos = torch.jit.annotate(List[Hypothesis], []) @@ -270,14 +275,14 @@ def _gen_new_hypos( symbols_current_t = 0 while a_hypos: - next_token_probs = self._gen_next_token_probs(enc_out[:, t: t + 1], a_hypos, device) + next_token_probs = _gen_next_token_probs(enc_out[:, t: t + 1], a_hypos, device) next_token_probs = next_token_probs.cpu() - b_hypos = self._gen_b_hypos(b_hypos, a_hypos, next_token_probs, key_to_b_hypo) + b_hypos = _gen_b_hypos(b_hypos, a_hypos, next_token_probs, key_to_b_hypo) - if symbols_current_t == self.step_max_tokens: + if symbols_current_t == step_max_tokens: break - a_hypos = self._gen_a_hypos( + a_hypos = _gen_a_hypos( a_hypos, b_hypos, next_token_probs, @@ -288,7 +293,7 @@ def _gen_new_hypos( if a_hypos: symbols_current_t += 1 - _, sorted_idx = torch.tensor([self.hypo_sort_key(hyp) for hyp in b_hypos]).topk(beam_width) + _, sorted_idx = torch.tensor([hypo_sort_key(hyp) for hyp in b_hypos]).topk(beam_width) b_hypos = [b_hypos[idx] for idx in sorted_idx] # return b_hypos From 9a965e58bee455fdafb828c95de82267ffab8dee Mon Sep 17 00:00:00 2001 From: schmitt Date: Thu, 6 Jun 2024 10:10:59 +0200 Subject: [PATCH 102/227] update --- .../labels/v2/librispeech/bpe/bpe.py | 11 + .../v2/librispeech/bpe/bpe_alignments.py | 38 +- .../labels/v2/librispeech/bpe/bpe_labels.py | 23 +- .../labels/v2/librispeech/label_singletons.py | 8 +- .../notes/commands | 3 +- .../pipelines/pipeline_ls_conf/alias.py | 2 +- .../global_vs_segmental_2022_23/recog_new.py | 6 + .../returnn/config_builder_rf/base.py | 25 +- .../network_builder_rf/global_/model.py | 20 +- .../global_/model_import.py | 7 + .../network_builder_rf/global_/train.py | 2 +- .../returnn/network_builder_rf/recog.py | 7 +- .../network_builder_rf/segmental/model.py | 34 +- .../segmental/model_new/blank_model/model.py | 70 +- .../segmental/model_new/blank_model/train.py | 133 ++- .../segmental/model_new/label_model/train.py | 824 +++++++++++++++++- .../network_builder_rf/segmental/recog.py | 224 ++--- .../segmental/recombination.py | 204 +++++ .../network_builder_rf/segmental/train.py | 212 +++-- .../network_builder_rf/segmental/utils.py | 6 +- .../center_window_att/baseline_v3/__init__.py | 110 ++- .../center_window_att/baseline_v3/baseline.py | 3 +- .../center_window_att/baseline_v4/__init__.py | 73 +- .../center_window_att/baseline_v4/baseline.py | 2 + .../center_window_att/config_builder.py | 11 +- .../center_window_att/recog.py | 4 +- .../center_window_att/train.py | 195 ++++- .../pipelines/pipeline_ls_conf/checkpoints.py | 21 +- .../global_att/baseline_v1/__init__.py | 37 +- .../pipeline_ls_conf/global_att/train.py | 5 +- .../model_interfaces/training.py | 19 + 31 files changed, 1975 insertions(+), 364 deletions(-) create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recombination.py diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/labels/v2/librispeech/bpe/bpe.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/labels/v2/librispeech/bpe/bpe.py index c16f7037a..fab669018 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/labels/v2/librispeech/bpe/bpe.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/labels/v2/librispeech/bpe/bpe.py @@ -1,4 +1,5 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.labels.v2.general import LabelDefinition +from i6_core.text.label.subword_nmt.train import ReturnnTrainBpeJob from typing import Dict from abc import ABC, abstractmethod @@ -14,3 +15,13 @@ def vocab_path(self) -> Path: @property def bpe_codes_path(self) -> Path: return Path("/u/zeineldeen/setups/librispeech/2022-11-28--conformer-att/work/i6_core/text/label/subword_nmt/train/ReturnnTrainBpeJob.vTq56NZ8STWt/output/bpe.codes") + + +class LibrispeechBPE1056(LabelDefinition, ABC): + @property + def vocab_path(self) -> Path: + return Path("/work/asr4/zeineldeen/setups-data/librispeech/2022-11-28--conformer-att/work/i6_core/text/label/subword_nmt/train/ReturnnTrainBpeJob.qhkNn2veTWkV/output/bpe.vocab") + + @property + def bpe_codes_path(self) -> Path: + return Path("/work/asr4/zeineldeen/setups-data/librispeech/2022-11-28--conformer-att/work/i6_core/text/label/subword_nmt/train/ReturnnTrainBpeJob.qhkNn2veTWkV/output/bpe.codes") diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/labels/v2/librispeech/bpe/bpe_alignments.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/labels/v2/librispeech/bpe/bpe_alignments.py index 87a226bad..5188c6984 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/labels/v2/librispeech/bpe/bpe_alignments.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/labels/v2/librispeech/bpe/bpe_alignments.py @@ -5,7 +5,7 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.labels.v2.general import \ SegmentalLabelDefinition from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.labels.v2.librispeech.bpe.bpe import \ - LibrispeechBPE10025 + LibrispeechBPE10025, LibrispeechBPE1056 from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.labels.v2.librispeech.general import \ LibrispeechLabelDefinition from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.general.hyperparameters import \ @@ -151,3 +151,39 @@ def alignment_paths(self) -> Dict[str, Path]: ) for corpus_key, alignment_path in self.librispeech_bpe_10025_ctc_alignment_instance.alignment_paths.items() } return self._alignment_paths + + +class LibrispeechBpe1056Alignment(LibrispeechBPE1056, LibrispeechLabelDefinition, SegmentalLabelDefinition): + """ + This is a forced alignment from the auxiliary CTC model in Mohammad's global AED setup (5.6% WER). + """ + def __init__(self): + super().__init__() + + self._alignment_paths = None + + @property + def model_hyperparameters(self) -> SegmentalModelHyperparameters: + return SegmentalModelHyperparameters( + sos_idx=0, target_num_labels=1056, sil_idx=None, blank_idx=0, target_num_labels_wo_blank=1055) + + @property + def rasr_format_paths(self) -> RasrFormats: + raise NotImplementedError + + @property + def alias(self) -> str: + return "att-transducer-alignment" + + @property + def alignment_paths(self) -> Dict[str, Path]: + if self._alignment_paths is None: + raise ValueError("Alignments first need to be set externally!") + return self._alignment_paths + + @alignment_paths.setter + def alignment_paths(self, value): + assert isinstance(value, dict) + assert self._alignment_paths is None, "Alignment paths are already set!" + assert "train" in value and "cv" in value and "devtrain" in value + self._alignment_paths = value diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/labels/v2/librispeech/bpe/bpe_labels.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/labels/v2/librispeech/bpe/bpe_labels.py index 0a9495ec1..9ec17fa04 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/labels/v2/librispeech/bpe/bpe_labels.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/labels/v2/librispeech/bpe/bpe_labels.py @@ -1,4 +1,4 @@ -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.labels.v2.librispeech.bpe.bpe import LibrispeechBPE10025 +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.labels.v2.librispeech.bpe.bpe import LibrispeechBPE10025, LibrispeechBPE1056 from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.labels.v2.librispeech.general import LibrispeechLabelDefinition from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.labels.v2.general import GlobalLabelDefinition from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.labels.v2.librispeech.phonemes.gmm_alignments import LIBRISPEECH_GMM_ALIGNMENT @@ -16,7 +16,7 @@ class LibrispeechBPE10025Labels(LibrispeechBPE10025, LibrispeechLabelDefinition, GlobalLabelDefinition): """ - These are the BPE labels of the SWB corpus. + These are the BPE labels of the Librispeech corpus. """ def __init__(self): super().__init__() @@ -33,7 +33,7 @@ def model_hyperparameters(self) -> GlobalModelHyperparameters: class LibrispeechBPE10025LabelsWithSilence(LibrispeechBPE10025, LibrispeechLabelDefinition, GlobalLabelDefinition): """ - These are the BPE labels of the SWB corpus. + These are the BPE labels of the Librispeech corpus. """ def __init__(self, librispeech_bpe_10025_labels_instance: LibrispeechBPE10025Labels): super().__init__() @@ -92,3 +92,20 @@ def model_hyperparameters(self) -> GlobalModelHyperparameters: @property def label_paths(self) -> Dict[str, Path]: return self._label_paths + + +class LibrispeechBPE1056Labels(LibrispeechBPE1056, LibrispeechLabelDefinition, GlobalLabelDefinition): + """ + These are the BPE labels of the Librispeech corpus. + """ + def __init__(self): + super().__init__() + + @property + def alias(self) -> str: + return "bpe" + + @property + def model_hyperparameters(self) -> GlobalModelHyperparameters: + return GlobalModelHyperparameters( + sos_idx=0, target_num_labels=1056, sil_idx=None, target_num_labels_wo_blank=1056) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/labels/v2/librispeech/label_singletons.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/labels/v2/librispeech/label_singletons.py index a248e57ae..0efb0cc71 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/labels/v2/librispeech/label_singletons.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/dependencies/labels/v2/librispeech/label_singletons.py @@ -1,5 +1,9 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.labels.v2.librispeech.bpe.bpe_labels import LibrispeechBPE10025Labels, LibrispeechBPE10025LabelsWithSilence -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.labels.v2.librispeech.bpe.bpe_alignments import LibrispeechBpe10025CtcAlignment, LibrispeechBpe10025CtcAlignmentEos +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.labels.v2.librispeech.bpe.bpe_alignments import ( + LibrispeechBpe10025CtcAlignment, + LibrispeechBpe10025CtcAlignmentEos, + LibrispeechBpe1056Alignment, +) from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.corpora.librispeech import LibrispeechCorpora @@ -9,3 +13,5 @@ LibrispeechBPE10025_LABELS_WITH_SILENCE = LibrispeechBPE10025LabelsWithSilence(LibrispeechBPE10025_LABELS) LibrispeechBPE10025_CTC_ALIGNMENT = LibrispeechBpe10025CtcAlignment() LibrispeechBPE10025_CTC_ALIGNMENT_EOS = LibrispeechBpe10025CtcAlignmentEos(LibrispeechBPE10025_CTC_ALIGNMENT) + +LibrispeechBPE1056_ALIGNMENT = LibrispeechBpe1056Alignment() diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/notes/commands b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/notes/commands index 0e7d139c5..ef4fb6c2b 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/notes/commands +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/notes/commands @@ -1,4 +1,5 @@ apptainer shell --nv -B /work/asr4 -B /work/asr3 -B /work/common -B /work/tools -B /u/berger -B /u/zeineldeen -B /u/rossenbach -B /u/beck -B /work/speech/tuske -B /u/zeyer -B /u/schmitt -B /u/atanas.gruev -B /u/zhou /work/asr4/berger/apptainer/images/i6_tensorflow-2.8_onnx-1.15.sif /work/asr4/berger/apptainer/images/i6_torch-2.2_onnx-1.16.sif -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.8/dist-packages/scipy/.libs \ No newline at end of file +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.8/dist-packages/scipy/.libs +export NUMBA_CACHE_DIR=/var/tmp/numba_cache_schmitt/ # librosa \ No newline at end of file diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/pipelines/pipeline_ls_conf/alias.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/pipelines/pipeline_ls_conf/alias.py index fca544222..c1f07e2ed 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/pipelines/pipeline_ls_conf/alias.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/pipelines/pipeline_ls_conf/alias.py @@ -2,4 +2,4 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.pipelines.pipeline_ls_conf.checkpoints import default_import_model_name -alias = f"{base_alias}/ls_conformer/import_{default_import_model_name}" +alias = f"{base_alias}/ls_conformer" diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/recog_new.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/recog_new.py index a301f504c..5e88e111e 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/recog_new.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/recog_new.py @@ -246,6 +246,12 @@ def __init__( self.search_rqmt = search_rqmt if search_rqmt is not None else {} self.alias += "/returnn_decoding" if search_alias is None else f"/{search_alias}" + + use_recombination = self.recog_opts.get("use_recombination") + if use_recombination is not None: + assert use_recombination in {"sum", "max"} + self.alias += f"_w-{use_recombination}-recomb" + if isinstance(self, ReturnnSegmentalAttDecodingExperiment): length_scale = self.config_builder.variant_params["network"]["length_scale"] if length_scale != 1.0: diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/config_builder_rf/base.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/config_builder_rf/base.py index 20af5a6d8..6130a6092 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/config_builder_rf/base.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/config_builder_rf/base.py @@ -89,9 +89,20 @@ def get_train_config(self, opts: Dict): train_step_func = opts.pop("train_step_func") remaining_opt_keys = [ - "aux_loss_layers", "preload_from_files", "accum_grad_multiple_step", "optimizer", "batching", - "torch_distributed", "pos_emb_dropout", "rf_att_dropout_broadcast", "grad_scaler", "gradient_clip_global_norm", - "spec_augment_steps", "torch_amp" + "aux_loss_layers", + "preload_from_files", + "accum_grad_multiple_step", + "optimizer", + "batching", + "torch_distributed", + "pos_emb_dropout", + "rf_att_dropout_broadcast", + "grad_scaler", + "gradient_clip_global_norm", + # "specaugment_steps", + "torch_amp", + "full_sum_training_beam_size", + # "max_seq_length" ] config_dict.update( {k: opts.pop(k) for k in remaining_opt_keys if k in opts} @@ -220,6 +231,7 @@ def get_lr_settings(self, lr_opts, python_epilog: Optional[List] = None): }) elif lr_opts["type"] == "dyn_lr_piecewise_linear": _lrlin_oclr_steps_by_bs_nep = { + (3, 125): [194_000, 388_000, 430_000], # ~3450steps/ep, 125 eps -> 430k steps in total (8, 125): [139_000, 279_000, 310_000], # ~2485steps/ep, 125 eps -> 310k steps in total (8, 250): [279_000, 558_000, 621_000], # ~2485steps/ep, 250 eps -> 621k steps in total (8, 500): [558_000, 1_117_000, 1_242_000], # ~2485steps/ep, 500 eps -> 1.242k steps in total @@ -288,6 +300,7 @@ def get_train_dataset_dict(self, dataset_opts: Dict): pre_process=CodeWrapper("speed_pert") if dataset_opts.get("use_speed_pert") else None, seq_ordering=self.variant_params["config"]["train_seq_ordering"], epoch_wise_filter=dataset_opts.get("epoch_wise_filter", None), + seq_postfix=dataset_opts.get("seq_postfix", self.variant_params["dependencies"].model_hyperparameters.sos_idx), **self.get_default_dataset_opts("train", dataset_opts) ) else: @@ -307,6 +320,7 @@ def get_cv_dataset_dict(self, dataset_opts: Dict): pre_process=None, seq_ordering="sorted_reverse", epoch_wise_filter=None, + seq_postfix=dataset_opts.get("seq_postfix", self.variant_params["dependencies"].model_hyperparameters.sos_idx), **self.get_default_dataset_opts("cv", dataset_opts) ) else: @@ -326,6 +340,7 @@ def get_devtrain_dataset_dict(self, dataset_opts: Dict): pre_process=None, seq_ordering="sorted_reverse", epoch_wise_filter=None, + seq_postfix=dataset_opts.get("seq_postfix", self.variant_params["dependencies"].model_hyperparameters.sos_idx), **self.get_default_dataset_opts("devtrain", dataset_opts) ) else: @@ -516,4 +531,8 @@ def get_recog_config(self, opts: Dict): "eos_label": self.variant_params["dependencies"].model_hyperparameters.sos_idx, } + use_recombination = opts.get("use_recombination") + if use_recombination is not None: + recog_config.config["use_recombination"] = use_recombination + return recog_config diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model.py index ed2961cd8..8af71daf3 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model.py @@ -87,12 +87,24 @@ def __init__( class MakeModel: """for import""" - def __init__(self, in_dim: int, target_dim: int, *, eos_label: int = 0, num_enc_layers: int = 12): + def __init__( + self, + in_dim: int, + target_dim: int, + *, + eos_label: int = 0, + num_enc_layers: int = 12, + enc_aux_logits: Sequence[int] = (), + ): self.in_dim = in_dim self.target_dim = target_dim self.eos_label = eos_label self.num_enc_layers = num_enc_layers + # do not set attribute otherwise to keep job hashes + if enc_aux_logits != (): + self.enc_aux_logits = enc_aux_logits + def __call__(self) -> GlobalAttentionModel: from returnn.datasets.util.vocabulary import Vocabulary @@ -102,7 +114,11 @@ def __call__(self) -> GlobalAttentionModel: [str(i) for i in range(target_dim.dimension)], eos_label=self.eos_label ) - return self.make_model(in_dim, target_dim, num_enc_layers=self.num_enc_layers) + extra = {} + if hasattr(self, "enc_aux_logits"): + extra["enc_aux_logits"] = self.enc_aux_logits + + return self.make_model(in_dim, target_dim, num_enc_layers=self.num_enc_layers, **extra) @classmethod def make_model( diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_import.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_import.py index 041ab883a..8964adc04 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_import.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model_import.py @@ -103,6 +103,13 @@ def _add_params(): _ParamMapping[ f"encoder.layers.{layer_idx}.final_layer_norm.bias" ] = f"conformer_block_{layer_idx + 1:02d}_ln/bias" + # ctc + _ParamMapping[ + f"encoder.enc_aux_logits_{layer_idx}.weight" + ] = f"ctc/W" + _ParamMapping[ + f"encoder.enc_aux_logits_{layer_idx}.bias" + ] = f"ctc/b" _add_params() diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/train.py index a2a1391d3..b068dc565 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/train.py @@ -118,7 +118,7 @@ def from_scratch_training( for i, layer_idx in enumerate(aux_loss_layers): if layer_idx > len(model.encoder.layers): continue - linear = getattr(model, f"enc_aux_logits_{layer_idx}") + linear = getattr(model.encoder, f"enc_aux_logits_{layer_idx}") aux_logits = linear(collected_outputs[str(layer_idx - 1)]) aux_loss = rf.ctc_loss( logits=aux_logits, diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/recog.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/recog.py index 37a07cc37..902ee0481 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/recog.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/recog.py @@ -24,8 +24,11 @@ def _returnn_v2_forward_step(*, model, extern_data: TensorDict, **_kwargs_unused default_target_key = config.typed_value("target") targets = extern_data[default_target_key] extra.update(dict(targets=targets, targets_spatial_dim=targets.get_time_dim_tag())) - if config.bool("use_recombination", False): - extra.update(dict(use_recombination=True)) + + use_recombination = config.typed_value("use_recombination", None) + if use_recombination: + extra.update(dict(use_recombination=use_recombination)) + recog_out = recog_def(model=model, data=data, data_spatial_dim=data_spatial_dim, **extra) if len(recog_out) == 5: # recog results including beam {batch, beam, out_spatial}, diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model.py index 55f63067a..7e441184c 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model.py @@ -9,6 +9,8 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.blank_model.model import ( BlankDecoderV1, BlankDecoderV3, + BlankDecoderV5, + BlankDecoderV6, ) from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.model import ( SegmentalAttLabelDecoder, SegmentalAttEfficientLabelDecoder @@ -65,7 +67,7 @@ def __init__( l2=l2, ) - assert blank_decoder_version in {1, 3, 4} + assert blank_decoder_version in {1, 3, 4, 5, 6} assert label_decoder_state in {"nb-lstm", "joint-lstm"} if not use_joint_model: assert label_decoder_state == "nb-lstm" @@ -96,13 +98,24 @@ def __init__( align_target_dim=align_target_dim, encoder_out_dim=self.encoder.out_dim, ) - else: + elif blank_decoder_version in {3, 4}: # the logic for blank_decoder_version == 4 is in the train/recog code self.blank_decoder = BlankDecoderV3( length_model_state_dim=length_model_state_dim, label_state_dim=self.label_decoder.get_lstm().out_dim, encoder_out_dim=self.encoder.out_dim, ) + elif blank_decoder_version == 5: + self.blank_decoder = BlankDecoderV5( + label_state_dim=self.label_decoder.get_lstm().out_dim, + encoder_out_dim=self.encoder.out_dim, + ) + else: + self.blank_decoder = BlankDecoderV6( + length_model_state_dim=length_model_state_dim, + label_state_dim=self.label_decoder.get_lstm().out_dim, + encoder_out_dim=self.encoder.out_dim, + ) else: self.blank_decoder = None @@ -287,3 +300,20 @@ def _returnn_v2_get_model(*, epoch: int, **_kwargs_unused): model = model_def( epoch=epoch, in_dim=data.feature_dim, align_target_dim=targets.sparse_dim, target_dim=non_blank_targets.sparse_dim) return model + + +def _returnn_v2_get_model_for_full_sum_training(*, epoch: int, **_kwargs_unused): + from returnn.tensor import Tensor, Dim + from returnn.config import get_global_config + + config = get_global_config() + default_input_key = config.typed_value("default_input") + default_target_key = config.typed_value("target") + extern_data_dict = config.typed_value("extern_data") + data = Tensor(name=default_input_key, **extern_data_dict[default_input_key]) + targets = Tensor(name=default_target_key, **extern_data_dict[default_target_key]) + + model_def = config.typed_value("_model_def") + model = model_def( + epoch=epoch, in_dim=data.feature_dim, align_target_dim=targets.sparse_dim, target_dim=targets.sparse_dim) + return model diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/model.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/model.py index 746349237..3954b544f 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/model.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/model.py @@ -11,11 +11,9 @@ class BlankDecoderBase(rf.Module, ABC): - def __init__(self, length_model_state_dim: Dim): + def __init__(self): super(BlankDecoderBase, self).__init__() - self.length_model_state_dim = length_model_state_dim self.emit_prob_dim = Dim(name="emit_prob", dimension=1) - self.emit_prob = rf.Linear(self.length_model_state_dim, self.emit_prob_dim) @property @abstractmethod @@ -55,11 +53,6 @@ def _get_am(enc: rf.Tensor, enc_spatial_dim: Dim, state: rf.State, spatial_dim) return rf.gather(enc, axis=enc_spatial_dim, indices=i, clip_to_valid=clip_to_valid) - def decode_logits(self, *, s_blank: Tensor) -> Tensor: - """logits for the decoder""" - logits = self.emit_prob(s_blank) - return logits - def get_label_decoder_deps(self) -> Optional[List[str]]: return None @@ -72,10 +65,9 @@ def __init__( align_target_dim: Dim, encoder_out_dim: Dim, ): - super(BlankDecoderV1, self).__init__(length_model_state_dim=length_model_state_dim) + super(BlankDecoderV1, self).__init__() self.length_model_state_dim = length_model_state_dim self.length_model_embed_dim = length_model_embed_dim - self.emit_prob_dim = Dim(name="emit_prob", dimension=1) self.target_embed = rf.Embedding(align_target_dim, self.length_model_embed_dim) self.s = rf.LSTM( @@ -125,10 +117,9 @@ def __init__( label_state_dim: Dim, encoder_out_dim: Dim, ): - super(BlankDecoderV2, self).__init__(length_model_state_dim=length_model_state_dim) + super(BlankDecoderV2, self).__init__() self.length_model_state_dim = length_model_state_dim self.length_model_embed_dim = length_model_embed_dim - self.emit_prob_dim = Dim(name="emit_prob", dimension=1) self.target_embed = rf.Embedding(target_dim, self.length_model_embed_dim) self.s = rf.LSTM( @@ -164,6 +155,11 @@ def loop_step( return {"s_blank": s_blank}, state_ + def decode_logits(self, *, s_blank: Tensor) -> Tensor: + """logits for the decoder""" + logits = self.emit_prob(s_blank) + return logits + def get_label_decoder_deps(self) -> Optional[List[str]]: return ["s"] @@ -175,9 +171,8 @@ def __init__( label_state_dim: Dim, encoder_out_dim: Dim, ): - super(BlankDecoderV3, self).__init__(length_model_state_dim=length_model_state_dim) + super(BlankDecoderV3, self).__init__() self.length_model_state_dim = length_model_state_dim - self.emit_prob_dim = Dim(name="emit_prob", dimension=1) self.s = rf.LSTM( encoder_out_dim + label_state_dim, @@ -211,5 +206,52 @@ def loop_step( return {"s_blank": s_blank}, state_ + def decode_logits(self, *, s_blank: Tensor) -> Tensor: + """logits for the decoder""" + logits = self.emit_prob(s_blank) + return logits + + def get_label_decoder_deps(self) -> Optional[List[str]]: + return ["s"] + + +class BlankDecoderV5(BlankDecoderBase): + @property + def _s(self) -> rf.LSTM: + raise NotImplementedError + + def __init__( + self, + label_state_dim: Dim, + encoder_out_dim: Dim, + ): + super(BlankDecoderV5, self).__init__() + + self.emit_prob = rf.Linear(encoder_out_dim + label_state_dim, self.emit_prob_dim) + + def get_label_decoder_deps(self) -> Optional[List[str]]: + return ["s"] + + +class BlankDecoderV6(BlankDecoderBase): + def __init__( + self, + length_model_state_dim: Dim, + label_state_dim: Dim, + encoder_out_dim: Dim, + ): + super(BlankDecoderV6, self).__init__() + self.length_model_state_dim = length_model_state_dim + + self.s = rf.LSTM( + encoder_out_dim, + self.length_model_state_dim, + ) + self.emit_prob = rf.Linear(self.length_model_state_dim + label_state_dim, self.emit_prob_dim) + + @property + def _s(self) -> rf.LSTM: + return self.s + def get_label_decoder_deps(self) -> Optional[List[str]]: return ["s"] diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/train.py index 101330b4e..0d961b411 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/train.py @@ -3,22 +3,23 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.blank_model.model import ( BlankDecoderV1, BlankDecoderV3, + BlankDecoderV5, + BlankDecoderV6, BlankDecoderBase ) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental import utils from returnn.tensor import Dim import returnn.frontend as rf -def decode_logits( +def get_packed_logits_and_emit_ground_truth( *, - model: BlankDecoderBase, - blank_loop_out: Dict, + blank_logits: rf.Tensor, align_targets_spatial_dim: Dim, emit_ground_truth: rf.Tensor, batch_dims: List[Dim] ): - blank_logits = model.decode_logits(**blank_loop_out) blank_logits_packed, pack_dim = rf.pack_padded( blank_logits, dims=batch_dims + [align_targets_spatial_dim], enforce_sorted=False) emit_ground_truth_packed, _ = rf.pack_padded( @@ -77,9 +78,8 @@ def viterbi_training( spatial_dim=align_targets_spatial_dim, ) - blank_logits_packed, pack_dim, emit_ground_truth_packed = decode_logits( - model=model, - blank_loop_out=blank_loop_out, + blank_logits_packed, pack_dim, emit_ground_truth_packed = get_packed_logits_and_emit_ground_truth( + blank_logits=model.decode_logits(**blank_loop_out), align_targets_spatial_dim=align_targets_spatial_dim, emit_ground_truth=emit_ground_truth, batch_dims=batch_dims @@ -112,9 +112,8 @@ def viterbi_training_v3( spatial_dim=label_states_unmasked_spatial_dim, ) - blank_logits_packed, pack_dim, emit_ground_truth_packed = decode_logits( - model=model, - blank_loop_out=blank_loop_out, + blank_logits_packed, pack_dim, emit_ground_truth_packed = get_packed_logits_and_emit_ground_truth( + blank_logits=model.decode_logits(**blank_loop_out), align_targets_spatial_dim=label_states_unmasked_spatial_dim, emit_ground_truth=emit_ground_truth, batch_dims=batch_dims @@ -128,16 +127,126 @@ def viterbi_training_v3( ) -# TODO: implement viterbi_training_v4 def viterbi_training_v4( *, model: BlankDecoderV3, enc_args: Dict, enc_spatial_dim: Dim, + label_states: rf.Tensor, + label_states_spatial_dim: Dim, + non_blank_mask: rf.Tensor, + non_blank_mask_dim: Dim, + non_blank_targets_spatial_dim: Dim, + emit_ground_truth: rf.Tensor, + emit_blank_target_dim: Dim, + batch_dims: List[Dim], +): + enc_spatial_dim.declare_same_as(non_blank_mask_dim) + am, _ = utils.get_masked( + input=enc_args["enc"], + mask=non_blank_mask, + mask_dim=non_blank_mask_dim, + batch_dims=batch_dims, + result_spatial_dim=non_blank_targets_spatial_dim, + ) + + singleton_dim = Dim(name="singleton", dimension=1) + first_enc_frame = rf.gather( + enc_args["enc"], + indices=rf.convert_to_tensor(0, dtype="int32"), + axis=enc_spatial_dim, + ) + first_enc_frame = rf.expand_dim(first_enc_frame, singleton_dim) + am, _ = rf.concat( + (first_enc_frame, singleton_dim), + (am, non_blank_targets_spatial_dim), + out_dim=label_states_spatial_dim + ) + + s, _ = model.s( + rf.concat_features(am, label_states), + state=model.s.default_initial_state(batch_dims=batch_dims), + spatial_dim=label_states_spatial_dim + ) + + s_unmasked = utils.get_unmasked( + s, + input_spatial_dim=label_states_spatial_dim, + mask=non_blank_mask, + mask_spatial_dim=non_blank_mask_dim + ) + + blank_logits_packed, pack_dim, emit_ground_truth_packed = get_packed_logits_and_emit_ground_truth( + blank_logits=model.decode_logits(s_blank=s_unmasked), + align_targets_spatial_dim=enc_spatial_dim, + emit_ground_truth=emit_ground_truth, + batch_dims=batch_dims + ) + + calc_loss( + blank_logits_packed=blank_logits_packed, + emit_ground_truth_packed=emit_ground_truth_packed, + emit_blank_target_dim=emit_blank_target_dim, + pack_dim=pack_dim + ) + + +def viterbi_training_v5( + *, + model: BlankDecoderV5, + enc_args: Dict, + enc_spatial_dim: Dim, + label_states_unmasked: rf.Tensor, + label_states_unmasked_spatial_dim: Dim, + emit_ground_truth: rf.Tensor, + emit_blank_target_dim: Dim, + batch_dims: List[Dim], +): + enc_spatial_dim.declare_same_as(label_states_unmasked_spatial_dim) + blank_logits = model.emit_prob(rf.concat_features(enc_args["enc"], label_states_unmasked)) + blank_logits_packed, pack_dim, emit_ground_truth_packed = get_packed_logits_and_emit_ground_truth( + blank_logits=blank_logits, + align_targets_spatial_dim=enc_spatial_dim, + emit_ground_truth=emit_ground_truth, + batch_dims=batch_dims + ) + + calc_loss( + blank_logits_packed=blank_logits_packed, + emit_ground_truth_packed=emit_ground_truth_packed, + emit_blank_target_dim=emit_blank_target_dim, + pack_dim=pack_dim + ) + + +def viterbi_training_v6( + *, + model: BlankDecoderV6, + enc_args: Dict, + enc_spatial_dim: Dim, label_states_unmasked: rf.Tensor, label_states_unmasked_spatial_dim: Dim, emit_ground_truth: rf.Tensor, emit_blank_target_dim: Dim, batch_dims: List[Dim], ): - pass + enc_spatial_dim.declare_same_as(label_states_unmasked_spatial_dim) + s, _ = model.s( + enc_args["enc"], + state=model.s.default_initial_state(batch_dims=batch_dims,), + spatial_dim=enc_spatial_dim + ) + blank_logits = model.emit_prob(rf.concat_features(s, label_states_unmasked)) + blank_logits_packed, pack_dim, emit_ground_truth_packed = get_packed_logits_and_emit_ground_truth( + blank_logits=blank_logits, + align_targets_spatial_dim=enc_spatial_dim, + emit_ground_truth=emit_ground_truth, + batch_dims=batch_dims + ) + + calc_loss( + blank_logits_packed=blank_logits_packed, + emit_ground_truth_packed=emit_ground_truth_packed, + emit_blank_target_dim=emit_blank_target_dim, + pack_dim=pack_dim + ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/train.py index 94468db9a..52fb27d5b 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/train.py @@ -1,6 +1,10 @@ from typing import Optional, Dict, Any, Sequence, Tuple, List +import tree + +import torch from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental import utils +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental import recombination from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.model import ( SegmentalAttLabelDecoder, SegmentalAttEfficientLabelDecoder @@ -8,6 +12,7 @@ from returnn.tensor import Dim, single_step_dim import returnn.frontend as rf +from returnn.frontend.tensor_array import TensorArray from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.training import TrainDef @@ -104,9 +109,10 @@ def _label_loop_body(xs, state: rf.State): segment_lens=final_state.decoder.segment_lens, state=final_state.decoder, ) + singleton_dim = Dim(name="singleton", dimension=1) return rf.concat( (label_loop_out["s"], non_blank_targets_spatial_dim), - (rf.expand_dim(last_loop_out["s"], single_step_dim), single_step_dim), + (rf.expand_dim(last_loop_out["s"], singleton_dim), singleton_dim), ) return None @@ -187,14 +193,15 @@ def viterbi_training_efficient( targets_spatial_dim.get_size_tensor() - 1, input_embeddings.device), clip_to_valid=True, ) + singleton_dim = Dim(name="singleton", dimension=1) last_lstm_out, _ = model.s_wo_att( last_embedding, state=final_state, - spatial_dim=single_step_dim, + spatial_dim=singleton_dim, ) return rf.concat( (label_lstm_out, targets_spatial_dim), - (rf.expand_dim(last_lstm_out, single_step_dim), single_step_dim), + (rf.expand_dim(last_lstm_out, singleton_dim), singleton_dim), ) return None @@ -205,35 +212,199 @@ def full_sum_training( model: SegmentalAttEfficientLabelDecoder, enc_args: Dict, enc_spatial_dim: Dim, - non_blank_targets: rf.Tensor, + non_blank_targets: rf.Tensor, # [B, S, V] non_blank_targets_spatial_dim: Dim, - segment_starts: rf.Tensor, - segment_lens: rf.Tensor, + segment_starts: rf.Tensor, # [B, T] + segment_lens: rf.Tensor, # [B, T] batch_dims: List[Dim], ) -> Optional[Dict[str, Tuple[rf.Tensor, Dim]]]: - # print("full_sum_training") - # print("model", model) + non_blank_input_embeddings = model.target_embed(non_blank_targets) # [B, S, D] + singleton_dim = Dim(name="singleton", dimension=1) + singleton_zeros = rf.zeros(batch_dims + [singleton_dim, model.target_embed.out_dim]) + non_blank_input_embeddings_shifted, non_blank_targets_spatial_dim_ext = rf.concat( + (singleton_zeros, singleton_dim), + (non_blank_input_embeddings, non_blank_targets_spatial_dim), + allow_broadcast=True + ) # [B, S+1, D] + non_blank_input_embeddings_shifted.feature_dim = non_blank_input_embeddings.feature_dim + + label_lstm_out, _ = model.s_wo_att( + non_blank_input_embeddings_shifted, + state=model.s_wo_att.default_initial_state(batch_dims=batch_dims), + spatial_dim=non_blank_targets_spatial_dim_ext, + ) # [B, S+1, D] - import torch - from torch.profiler import profile, record_function, ProfilerActivity + att = model( + enc=enc_args["enc"], + enc_ctx=enc_args["enc_ctx"], + enc_spatial_dim=enc_spatial_dim, + s=label_lstm_out, + segment_starts=segment_starts, + segment_lens=segment_lens, + ) # [B, S+1, T, D] - with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, profile_memory=True) as prof: + logits = model.decode_logits( + input_embed=non_blank_input_embeddings_shifted, + att=att, + s=label_lstm_out, + ) # [B, S+1, T, D] - non_blank_input_embeddings = model.target_embed(non_blank_targets) # [B, S, D] - singleton_dim = Dim(name="singleton", dimension=1) - singleton_zeros = rf.zeros(batch_dims + [singleton_dim, model.target_embed.out_dim]) - non_blank_input_embeddings_shifted, non_blank_targets_spatial_dim_ext = rf.concat( - (singleton_zeros, singleton_dim), - (non_blank_input_embeddings, non_blank_targets_spatial_dim), - allow_broadcast=True - ) # [B, S+1, D] - non_blank_input_embeddings_shifted.feature_dim = non_blank_input_embeddings.feature_dim - - label_lstm_out, _ = model.s_wo_att( - non_blank_input_embeddings_shifted, - state=model.s_wo_att.default_initial_state(batch_dims=batch_dims), - spatial_dim=non_blank_targets_spatial_dim_ext, - ) # [B, S+1, D] + + logits_packed, pack_dim = rf.pack_padded( + logits, + dims=batch_dims + [enc_spatial_dim, non_blank_targets_spatial_dim_ext], + enforce_sorted=False + ) # [B * T * (S+1), D] + + from returnn.extern_private.BergerMonotonicRNNT.monotonic_rnnt.pytorch_binding import monotonic_rnnt_loss + + loss = monotonic_rnnt_loss( + acts=logits_packed.raw_tensor, + labels=non_blank_targets.copy_transpose(batch_dims + [non_blank_targets_spatial_dim]).raw_tensor, + input_lengths=rf.copy_to_device(enc_spatial_dim.dyn_size_ext, logits.device).raw_tensor, + label_lengths=rf.copy_to_device(non_blank_targets_spatial_dim.dyn_size_ext, logits.device).raw_tensor.int(), + blank_label=model.blank_idx, + ) + + loss = rf.convert_to_tensor(loss, name="full_sum_loss") + loss.mark_as_loss("full_sum_loss", scale=1.0, use_normalized_loss=True) + + return None + + +def full_sum_training_w_beam( + *, + model: SegmentalAttEfficientLabelDecoder, + enc_args: Dict, + enc_spatial_dim: Dim, + non_blank_targets: rf.Tensor, # [B, S, V] + non_blank_targets_spatial_dim: Dim, + # segment_starts: rf.Tensor, # [B, T] + # segment_lens: rf.Tensor, # [B, T] + batch_dims: List[Dim], + beam_size: int, +) -> Optional[Dict[str, Tuple[rf.Tensor, Dim]]]: + assert len(batch_dims) == 1, "not supported yet" + + non_blank_input_embeddings = model.target_embed(non_blank_targets) # [B, S, D] + non_blank_input_embeddings, non_blank_targets_padded_spatial_dim = rf.pad( + non_blank_input_embeddings, + axes=[non_blank_targets_spatial_dim], + padding=[(1, 0)], + value=0.0, + ) + non_blank_targets_padded_spatial_dim = non_blank_targets_padded_spatial_dim[0] + + # add blank idx on the right + # this way, when the label index for gathering reached the last non-blank index, it will gather blank after that + # which then only allows corresponding hypotheses to be extended by blank + non_blank_targets_padded, _ = rf.pad( + non_blank_targets, + axes=[non_blank_targets_spatial_dim], + padding=[(0, 1)], + value=model.blank_idx, + out_dims=[non_blank_targets_padded_spatial_dim] + ) + + non_blank_targets_padded_sizes = rf.copy_to_device( + non_blank_targets_padded_spatial_dim.dyn_size_ext, non_blank_targets.device + ) + non_blank_targets_spatial_sizes = rf.copy_to_device( + non_blank_targets_spatial_dim.dyn_size_ext, non_blank_targets.device) + enc_spatial_sizes = rf.copy_to_device(enc_spatial_dim.dyn_size_ext, non_blank_targets.device) + + linear_label_positions = enc_spatial_sizes / non_blank_targets_spatial_sizes + linear_label_positions = linear_label_positions * rf.range_over_dim(non_blank_targets_spatial_dim) + # print("linear_label_positions", linear_label_positions.raw_tensor) + # exit() + + # print("non_blank_targets_padded", non_blank_targets_padded.raw_tensor) + + beam_dim = Dim(1, name="initial-beam") + batch_dims_ = [beam_dim] + batch_dims + bos_idx = 0 + seq_log_prob = rf.constant(0.0, dims=batch_dims_) + + max_seq_len = enc_spatial_dim.get_size_tensor() + max_seq_len = rf.reduce_max(max_seq_len, axis=max_seq_len.dims) + + label_lstm_state = model.s_wo_att.default_initial_state(batch_dims=batch_dims_) + target = rf.constant(bos_idx, dims=batch_dims_, sparse_dim=model.target_dim) + # target_non_blank = target.copy() + update_state_mask = rf.convert_to_tensor(target != model.blank_idx) + label_indices = rf.zeros(batch_dims_, dtype="int64") + + # input_embed = rf.zeros( + # batch_dims_ + [model.target_embed.out_dim], + # feature_dim=model.target_embed.out_dim, + # dtype="float32" + # ) + + vocab_range = rf.range_over_dim(model.target_dim) + blank_tensor = rf.convert_to_tensor(model.blank_idx, dtype=vocab_range.dtype) + log_lambda = rf.log(rf.convert_to_tensor(0.004)) * rf.ones([model.target_dim], dtype="float32") + # lambda_ = rf.shift_right(lambda_, axis=model.target_dim, pad_value=0.0) + log_lambda = rf.where( + vocab_range == model.blank_idx, + rf.constant(0.0, dims=[model.target_dim], dtype="float32"), + log_lambda + ) + + old_beam_dim = beam_dim.copy() + backrefs = rf.zeros(batch_dims_, dtype="int32") + + i = 0 + seq_targets = [] + seq_backrefs = [] + while i < max_seq_len.raw_tensor: + if i > 0: + # target_non_blank = rf.where(update_state_mask, target, rf.gather(target_non_blank, indices=backrefs)) + # input_embed = rf.where( + # update_state_mask, + # model.target_embed(target_non_blank), + # rf.gather(input_embed, indices=backrefs) + # ) + prev_label_indices = rf.gather(label_indices, indices=backrefs) + label_indices = rf.where( + update_state_mask, + rf.where( + prev_label_indices == non_blank_targets_padded_sizes - 1, + prev_label_indices, + prev_label_indices + 1 + ), + prev_label_indices + ) + + ground_truth = rf.gather( + non_blank_targets_padded, + indices=label_indices, + axis=non_blank_targets_padded_spatial_dim, + clip_to_valid=True + ) + input_embed = rf.gather( + non_blank_input_embeddings, + indices=label_indices, + axis=non_blank_targets_padded_spatial_dim, + clip_to_valid=True + ) + + label_lstm_out, label_lstm_state_updated = model.s_wo_att( + input_embed, + state=label_lstm_state, + spatial_dim=single_step_dim, + ) + + center_position = rf.minimum( + rf.full(dims=[beam_dim] + batch_dims, fill_value=i, dtype="int32"), + rf.copy_to_device(enc_spatial_dim.get_size_tensor() - 1, label_lstm_out.device) + ) + segment_starts = rf.maximum( + rf.convert_to_tensor(0, dtype="int32"), center_position - model.center_window_size // 2) + segment_ends = rf.minimum( + rf.copy_to_device(enc_spatial_dim.get_size_tensor() - 1, label_lstm_out.device), + center_position + model.center_window_size // 2 + ) + segment_lens = segment_ends - segment_starts + 1 att = model( enc=enc_args["enc"], @@ -243,41 +414,602 @@ def full_sum_training( segment_starts=segment_starts, segment_lens=segment_lens, ) # [B, S+1, T, D] + # print("att", att) logits = model.decode_logits( - input_embed=non_blank_input_embeddings_shifted, + input_embed=input_embed, att=att, s=label_lstm_out, ) # [B, S+1, T, D] + # print("logits", logits) + + label_log_prob = rf.log_softmax(logits, axis=model.target_dim) + + def custom_backward(grad): + grad[:, :, 0] *= 0.001 + return grad + + if rf.get_run_ctx().train_flag: + label_log_prob.raw_tensor.register_hook(custom_backward) + + # log prob needs to correspond to the next non-blank label... + log_prob_mask = vocab_range == ground_truth + rem_frames = enc_spatial_sizes - i + rem_labels = non_blank_targets_spatial_sizes - label_indices + # ... or to blank if there are more frames than labels left + log_prob_mask = rf.logical_or( + log_prob_mask, + rf.logical_and( + vocab_range == blank_tensor, + rem_frames > rem_labels + ) + ) + label_log_prob = rf.where( + log_prob_mask, + label_log_prob, + rf.constant(-float("inf"), dims=batch_dims + [beam_dim, model.target_dim]) + ) - print("logits", logits.raw_tensor.shape) + seq_log_prob = seq_log_prob + label_log_prob # Batch, InBeam, Vocab + old_beam_dim = beam_dim.copy() + seq_log_prob, (backrefs, target), beam_dim = rf.top_k( + seq_log_prob, + k_dim=Dim(beam_size, name=f"dec-step{i}-beam"), + axis=[beam_dim, model.target_dim] + ) # seq_log_prob, backrefs, target: Batch, Beam + seq_targets.append(target) + seq_backrefs.append(backrefs) + + update_state_mask = rf.logical_and( + rf.convert_to_tensor(target != model.blank_idx), + seq_log_prob != rf.convert_to_tensor(-float("inf"), dtype="float32") + ) - logits_packed, pack_dim = rf.pack_padded( - logits, - dims=batch_dims + [enc_spatial_dim, non_blank_targets_spatial_dim_ext], - enforce_sorted=False - ) # [B * T * (S+1), D] + def _get_masked_state(old, new, mask): + old = rf.gather(old, indices=backrefs, axis=old_beam_dim) + new = rf.gather(new, indices=backrefs, axis=old_beam_dim) + return rf.where(mask, new, old) - print("logits_packed", logits_packed.raw_tensor.shape) + label_lstm_state = tree.map_structure( + lambda old_state, new_state: _get_masked_state(old_state, new_state, update_state_mask), + label_lstm_state, label_lstm_state_updated + ) - print(prof.key_averages(group_by_input_shape=True).table(sort_by="self_cuda_memory_usage", row_limit=10)) - exit() + i += 1 + + # # Backtrack via backrefs, resolve beams. + # seq_targets_ = [] + # indices = rf.range_over_dim(beam_dim) # FinalBeam -> FinalBeam + # for backrefs, target in zip(seq_backrefs[::-1], seq_targets[::-1]): + # # indices: FinalBeam -> Beam + # # backrefs: Beam -> PrevBeam + # seq_targets_.insert(0, rf.gather(target, indices=indices)) + # indices = rf.gather(backrefs, indices=indices) # FinalBeam -> PrevBeam + # + # seq_targets__ = TensorArray(seq_targets_[0]) + # for target in seq_targets_: + # seq_targets__ = seq_targets__.push_back(target) + # seq_targets = seq_targets__.stack(axis=enc_spatial_dim) + + # torch.set_printoptions(threshold=10_000) + # print("seq_targets", seq_targets.copy_transpose(batch_dims + [beam_dim, enc_spatial_dim]).raw_tensor[0, 0]) + # print("seq_log_prob", seq_log_prob.raw_tensor[0]) + + # calculate full-sum loss using the log-sum-exp trick + max_log_prob = rf.reduce_max(seq_log_prob, axis=beam_dim) + loss = -1 * (max_log_prob + rf.log(rf.reduce_sum(rf.exp(seq_log_prob - max_log_prob), axis=beam_dim))) + + # loss = -rf.log(rf.reduce_sum(rf.exp(seq_log_prob), axis=beam_dim)) + loss.mark_as_loss("full_sum_loss", scale=1.0, use_normalized_loss=True) - from returnn.extern_private.BergerMonotonicRNNT.monotonic_rnnt.pytorch_binding import monotonic_rnnt_loss + return None - loss = monotonic_rnnt_loss( - acts=logits_packed.raw_tensor, - labels=non_blank_targets.copy_transpose(batch_dims + [non_blank_targets_spatial_dim]).raw_tensor, - input_lengths=rf.copy_to_device(enc_spatial_dim.dyn_size_ext, logits.device).raw_tensor, - label_lengths=rf.copy_to_device(non_blank_targets_spatial_dim.dyn_size_ext, logits.device).raw_tensor.int(), - blank_label=model.blank_idx, + +def full_sum_training_w_beam_eff( + *, + model: SegmentalAttEfficientLabelDecoder, + enc_args: Dict, + enc_spatial_dim: Dim, + non_blank_targets: rf.Tensor, # [B, S, V] + non_blank_targets_spatial_dim: Dim, + segment_starts: rf.Tensor, # [B, T] + segment_lens: rf.Tensor, # [B, T] + batch_dims: List[Dim], + beam_size: int, +) -> Optional[Dict[str, Tuple[rf.Tensor, Dim]]]: + assert len(batch_dims) == 1, "not supported yet" + assert model.blank_idx == 0, "blank idx needs to be zero because of the way the gradient is scaled" + + # ------------------------ init some variables ------------------------ + beam_dim = Dim(1, name="initial-beam") + batch_dims_ = [beam_dim] + batch_dims + bos_idx = 0 + seq_log_prob = rf.constant(0.0, dims=batch_dims_) + max_seq_len = enc_spatial_dim.get_size_tensor() + max_seq_len = rf.reduce_max(max_seq_len, axis=max_seq_len.dims) + label_lstm_state = model.s_wo_att.default_initial_state(batch_dims=batch_dims) + target = rf.constant(bos_idx, dims=batch_dims_, sparse_dim=model.target_dim) + update_state_mask = rf.convert_to_tensor(target != model.blank_idx) + label_indices = rf.zeros(batch_dims_, dtype="int32") + vocab_range = rf.range_over_dim(model.target_dim) + blank_tensor = rf.convert_to_tensor(model.blank_idx, dtype=vocab_range.dtype) + backrefs = rf.zeros(batch_dims_, dtype="int32") + seq_hash = rf.constant(0, dims=batch_dims_, dtype="int64") + + # ------------------------ targets/embeddings ------------------------ + + non_blank_input_embeddings = model.target_embed(non_blank_targets) # [B, S, D] + non_blank_input_embeddings, non_blank_targets_padded_spatial_dim = rf.pad( + non_blank_input_embeddings, + axes=[non_blank_targets_spatial_dim], + padding=[(1, 0)], + value=0.0, + ) # [B, S+1, D] + non_blank_targets_padded_spatial_dim = non_blank_targets_padded_spatial_dim[0] + + # add blank idx on the right + # this way, when the label index for gathering reached the last non-blank index, it will gather blank after that + # which then only allows corresponding hypotheses to be extended by blank + non_blank_targets_padded, _ = rf.pad( + non_blank_targets, + axes=[non_blank_targets_spatial_dim], + padding=[(0, 1)], + value=model.blank_idx, + out_dims=[non_blank_targets_padded_spatial_dim] ) - # print("loss", loss.shape) + # ------------------------ sizes ------------------------ - exit() + non_blank_targets_padded_spatial_sizes = rf.copy_to_device( + non_blank_targets_padded_spatial_dim.dyn_size_ext, non_blank_targets.device + ) + non_blank_targets_spatial_sizes = rf.copy_to_device( + non_blank_targets_spatial_dim.dyn_size_ext, non_blank_targets.device) + max_num_labels = rf.reduce_max( + non_blank_targets_spatial_sizes, axis=non_blank_targets_spatial_sizes.dims + ).raw_tensor.item() + enc_spatial_sizes = rf.copy_to_device(enc_spatial_dim.dyn_size_ext, non_blank_targets.device) + + # ------------------------ compute LSTM sequence ------------------------ + + label_lstm_out_seq, _ = model.s_wo_att( + non_blank_input_embeddings, + state=label_lstm_state, + spatial_dim=non_blank_targets_padded_spatial_dim, + ) - loss = rf.convert_to_tensor(loss, name="full_sum_loss") + # ------------------------ chunk dim ------------------------ + + chunk_size = 20 + chunk_dim = Dim(chunk_size, name="chunk") + chunk_range = rf.expand_dim(rf.range_over_dim(chunk_dim), batch_dims[0]) + + i = 0 + seq_targets = [] + seq_backrefs = [] + while i < max_seq_len.raw_tensor: + # get current number of labels for each hypothesis + if i > 0: + prev_label_indices = rf.gather(label_indices, indices=backrefs) + label_indices = rf.where( + update_state_mask, + rf.where( + prev_label_indices == non_blank_targets_padded_spatial_sizes - 1, + prev_label_indices, + prev_label_indices + 1 + ), + prev_label_indices + ) + + # gather ground truth, input embeddings and LSTM output for current label index + ground_truth = rf.gather( + non_blank_targets_padded, + indices=label_indices, + axis=non_blank_targets_padded_spatial_dim, + clip_to_valid=True + ) + input_embed = rf.gather( + non_blank_input_embeddings, + indices=label_indices, + axis=non_blank_targets_padded_spatial_dim, + clip_to_valid=True + ) + label_lstm_out = rf.gather( + label_lstm_out_seq, + indices=label_indices, + axis=non_blank_targets_padded_spatial_dim, + clip_to_valid=True + ) + + # precompute attention for the current chunk (more efficient than computing it individually for each label index) + if i % chunk_size == 0: + seg_starts = rf.gather( + segment_starts, + indices=chunk_range, + axis=enc_spatial_dim, + clip_to_valid=True + ) + seg_lens = rf.gather( + segment_lens, + indices=chunk_range, + axis=enc_spatial_dim, + clip_to_valid=True + ) + att = model( + enc=enc_args["enc"], + enc_ctx=enc_args["enc_ctx"], + enc_spatial_dim=enc_spatial_dim, + s=label_lstm_out_seq, + segment_starts=seg_starts, + segment_lens=seg_lens, + ) # [B, S+1, T, D] + chunk_range += chunk_size + + # gather attention for the current label index + att_step = rf.gather( + att, + indices=label_indices, + axis=non_blank_targets_padded_spatial_dim, + clip_to_valid=True + ) + att_step = rf.gather( + att_step, + indices=rf.constant(i % chunk_size, dims=batch_dims, device=att_step.device), + axis=chunk_dim, + clip_to_valid=True + ) + + logits = model.decode_logits( + input_embed=input_embed, + att=att_step, + s=label_lstm_out, + ) # [B, S+1, T, D] + + label_log_prob = rf.log_softmax(logits, axis=model.target_dim) + + # alpha = 0. + # label_log_prob = label_log_prob + rf.stop_gradient(label_log_prob * (alpha - 1)) + + # # scale down blank gradient to avoid outputting only blanks in the beginning + # # and then all other labels in the end + # def custom_backward(grad): + # grad[:, :, 0] *= 0.00005 + # return grad + # + # if rf.get_run_ctx().train_flag: + # label_log_prob.raw_tensor.register_hook(custom_backward) + + # mask label log prob in order to only allow hypotheses corresponding to the ground truth: + # log prob needs to correspond to the next non-blank label... + log_prob_mask = vocab_range == ground_truth + rem_frames = enc_spatial_sizes - i + rem_labels = non_blank_targets_spatial_sizes - label_indices + # ... or to blank if there are more frames than labels left + log_prob_mask = rf.logical_or( + log_prob_mask, + rf.logical_and( + vocab_range == blank_tensor, + rem_frames > rem_labels + ) + ) + label_log_prob = rf.where( + log_prob_mask, + label_log_prob, + rf.constant(-1.0e30, dims=batch_dims + [beam_dim, model.target_dim]) + ) + + # recombine hypotheses corresponding to the same node in the lattice (= same hash value -> same label history) + # do this by setting the log prob of all but the best hypothesis to -inf + # and setting the log prob of the best hypothesis to either the max or the sum of the equivalent hypotheses + seq_log_prob = recombination.recombine_seqs(seq_targets, seq_log_prob, seq_hash, beam_dim, batch_dims[0]) + + # set the beam size as low as possible according to the following rules (using recombination): + # 1) in frame i, there are i+1 nodes in the lattice and from each node, we can spawn 2 hypotheses + # 2) if T-i frames remain, only (T-i)*2 hypotheses can survive in order to reach the last node + # 3) in a frame, there are at most S+1 nodes, i.e. (S+1)*2 hypotheses can be spawned (see 1)) + # 4) the beam size should not exceed the given beam size + beam_size_ = min( + min((i + 1) * 2, rf.reduce_max(rem_frames, axis=rem_frames.dims).raw_tensor.item() * 2), + min((max_num_labels + 1) * 2 - 1, beam_size) + ) + + # update sequence log prob and beam indices + seq_log_prob = seq_log_prob + label_log_prob # Batch, InBeam, Vocab + seq_log_prob, (backrefs, target), beam_dim = rf.top_k( + seq_log_prob, + k_dim=Dim(beam_size_, name=f"dec-step{i}-beam"), + axis=[beam_dim, model.target_dim] + ) # seq_log_prob, backrefs, target: Batch, Beam + seq_targets.append(target) + seq_backrefs.append(backrefs) + + seq_hash = recombination.update_seq_hash(seq_hash, target, backrefs, model.blank_idx) + + # mask blank label + update_state_mask = rf.convert_to_tensor(target != model.blank_idx) + + i += 1 + + # last recombination + seq_log_prob = recombination.recombine_seqs(seq_targets, seq_log_prob, seq_hash, beam_dim, batch_dims[0]) + + # # Backtrack via backrefs, resolve beams. + # seq_targets_ = [] + # indices = rf.range_over_dim(beam_dim) # FinalBeam -> FinalBeam + # for backrefs, target in zip(seq_backrefs[::-1], seq_targets[::-1]): + # # indices: FinalBeam -> Beam + # # backrefs: Beam -> PrevBeam + # seq_targets_.insert(0, rf.gather(target, indices=indices)) + # indices = rf.gather(backrefs, indices=indices) # FinalBeam -> PrevBeam + # + # seq_targets__ = TensorArray(seq_targets_[0]) + # for target in seq_targets_: + # seq_targets__ = seq_targets__.push_back(target) + # seq_targets = seq_targets__.stack(axis=enc_spatial_dim) + # + # torch.set_printoptions(threshold=10_000) + # print("seq_targets", seq_targets.copy_transpose(batch_dims + [beam_dim, enc_spatial_dim]).raw_tensor[0, :]) + # print("seq_log_prob", seq_log_prob.raw_tensor[0]) + + # calculate full-sum loss using the log-sum-exp trick + max_log_prob = rf.reduce_max(seq_log_prob, axis=beam_dim) + loss = -1 * (max_log_prob + rf.log(rf.reduce_sum(rf.exp(seq_log_prob - max_log_prob), axis=beam_dim))) + + # loss = -rf.log(rf.reduce_sum(rf.exp(seq_log_prob), axis=beam_dim)) loss.mark_as_loss("full_sum_loss", scale=1.0, use_normalized_loss=True) return None + + +def full_sum_training_w_beam_eff_w_recomb( + *, + model: SegmentalAttEfficientLabelDecoder, + enc_args: Dict, + enc_spatial_dim: Dim, + non_blank_targets: rf.Tensor, # [B, S, V] + non_blank_targets_spatial_dim: Dim, + segment_starts: rf.Tensor, # [B, T] + segment_lens: rf.Tensor, # [B, T] + batch_dims: List[Dim], + beam_size: int, +) -> Optional[Dict[str, Tuple[rf.Tensor, Dim]]]: + assert len(batch_dims) == 1, "not supported yet" + assert model.blank_idx == 0, "blank idx needs to be zero because of the way the gradient is scaled" + + # ------------------------ init some variables ------------------------ + beam_dim = Dim(1, name="initial-beam") + batch_dims_ = [beam_dim] + batch_dims + bos_idx = 0 + seq_log_prob = rf.constant(0.0, dims=batch_dims_) + max_seq_len = enc_spatial_dim.get_size_tensor() + max_seq_len = rf.reduce_max(max_seq_len, axis=max_seq_len.dims) + label_lstm_state = model.s_wo_att.default_initial_state(batch_dims=batch_dims) + target = rf.constant(bos_idx, dims=batch_dims_, sparse_dim=model.target_dim) + vocab_range = rf.range_over_dim(model.target_dim) + blank_tensor = rf.convert_to_tensor(model.blank_idx, dtype=vocab_range.dtype) + backrefs = rf.zeros(batch_dims_, dtype="int32") + + # ------------------------ targets/embeddings ------------------------ + + non_blank_input_embeddings = model.target_embed(non_blank_targets) # [B, S, D] + non_blank_input_embeddings, non_blank_targets_padded_spatial_dim = rf.pad( + non_blank_input_embeddings, + axes=[non_blank_targets_spatial_dim], + padding=[(1, 0)], + value=0.0, + ) # [B, S+1, D] + non_blank_targets_padded_spatial_dim = non_blank_targets_padded_spatial_dim[0] + + # add blank idx on the right + # this way, when the label index for gathering reached the last non-blank index, it will gather blank after that + # which then only allows corresponding hypotheses to be extended by blank + non_blank_targets_padded, _ = rf.pad( + non_blank_targets, + axes=[non_blank_targets_spatial_dim], + padding=[(0, 1)], + value=model.blank_idx, + out_dims=[non_blank_targets_padded_spatial_dim] + ) + + # ------------------------ sizes ------------------------ + + non_blank_targets_padded_spatial_sizes = rf.copy_to_device( + non_blank_targets_padded_spatial_dim.dyn_size_ext, non_blank_targets.device + ) + non_blank_targets_spatial_sizes = rf.copy_to_device( + non_blank_targets_spatial_dim.dyn_size_ext, non_blank_targets.device) + max_num_labels = rf.reduce_max( + non_blank_targets_spatial_sizes, axis=non_blank_targets_spatial_sizes.dims + ).raw_tensor.item() + single_col_dim = Dim(dimension=max_num_labels + 1, name="max-num-labels") + label_indices = rf.zeros(batch_dims_, dtype="int32", sparse_dim=single_col_dim) + + enc_spatial_sizes = rf.copy_to_device(enc_spatial_dim.dyn_size_ext, non_blank_targets.device) + + # ------------------------ compute LSTM sequence ------------------------ + + label_lstm_out_seq, _ = model.s_wo_att( + non_blank_input_embeddings, + state=label_lstm_state, + spatial_dim=non_blank_targets_padded_spatial_dim, + ) + + # ------------------------ chunk dim ------------------------ + + chunk_size = 20 + chunk_dim = Dim(chunk_size, name="chunk") + chunk_range = rf.expand_dim(rf.range_over_dim(chunk_dim), batch_dims[0]) + + i = 0 + seq_targets = [] + seq_backrefs = [] + while i < max_seq_len.raw_tensor: + # get current number of labels for each hypothesis + if i > 0: + prev_label_indices = rf.gather(label_indices, indices=backrefs) + # mask blank label + update_state_mask = rf.convert_to_tensor(target != prev_label_indices) + label_indices = rf.where( + update_state_mask, + rf.where( + prev_label_indices == non_blank_targets_padded_spatial_sizes - 1, + prev_label_indices, + prev_label_indices + 1 + ), + prev_label_indices + ) + + # gather ground truth, input embeddings and LSTM output for current label index + ground_truth = rf.gather( + non_blank_targets_padded, + indices=label_indices, + axis=non_blank_targets_padded_spatial_dim, + clip_to_valid=True + ) + input_embed = rf.gather( + non_blank_input_embeddings, + indices=label_indices, + axis=non_blank_targets_padded_spatial_dim, + clip_to_valid=True + ) + label_lstm_out = rf.gather( + label_lstm_out_seq, + indices=label_indices, + axis=non_blank_targets_padded_spatial_dim, + clip_to_valid=True + ) + + # precompute attention for the current chunk (more efficient than computing it individually for each label index) + if i % chunk_size == 0: + seg_starts = rf.gather( + segment_starts, + indices=chunk_range, + axis=enc_spatial_dim, + clip_to_valid=True + ) + seg_lens = rf.gather( + segment_lens, + indices=chunk_range, + axis=enc_spatial_dim, + clip_to_valid=True + ) + att = model( + enc=enc_args["enc"], + enc_ctx=enc_args["enc_ctx"], + enc_spatial_dim=enc_spatial_dim, + s=label_lstm_out_seq, + segment_starts=seg_starts, + segment_lens=seg_lens, + ) # [B, S+1, T, D] + chunk_range += chunk_size + + # gather attention for the current label index + att_step = rf.gather( + att, + indices=label_indices, + axis=non_blank_targets_padded_spatial_dim, + clip_to_valid=True + ) + att_step = rf.gather( + att_step, + indices=rf.constant(i % chunk_size, dims=batch_dims, device=att_step.device), + axis=chunk_dim, + clip_to_valid=True + ) + + logits = model.decode_logits( + input_embed=input_embed, + att=att_step, + s=label_lstm_out, + ) # [B, S+1, T, D] + + label_log_prob = rf.log_softmax(logits, axis=model.target_dim) + + # mask label log prob in order to only allow hypotheses corresponding to the ground truth: + # log prob needs to correspond to the next non-blank label... + log_prob_mask = vocab_range == ground_truth + rem_frames = enc_spatial_sizes - i + rem_labels = non_blank_targets_spatial_sizes - label_indices + # ... or to blank if there are more frames than labels left + log_prob_mask = rf.logical_or( + log_prob_mask, + rf.logical_and( + vocab_range == blank_tensor, + rem_frames > rem_labels + ) + ) + label_log_prob = rf.where( + log_prob_mask, + label_log_prob, + rf.constant(-1.0e30, dims=batch_dims + [beam_dim, model.target_dim]) + ) + + label_log_prob = rf.where( + rf.convert_to_tensor(i >= rf.copy_to_device(enc_spatial_dim.get_size_tensor(), label_log_prob.device)), + rf.sparse_to_dense( + model.blank_idx, + axis=model.target_dim, + label_value=0.0, + other_value=-1.0e30 + ), + label_log_prob + ) + + seq_log_prob = recombination.recombine_seqs_train( + seq_log_prob=seq_log_prob, + label_log_prob=label_log_prob, + label_indices=label_indices, + ground_truth=ground_truth, + target_dim=model.target_dim, + single_col_dim=single_col_dim, + labels_padded_spatial_sizes=non_blank_targets_padded_spatial_sizes, + beam_dim=beam_dim, + batch_dims=batch_dims, + blank_idx=model.blank_idx, + ) + + beam_size_ = min( + min((i + 2), rf.reduce_max(rem_frames, axis=rem_frames.dims).raw_tensor.item()), + min((max_num_labels + 1), beam_size) + ) + + # update sequence log prob and beam indices + # seq_log_prob = seq_log_prob + label_log_prob # Batch, InBeam, Vocab + seq_log_prob, (backrefs, target), beam_dim = rf.top_k( + seq_log_prob, + k_dim=Dim(beam_size_, name=f"dec-step{i}-beam"), + axis=[beam_dim, single_col_dim] + ) # seq_log_prob, backrefs, target: Batch, Beam + seq_targets.append(target) + seq_backrefs.append(backrefs) + + i += 1 + + # Backtrack via backrefs, resolve beams. + seq_targets_ = [] + indices = rf.range_over_dim(beam_dim) # FinalBeam -> FinalBeam + for backrefs, target in zip(seq_backrefs[::-1], seq_targets[::-1]): + # indices: FinalBeam -> Beam + # backrefs: Beam -> PrevBeam + seq_targets_.insert(0, rf.gather(target, indices=indices)) + indices = rf.gather(backrefs, indices=indices) # FinalBeam -> PrevBeam + + seq_targets__ = TensorArray(seq_targets_[0]) + for target in seq_targets_: + seq_targets__ = seq_targets__.push_back(target) + seq_targets = seq_targets__.stack(axis=enc_spatial_dim) + + torch.set_printoptions(threshold=10_000) + print("seq_targets", seq_targets.copy_transpose(batch_dims + [beam_dim, enc_spatial_dim]).raw_tensor[0, 0]) + + loss = -1 * seq_log_prob + + # loss = -rf.log(rf.reduce_sum(rf.exp(seq_log_prob), axis=beam_dim)) + loss.mark_as_loss("full_sum_loss", scale=1.0, use_normalized_loss=True) + + # print("loss", loss.raw_tensor) + # print("single_col_dim", single_col_dim.dimension) + # exit() + + return None + diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recog.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recog.py index cd7886330..53e39d302 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recog.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recog.py @@ -8,6 +8,7 @@ from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.recog import RecogDef from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.base import _batch_size_factor from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model import SegmentalAttentionModel +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental import recombination from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.utils import get_masked, get_non_blank_mask from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.beam_search import utils as beam_search_utils from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.blank_model.model import ( @@ -16,82 +17,12 @@ ) -def recombine_seqs( - seq_targets: list, - seq_log_prob: Tensor, - seq_backrefs: list, - seq_hash: Tensor, - beam_dim: Dim, - batch_dim: Dim, - i: int -) -> Tensor: - if len(seq_targets) in (0, 1): - return seq_log_prob - - print("seq_hash: ", seq_hash.raw_tensor) - print("seq_log_prob before: ", seq_log_prob.raw_tensor) - - seq_hash_cpu = rf.copy_to_device(seq_hash.copy_transpose([batch_dim, beam_dim]), device="cpu") - # convert from neg log prob to log prob - seq_log_prob = rf.copy_to_device(seq_log_prob.copy_transpose([batch_dim, beam_dim]), device="cpu") - - for b in range(batch_dim.dyn_size_ext.raw_tensor.item()): - # for each batch dim, we need to find the seqs that have the same hash value - seq_sets = {} - for h in range(beam_dim.dimension): - # hash value of current hypothesis - seq_hash_value = seq_hash_cpu.raw_tensor[b, h].item() - if seq_hash_value not in seq_sets: - seq_sets[seq_hash_value] = [] - # insert hypothesis index into the list of hypotheses with the same hash value - seq_sets[seq_hash_value].append(h) - # for each set of hypotheses with the same hash value, we keep the one with the highest log prob - for seq_set in seq_sets.values(): - if len(seq_set) == 1: - continue - best_score = float("-inf") - best_idx = -1 - for idx in seq_set: - if seq_log_prob.raw_tensor[b, idx] > best_score: - best_score = seq_log_prob.raw_tensor[b, idx] - best_idx = idx - # print("batch: ", b, "seq_set: ", seq_set, "best_idx: ", best_idx, "best_score: ", best_score) - # exit() - for idx in seq_set: - if idx != best_idx: - seq_log_prob.raw_tensor[b, idx] = float("-inf") - else: - seq_log_prob.raw_tensor[b, idx] = best_score - - seq_log_prob = seq_log_prob - print("seq_log_prob after: ", seq_log_prob.raw_tensor) - exit() - - return rf.copy_to_device(seq_log_prob, device="gpu") - - -def update_seq_hash(seq_hash: Tensor, target: Tensor, backrefs: Tensor) -> Tensor: - print("update_seq_hash") - print("old seq_hash", seq_hash.raw_tensor) - print("target", target.raw_tensor) - print("backrefs", backrefs.raw_tensor) - print("\n\n") - - old_seq_hash = rf.gather(seq_hash, indices=backrefs) - seq_hash = rf.where( - target == 10025, - old_seq_hash, - (old_seq_hash * 257 + (target + 1)) % (10 ** 9 + 7) - ) - return seq_hash - - def model_recog( *, model: SegmentalAttentionModel, data: Tensor, data_spatial_dim: Dim, - use_recombination: bool = False, + use_recombination: Optional[str] = None, ) -> Tuple[Tensor, Tensor, Dim, Dim]: """ Function is run within RETURNN. @@ -129,6 +60,27 @@ def model_recog( blank_decoder_state = model.blank_decoder.default_initial_state(batch_dims=batch_dims_) if model.language_model: lm_state = model.language_model.default_initial_state(batch_dims=batch_dims_) + for state in lm_state: + if state == "pos": + # pass + lm_state[state] = rf.zeros(batch_dims_, dtype="int32") + else: + self_att_expand_dim = Dim(rf.zeros(batch_dims_, dtype="int32"), name="self_att_expand_dim_init") + lm_state[state].self_att.accum_axis = self_att_expand_dim + + k_accum = lm_state[state].self_att.k_accum # type: rf.Tensor + k_accum_raw = k_accum.raw_tensor + lm_state[state].self_att.k_accum = k_accum.copy_template_replace_dim_tag( + k_accum.get_axis_from_description("stag:self_att_expand_dim_init"), self_att_expand_dim + ) + lm_state[state].self_att.k_accum.raw_tensor = k_accum_raw + + v_accum = lm_state[state].self_att.v_accum # type: rf.Tensor + v_accum_raw = v_accum.raw_tensor + lm_state[state].self_att.v_accum = v_accum.copy_template_replace_dim_tag( + v_accum.get_axis_from_description("stag:self_att_expand_dim_init"), self_att_expand_dim + ) + lm_state[state].self_att.v_accum.raw_tensor = v_accum_raw bos_idx = 0 @@ -144,6 +96,7 @@ def model_recog( seq_log_prob = rf.constant(0.0, dims=batch_dims_) if use_recombination: assert len(batch_dims) == 1 + assert use_recombination in {"sum", "max"} seq_hash = rf.constant(0, dims=batch_dims_, dtype="int64") input_embed = rf.zeros( @@ -212,7 +165,13 @@ def model_recog( spatial_dim=single_step_dim, state=lm_state, ) - label_log_prob += rf.log_softmax(lm_logits, axis=model.target_dim) + lm_label_log_prob = rf.log_softmax(lm_logits, axis=model.target_dim) + # print(i) + # print("lm_label_log_prob: ", lm_label_log_prob.copy_transpose(batch_dims + [beam_dim, model.target_dim]).raw_tensor[0, :, :6]) + # print() + # if i == 10: + # exit() + label_log_prob += 0.4 * lm_label_log_prob # ------------------- blank step ------------------- @@ -243,12 +202,29 @@ def model_recog( else: output_log_prob = label_log_prob + # for shorter seqs in the batch, set the blank score to zero and the others to ~-inf + output_log_prob = rf.where( + rf.convert_to_tensor(i >= rf.copy_to_device(enc_spatial_dim.get_size_tensor(), data.device)), + rf.sparse_to_dense( + model.blank_idx, + axis=model.target_dim if model.use_joint_model else model.align_target_dim, + label_value=0.0, + other_value=-1.0e30 + ), + output_log_prob + ) + # ------------------- top-k ------------------- if use_recombination: - seq_log_prob = recombine_seqs(seq_targets, seq_log_prob, seq_backrefs, seq_hash, beam_dim, batch_dims[0], i) - if i == 3: - exit() + seq_log_prob = recombination.recombine_seqs( + seq_targets, + seq_log_prob, + seq_hash, + beam_dim, + batch_dims[0], + use_sum=use_recombination == "sum" + ) seq_log_prob = seq_log_prob + output_log_prob # Batch, InBeam, Vocab old_beam_dim = beam_dim.copy() @@ -257,11 +233,12 @@ def model_recog( k_dim=Dim(beam_size, name=f"dec-step{i}-beam"), axis=[beam_dim, model.target_dim if model.use_joint_model else model.align_target_dim] ) # seq_log_prob, backrefs, target: Batch, Beam + # print("seq_log_prob: ", seq_log_prob.raw_tensor) seq_targets.append(target) seq_backrefs.append(backrefs) if use_recombination: - seq_hash = update_seq_hash(seq_hash, target, backrefs) + seq_hash = recombination.update_seq_hash(seq_hash, target, backrefs, model.blank_idx) # mask for updating label-sync states update_state_mask = rf.convert_to_tensor(target != model.blank_idx) @@ -289,33 +266,84 @@ def _get_masked_state(old, new, mask): # ------------------- update external LM state ------------------- if model.language_model: - def _get_masked_state_lm(old: rf.Tensor, new: rf.Tensor, mask: rf.Tensor): - if isinstance(old, Dim): - return new - - def _update(tensor: rf.Tensor): - tensor = tensor.copy_transpose(batch_dims + [old_beam_dim] + tensor.remaining_dims(batch_dims_)) - tensor_raw_tensor = beam_search_utils.batch_gather( - tensor.raw_tensor, - indices=backrefs.copy_transpose(batch_dims + [beam_dim]).raw_tensor + for state in lm_state: + if state == "pos": + lm_state[state] = rf.where( + update_state_mask, + rf.gather(lm_state_updated[state], indices=backrefs), + rf.gather(lm_state[state], indices=backrefs) ) - tensor = tensor.copy_template_replace_dim_tag(1, beam_dim) - tensor.raw_tensor = tensor_raw_tensor - return tensor - - old = _update(old) - new = _update(new) + else: + updated_accum_axis = lm_state_updated[state].self_att.accum_axis - return rf.where(mask, new, old) + updated_self_att_expand_dim_dyn_size_ext = rf.gather(updated_accum_axis.dyn_size_ext, indices=backrefs) + masked_self_att_expand_dim_dyn_size_ext = rf.where( + update_state_mask, + updated_self_att_expand_dim_dyn_size_ext, + updated_self_att_expand_dim_dyn_size_ext - 1 + ) + masked_self_att_expand_dim = Dim(masked_self_att_expand_dim_dyn_size_ext, name="self_att_expand_dim_init") + lm_state[state].self_att.accum_axis = masked_self_att_expand_dim + + def _mask_lm_state(tensor: rf.Tensor): + tensor = rf.gather(tensor, indices=backrefs) + tensor = tensor.copy_transpose( + [updated_accum_axis] + tensor.remaining_dims(updated_accum_axis)) + tensor_raw = tensor.raw_tensor + tensor_raw = tensor_raw[:rf.reduce_max( + masked_self_att_expand_dim_dyn_size_ext, + axis=masked_self_att_expand_dim_dyn_size_ext.dims + ).raw_tensor.item()] + tensor = tensor.copy_template_replace_dim_tag( + tensor.get_axis_from_description(updated_accum_axis), masked_self_att_expand_dim + ) + tensor.raw_tensor = tensor_raw + return tensor + + lm_state[state].self_att.k_accum = _mask_lm_state(lm_state_updated[state].self_att.k_accum) + lm_state[state].self_att.v_accum = _mask_lm_state(lm_state_updated[state].self_att.v_accum) + + # lm_state[state].self_att.k_accum = rf.gather(lm_state_updated[state].self_att.k_accum, indices=backrefs) + # lm_state[state].self_att.k_accum = lm_state[state].self_att.k_accum.copy_transpose( + # [updated_accum_axis] + lm_state[state].self_att.k_accum.remaining_dims(updated_accum_axis)) + # k_accum_raw = lm_state[state].self_att.k_accum.raw_tensor + # k_accum_raw = k_accum_raw[:rf.reduce_max( + # masked_self_att_expand_dim_dyn_size_ext, + # axis=masked_self_att_expand_dim_dyn_size_ext.dims + # ).raw_tensor.item()] + # lm_state[state].self_att.k_accum = lm_state[state].self_att.k_accum.copy_template_replace_dim_tag( + # lm_state[state].self_att.k_accum.get_axis_from_description(updated_accum_axis), masked_self_att_expand_dim + # ) + # lm_state[state].self_att.k_accum.raw_tensor = k_accum_raw + # + # lm_state[state].self_att.v_accum = rf.gather(lm_state_updated[state].self_att.v_accum, indices=backrefs) + # lm_state[state].self_att.v_accum = lm_state[state].self_att.v_accum.copy_transpose( + # [updated_accum_axis] + lm_state[state].self_att.v_accum.remaining_dims(updated_accum_axis)) + # v_accum_raw = lm_state[state].self_att.v_accum.raw_tensor + # v_accum_raw = v_accum_raw[:rf.reduce_max( + # masked_self_att_expand_dim_dyn_size_ext, + # axis=masked_self_att_expand_dim_dyn_size_ext.dims + # ).raw_tensor.item()] + # lm_state[state].self_att.v_accum = lm_state[state].self_att.v_accum.copy_template_replace_dim_tag( + # lm_state[state].self_att.v_accum.get_axis_from_description(updated_accum_axis), masked_self_att_expand_dim + # ) + # lm_state[state].self_att.v_accum.raw_tensor = v_accum_raw - lm_state = tree.map_structure( - lambda old_state, new_state: _get_masked_state_lm(old_state, new_state, update_state_mask), - lm_state, lm_state_updated - ) + i += 1 - exit() + # last recombination + if use_recombination: + seq_log_prob = recombination.recombine_seqs( + seq_targets, + seq_log_prob, + seq_hash, + beam_dim, + batch_dims[0], + use_sum=use_recombination == "sum" + ) - i += 1 + # print("seq_log_prob: ", seq_log_prob.raw_tensor) + # exit() # Backtrack via backrefs, resolve beams. seq_targets_ = [] diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recombination.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recombination.py new file mode 100644 index 000000000..b8017325c --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recombination.py @@ -0,0 +1,204 @@ +from typing import Optional, Dict, Any, Tuple, Sequence +import tree +import numpy as np +import torch + +from returnn.tensor import Tensor, Dim +import returnn.frontend as rf + + +def recombine_seqs( + seq_targets: list, + seq_log_prob: Tensor, + seq_hash: Tensor, + beam_dim: Dim, + batch_dim: Dim, + use_sum: bool = True, +) -> Tensor: + if len(seq_targets) in (0, 1): + return seq_log_prob + + seq_hash_cpu = rf.copy_to_device(seq_hash.copy_transpose([batch_dim, beam_dim]), device="cpu") + # convert from neg log prob to log prob + seq_log_prob_cpu = rf.copy_to_device(seq_log_prob.copy_transpose([batch_dim, beam_dim]), device="cpu") + + for b in range(batch_dim.dyn_size_ext.raw_tensor.item()): + # for each batch dim, we need to find the seqs that have the same hash value + seq_sets = {} + for h in range(beam_dim.dimension): + # hash value of current hypothesis + seq_hash_value = seq_hash_cpu.raw_tensor[b, h].item() + if seq_hash_value not in seq_sets: + seq_sets[seq_hash_value] = [] + # insert hypothesis index into the list of hypotheses with the same hash value + seq_sets[seq_hash_value].append(h) + # for each set of hypotheses with the same hash value, we keep the one with the highest log prob + for seq_set in seq_sets.values(): + # skip if there is only one hypothesis in the set + if len(seq_set) == 1: + continue + best_score = -1.0e30 + best_idx = -1 + # find the hypothesis with the highest log prob + for idx in seq_set: + if seq_log_prob_cpu.raw_tensor[b, idx] > best_score: + best_score = seq_log_prob_cpu.raw_tensor[b, idx] + best_idx = idx + + if use_sum: + sum_score = torch.zeros(1, device="cpu") + # calculate log of sum of probs via log-sum-exp trick + for idx in seq_set: + sum_score += torch.exp(seq_log_prob_cpu.raw_tensor[b, idx] - best_score) + recomb_score = torch.log(sum_score) + best_score + else: + recomb_score = best_score + + for idx in seq_set: + if idx != best_idx: + seq_log_prob_cpu.raw_tensor[b, idx] = -1.0e30 + else: + seq_log_prob_cpu.raw_tensor[b, idx] = recomb_score + + return rf.copy_to_device(seq_log_prob_cpu, device=seq_log_prob.device) + + +def recombine_seqs_train( + seq_log_prob: Tensor, + label_log_prob: Tensor, + label_indices: Tensor, + ground_truth: Tensor, + labels_padded_spatial_sizes: Tensor, + target_dim: Dim, + single_col_dim: Dim, + beam_dim: Dim, + batch_dims: Sequence[Dim], + blank_idx: int, +) -> Tensor: + # local horizontal scores for each hyp: [B, beam] + horizontal_scores = rf.gather( + label_log_prob, + indices=rf.constant(blank_idx, dtype="int32", dims=batch_dims), + axis=target_dim + ) + + # combined horizontal scores for each hyp: [B, beam] + horizontal_scores = seq_log_prob + horizontal_scores + + # if a hypothesis has score < -1.0e30, it means that it got recombined with another hypothesis in the previous step + # in this case, it has the same label index as the hypothesis it got combined with and therefore, the scatter + # below would result in a horizontal score < -1.0e30, which is wrong. + # therefore, set the label index to S+2 and cut off this score afterwards + label_indices_ext = rf.where( + seq_log_prob <= -1.0e30, + single_col_dim.dimension, + label_indices, + ) + single_col_dim_ext = single_col_dim + 1 + label_indices_ext.sparse_dim = single_col_dim_ext + + # lattice column with horizontal scores: [B, S+2] + # horizontal -> label index stays the same + horizontal_scores = rf.scatter( + horizontal_scores, + indices=label_indices_ext, + indices_dim=beam_dim, + ) + horizontal_scores = horizontal_scores.copy_transpose(batch_dims + [single_col_dim_ext]) + # cut off the last row as mentioned above + horizontal_scores_raw = horizontal_scores.raw_tensor[:, :-1] + # lattice column with horizontal scores: [B, S+1] + horizontal_scores = horizontal_scores.copy_template_replace_dim_tag( + horizontal_scores.get_axis_from_description(single_col_dim_ext), + single_col_dim + ) + horizontal_scores.raw_tensor = horizontal_scores_raw + + # lattice column with horizontal scores for each hyp: [B, beam, S+1] + # each hyp only has one node with score > -1.0e30, which is the node corresponding to the horizontal transition + horizontal_scores = rf.where( + rf.range_over_dim(single_col_dim) == label_indices_ext, + horizontal_scores, + rf.constant(-1.0e30, dims=batch_dims) + ) + + # local diagonal scores for each hyp: [B, beam] + diagonal_scores = rf.gather( + label_log_prob, + indices=ground_truth, + axis=target_dim + ) + + # if the ground truth is blank, it means the hypothesis is already at the top-most row of the lattice + # in this case, set the diagonal score to -1.0e30 + diagonal_scores = rf.where( + ground_truth == blank_idx, + rf.constant(-1.0e30, dims=batch_dims), + diagonal_scores + ) + # combined diagonal scores for each hyp: [B, beam] + diagonal_scores = seq_log_prob + diagonal_scores + + # the updated label indices after diagonal transition: [B, beam] + label_indices_updated = label_indices + 1 + + + # same as with horizontal_scores, see above + label_indices_updated_ext = rf.where( + seq_log_prob <= -1.0e30, + single_col_dim.dimension, + label_indices_updated, + ) + label_indices_updated_ext.sparse_dim = single_col_dim_ext + + # lattice column with diagonal scores: [B, S+2] + # diagonal -> label index is updated + diagonal_scores = rf.scatter( + diagonal_scores, + indices=label_indices_updated_ext, + indices_dim=beam_dim, + ) + diagonal_scores = diagonal_scores.copy_transpose(batch_dims + [single_col_dim_ext]) + # cut off last row, see above + diagonal_scores_raw = diagonal_scores.raw_tensor[:, :-1] + # lattice column with diagonal scores: [B, S+1] + diagonal_scores = diagonal_scores.copy_template_replace_dim_tag( + diagonal_scores.get_axis_from_description(single_col_dim_ext), + single_col_dim + ) + diagonal_scores.raw_tensor = diagonal_scores_raw + + # lattice column with diagonal scores for each hyp: [B, beam, S+1] + # each hyp only has one node with score > -1.0e30, which is the node corresponding to the diagonal transition + diagonal_scores = rf.where( + rf.range_over_dim(single_col_dim) == label_indices_updated_ext, + diagonal_scores, + rf.constant(-1.0e30, dims=batch_dims) + ) + + best_scores = rf.maximum(horizontal_scores, diagonal_scores) + sum_scores = rf.exp(horizontal_scores - best_scores) + rf.exp(diagonal_scores - best_scores) + sum_scores = best_scores + rf.safe_log(sum_scores) + + best_scores = rf.reduce_max(sum_scores, axis=beam_dim) + is_max = sum_scores == best_scores + sum_scores = best_scores + rf.log(rf.reduce_sum(rf.exp(sum_scores - best_scores), axis=beam_dim)) + + sum_scores = rf.expand_dim(sum_scores, beam_dim) + sum_scores = rf.where( + is_max, + sum_scores, + rf.constant(-1.0e30, dims=batch_dims) + ) + + return sum_scores + + +def update_seq_hash(seq_hash: Tensor, target: Tensor, backrefs: Tensor, blank_idx: int) -> Tensor: + old_seq_hash = rf.gather(seq_hash, indices=backrefs) + seq_hash = rf.where( + target == blank_idx, + old_seq_hash, + (old_seq_hash * 257 + (target + 1)) % (10 ** 9 + 7) + ) + return seq_hash diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/train.py index 082d8525d..bb5c56b40 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/train.py @@ -1,6 +1,6 @@ import torch -from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.training import FramewiseTrainDef +from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.training import FramewiseTrainDef, FullSumTrainDef from returnn.tensor import TensorDict @@ -16,15 +16,35 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.train import ( full_sum_training as label_model_full_sum_training ) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.train import ( + full_sum_training_w_beam as label_model_full_sum_training_w_beam +) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.train import ( + full_sum_training_w_beam_eff as label_model_full_sum_training_w_beam_eff +) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.train import ( + full_sum_training_w_beam_eff_w_recomb as label_model_full_sum_training_w_beam_eff_w_recomb +) from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.blank_model.train import ( viterbi_training as blank_model_viterbi_training ) from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.blank_model.train import ( viterbi_training_v3 as blank_model_viterbi_training_v3 ) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.blank_model.train import ( + viterbi_training_v4 as blank_model_viterbi_training_v4 +) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.blank_model.train import ( + viterbi_training_v5 as blank_model_viterbi_training_v5 +) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.blank_model.train import ( + viterbi_training_v6 as blank_model_viterbi_training_v6 +) from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.blank_model.model import ( BlankDecoderV1, BlankDecoderV3, + BlankDecoderV5, + BlankDecoderV6 ) from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.model import ( SegmentalAttLabelDecoder, @@ -57,6 +77,26 @@ def _returnn_v2_train_step(*, model, extern_data: TensorDict, **_kwargs_unused): ) +def _returnn_v2_full_sum_train_step(*, model, extern_data: TensorDict, **_kwargs_unused): + from returnn.config import get_global_config + + config = get_global_config() + default_input_key = config.typed_value("default_input") + default_target_key = config.typed_value("target") + data = extern_data[default_input_key] + data_spatial_dim = data.get_time_dim_tag() + targets = extern_data[default_target_key] + targets_spatial_dim = targets.get_time_dim_tag() + train_def: FullSumTrainDef = config.typed_value("_train_def") + train_def( + model=model, + data=data, + data_spatial_dim=data_spatial_dim, + non_blank_targets=targets, + non_blank_targets_spatial_dim=targets_spatial_dim, + ) + + def viterbi_training( *, model: SegmentalAttentionModel, @@ -71,6 +111,8 @@ def viterbi_training( config = get_global_config() # noqa aux_loss_layers = config.typed_value("aux_loss_layers") aux_loss_scales = config.typed_value("aux_loss_scales", ([1.0] * len(aux_loss_layers)) if aux_loss_layers else None) + use_ctc_loss = config.typed_value("use_ctc_loss", True) + generate_ctc_alignments_on_the_fly = config.typed_value("generate_ctc_alignments_on_the_fly", False) force_inefficient_loop = config.typed_value("force_inefficient_loop", False) if data.feature_dim and data.feature_dim.dimension == 1: @@ -94,7 +136,6 @@ def viterbi_training( # TODO: use rf.window() instead segment_starts, segment_lens = utils.get_segment_starts_and_lens( non_blank_mask=rf.sequence_mask(align_targets.dims), # this way, every frame is interpreted as non-blank - align_targets=align_targets, align_targets_spatial_dim=align_targets_spatial_dim, model=model, batch_dims=batch_dims, @@ -118,7 +159,6 @@ def viterbi_training( segment_starts, segment_lens = utils.get_segment_starts_and_lens( non_blank_mask, - align_targets, align_targets_spatial_dim, model, batch_dims, @@ -132,24 +172,46 @@ def viterbi_training( data, in_spatial_dim=data_spatial_dim, collected_outputs=collected_outputs) if aux_loss_layers: - for i, layer_idx in enumerate(aux_loss_layers): - if layer_idx > len(model.encoder.layers): - continue - linear = getattr(model, f"enc_aux_logits_{layer_idx}") - aux_logits = linear(collected_outputs[str(layer_idx - 1)]) - aux_loss = rf.ctc_loss( - logits=aux_logits, - targets=non_blank_targets, - input_spatial_dim=enc_spatial_dim, - targets_spatial_dim=non_blank_targets_spatial_dim, - blank_index=model.blank_idx, - ) - aux_loss.mark_as_loss( - f"ctc_{layer_idx}", - scale=aux_loss_scales[i], - custom_inv_norm_factor=align_targets_spatial_dim.get_size_tensor(), - use_normalized_loss=True, + if use_ctc_loss: + for i, layer_idx in enumerate(aux_loss_layers): + if layer_idx > len(model.encoder.layers): + continue + linear = getattr(model.encoder, f"enc_aux_logits_{layer_idx}") + aux_logits = linear(collected_outputs[str(layer_idx - 1)]) + aux_loss = rf.ctc_loss( + logits=aux_logits, + targets=non_blank_targets, + input_spatial_dim=enc_spatial_dim, + targets_spatial_dim=non_blank_targets_spatial_dim, + blank_index=model.blank_idx, + ) + aux_loss.mark_as_loss( + f"ctc_{layer_idx}", + scale=aux_loss_scales[i], + custom_inv_norm_factor=align_targets_spatial_dim.get_size_tensor(), + use_normalized_loss=True, + ) + elif generate_ctc_alignments_on_the_fly: + assert len(aux_loss_layers) == 1 + assert len(batch_dims) == 1 + assert model.blank_idx == model.target_dim.dimension + print("Generating CTC alignments on the fly") + layer_idx = aux_loss_layers[0] + linear = getattr(model.encoder, f"enc_aux_logits_{layer_idx}") + aux_logits = linear(collected_outputs[str(layer_idx - 1)]) # type: rf.Tensor + print("aux_logits", aux_logits) + + from torchaudio.functional import forced_align + rem_dims = aux_logits.remaining_dims(batch_dims + [enc_spatial_dim]) + ctc_align = forced_align( + log_probs=aux_logits.copy_transpose(batch_dims + [enc_spatial_dim] + rem_dims).raw_tensor, + targets=non_blank_targets.copy_transpose(batch_dims + [non_blank_targets_spatial_dim]).raw_tensor.contiguous(), + input_lengths=enc_spatial_dim.get_size_tensor().raw_tensor, + target_lengths=non_blank_targets_spatial_dim.get_size_tensor().raw_tensor, + blank=model.blank_idx, ) + print("ctc_align", ctc_align.shape) + exit() if model.use_joint_model: @@ -241,8 +303,11 @@ def viterbi_training( emit_blank_target_dim=emit_blank_target_dim, batch_dims=batch_dims, ) - else: - assert isinstance(model.blank_decoder, BlankDecoderV3) and model.blank_decoder_version in (3,) + elif model.blank_decoder_version in (3, 5, 6): + assert isinstance( + model.blank_decoder, BlankDecoderV3) or isinstance( + model.blank_decoder, BlankDecoderV5) or isinstance( + model.blank_decoder, BlankDecoderV6) label_states_unmasked = utils.get_unmasked( input=label_decoder_outputs[0], @@ -250,12 +315,51 @@ def viterbi_training( mask=non_blank_mask, mask_spatial_dim=align_targets_spatial_dim ) - blank_model_viterbi_training_v3( + + if model.blank_decoder_version == 3: + blank_model_viterbi_training_v3( + model=model.blank_decoder, + enc_args=enc_args, + enc_spatial_dim=enc_spatial_dim, + label_states_unmasked=label_states_unmasked, + label_states_unmasked_spatial_dim=align_targets_spatial_dim, + emit_ground_truth=emit_ground_truth, + emit_blank_target_dim=emit_blank_target_dim, + batch_dims=batch_dims, + ) + elif model.blank_decoder_version == 5: + blank_model_viterbi_training_v5( + model=model.blank_decoder, + enc_args=enc_args, + enc_spatial_dim=enc_spatial_dim, + label_states_unmasked=label_states_unmasked, + label_states_unmasked_spatial_dim=align_targets_spatial_dim, + emit_ground_truth=emit_ground_truth, + emit_blank_target_dim=emit_blank_target_dim, + batch_dims=batch_dims, + ) + else: + blank_model_viterbi_training_v6( + model=model.blank_decoder, + enc_args=enc_args, + enc_spatial_dim=enc_spatial_dim, + label_states_unmasked=label_states_unmasked, + label_states_unmasked_spatial_dim=align_targets_spatial_dim, + emit_ground_truth=emit_ground_truth, + emit_blank_target_dim=emit_blank_target_dim, + batch_dims=batch_dims, + ) + else: + assert model.blank_decoder_version == 4 and isinstance(model.blank_decoder, BlankDecoderV3) + blank_model_viterbi_training_v4( model=model.blank_decoder, enc_args=enc_args, enc_spatial_dim=enc_spatial_dim, - label_states_unmasked=label_states_unmasked, - label_states_unmasked_spatial_dim=align_targets_spatial_dim, + label_states=label_decoder_outputs[0], + label_states_spatial_dim=label_decoder_outputs[1], + non_blank_mask=non_blank_mask, + non_blank_mask_dim=align_targets_spatial_dim, + non_blank_targets_spatial_dim=non_blank_targets_spatial_dim, emit_ground_truth=emit_ground_truth, emit_blank_target_dim=emit_blank_target_dim, batch_dims=batch_dims, @@ -270,8 +374,8 @@ def full_sum_training( model: SegmentalAttentionModel, data: rf.Tensor, data_spatial_dim: Dim, - align_targets: rf.Tensor, - align_targets_spatial_dim: Dim + non_blank_targets: rf.Tensor, + non_blank_targets_spatial_dim: Dim ): assert model.use_joint_model assert isinstance(model.label_decoder, SegmentalAttEfficientLabelDecoder) @@ -279,28 +383,18 @@ def full_sum_training( from returnn.config import get_global_config - # torch.cuda.memory._record_memory_history(enabled=True) - config = get_global_config() # noqa aux_loss_layers = config.typed_value("aux_loss_layers") aux_loss_scales = config.typed_value("aux_loss_scales", ([1.0] * len(aux_loss_layers)) if aux_loss_layers else None) + full_sum_training_beam_size = config.int("full_sum_training_beam_size", None) + if data.feature_dim and data.feature_dim.dimension == 1: data = rf.squeeze(data, axis=data.feature_dim) assert not data.feature_dim # raw audio batch_dims = data.remaining_dims(data_spatial_dim) - # set blank indices in alignment to 0 (= EOS index of imported global att model which is not used otherwise) - align_targets.raw_tensor[align_targets.raw_tensor == model.target_dim.dimension] = 0 - align_targets.sparse_dim = model.target_dim - - # get non-blank targets ([B,S]) - non_blank_mask = utils.get_non_blank_mask(align_targets, model.blank_idx) - non_blank_targets, non_blank_targets_spatial_dim = utils.get_masked( - align_targets, non_blank_mask, align_targets_spatial_dim, batch_dims - ) - # ------------------- encoder aux loss ------------------- collected_outputs = {} @@ -323,16 +417,15 @@ def full_sum_training( aux_loss.mark_as_loss( f"ctc_{layer_idx}", scale=aux_loss_scales[i], - custom_inv_norm_factor=align_targets_spatial_dim.get_size_tensor(), + custom_inv_norm_factor=enc_spatial_dim.get_size_tensor(), use_normalized_loss=True, ) # for every frame position, get the corresponding window around it ([B,T,W]) # TODO: use rf.window() instead segment_starts, segment_lens = utils.get_segment_starts_and_lens( - utils.get_non_blank_mask(align_targets, blank_idx=-1), # this way, every frame is interpreted as non-blank - align_targets, - align_targets_spatial_dim, + rf.sequence_mask(batch_dims + [enc_spatial_dim]), # this way, every frame is interpreted as non-blank + enc_spatial_dim, model, batch_dims, enc_spatial_dim @@ -340,16 +433,29 @@ def full_sum_training( # ------------------- joint loop ------------------- - label_model_full_sum_training( - model=model.label_decoder, - enc_args=enc_args, - enc_spatial_dim=enc_spatial_dim, - non_blank_targets=non_blank_targets, - non_blank_targets_spatial_dim=non_blank_targets_spatial_dim, - segment_starts=segment_starts, - segment_lens=segment_lens, - batch_dims=batch_dims, - ) + if full_sum_training_beam_size: + label_model_full_sum_training_w_beam_eff_w_recomb( + model=model.label_decoder, + enc_args=enc_args, + enc_spatial_dim=enc_spatial_dim, + non_blank_targets=non_blank_targets, + non_blank_targets_spatial_dim=non_blank_targets_spatial_dim, + segment_starts=segment_starts, + segment_lens=segment_lens, + batch_dims=batch_dims, + beam_size=full_sum_training_beam_size + ) + else: + label_model_full_sum_training( + model=model.label_decoder, + enc_args=enc_args, + enc_spatial_dim=enc_spatial_dim, + non_blank_targets=non_blank_targets, + non_blank_targets_spatial_dim=non_blank_targets_spatial_dim, + segment_starts=segment_starts, + segment_lens=segment_lens, + batch_dims=batch_dims, + ) full_sum_training: TrainDef[SegmentalAttentionModel] diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/utils.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/utils.py index 18074b34b..73da27aa7 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/utils.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/utils.py @@ -34,11 +34,12 @@ def get_masked( result = rf.scatter( input, indices=idxs, indices_dim=mask_dim, out_dim=result_spatial_dim_temp) # remove accumulated blanks at the last position - result = result.copy_transpose([result_spatial_dim_temp] + batch_dims) + rem_dims = result.remaining_dims([result_spatial_dim_temp] + batch_dims) + result = result.copy_transpose([result_spatial_dim_temp] + batch_dims + rem_dims) result_raw_tensor = result.raw_tensor result = result.copy_template_replace_dim_tag(0, result_spatial_dim) result.raw_tensor = result_raw_tensor[:-1] - result = result.copy_transpose(batch_dims + [result_spatial_dim]) + result = result.copy_transpose(batch_dims + [result_spatial_dim] + rem_dims) return result, result_spatial_dim @@ -67,7 +68,6 @@ def get_unmasked( def get_segment_starts_and_lens( non_blank_mask: Tensor, - align_targets: Tensor, align_targets_spatial_dim: Dim, model: SegmentalAttentionModel, batch_dims: Sequence[Dim], diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/__init__.py index 21f3ce8fc..8be9edbfb 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/__init__.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/__init__.py @@ -10,9 +10,8 @@ def run_exps(): - # baseline model for checking consistency of train and recog implementations for model_alias, config_builder in baseline.center_window_att_baseline_rf( - win_size_list=(5,), + win_size_list=(5,), blank_decoder_version=4, ): for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( alias=model_alias, @@ -26,14 +25,62 @@ def run_exps(): checkpoint=checkpoint, ) + # for model_alias, config_builder in baseline.center_window_att_baseline_rf( + # win_size_list=(5,), blank_decoder_version=5, + # ): + # for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( + # alias=model_alias, + # config_builder=config_builder, + # n_epochs_list=(10,), + # const_lr_list=(1e-4,), + # ): + # pass + # recog.center_window_returnn_frame_wise_beam_search( + # alias=train_alias, + # config_builder=config_builder, + # checkpoint=checkpoint, + # ) + + # for model_alias, config_builder in baseline.center_window_att_baseline_rf( + # win_size_list=(5,), blank_decoder_version=6, + # ): + # for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( + # alias=model_alias, + # config_builder=config_builder, + # n_epochs_list=(10,), + # const_lr_list=(1e-4,), + # ): + # pass + # recog.center_window_returnn_frame_wise_beam_search( + # alias=train_alias, + # config_builder=config_builder, + # checkpoint=checkpoint, + # ) + + # for model_alias, config_builder in baseline.center_window_att_baseline_rf( + # win_size_list=(5,) + # ): + # for max_shift, num_iterations in [(1, 1), (2, 1), (1, 2)]: + # for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( + # alias=model_alias, + # config_builder=config_builder, + # n_epochs_list=(100,), + # const_lr_list=(1e-4,), + # alignment_augmentation_opts={"max_shift": max_shift, "num_iterations": num_iterations}, + # ): + # recog.center_window_returnn_frame_wise_beam_search( + # alias=train_alias, + # config_builder=config_builder, + # checkpoint=checkpoint, + # ) + for model_alias, config_builder in baseline.center_window_att_baseline_rf( - win_size_list=(5, 129), + win_size_list=(5,), ): - for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( + for train_alias, checkpoint in train.train_center_window_att_viterbi_from_scratch( alias=model_alias, config_builder=config_builder, - n_epochs_list=(100,), - const_lr_list=(1e-4,), + n_epochs_list=(500,), ): recog.center_window_returnn_frame_wise_beam_search( alias=train_alias, @@ -41,8 +88,24 @@ def run_exps(): checkpoint=checkpoint, ) + # for model_alias, config_builder in baseline.center_window_att_baseline_rf( + # win_size_list=(5,), + # ): + # for train_alias, checkpoint in train.train_center_window_att_viterbi_from_scratch( + # alias=model_alias, + # config_builder=config_builder, + # n_epochs_list=(500,), + # use_speed_pert=True, + # ): + # recog.center_window_returnn_frame_wise_beam_search( + # alias=train_alias, + # config_builder=config_builder, + # checkpoint=checkpoint, + # ) + + # ------------------------------------- best models: KEEP! ------------------------------------- for model_alias, config_builder in baseline.center_window_att_baseline_rf( - win_size_list=(5, 129), use_att_ctx_in_state=False, use_weight_feedback=False, + win_size_list=(5, 129), ): for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( alias=model_alias, @@ -57,32 +120,19 @@ def run_exps(): ) for model_alias, config_builder in baseline.center_window_att_baseline_rf( - win_size_list=(5,) + win_size_list=(5,), ): - for max_shift, num_iterations in [(1, 1), (2, 1), (1, 2)]: - for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( - alias=model_alias, - config_builder=config_builder, - n_epochs_list=(100,), - const_lr_list=(1e-4,), - alignment_augmentation_opts={"max_shift": max_shift, "num_iterations": num_iterations}, - ): + for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( + alias=model_alias, + config_builder=config_builder, + n_epochs_list=(100,), + const_lr_list=(1e-4,), + ): + for use_recombination in {"max", "sum"}: recog.center_window_returnn_frame_wise_beam_search( alias=train_alias, config_builder=config_builder, checkpoint=checkpoint, + checkpoint_aliases=("best-4-avg",), + use_recombination=use_recombination, ) - - for model_alias, config_builder in baseline.center_window_att_baseline_rf( - win_size_list=(5,), - ): - for train_alias, checkpoint in train.train_center_window_att_viterbi_from_scratch( - alias=model_alias, - config_builder=config_builder, - n_epochs_list=(500,), - ): - recog.center_window_returnn_frame_wise_beam_search( - alias=train_alias, - config_builder=config_builder, - checkpoint=checkpoint, - ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/baseline.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/baseline.py index d6fff6356..8068a8061 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/baseline.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/baseline.py @@ -8,12 +8,13 @@ def center_window_att_baseline_rf( win_size_list: Tuple[int, ...] = (5, 129), use_att_ctx_in_state: bool = True, use_weight_feedback: bool = True, + blank_decoder_version: int = 3, ): for win_size in win_size_list: alias, config_builder = get_center_window_att_config_builder_rf( win_size=win_size, use_att_ctx_in_state=use_att_ctx_in_state, - blank_decoder_version=3, + blank_decoder_version=blank_decoder_version, use_joint_model=False, use_weight_feedback=use_weight_feedback, ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/__init__.py index d9f80c03e..478e6957d 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/__init__.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/__init__.py @@ -13,7 +13,7 @@ def run_exps(): for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( alias=model_alias, config_builder=config_builder, - n_epochs_list=(100,), + n_epochs_list=(200, 300), ): recog.center_window_returnn_frame_wise_beam_search( alias=train_alias, @@ -27,10 +27,79 @@ def run_exps(): for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( alias=model_alias, config_builder=config_builder, - n_epochs_list=(100,), + n_epochs_list=(200, 300), ): recog.center_window_returnn_frame_wise_beam_search( alias=train_alias, config_builder=config_builder, checkpoint=checkpoint, ) + + for model_alias, config_builder in baseline.center_window_att_baseline_rf( + win_size_list=(5,), + label_decoder_state="nb-lstm", + use_att_ctx_in_state=False, + use_weight_feedback=False, + bpe_vocab_size=1056, + ): + for train_alias, checkpoint in train.train_center_window_att_full_sum_from_scratch( + alias=model_alias, + config_builder=config_builder, + n_epochs_list=(125,), + use_speed_pert=True, + batch_size=3_000, + time_rqmt=80, + use_mgpu=False, + ): + pass + # recog.center_window_returnn_frame_wise_beam_search( + # alias=train_alias, + # config_builder=config_builder, + # checkpoint=checkpoint, + # ) + + for model_alias, config_builder in baseline.center_window_att_baseline_rf( + win_size_list=(5,), + label_decoder_state="nb-lstm", + use_att_ctx_in_state=False, + use_weight_feedback=False, + bpe_vocab_size=1056, + ): + for train_alias, checkpoint in train.train_center_window_att_full_sum_from_scratch( + alias=model_alias, + config_builder=config_builder, + n_epochs_list=(125,), + use_speed_pert=True, + batch_size=3_000, + time_rqmt=1, + use_mgpu=True, + ): + pass + # recog.center_window_returnn_frame_wise_beam_search( + # alias=train_alias, + # config_builder=config_builder, + # checkpoint=checkpoint, + # ) + + for model_alias, config_builder in baseline.center_window_att_baseline_rf( + win_size_list=(1,), + label_decoder_state="nb-lstm", + use_att_ctx_in_state=False, + use_weight_feedback=False, + ): + for train_alias, checkpoint in train.train_center_window_att_full_sum_from_scratch( + alias=model_alias, + config_builder=config_builder, + n_epochs_list=(125,), + use_speed_pert=True, + batch_size=3_000, + time_rqmt=80, + use_mgpu=False, + beam_size=100 + ): + pass + # recog.center_window_returnn_frame_wise_beam_search( + # alias=train_alias, + # config_builder=config_builder, + # checkpoint=checkpoint, + # ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/baseline.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/baseline.py index 8e5566d4f..5cef2bc0b 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/baseline.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/baseline.py @@ -9,6 +9,7 @@ def center_window_att_baseline_rf( use_att_ctx_in_state: bool = True, label_decoder_state: str = "nb-lstm", use_weight_feedback: bool = True, + bpe_vocab_size: int = 10025, ): for win_size in win_size_list: alias, config_builder = get_center_window_att_config_builder_rf( @@ -18,6 +19,7 @@ def center_window_att_baseline_rf( use_joint_model=True, label_decoder_state=label_decoder_state, use_weight_feedback=use_weight_feedback, + bpe_vocab_size=bpe_vocab_size, ) alias = f"{base_alias}/baseline_rf/{alias}" yield alias, config_builder diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/config_builder.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/config_builder.py index a4f9b7d68..d43cc8813 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/config_builder.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/config_builder.py @@ -2,6 +2,7 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.labels.v2.librispeech.label_singletons import ( LibrispeechBPE10025_CTC_ALIGNMENT, + LibrispeechBPE1056_ALIGNMENT, LIBRISPEECH_CORPUS ) from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.general.returnn.exes import RETURNN_EXE_NEW, RETURNN_CURRENT_ROOT @@ -17,9 +18,11 @@ def get_center_window_att_config_builder_rf( use_joint_model: bool, use_weight_feedback: bool = True, label_decoder_state: str = "nb-lstm", + bpe_vocab_size: int = 10025, ) -> Tuple[str, SegmentalAttConfigBuilderRF]: + assert bpe_vocab_size in {10025, 1056} variant_params = { - "dependencies": LibrispeechBPE10025_CTC_ALIGNMENT, + "dependencies": LibrispeechBPE10025_CTC_ALIGNMENT if bpe_vocab_size == 10025 else LibrispeechBPE1056_ALIGNMENT, "dataset": { "feature_type": "raw", "corpus": LIBRISPEECH_CORPUS @@ -45,9 +48,11 @@ def get_center_window_att_config_builder_rf( ) alias = ( + f"bpe-size-{bpe_vocab_size}/" f"win-size-{win_size}/" - f"{'w' if use_weight_feedback else 'wo'}-weight-feedback/" - f"{'w' if use_att_ctx_in_state else 'wo'}-att-ctx-in-state/" + f"{'w' if use_weight_feedback else 'wo'}-wf_" + f"{'w' if use_att_ctx_in_state else 'wo'}-ctx-in-s/" + f"bd-{blank_decoder_version}/" f"{label_decoder_state}" ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/recog.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/recog.py index 3d7c75192..dc0ffa994 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/recog.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/recog.py @@ -22,7 +22,8 @@ def center_window_returnn_frame_wise_beam_search( checkpoint_aliases: Tuple[str, ...] = ("last", "best", "best-4-avg"), run_analysis: bool = False, att_weight_seq_tags: Optional[List] = None, - pure_torch: bool = False + pure_torch: bool = False, + use_recombination: Optional[str] = None, ): ilm_opts = {"type": ilm_type} if ilm_type == "mini_att": @@ -46,6 +47,7 @@ def center_window_returnn_frame_wise_beam_search( "recog_def": model_recog_pure_torch if pure_torch else model_recog, "forward_step_func": _returnn_v2_forward_step, "forward_callback": _returnn_v2_get_forward_callback, + "use_recombination": use_recombination, }, search_alias=f'returnn_decoding{"_pure_torch" if pure_torch else ""}' ).run() diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/train.py index 468a525d2..61e373208 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/train.py @@ -1,8 +1,15 @@ +import copy from typing import Tuple, Optional, List, Dict, Union, Callable from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.config_builder_rf.base import SegmentalAttConfigBuilderRF from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.train_new import SegmentalTrainExperiment -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.train import _returnn_v2_train_step, viterbi_training +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.train import ( + _returnn_v2_train_step, + _returnn_v2_full_sum_train_step, + viterbi_training, + full_sum_training, +) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model import _returnn_v2_get_model_for_full_sum_training from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.checkpoints import ( external_checkpoints, default_import_model_name, @@ -15,54 +22,157 @@ def train_center_window_att_viterbi_from_scratch( config_builder: SegmentalAttConfigBuilderRF, n_epochs_list: Tuple[int, ...], time_rqmt: int = 80, + use_speed_pert: bool = False, + batch_size: int = 15_000, + use_mgpu: bool = True, ): - batch_size = 15_000 for n_epochs in n_epochs_list: - alias += "/train_from_scratch/%d-epochs_w-ctc-loss" % (n_epochs,) + alias += f"/viterbi-train_from_scratch/{n_epochs}-epochs_bs-{batch_size}_w-ctc-loss_{'w' if use_speed_pert else 'wo'}-speed-pert" + + train_opts = { + "dataset_opts": { + "use_speed_pert": use_speed_pert, + "epoch_wise_filter": {(1, 5): {"max_mean_len": 1000}} + }, + # "import_model_train_epoch1": None, + "accum_grad_multiple_step": 4, + "pos_emb_dropout": 0.1, + "rf_att_dropout_broadcast": False, + "batch_size": batch_size, + "batching": "laplace:.1000", + "lr_opts": { + "type": "dyn_lr_piecewise_linear", + "batch_size": batch_size, + "num_epochs": n_epochs, + "learning_rate": 1e-3, + }, + "aux_loss_layers": None, + "specaugment_steps": (5_000, 15_000, 25_000), + "grad_scaler": None, + "gradient_clip_global_norm": 5.0, + "optimizer": { + "class": "adamw", + "weight_decay_modules_blacklist": [ + "rf.Embedding", + "rf.LearnedRelativePositionalEncoding", + ], + "epsilon": 1e-16, + "weight_decay": 1e-6, + }, + "train_def": viterbi_training, + "train_step_func": _returnn_v2_train_step, + } + + if use_speed_pert: + train_opts["preload_from_files"] = { + "pretrained_ctc_weights": { + "filename": external_checkpoints[default_import_model_name + "_w_ctc"], + "init_for_train": True, + "ignore_missing": False, + } + } + + train_rqmt = { + "time": time_rqmt, + } + if use_mgpu: + train_rqmt.update({ + "horovod_num_processes": 4, + "distributed_launch_cmd": "torchrun" + }) + train_opts["torch_distributed"] = {} train_exp = SegmentalTrainExperiment( config_builder=config_builder, alias=alias, num_epochs=n_epochs, - train_rqmt={ - "time": time_rqmt, - "horovod_num_processes": 4, - "distributed_launch_cmd": "torchrun" + train_rqmt=train_rqmt, + train_opts=train_opts + ) + checkpoints, model_dir, learning_rates = train_exp.run_train() + + checkpoint = { + "model_dir": model_dir, + "learning_rates": learning_rates, + "key": "dev_loss_non_blank_ce", + "checkpoints": checkpoints, + "n_epochs": n_epochs + } + yield alias, checkpoint + + +def train_center_window_att_full_sum_from_scratch( + alias: str, + config_builder: SegmentalAttConfigBuilderRF, + n_epochs_list: Tuple[int, ...], + time_rqmt: int = 80, + use_speed_pert: bool = False, + batch_size: int = 15_000, + use_mgpu: bool = True, + beam_size: Optional[int] = None, +): + # TODO: do this in a nicer way + config_builder = copy.deepcopy(config_builder) + config_builder.get_model_func = _returnn_v2_get_model_for_full_sum_training + for n_epochs in n_epochs_list: + alias += f"/full-sum-train_from_scratch/{n_epochs}-epochs_bs-{batch_size}_w-ctc-loss_{'w' if use_speed_pert else 'wo'}-speed-pert" + + train_opts = { + "dataset_opts": { + "use_speed_pert": use_speed_pert, + "epoch_wise_filter": {(1, 5): {"max_mean_len": 1000}}, + "hdf_targets": {}, # do not use alignment for full sum training + "seq_postfix": None, }, - train_opts={ - "dataset_opts": { - "use_speed_pert": False, - "epoch_wise_filter": {(1, 5): {"max_mean_len": 1000}} - }, - # "import_model_train_epoch1": None, - "accum_grad_multiple_step": 4, - "torch_distributed": {}, - "pos_emb_dropout": 0.1, - "rf_att_dropout_broadcast": False, + # "import_model_train_epoch1": None, + "accum_grad_multiple_step": 4, + "pos_emb_dropout": 0.1, + "rf_att_dropout_broadcast": False, + "batch_size": batch_size, + "batching": "laplace:.1000", + "lr_opts": { + "type": "dyn_lr_piecewise_linear", "batch_size": batch_size, - "batching": "laplace:.1000", - "lr_opts": { - "type": "dyn_lr_piecewise_linear", - "batch_size": batch_size, - "num_epochs": n_epochs, - "learning_rate": 1e-3, - }, - "aux_loss_layers": None, - "specaugment_steps": (5_000, 15_000, 25_000), - "grad_scaler": None, - "gradient_clip_global_norm": 5.0, - "optimizer": { - "class": "adamw", - "weight_decay_modules_blacklist": [ - "rf.Embedding", - "rf.LearnedRelativePositionalEncoding", - ], - "epsilon": 1e-16, - "weight_decay": 1e-6, - }, - "train_def": viterbi_training, - "train_step_func": _returnn_v2_train_step, - } + "num_epochs": n_epochs, + "learning_rate": 1e-3, + }, + "aux_loss_layers": None, + "specaugment_steps": (5_000, 15_000, 25_000), + "grad_scaler": None, + "gradient_clip_global_norm": 5.0, + "optimizer": { + "class": "adamw", + "weight_decay_modules_blacklist": [ + "rf.Embedding", + "rf.LearnedRelativePositionalEncoding", + ], + "epsilon": 1e-16, + "weight_decay": 1e-6, + }, + # "max_seq_length": {"targets": 75}, + "train_def": full_sum_training, + "train_step_func": _returnn_v2_full_sum_train_step, + } + + if beam_size is not None: + train_opts["full_sum_training_beam_size"] = beam_size + + train_rqmt = { + "time": time_rqmt, + } + if use_mgpu: + train_rqmt.update({ + "horovod_num_processes": 4, + "distributed_launch_cmd": "torchrun" + }) + train_opts["torch_distributed"] = {} + + train_exp = SegmentalTrainExperiment( + config_builder=config_builder, + alias=alias, + num_epochs=n_epochs, + train_rqmt=train_rqmt, + train_opts=train_opts ) checkpoints, model_dir, learning_rates = train_exp.run_train() @@ -83,6 +193,7 @@ def train_center_window_att_viterbi_import_global_tf( const_lr_list: Tuple[float, ...] = (1e-4,), time_rqmt: int = 80, alignment_augmentation_opts: Optional[Dict] = None, + import_model_name: str = default_import_model_name, ): if not config_builder.use_att_ctx_in_state: # only randomly init FF weights, since only the input dim of the lstm layer is different @@ -92,7 +203,7 @@ def train_center_window_att_viterbi_import_global_tf( for n_epochs in n_epochs_list: for const_lr in const_lr_list: - train_alias = alias + f"/train_from_global_att_tf_checkpoint/standard-training/{n_epochs}-epochs_{const_lr}-const-lr_wo-ctc-loss" + train_alias = alias + f"/train_from_{import_model_name}/standard-training/{n_epochs}-epochs_{const_lr}-const-lr_wo-ctc-loss" if alignment_augmentation_opts: opts = alignment_augmentation_opts train_alias += f"_align-aug-{opts['num_iterations']}-iters_{opts['max_shift']}-max-shift" @@ -100,7 +211,7 @@ def train_center_window_att_viterbi_import_global_tf( train_opts = { "preload_from_files": { "pretrained_global_att_params": { - "filename": external_checkpoints[default_import_model_name], + "filename": external_checkpoints[import_model_name], "init_for_train": True, "ignore_missing": True, # because of length model params } diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/checkpoints.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/checkpoints.py index 2b3dc216b..56f161b86 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/checkpoints.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/checkpoints.py @@ -16,19 +16,30 @@ from sisyphus import tk +external_checkpoints = {} -global_att_checkpoint = ConvertTfCheckpointToRfPtJob( +for checkpoint_name, checkpoint in external_checkpoints_tf.items(): + global_att_checkpoint = ConvertTfCheckpointToRfPtJob( + checkpoint=checkpoint, + make_model_func=MakeModelGlobal( + in_dim=80, + target_dim=10025, + ), + map_func=map_param_func_v2_global + ).out_checkpoint + + external_checkpoints[checkpoint_name] = PtCheckpoint(global_att_checkpoint) + +global_att_checkpoint_w_ctc = ConvertTfCheckpointToRfPtJob( checkpoint=external_checkpoints_tf[default_import_model_name], make_model_func=MakeModelGlobal( in_dim=80, target_dim=10025, + enc_aux_logits=(11,) ), map_func=map_param_func_v2_global ).out_checkpoint - -external_checkpoints = { - default_import_model_name: PtCheckpoint(global_att_checkpoint) -} +external_checkpoints[default_import_model_name + "_w_ctc"] = PtCheckpoint(global_att_checkpoint_w_ctc) def get_center_window_baseline_v1_tf_checkpoint(): diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/__init__.py index 31c6c5bab..c6c6301de 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/__init__.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/__init__.py @@ -11,7 +11,8 @@ def run_exps(): for use_weight_feedback in (True,): for model_alias, config_builder in baseline.global_att_baseline_rf(use_weight_feedback=use_weight_feedback): for train_alias, checkpoint in ( - (f"{model_alias}/import-global-tf_no-finetuning", external_checkpoints[default_import_model_name]), + (f"{model_alias}/import_{default_import_model_name}", external_checkpoints[default_import_model_name]), + (f"{model_alias}/import_glob.conformer.mohammad.5.4", external_checkpoints["glob.conformer.mohammad.5.4"]), ): recog.global_att_returnn_label_sync_beam_search( alias=train_alias, @@ -31,37 +32,3 @@ def run_exps(): config_builder=config_builder, checkpoint=checkpoint, ) - - for use_weight_feedback in (True,): - for model_alias, config_builder in baseline.global_att_baseline_rf( - use_weight_feedback=use_weight_feedback, - use_att_ctx_in_state=False, - ): - for train_alias, checkpoint in train.train_import_global_tf( - alias=model_alias, - config_builder=config_builder, - n_epochs_list=(100,), - const_lr_list=(1e-4,), - ): - recog.global_att_returnn_label_sync_beam_search( - alias=train_alias, - config_builder=config_builder, - checkpoint=checkpoint, - ) - - for use_weight_feedback in (False,): - for model_alias, config_builder in baseline.global_att_baseline_rf( - use_weight_feedback=use_weight_feedback, - use_att_ctx_in_state=False, - ): - for train_alias, checkpoint in train.train_import_global_tf( - alias=model_alias, - config_builder=config_builder, - n_epochs_list=(300,), - const_lr_list=(1e-4,), - ): - recog.global_att_returnn_label_sync_beam_search( - alias=train_alias, - config_builder=config_builder, - checkpoint=checkpoint, - ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/train.py index 6d02cab56..c4c3949b3 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/train.py @@ -58,6 +58,7 @@ def train_import_global_tf( n_epochs_list: Tuple[int, ...], const_lr_list: Tuple[float, ...], time_rqmt: int = 80, + import_model_name: str = default_import_model_name, ): if not config_builder.use_att_ctx_in_state: # only randomly init FF weights, since only the input dim of the lstm layer is different @@ -66,12 +67,12 @@ def train_import_global_tf( custom_missing_load_func = None for n_epochs, const_lr in itertools.product(n_epochs_list, const_lr_list): - train_alias = alias + f"/train_from_global_att_tf_checkpoint/standard-training/{n_epochs}-epochs_{const_lr}-const-lr_wo-ctc-loss" + train_alias = alias + f"/train_from_{import_model_name}/standard-training/{n_epochs}-epochs_{const_lr}-const-lr_wo-ctc-loss" train_opts = { "preload_from_files": { "pretrained_global_att_params": { - "filename": external_checkpoints[default_import_model_name], + "filename": external_checkpoints[import_model_name], "init_for_train": True, } }, diff --git a/users/schmitt/returnn_frontend/model_interfaces/training.py b/users/schmitt/returnn_frontend/model_interfaces/training.py index 6edef2ad1..d834db360 100644 --- a/users/schmitt/returnn_frontend/model_interfaces/training.py +++ b/users/schmitt/returnn_frontend/model_interfaces/training.py @@ -48,3 +48,22 @@ def __call__( raise NotImplementedError learning_rate_control_error_measure: Optional[str] = None + + +class FullSumTrainDef(Protocol[ModelT]): + """ + Defines the losses (mark_as_loss). + """ + + def __call__( + self, + *, + model: ModelT, + data: Tensor, + data_spatial_dim: Dim, + non_blank_targets: Tensor, + non_blank_targets_spatial_dim: Dim, + ): + raise NotImplementedError + + learning_rate_control_error_measure: Optional[str] = None From b76c488cd3c6698361f34367af8092a43c0f8f2b Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 6 Jun 2024 12:18:11 +0200 Subject: [PATCH 103/227] update --- .../tedlium2/configs/ted2_att_baseline.py | 197 +++++++----------- 1 file changed, 75 insertions(+), 122 deletions(-) diff --git a/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py b/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py index 3c9f21f2c..76080e7fd 100644 --- a/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py +++ b/users/zeineldeen/experiments/conformer_att_2023/tedlium2/configs/ted2_att_baseline.py @@ -1161,68 +1161,19 @@ def get_base_v1_args(lr, ep, enc_drop=0.1, pretrain_reps=3, use_legacy_stats=Tru recog_datasets_tuples = get_test_dataset_tuples(bpe_size=BPE_1K) - # baseline: 7.41/6.85 - # dev_coverage0.03_0.11_max/wer 7.34 - # test_coverage0.03_0.11_max/wer 6.85 - # for test_set in ["test"]: - # for cov_update in ["max"]: - # for cov_scale in [0.03, 0.04]: - # for cov_thre in [0.11, 0.13]: - # search_args = copy.deepcopy(args) - # search_args["decoder_args"].coverage_scale = cov_scale - # search_args["decoder_args"].coverage_threshold = cov_thre - # name_ = f"/average_4/{test_set}_coverage{cov_scale}_{cov_thre}" - # if cov_update == "max": - # name_ += "_max" - # search_args["decoder_args"].coverage_update = "max" - # run_single_search( - # exp_name=name + name_, - # train_data=train_data, - # search_args=search_args, - # checkpoint=train_job_avg_ckpt[name], - # feature_extraction_net=log10_net_10ms, - # recog_dataset=recog_datasets_tuples[test_set][0], - # recog_ref=recog_datasets_tuples[test_set][1], - # recog_bliss=recog_datasets_tuples[test_set][2], - # ) - - # # TODO: only CTC - # only_ctc_args = copy.deepcopy(args) - # only_ctc_args["decoder_args"].ce_loss_scale = 0.0 - # _, train_data = run_exp( - # name + "_onlyCTC", - # only_ctc_args, - # num_epochs=ep, - # epoch_wise_filter=None, - # bpe_size=BPE_1K, - # partition_epoch=4, - # search_args={"ctc_decode": True, "ctc_blank_idx": 1057, **only_ctc_args}, - # avg_key="dev_score_ctc", - # ) - # - # # TODO: scale CTC - # scale_ctc_args = copy.deepcopy(args) - # scale_ctc_args["encoder_args"].ctc_loss_scale = 0.3 / 0.7 # AED scale is 1.0 - # _, train_data = run_exp( - # name + "_ctcScale0.3", - # scale_ctc_args, - # num_epochs=ep, - # epoch_wise_filter=None, - # bpe_size=BPE_1K, - # partition_epoch=4, - # ) - # - # # TODO: grad clip 5 - # grad_clip_args = copy.deepcopy(args) - # grad_clip_args["gradient_clip_global_norm"] = 5 - # _, train_data = run_exp( - # name + "_gradClipNorm5", - # grad_clip_args, - # num_epochs=ep, - # epoch_wise_filter=None, - # bpe_size=BPE_1K, - # partition_epoch=4, - # ) + # TODO: only CTC + only_ctc_args = copy.deepcopy(args) + only_ctc_args["decoder_args"].ce_loss_scale = 0.0 + _, train_data = run_exp( + name + "_onlyCTC", + only_ctc_args, + num_epochs=ep, + epoch_wise_filter=None, + bpe_size=BPE_1K, + partition_epoch=4, + search_args={"ctc_decode": True, "ctc_blank_idx": 1057, **only_ctc_args}, + avg_key="dev_score_ctc", + ) # base_bpe1000_peakLR0.0008_ep400_globalNorm_epochOCLR_pre3_fixZoneout_encDrop0.15_woDepthConvPre_weightDrop0.1_decAttDrop0.0_embedDim256_numBlocks12 # 7.4 6.85 avg @@ -1270,6 +1221,7 @@ def get_base_v1_args(lr, ep, enc_drop=0.1, pretrain_reps=3, use_legacy_stats=Tru if ctc_scale == 0.3: args_ = copy.deepcopy(args) args_["with_pretrain"] = False + reset_params_init(args_["encoder_args"]) specaug_steps = {"step0": 10_000, "step1": 15_000, "step2": 20_000} args_["specaug_str_func_opts"] = { "version": 2, @@ -1280,7 +1232,7 @@ def get_base_v1_args(lr, ep, enc_drop=0.1, pretrain_reps=3, use_legacy_stats=Tru "freq_dim_factor": 5, } run_exp( - exp_name + "_woPretrain", + exp_name + "_woPretrain_defaultInit", args_, num_epochs=ep, epoch_wise_filter=[(1, 2, 400), (3, 4, 800)], @@ -1289,75 +1241,76 @@ def get_base_v1_args(lr, ep, enc_drop=0.1, pretrain_reps=3, use_legacy_stats=Tru ) args_["encoder_args"].with_ctc = False run_exp( - exp_name + "_woPretrain_noCTC", + exp_name + "_woPretrain_noCTC_defaultInit", args_, num_epochs=ep, epoch_wise_filter=[(1, 2, 400), (3, 4, 800)], bpe_size=BPE_1K, partition_epoch=4, + avg_key="dev_score", ) # TODO: multi-gpu # base_bpe1000_peakLR0.0016_ep200_globalNorm_epochOCLR_pre3_fixZoneout_encDrop0.15_woDepthConvPre_weightDrop0.1_decAttDrop0.0_embedDim256_numBlocks12_gradClipNorm5.0_paramSync_step100_accum1_ctcScale0.3_gpu4 # 8.66 7.97 avg - - for num_blocks in [12]: - for ep in [50 * 4]: - for lr in [8e-4, 13e-4, 16e-4]: - for target_embed_dim in [256]: - for att_drop in [0.0]: - for weight_drop in [0.1]: - for enc_drop in [0.15]: - for ctc_scale in [0.3]: - for sync_step in [100]: - for grad_clip in [None, 1.0, 5.0]: - if grad_clip is None and lr != 8e-4: - continue - - base_v1_args, exp_name = get_base_v1_args( - lr, ep, enc_drop=enc_drop, use_legacy_stats=False - ) - - args = copy.deepcopy(base_v1_args) - args["encoder_args"].num_blocks = num_blocks - args["encoder_args"].mhsa_weight_dropout = weight_drop - args["encoder_args"].ff_weight_dropout = weight_drop - args["encoder_args"].conv_weight_dropout = weight_drop - - args["decoder_args"].embed_dim = target_embed_dim - args["decoder_args"].att_dropout = att_drop - - args["horovod_params"] = { - "horovod_reduce_type": "param", - "horovod_param_sync_step": sync_step, - "horovod_dataset_distribution": "random_seed_offset", - } - - args["batch_size"] = 15_000 * 160 - args["pretrain_opts"]["initial_batch_size"] = 15_000 * 160 - args["accum_grad"] = 1 - - exp_name += f"_weightDrop{weight_drop}_decAttDrop{att_drop}_embedDim{target_embed_dim}_numBlocks{num_blocks}" - if grad_clip: - args["gradient_clip_global_norm"] = grad_clip - exp_name += f"_gradClipNorm{grad_clip}" - - exp_name += f"_paramSync_step{sync_step}_accum1" - - if ctc_scale != 1.0: - args["encoder_args"].ctc_loss_scale = ctc_scale - args["decoder_args"].ce_loss_scale = 1.0 - ctc_scale - exp_name += f"_ctcScale{ctc_scale}" - - run_exp( - exp_name + "_gpu4", - args, - num_epochs=ep, - epoch_wise_filter=None, - bpe_size=BPE_1K, - partition_epoch=4 * 4, - horovod_num_processes=4, - ) + # + # for num_blocks in [12]: + # for ep in [50 * 4]: + # for lr in [8e-4, 13e-4, 16e-4]: + # for target_embed_dim in [256]: + # for att_drop in [0.0]: + # for weight_drop in [0.1]: + # for enc_drop in [0.15]: + # for ctc_scale in [0.3]: + # for sync_step in [100]: + # for grad_clip in [None, 1.0, 5.0]: + # if grad_clip is None and lr != 8e-4: + # continue + # + # base_v1_args, exp_name = get_base_v1_args( + # lr, ep, enc_drop=enc_drop, use_legacy_stats=False + # ) + # + # args = copy.deepcopy(base_v1_args) + # args["encoder_args"].num_blocks = num_blocks + # args["encoder_args"].mhsa_weight_dropout = weight_drop + # args["encoder_args"].ff_weight_dropout = weight_drop + # args["encoder_args"].conv_weight_dropout = weight_drop + # + # args["decoder_args"].embed_dim = target_embed_dim + # args["decoder_args"].att_dropout = att_drop + # + # args["horovod_params"] = { + # "horovod_reduce_type": "param", + # "horovod_param_sync_step": sync_step, + # "horovod_dataset_distribution": "random_seed_offset", + # } + # + # args["batch_size"] = 15_000 * 160 + # args["pretrain_opts"]["initial_batch_size"] = 15_000 * 160 + # args["accum_grad"] = 1 + # + # exp_name += f"_weightDrop{weight_drop}_decAttDrop{att_drop}_embedDim{target_embed_dim}_numBlocks{num_blocks}" + # if grad_clip: + # args["gradient_clip_global_norm"] = grad_clip + # exp_name += f"_gradClipNorm{grad_clip}" + # + # exp_name += f"_paramSync_step{sync_step}_accum1" + # + # if ctc_scale != 1.0: + # args["encoder_args"].ctc_loss_scale = ctc_scale + # args["decoder_args"].ce_loss_scale = 1.0 - ctc_scale + # exp_name += f"_ctcScale{ctc_scale}" + # + # run_exp( + # exp_name + "_gpu4", + # args, + # num_epochs=ep, + # epoch_wise_filter=None, + # bpe_size=BPE_1K, + # partition_epoch=4 * 4, + # horovod_num_processes=4, + # ) # # TODO: mixup # for num_blocks in [12]: From f7ca71a0fe4d0b236fe5c6ef042628e1f9a296c3 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 6 Jun 2024 12:20:07 +0200 Subject: [PATCH 104/227] update --- .../configs/libri_chunked_aed_variants.py | 1470 +++++++++++++++++ .../configs/ted2_chunked_aed_variants.py | 71 +- 2 files changed, 1508 insertions(+), 33 deletions(-) create mode 100644 users/zeineldeen/experiments/chunkwise_att_2023/librispeech_960/configs/libri_chunked_aed_variants.py diff --git a/users/zeineldeen/experiments/chunkwise_att_2023/librispeech_960/configs/libri_chunked_aed_variants.py b/users/zeineldeen/experiments/chunkwise_att_2023/librispeech_960/configs/libri_chunked_aed_variants.py new file mode 100644 index 000000000..ceddb4d61 --- /dev/null +++ b/users/zeineldeen/experiments/chunkwise_att_2023/librispeech_960/configs/libri_chunked_aed_variants.py @@ -0,0 +1,1470 @@ +""" +based on: +users/zeineldeen/experiments/conformer_att_2022/librispeech_960/configs/baseline_960h_v2.py +""" + +from __future__ import annotations +from typing import Optional, Union, List +import copy +import os + +import numpy, math + +from sisyphus import tk + +from i6_experiments.users.zeineldeen.experiments.chunkwise_att_2023.librispeech_960.chunkwise_attention_asr_config import ( + create_config, + ConformerEncoderArgs, + RNNDecoderArgs, +) + +from i6_experiments.users.zeineldeen.experiments.conformer_att_2022.librispeech_960.additional_config import ( + apply_fairseq_init_to_conformer, +) +from i6_experiments.users.zeineldeen.experiments.chunkwise_att_2023.librispeech_960.data import ( + build_training_datasets, + build_test_dataset, + build_chunkwise_training_datasets, +) +from i6_experiments.users.zeineldeen.experiments.conformer_att_2023.librispeech_960.default_tools import ( + RETURNN_ROOT, + RETURNN_CPU_EXE, +) +from i6_experiments.users.zeineldeen.experiments.conformer_att_2023.tedlium2.default_tools import RETURNN_ROOT_V2 +from i6_experiments.users.zeineldeen.experiments.conformer_att_2022.librispeech_960.feature_extraction_net import ( + log10_net_10ms, +) +from i6_experiments.users.zeineldeen.experiments.conformer_att_2022.librispeech_960.pipeline import ( + training, + search, + get_average_checkpoint, + get_best_checkpoint, + search_single, +) +from i6_experiments.users.zeineldeen.models.lm import generic_lm +from i6_experiments.users.zeineldeen.models.lm.transformer_lm import TransformerLM +from i6_experiments.users.zeineldeen.experiments.conformer_att_2022.librispeech_960 import ( + ilm_helpers, +) +from i6_experiments.users.rossenbach.experiments.librispeech.kazuki_lm.experiment import ( + get_lm, + ZeineldeenLM, +) + +from i6_experiments.users.zeineldeen.experiments.chunkwise_att_2023 import ( + tools_eval_funcs, + tools_eval_funcs_old, +) + +from i6_core.returnn.config import ReturnnConfig +from i6_core.returnn.training import Checkpoint +from i6_core.returnn.forward import ReturnnForwardJob + +train_jobs_map = {} # dict[str, ReturnnTrainJob] +train_job_avg_ckpt = {} +train_job_best_epoch = {} + +BPE_10K = 10000 +BPE_5K = 5000 +BPE_1K = 1000 + +# dev-other: +# Seq-length 'audio_features' Stats: +# 2864 seqs +# Mean: 102995.8959497207 (6.4 sec) +# Std dev: 69081.77143805166 (4.3 sec) +# Min/max: 17040 / 562480 (1.1 / 35.2 sec) +# Seq-length 'bpe_labels' Stats: +# 2864 seqs +# Mean: 21.13966480446923 +# Std dev: 13.536136898032625 +# Min/max: 2 / 110 +# +# test-other: +# Seq-length 'audio_features' Stats: +# 2939 seqs +# Mean: 104686.32936372912 (6.5 sec) +# Std dev: 70821.55181403323 (4.4 sec) +# Min/max: 20000 / 552160 (1.2 / 34.5 sec) +# Seq-length 'bpe_labels' Stats: +# 2939 seqs +# Mean: 21.22796869683565 +# Std dev: 14.559673655773087 +# Min/max: 2 / 137 + + +# --------------------------- LM --------------------------- # + +lstm_10k_lm_opts = { + "lm_subnet": generic_lm.libri_lstm_bpe10k_net, + "lm_model": generic_lm.libri_lstm_bpe10k_model, + "name": "lstm", +} + +lstm_lm_opts_map = { + BPE_10K: lstm_10k_lm_opts, +} + +trafo_lm_net = TransformerLM(source="prev:output", num_layers=24, vocab_size=10025, use_as_ext_lm=True) +trafo_lm_net.create_network() +trafo_10k_lm_opts = { + "lm_subnet": trafo_lm_net.network.get_net(), + "load_on_init_opts": { + "filename": "/work/asr3/irie/experiments/lm/librispeech/2018-03-05--lmbpe-zeyer/data-train/transfo_24_d00.4096_1024.sgd.lr1.8_heads/bk-net-model/network.023", + "params_prefix": "", + "load_if_prefix": "lm_output_masked/", + }, + "name": "trafo", +} + +bpe5k_lm = get_lm("ls960_trafo24_bs3000_5ep_5kbpe") # type: ZeineldeenLM +trafo_5k_lm_opts = { + "lm_subnet": bpe5k_lm.combination_network, + "load_on_init_opts": { + "filename": get_best_checkpoint(bpe5k_lm.train_job, key="dev_score_output/output"), + "params_prefix": "", + "load_if_prefix": "lm_output/", + }, + "name": "trafo", +} + +trafo_lm_opts_map = { + BPE_10K: trafo_10k_lm_opts, + BPE_5K: trafo_5k_lm_opts, +} + + +# ----------------------------------------------------------- # + + +abs_name = os.path.abspath(__file__) +prefix_name = os.path.basename(abs_name)[: -len(".py")] + + +def get_test_dataset_tuples(bpe_size, selected_datasets=None): + test_dataset_tuples = {} + for testset in ["dev-clean", "dev-other", "test-clean", "test-other"]: + if selected_datasets and testset not in selected_datasets: + continue + test_dataset_tuples[testset] = build_test_dataset( + testset, + use_raw_features=True, + bpe_size=bpe_size, + ) + return test_dataset_tuples + + +def run_train( + prefix_name: str, + exp_name: str, + train_args, + train_data, + feature_extraction_net, + num_epochs, + recog_epochs, + time_rqmt=168, + **kwargs, +): + exp_prefix = os.path.join(prefix_name, exp_name) + returnn_config = create_config( + training_datasets=train_data, + **train_args, + feature_extraction_net=feature_extraction_net, + recog_epochs=recog_epochs, + ) + train_job = training( + exp_prefix, + returnn_config, + RETURNN_CPU_EXE, + kwargs.get("returnn_root", RETURNN_ROOT), + num_epochs=num_epochs, + time_rqmt=time_rqmt, + gpu_mem=kwargs.get("gpu_mem", 11), + ) + return train_job + + +def run_single_search( + prefix_name: str, + exp_name: str, + train_data, + search_args, + checkpoint, + feature_extraction_net, + recog_dataset, + recog_ref, + recog_bliss_corpus, + mem_rqmt=8, + time_rqmt=4, + **kwargs, +): + exp_prefix = os.path.join(prefix_name, exp_name) + returnn_search_config = create_config( + training_datasets=train_data, + **search_args, + feature_extraction_net=feature_extraction_net, + is_recog=True, + ) + search_single( + exp_prefix, + returnn_search_config, + checkpoint, + recognition_dataset=recog_dataset, + recognition_reference=recog_ref, + recognition_bliss_corpus=recog_bliss_corpus, + returnn_exe=RETURNN_CPU_EXE, + returnn_root=kwargs.get("returnn_root", RETURNN_ROOT), + mem_rqmt=mem_rqmt, + time_rqmt=time_rqmt, + use_sclite=True, + use_gpu_test=kwargs.get("use_gpu_test", False), + gpu_mem=kwargs.get("gpu_mem", 11), + ) + + +def run_lm_fusion( + lm_type, + prefix_name: str, + exp_name: str, + epoch: Union[str, int], + test_set_names: Union[str, List[str]], + lm_scales, + train_job, + train_data, + feature_net, + bpe_size, + args, + am_scale=1.0, + beam_size=12, + prior_scales=None, + prior_type=None, + mini_lstm_ckpt=None, + length_norm=True, + prior_type_name=None, + coverage_scale=None, + coverage_threshold=None, + lm_desc_suffix="", + **kwargs, +): + assert lm_type in ["lstm", "trafo"], "lm type should be lstm or trafo" + + if isinstance(lm_scales, float): + lm_scales = [lm_scales] + if prior_scales and isinstance(prior_scales, float): + prior_scales = [prior_scales] + if isinstance(test_set_names, str): + test_set_names = [test_set_names] + assert isinstance(test_set_names, list) + + if epoch == "avg": + search_checkpoint = train_job_avg_ckpt[exp_name] + elif epoch == "best": + search_checkpoint = train_job_best_epoch[exp_name] + else: + assert isinstance(epoch, int), "epoch must be either a defined integer or a string in {avg, best}." + search_checkpoint = train_job.out_checkpoints[epoch] + + ext_lm_opts = lstm_lm_opts_map[bpe_size] if lm_type == "lstm" else trafo_lm_opts_map[bpe_size] + + ext_lm_opts["am_scale"] = am_scale + + time_rqmt = 1.0 + + search_args = copy.deepcopy(args) + + if lm_type == "lstm": + if beam_size > 128: + search_args["batch_size"] = 4000 * 160 + + if lm_type == "trafo": + search_args["batch_size"] = 4000 * 160 if beam_size <= 32 else 2000 * 160 + time_rqmt = 2 + if beam_size > 50: + time_rqmt = 3 + + search_args["beam_size"] = beam_size + if kwargs.get("batch_size", None): + search_args["batch_size"] = kwargs["batch_size"] + + if not length_norm: + search_args["decoder_args"].length_normalization = False + + if "decoder_args" in kwargs: + for k, v in kwargs["decoder_args"].items(): + setattr(search_args["decoder_args"], k, v) + + scales = [(e,) for e in lm_scales] + + for test_set in test_set_names: + if prior_scales: + import itertools + + scales = itertools.product(lm_scales, prior_scales) + + for scale in scales: + lm_scale = scale[0] + prior_scale = scale[1] if len(scale) == 2 else None + if prior_scale and prior_scale > lm_scale: + continue + + # External LM opts + ext_lm_opts["lm_scale"] = lm_scale + search_args["ext_lm_opts"] = ext_lm_opts + + # ILM opts + if prior_scale: + ilm_opts = { + "scale": prior_scale, + "type": prior_type, + "ctx_dim": search_args["encoder_args"].enc_key_dim, # this is needed for mini-lstm + } + # this is needed for mini-self-att + if hasattr(search_args["decoder_args"], "num_layers"): + ilm_opts["num_dec_layers"] = search_args["decoder_args"].num_layers + search_args["decoder_args"].create_ilm_decoder = True + search_args["decoder_args"].ilm_type = prior_type + + ilm_opts.update(kwargs.get("ilm_train_opts", {})) # example for FFN, etc + + search_args["prior_lm_opts"] = ilm_opts + search_args["preload_from_files"] = { + "prior_lm": { + "filename": search_checkpoint, # copy ASR decoder to be used as ILM decoder + "prefix": "prior_", + } + } + if prior_type == "mini_lstm" or prior_type == "ffn": + assert mini_lstm_ckpt, "Mini-LSTM checkpoint not set." + search_args["preload_from_files"].update( + { + "mini_lstm": { + "filename": mini_lstm_ckpt, + "prefix": "mini_", + } + } + ) + + if prior_type_name is None: + prior_type_name = prior_type + + lm_desc = "" + + if am_scale: + lm_desc += f"am-scale-{am_scale}-" + + lm_desc += f"lm-scale-{lm_scale}" + if prior_scale: + lm_desc += f"-prior-{prior_scale}-{prior_type_name}" + lm_desc += f"-beam-{beam_size}" + if length_norm is False: + lm_desc += "-woLenNorm" + + if coverage_scale and coverage_threshold: + assert isinstance(search_args["decoder_args"], RNNDecoderArgs) + search_args["decoder_args"].coverage_scale = coverage_scale + search_args["decoder_args"].coverage_threshold = coverage_threshold + lm_desc += f"_coverage-thre{coverage_threshold}-scale{coverage_scale}" + + if lm_desc_suffix: + lm_desc += f"{lm_desc_suffix}" + + name = f"{exp_name}/recog-{lm_type}-lm/ep-{epoch}/{lm_desc}/{test_set}" + + test_dataset_tuples = get_test_dataset_tuples(bpe_size=bpe_size) + + run_single_search( + prefix_name=prefix_name, + exp_name=name, + train_data=train_data, + search_args=search_args, + checkpoint=search_checkpoint, + feature_extraction_net=feature_net, + recog_dataset=test_dataset_tuples[test_set][0], + recog_ref=test_dataset_tuples[test_set][1], + recog_bliss_corpus=test_dataset_tuples[test_set][2], + time_rqmt=kwargs.get("time_rqmt", time_rqmt), + gpu_mem=kwargs.get("gpu_mem", 11), + use_gpu_test=kwargs.get("use_gpu_test", False), + returnn_root=kwargs.get("returnn_root", RETURNN_ROOT), + ) + + +def run_search( + prefix_name: str, + exp_name: str, + train_args, + train_data, + train_job, + feature_extraction_net, + num_epochs, + search_args, + recog_epochs, + bpe_size, + run_all_for_best_last_avg=False, + recog_ext_pipeline=False, + **kwargs, +): + exp_prefix = os.path.join(prefix_name, exp_name) + + search_args = search_args if search_args is not None else copy.deepcopy(train_args) + search_args["search_type"] = None + + returnn_search_config = create_config( + training_datasets=train_data, + **search_args, + feature_extraction_net=feature_extraction_net, + is_recog=True, + recog_ext_pipeline=recog_ext_pipeline, + ) + + num_avg = kwargs.get("num_avg", 4) + averaged_checkpoint = get_average_checkpoint( + train_job, + returnn_exe=RETURNN_CPU_EXE, + returnn_root=kwargs.get("returnn_root", RETURNN_ROOT), + num_average=num_avg, + key=kwargs.get("key", "dev_score_output/output_prob"), + ) + if num_avg == 4: # TODO: just for now to not break hashes + train_job_avg_ckpt[exp_name] = averaged_checkpoint + + best_checkpoint = get_best_checkpoint(train_job, key=kwargs.get("key", "dev_score_output/output_prob")) + train_job_best_epoch[exp_name] = best_checkpoint + + if recog_epochs is None: + default_recog_epochs = [40] + [80 * i for i in range(1, int(num_epochs / 80) + 1)] + if num_epochs % 80 != 0: + default_recog_epochs += [num_epochs] + else: + default_recog_epochs = recog_epochs + + test_dataset_tuples = get_test_dataset_tuples( + bpe_size=bpe_size, selected_datasets=kwargs.get("selected_datasets", None) + ) + + all_test_dataset_tuples = get_test_dataset_tuples(bpe_size=bpe_size) + + remove_label = {"", "", ""} if recog_ext_pipeline else None + + for ep in default_recog_epochs: + search( + exp_prefix + f"/recogs/ep-{ep}", + returnn_search_config, + train_job.out_checkpoints[ep], + test_dataset_tuples, + RETURNN_CPU_EXE, + kwargs.get("returnn_root", RETURNN_ROOT), + use_sclite=kwargs.get("use_sclite", False), + recog_ext_pipeline=recog_ext_pipeline, + remove_label=remove_label, + ) + + search( + exp_prefix + "/default_last", + returnn_search_config, + train_job.out_checkpoints[num_epochs], + all_test_dataset_tuples if run_all_for_best_last_avg else test_dataset_tuples, + RETURNN_CPU_EXE, + kwargs.get("returnn_root", RETURNN_ROOT), + use_sclite=kwargs.get("use_sclite", False), + recog_ext_pipeline=recog_ext_pipeline, + remove_label=remove_label, + ) + + search( + exp_prefix + "/default_best", + returnn_search_config, + best_checkpoint, + all_test_dataset_tuples if run_all_for_best_last_avg else test_dataset_tuples, + RETURNN_CPU_EXE, + kwargs.get("returnn_root", RETURNN_ROOT), + use_sclite=kwargs.get("use_sclite", False), + recog_ext_pipeline=recog_ext_pipeline, + remove_label=remove_label, + ) + + search( + exp_prefix + f"/average_{num_avg}", + returnn_search_config, + averaged_checkpoint, + all_test_dataset_tuples if run_all_for_best_last_avg else test_dataset_tuples, + RETURNN_CPU_EXE, + kwargs.get("returnn_root", RETURNN_ROOT), + use_sclite=kwargs.get("use_sclite", False), + recog_ext_pipeline=recog_ext_pipeline, + remove_label=remove_label, + enable_mail=True, + ) + + +def run_exp( + prefix_name: str, + exp_name: str, + train_args, + feature_extraction_net=log10_net_10ms, + num_epochs=300, + search_args=None, + recog_epochs=None, + bpe_size=10000, + partition_epoch=20, + time_rqmt=168, + train_fixed_alignment=None, + cv_fixed_alignment=None, + recog_ext_pipeline=False, + **kwargs, +): + if train_fixed_alignment: + assert cv_fixed_alignment, "cv alignment is not set." + train_data = build_chunkwise_training_datasets( + train_fixed_alignment=train_fixed_alignment, + cv_fixed_alignment=cv_fixed_alignment, + bpe_size=bpe_size, + use_raw_features=True, + partition_epoch=partition_epoch, + epoch_wise_filter=kwargs.get("epoch_wise_filter", [(1, 5, 1000)]), + link_speed_perturbation=train_args.get("speed_pert", True), + seq_ordering=kwargs.get("seq_ordering", "laplace:.1000"), + ) + else: + train_data = build_training_datasets( + bpe_size=bpe_size, + use_raw_features=True, + partition_epoch=partition_epoch, + epoch_wise_filter=kwargs.get("epoch_wise_filter", [(1, 5, 1000)]), + link_speed_perturbation=train_args.get("speed_pert", True), + seq_ordering=kwargs.get("seq_ordering", "laplace:.1000"), + seq_postfix=kwargs.get("seq_postfix", 0), + ) + + train_job = run_train( + prefix_name, + exp_name, + train_args, + train_data, + feature_extraction_net, + num_epochs, + recog_epochs, + time_rqmt=time_rqmt, + **kwargs, + ) + train_jobs_map[exp_name] = train_job + + run_search( + prefix_name, + exp_name, + train_args, + train_data, + train_job, + feature_extraction_net, + num_epochs, + search_args, + recog_epochs, + bpe_size=bpe_size, + recog_ext_pipeline=recog_ext_pipeline, + **kwargs, + ) + return train_job, train_data + + +def run_forward( + prefix_name: str, + exp_name: str, + train_args, + model_ckpt, + hdf_layers=None, + feature_extraction_net=log10_net_10ms, + bpe_size=10000, + time_rqmt=12, + mem_rqmt=15, + override_returnn_config=None, + seq_postfix=0, + **kwargs, +): + # build train, dev, and devtrain + # - No speed pert + # - Partition epoch 1 + # - No curr. learning + + train_data = build_training_datasets( + bpe_size=bpe_size, + use_raw_features=True, + partition_epoch=1, + epoch_wise_filter=None, + link_speed_perturbation=False, + seq_postfix=seq_postfix, + seq_ordering=kwargs.get("seq_ordering", "laplace:.1000"), + ) + + if train_args.get("dump_alignments_dataset", None): + dump_dataset = train_args["dump_alignments_dataset"] + elif train_args.get("dump_ctc_dataset", None): + dump_dataset = train_args["dump_ctc_dataset"] + else: + raise Exception("No dump dataset specified.") + + assert dump_dataset in ["train", "dev"] + + exp_prefix = os.path.join(prefix_name, exp_name) + + if override_returnn_config: + returnn_config = copy.deepcopy(override_returnn_config) + else: + returnn_config = create_config( + training_datasets=train_data, + **train_args, + feature_extraction_net=feature_extraction_net, + ) + + if isinstance(model_ckpt, str): + model_ckpt_index_path = tk.Path(model_ckpt + ".index") + model_ckpt = Checkpoint(index_path=model_ckpt_index_path) + elif isinstance(model_ckpt, Checkpoint): + pass + else: + raise TypeError(f"model_ckpt must be str or Checkpoint, got {type(model_ckpt)}") + forward_j = ReturnnForwardJob( + model_checkpoint=model_ckpt, + hdf_outputs=hdf_layers, + returnn_config=returnn_config, + returnn_python_exe=RETURNN_CPU_EXE, + returnn_root=RETURNN_ROOT, + time_rqmt=time_rqmt, + mem_rqmt=mem_rqmt, + eval_mode=kwargs.get("do_eval", True), + device=kwargs.get("device", "gpu"), + ) + if kwargs.get("cpu_type", None): + assert "sbatch_args" not in forward_j.rqmt + forward_j.rqmt["cpu_type"] = kwargs["cpu_type"] + + forward_j.add_alias(exp_prefix + "/forward_hdf/" + dump_dataset) + + if hdf_layers is None: + hdf_layers = ["output.hdf"] + + for layer in hdf_layers: + tk.register_output( + os.path.join(exp_prefix, "hdfs", dump_dataset), + forward_j.out_hdf_files[layer], + ) + + return forward_j.out_hdf_files + + +def train_mini_lstm( + prefix_name: str, + exp_name: str, + checkpoint, + args, + num_epochs=20, + lr=8e-4, + time_rqmt=4, + l2=1e-4, + name="mini_lstm", + w_drop=False, + use_dec_state=False, + use_ffn=False, + ffn_opts=None, + train_fixed_alignment=None, + cv_fixed_alignment=None, + **kwargs, +): + if not w_drop: + params_freeze_str = ilm_helpers.get_mini_lstm_params_freeze_str() + else: + if use_ffn: + params_freeze_str = ilm_helpers.get_ffn_params_freeze_str_w_drop(ffn_opts["num_ffn_layers"]) + else: + params_freeze_str = ilm_helpers.get_mini_lstm_params_freeze_str_w_drop() + + mini_lstm_args = copy.deepcopy(args) + mini_lstm_args["batch_size"] = 20000 * 160 + mini_lstm_args["with_pretrain"] = False + mini_lstm_args["lr"] = lr + mini_lstm_args["allow_lr_scheduling"] = False + mini_lstm_args["encoder_args"].with_ctc = False + mini_lstm_args["keep_all_epochs"] = True # keep everything + mini_lstm_args["extra_str"] = params_freeze_str + mini_lstm_args["preload_from_files"] = { + "import": { + "init_for_train": True, + "ignore_missing": True, + "filename": checkpoint, + } + } + mini_lstm_args.update(kwargs) + + exp_prefix = os.path.join(prefix_name, exp_name, name) + if train_fixed_alignment is None: + mini_lstm_train_data = build_training_datasets( + bpe_size=10000, + use_raw_features=True, + epoch_wise_filter=None, + link_speed_perturbation=False, # depends only on text + seq_ordering=kwargs.get("seq_ordering", "laplace:.1000"), + ) + else: + mini_lstm_train_data = build_chunkwise_training_datasets( + train_fixed_alignment=train_fixed_alignment, + cv_fixed_alignment=cv_fixed_alignment, + bpe_size=BPE_10K, + use_raw_features=True, + partition_epoch=20, + epoch_wise_filter=None, + link_speed_perturbation=False, + seq_ordering=kwargs.get("seq_ordering", "laplace:.1000"), + ) + + returnn_config = create_config( + training_datasets=mini_lstm_train_data, + **mini_lstm_args, + feature_extraction_net=log10_net_10ms, + ) + + inp = "s" if use_dec_state else "prev:target_embed" + + if use_ffn: + x = inp + activations = ffn_opts["activations"] + for l in range(ffn_opts["num_ffn_layers"]): + returnn_config.config["network"]["output"]["unit"]["ffn_%02i" % (l + 1)] = { + "class": "linear", + "n_out": ffn_opts["ffn_dims"][l], + "L2": l2, + "from": inp, + "activation": activations[l] if activations and l < len(activations) else None, + } + x = "ffn_%02i" % (l + 1) + + returnn_config.config["network"]["output"]["unit"]["att"] = { + "class": "linear", + "from": x, + "activation": None, + "n_out": mini_lstm_args["encoder_args"].enc_key_dim, + "L2": l2, + } + else: + # Mini-LSTM + FF + + returnn_config.config["network"]["output"]["unit"]["att_lstm"] = { + "class": "rec", + "unit": "nativelstm2", + "from": inp, + "n_out": 50, + } + + returnn_config.config["network"]["output"]["unit"]["att"] = { + "class": "linear", + "from": "att_lstm", + "activation": None, + "n_out": mini_lstm_args["encoder_args"].enc_key_dim, + "L2": l2, + } + + train_job = training( + exp_prefix, + returnn_config, + RETURNN_CPU_EXE, + RETURNN_ROOT, + num_epochs=num_epochs, + time_rqmt=time_rqmt, + ) + return train_job + + +def train_mini_self_att( + prefix_name: str, + exp_name: str, + checkpoint, + args, + num_epochs=20, + lr=8e-4, + time_rqmt=4, + name="mini_self_att", + **kwargs, +): + """ + Same idea as Mini-LSTM but use masked (mini-)self-attention models instead of cross attention. + Note that each layer has its own (mini-)self-attention. + + In the case of transformer decoder, we want to replace cross-attention layers namely: + transformer_decoder_{idx}_att_linear + with masked self-attention models. + """ + + params_freeze_str = ilm_helpers.get_mini_self_att_params_freeze_str_w_drop(args["decoder_args"].num_layers) + + mini_self_att = copy.deepcopy(args) + mini_self_att["batch_size"] = 20000 * 160 # TODO: does this fit now? + mini_self_att["with_pretrain"] = False + mini_self_att["lr"] = lr + mini_self_att["allow_lr_scheduling"] = False + mini_self_att["encoder_args"].with_ctc = False + # mini_self_att['keep_all_epochs'] = True # keep everything + mini_self_att["extra_str"] = params_freeze_str + mini_self_att["preload_from_files"] = { + "import": { + "init_for_train": True, + "ignore_missing": True, + "filename": checkpoint, + } + } + if "decoder_args" in kwargs: + assert isinstance(kwargs["decoder_args"], dict) + for k, v in kwargs["decoder_args"].items(): + setattr(mini_self_att["decoder_args"], k, v) + kwargs.pop("decoder_args") + mini_self_att.update(kwargs) + + exp_prefix = os.path.join(prefix_name, exp_name, name) + mini_self_att_train_data = build_training_datasets( + bpe_size=10000, + use_raw_features=True, + epoch_wise_filter=None, + link_speed_perturbation=False, # depends only on text + seq_ordering=kwargs.get("seq_ordering", "laplace:.1000"), + ) + + # use masked self-att instead of cross-att with layer names having "ilm_" as prefix + mini_self_att["decoder_args"].replace_cross_att_w_masked_self_att = True + + returnn_config = create_config( + training_datasets=mini_self_att_train_data, + **mini_self_att, + feature_extraction_net=log10_net_10ms, + ) + train_job = training( + exp_prefix, + returnn_config, + RETURNN_CPU_EXE, + RETURNN_ROOT, + num_epochs=num_epochs, + time_rqmt=time_rqmt, + ) + return train_job + + +# --------------------------- General Settings --------------------------- # + +conformer_enc_args = ConformerEncoderArgs( + num_blocks=12, + input_layer="conv-6", + att_num_heads=8, + ff_dim=2048, + enc_key_dim=512, + conv_kernel_size=32, + pos_enc="rel", + dropout=0.1, + att_dropout=0.1, + l2=0.0001, + use_sqrd_relu=True, + frontend_conv_l2=0.0001, +) +apply_fairseq_init_to_conformer(conformer_enc_args) +conformer_enc_args.ctc_loss_scale = 1.0 + +rnn_dec_args = RNNDecoderArgs() + +training_args = dict() +training_args["speed_pert"] = True +training_args["with_pretrain"] = False + +lstm_training_args = copy.deepcopy(training_args) +lstm_training_args["batch_size"] = 15000 * 160 # frames * samples per frame + +lstm_dec_exp_args = copy.deepcopy( + { + **lstm_training_args, + "encoder_args": conformer_enc_args, + "decoder_args": rnn_dec_args, + } +) + +# --------------------------- Experiments --------------------------- # + +# Global attention baseline: +# +# dev-clean 2.28 +# dev-other 5.63 +# test-clean 2.48 +# test-other 5.71 + +global_att_best_ckpt = "/work/asr4/zeineldeen/setups-data/librispeech/2022-11-28--conformer-att/models-backup/best_att_100/avg_ckpt/epoch.2029" +global_att_v2 = "/work/asr4/zeineldeen/setups-data/librispeech/2022-11-28--conformer-att/work/i6_core/returnn/training/AverageTFCheckpointsJob.BxqgICRSGkgb/output/model/epoch.570" + +# from Albert: +# with task=“train” and search_type=“end-of-chunk”, it would align on-the-fly +# with task=“eval”, add a hdf-dump-layer, and search_type=“end-of-chunk”, you can dump it +# with task=“train” and search_type default (None), it would train using a fixed alignment + +default_args = copy.deepcopy(lstm_dec_exp_args) +default_args["learning_rates_list"] = list(numpy.linspace(8e-4, 1e-5, 60)) +default_args["retrain_checkpoint"] = global_att_best_ckpt +default_args["chunk_size"] = 20 +default_args["chunk_step"] = 20 * 3 // 4 +default_args["search_type"] = "end-of-chunk" # align on-the-fly + + +def get_ctc_chunksyn_align_config( + dataset_name, + ctc_alignments, + chunk_step, + eoc_idx=0, + hash_full_python_code=False, + ignore_eoc_in_input=False, # workaround for broken CTC/RNA alignments which include EOS (=EOC) +): + from i6_experiments.common.setups.returnn import serialization + + config = ReturnnConfig( + { + "extern_data": { + "bpe_labels": { + "available_for_inference": False, + "dim": 10026, # from CTC so +1 for blank + "shape": (None,), + "sparse": True, + }, + }, + "eval_datasets": { + dataset_name: { + "class": "MetaDataset", + "data_map": {"bpe_labels": ("hdf_dataset", "data")}, + "datasets": { + "hdf_dataset": { + "class": "HDFDataset", + "files": [ctc_alignments], + }, + }, + "seq_order_control_dataset": "hdf_dataset", + }, + }, + "network": { + "chunked_align": { + "class": "eval", + "eval": tools_eval_funcs.get_chunked_align, + "out_type": tools_eval_funcs.get_chunked_align_out_type, + "from": "data:bpe_labels", + "eval_locals": {"chunk_step": chunk_step, "eoc_idx": eoc_idx}, + }, + "output": { + "class": "hdf_dump", + "from": "chunked_align", + "filename": f"alignments-{dataset_name}.hdf", + }, + }, + "batch_size": 5000, + } + ) + config.post_config["use_tensorflow"] = True + if ignore_eoc_in_input: + config.config["network"]["chunked_align"]["eval_locals"].setdefault("ignore_indices", []).append(eoc_idx) + return serialization.get_serializable_config(config, hash_full_python_code=hash_full_python_code) + + +def get_ctc_rna_based_chunk_alignments( + *, + base_model_train_args: Optional[dict] = None, + ctc_dump_exp_name: Optional[str] = None, + fixed_ctc_rna_align_without_eos: bool = True, + ignore_eoc_in_input: bool = False, + chunk_sizes: Optional[List[int]] = None, + chunk_step_factors: Optional[List[Union[int, float]]] = None, + model_ckpt: Optional[Union[str, Checkpoint]] = None, +): + """ + Get CTC/RNA based chunk alignments for train/dev datasets. + """ + # save time-sync -> chunk-sync converted alignments. + ctc_align_wo_speed_pert = { + "train": {}, + "dev": {}, + } + + if model_ckpt is None: + model_ckpt = global_att_best_ckpt + + if fixed_ctc_rna_align_without_eos: + assert not ignore_eoc_in_input # should not be needed then + + if not ctc_dump_exp_name: + ctc_dump_exp_name = "dump_ctc_alignment_wo_speedPert" + if fixed_ctc_rna_align_without_eos: + ctc_dump_exp_name += "_wo_eos" + have_custom_exp_name = False + else: + have_custom_exp_name = True + + for dataset in ["train", "dev"]: + args = copy.deepcopy(base_model_train_args or default_args) + args["dump_ctc_dataset"] = dataset + args["batch_size"] *= 2 + + # CTC alignment with blank. + j = run_forward( + prefix_name=prefix_name, + exp_name=ctc_dump_exp_name, + train_args=args, + model_ckpt=model_ckpt, + hdf_layers=[f"alignments-{dataset}.hdf"], + seq_postfix=None if fixed_ctc_rna_align_without_eos else 0, + ) + + # convert w.r.t different chunk sizes and chunk steps + if not chunk_sizes: + chunk_sizes = [1, 2, 5, 8] + list(range(10, 55, 5)) + [60, 70, 80, 100] + for chunk_size in chunk_sizes: + if not chunk_step_factors: + chunk_step_factors = [1 / 2, 3 / 4, 0.9, 1] # 1 = no overlap + for chunk_step_factor in chunk_step_factors: + chunk_step = max(1, int(chunk_size * chunk_step_factor)) + + if have_custom_exp_name: + ctc_chunk_sync_align_exp_name = f"{ctc_dump_exp_name}_chunk{chunk_size}-{chunk_step}" + else: + ctc_chunk_sync_align_exp_name = f"ctc_chunk_sync_align_wo_speedPert_{chunk_size}-{chunk_step}" + if fixed_ctc_rna_align_without_eos: + ctc_chunk_sync_align_exp_name += "_wo_eos" + + ctc_chunk_sync_align = run_forward( + prefix_name=prefix_name, + exp_name=ctc_chunk_sync_align_exp_name, + train_args=args, + model_ckpt=model_ckpt, + hdf_layers=[f"alignments-{dataset}.hdf"], + override_returnn_config=get_ctc_chunksyn_align_config( + dataset, + ctc_alignments=j[f"alignments-{dataset}.hdf"], + chunk_step=chunk_step, + ignore_eoc_in_input=ignore_eoc_in_input, + ), + device="cpu", + time_rqmt=1.0, + cpu_type="cpu_short", + ) + + ctc_align_wo_speed_pert[dataset][f"{chunk_size}_{chunk_step}"] = ctc_chunk_sync_align[ + f"alignments-{dataset}.hdf" + ] + + return ctc_align_wo_speed_pert + + +def run_chunkwise_train( + total_epochs: List[int], + chunk_sizes: List[Optional[int]], + chunk_step_factors: List[Optional[float]], + enc_stream_type: str = "global", + suffix: str = "", + enable_check_align: bool = True, + on_the_fly_align: bool = False, + with_ctc: bool = False, + ctc_self_align_delay: int = None, + ctc_self_align_delay_scale: float = 0.5, + batch_size: int = 15_000, + accum_grad: int = 2, + time_rqmt: float = 72, + start_lrs: Union[float, List[Optional[float]]] = 1e-4, + decay_pt_factors: Union[float, List[Optional[float]]] = 1 / 3, + min_lr: float = 1e-6, + window_left_padding: Optional[int] = None, + end_slice_size: Optional[int] = None, + end_slice_start: Optional[int] = None, + pos_enc: Optional[str] = "rel", + conf_mem_opts: Optional[dict] = None, + full_sum_approx: bool = False, + retrain_ckpt: Optional[Union[tk.Path, str]] = None, + chunked_decoder: bool = True, + epoch_oclr_lr: Optional[float] = None, + decoder_mask_eoc: Optional[bool] = None, + speed_pert: bool = False, + from_scratch_train: bool = False, + lrs_list: Optional[List[float]] = None, + lr_list_desc: Optional[str] = None, + return_args: bool = False, + **kwargs, +): + if isinstance(start_lrs, float): + start_lrs = [start_lrs] + if isinstance(decay_pt_factors, float): + decay_pt_factors = [decay_pt_factors] + + # train with ctc chunk-sync alignment + ctc_chunksync_align = get_ctc_rna_based_chunk_alignments( + fixed_ctc_rna_align_without_eos=True, + chunk_sizes=chunk_sizes, + chunk_step_factors=chunk_step_factors, + model_ckpt=retrain_ckpt, + ) + + for total_epoch in total_epochs: + for chunk_size in chunk_sizes: + for chunk_step_factor in chunk_step_factors: + for start_lr in start_lrs: + for decay_pt_factor in decay_pt_factors: + train_args = copy.deepcopy(default_args) + train_args["speed_pert"] = speed_pert # no speed pert + train_args["search_type"] = None # fixed alignment + + train_args["max_seq_length"] = None # no filtering! + + train_args["encoder_args"].with_ctc = with_ctc + if ctc_self_align_delay: + assert with_ctc, "need CTC for self-align" + train_args["encoder_args"].ctc_self_align_delay = ctc_self_align_delay + train_args["encoder_args"].ctc_self_align_scale = ctc_self_align_delay_scale + + if enc_stream_type == "causal" or enc_stream_type.startswith("causal-"): + if enc_stream_type == "causal": + train_args["encoder_args"].use_causal_layers = True # causal MHSA and conv + elif enc_stream_type == "causal-mhsa": + train_args["encoder_args"].use_causal_layers = True + train_args["encoder_args"].use_causal_conv = False # causal MHSA only + elif enc_stream_type == "causal-reset-conv": + train_args["encoder_args"].use_causal_layers = True + train_args["encoder_args"].conv_alternative_name = "depthwise_conv2_causal" + train_args.setdefault("retrain_checkpoint_opts", {}).setdefault( + "ignore_params_prefixes", [] + ).extend( + [ + "conformer_block_%02i_conv_mod_depthwise_conv2_causal/" % (i + 1) + for i in range(train_args["encoder_args"].num_blocks) + ] + ) + + train_args["batch_size"] = batch_size * 160 + train_args["accum_grad"] = accum_grad + + train_args["enable_check_align"] = enable_check_align # to not break hashes + + train_args["chunk_size"] = chunk_size + if chunk_size is None: + train_args["chunk_step"] = None + chunk_step = None + else: + chunk_step = max(1, int(chunk_size * chunk_step_factor)) + train_args["chunk_step"] = chunk_step + + if lrs_list is not None: + train_args["learning_rates_list"] = lrs_list + elif epoch_oclr_lr: + assert start_lr is None + cyc_ep = int(0.45 * total_epoch) + train_args["learning_rates_list"] = ( + list(numpy.linspace(epoch_oclr_lr / 10, epoch_oclr_lr, cyc_ep)) + + list(numpy.linspace(epoch_oclr_lr, epoch_oclr_lr / 10, cyc_ep)) + + list(numpy.linspace(epoch_oclr_lr / 10, 1e-6, total_epoch - 2 * cyc_ep)) + ) + else: + decay_pt = int(total_epoch * decay_pt_factor) + train_args["learning_rates_list"] = [start_lr] * decay_pt + list( + numpy.linspace(start_lr, min_lr, total_epoch - decay_pt) + ) + + chunk_level = "input" if enc_stream_type == "chunked" else "encoder" + train_args["chunk_level"] = chunk_level + train_args["eoc_idx"] = 0 + + exp_name = f"{enc_stream_type}_att_chunk" + if chunk_size is not None: + assert chunk_step is not None + exp_name += f"-{chunk_size}_step-{chunk_step}" + else: + exp_name += "-globalAtt" # no chunking + + if start_lr: + exp_name += f"_linDecay{total_epoch}_{start_lr}_decayPt{decay_pt_factor}" + if min_lr != 1e-6: + exp_name += f"_minLR{min_lr}" + elif epoch_oclr_lr: + exp_name += f"_epochOCLR-{epoch_oclr_lr}_ep{total_epoch}" + elif lrs_list: + assert lr_list_desc + exp_name += f"_{lr_list_desc}" + + exp_name += f"_bs{batch_size}_accum{accum_grad}" + + if window_left_padding is not None: + train_args["window_left_padding"] = window_left_padding + exp_name += f"_winLeft{window_left_padding}" + + if end_slice_size is not None: + train_args["end_slice_size"] = end_slice_size + assert end_slice_start is not None, "need end_slice_start" + train_args["end_slice_start"] = end_slice_start + exp_name += f"_endSliceStart{end_slice_start}_endSlice{end_slice_size}" + + if pos_enc is None: + train_args["encoder_args"].pos_enc = pos_enc + exp_name += f"_woPosEnc" + else: + assert pos_enc == "rel" + + if conf_mem_opts is not None: + train_args["conf_mem_opts"] = conf_mem_opts + exp_name += f"_memVariant{conf_mem_opts['self_att_version']}" + mem_size = conf_mem_opts.get("mem_size", 1) + if mem_size > 1: + exp_name += f"_memSize{mem_size}" + if conf_mem_opts.get("mask_paddings", False): + exp_name += f"_memMaskPad" + if conf_mem_opts.get("conv_cache_size", None): + exp_name += f"_convCache{conf_mem_opts['conv_cache_size']}" + if conf_mem_opts.get("use_cached_prev_kv", False): + exp_name += f"_useCachedKV" + if conf_mem_opts.get("mem_slice_start", None) is not None: + assert conf_mem_opts.get("mem_slice_size", None) is not None + exp_name += ( + f"_memSlice{conf_mem_opts['mem_slice_start']}-{conf_mem_opts['mem_slice_size']}" + ) + train_args["recursion_limit"] = 4000 + + if with_ctc: + exp_name += "_withCtc" + + if full_sum_approx: + # NOTE: no need to mask EOC for the decoder since the targets do not contain EOC (just bpe labels) + train_args["decoder_args"].prev_target_embed_direct = True + train_args["decoder_args"].full_sum_simple_approx = True + exp_name += "_fullSumApprox" + + if decoder_mask_eoc: + train_args["decoder_args"].masked_computation_blank_idx = train_args["eoc_idx"] + exp_name += "_maskEOC" + + if retrain_ckpt: + assert suffix, "set suffix for retrain to avoid overwriting" + train_args["retrain_checkpoint"] = retrain_ckpt + + train_args["chunked_decoder"] = chunked_decoder + if not chunked_decoder: + exp_name += "_noChunkedDec" + + if from_scratch_train: + train_args.update(get_base_v1_args(train_args, lr=epoch_oclr_lr, ep=total_epoch)) + train_args["with_pretrain"] = True + train_args["retrain_checkpoint"] = None + exp_name += "_fromScratch" + + if kwargs.get("rel_pos_clipping", None): + train_args["encoder_args"].rel_pos_clipping = kwargs["rel_pos_clipping"] + exp_name += f"_relPosClip{kwargs['rel_pos_clipping']}" + + if kwargs.get("freeze_bn", False): + train_args["freeze_bn"] = True + exp_name += "_freezeBN" + + if kwargs.get("remove_att_ctx_from_dec_state", False): + train_args["remove_att_ctx_from_dec_state"] = True + exp_name += "_woDecAtt" + + if suffix: + exp_name += suffix + + # override + if with_ctc: + search_score_key = "dev_score_output/output_prob" + else: + search_score_key = "dev_score" + + if chunk_size is None or chunked_decoder is False or from_scratch_train: + run_exp( + prefix_name=prefix_name, + exp_name=exp_name, + train_args=train_args, + num_epochs=total_epoch, + epoch_wise_filter=None, + time_rqmt=time_rqmt, + key=search_score_key, + use_sclite=True, + speed_pert=speed_pert, + **kwargs, + ) + elif on_the_fly_align: + train_args["search_type"] = "end-of-chunk" # on-the-fly alignment + run_exp( + prefix_name=prefix_name, + exp_name=exp_name, + train_args=train_args, + num_epochs=total_epoch, + epoch_wise_filter=None, + time_rqmt=time_rqmt, + key=search_score_key, + use_sclite=True, + **kwargs, + ) + else: + if full_sum_approx: + # just use original targets without EOC + train_fixed_alignment = None + cv_fixed_alignment = None + else: + assert ctc_chunksync_align, "Need CTC chunk-sync alignments" + train_fixed_alignment = ctc_chunksync_align["train"][f"{chunk_size}_{chunk_step}"] + cv_fixed_alignment = ctc_chunksync_align["dev"][f"{chunk_size}_{chunk_step}"] + _, train_data = run_exp( + prefix_name=prefix_name, + exp_name=exp_name, + train_args=train_args, + num_epochs=total_epoch, + train_fixed_alignment=train_fixed_alignment, + cv_fixed_alignment=cv_fixed_alignment, + epoch_wise_filter=None, + time_rqmt=time_rqmt, + key=search_score_key, + use_sclite=True, + seq_postfix=None if full_sum_approx else 0, + **kwargs, + ) + + if return_args: + assert len(total_epochs) == 1 + assert len(chunk_sizes) == 1 + assert len(chunk_step_factors) == 1 + assert len(start_lrs) == 1 + assert len(decay_pt_factors) == 1 + + return train_args, exp_name, train_data, train_fixed_alignment, cv_fixed_alignment + + +def _run_exp_full_sum_simple_approx( + *, + enc_stream_type: Optional[str], + chunk_size: int, + chunk_step_factor: float, + total_epochs: int, + with_ctc: bool = False, +): + start_lr = 1e-4 + decay_pt_factor = 1 / 3 + train_args = copy.deepcopy(default_args) + + train_args["speed_pert"] = False # no speed pert + train_args["search_type"] = None + train_args["max_seq_length"] = None # no filtering! + + train_args["encoder_args"].with_ctc = with_ctc + + if enc_stream_type == "causal" or enc_stream_type.startswith("causal-"): + train_args["encoder_args"].use_causal_layers = True + if enc_stream_type == "causal-reset-conv": + train_args["encoder_args"].conv_alternative_name = "depthwise_conv2_causal" + train_args.setdefault("retrain_checkpoint_opts", {}).setdefault("ignore_params_prefixes", []).extend( + [ + "conformer_block_%02i_conv_mod_depthwise_conv2_causal/" % (i + 1) + for i in range(train_args["encoder_args"].num_blocks) + ] + ) + + decay_pt = int(total_epochs * decay_pt_factor) + + train_args["chunk_size"] = chunk_size + + chunk_step = max(1, int(chunk_size * chunk_step_factor)) + train_args["chunk_step"] = chunk_step + + chunk_level = "input" if enc_stream_type == "chunked" else "encoder" + train_args["chunk_level"] = chunk_level + train_args["eoc_idx"] = 0 + train_args["decoder_args"].prev_target_embed_direct = True + train_args["decoder_args"].full_sum_simple_approx = True + + if chunk_level == "input": + # It needs more memory because there are mini batches + # where the chunk size is larger than the sequences, + # thus increasing the overall memory consumption of the whole encoder. + train_args["batch_size"] = int(train_args["batch_size"] * 0.75) + train_args["accum_grad"] = int(train_args.get("accum_grad", 2) * 1.5) + + train_args["learning_rates_list"] = [start_lr] * decay_pt + list( + numpy.linspace(start_lr, 1e-6, total_epochs - decay_pt) + ) + + train_args["enable_check_align"] = False + + train_args["batch_size"] = int(0.75 * train_args["batch_size"]) + train_args["accum_grad"] = int(1.5 * train_args.get("accum_grad", 2)) + + exp_name_parts = [ + "chunk_att_simpleFS", + f"enc-{enc_stream_type}-conf", + f"chunksize-{chunk_size}", + f"chunkstep-{chunk_step}", + f"linDecay{total_epochs}_{start_lr}_decayPt{decay_pt_factor}", + f"ctc{with_ctc}", + ] + + run_exp( + prefix_name=prefix_name, + exp_name="_".join(exp_name_parts), + train_args=train_args, + num_epochs=total_epochs, + epoch_wise_filter=None, + time_rqmt=72, + selected_datasets=["dev-other"], + key="dev_score_output/output_prob" if with_ctc else "dev_score", + use_sclite=True, + ) + + +def py(): + for mask_eoc in [True, False]: + run_chunkwise_train( + enc_stream_type="global", + run_all_for_best_last_avg=True, + enable_check_align=False, + chunk_sizes=[1, 5, 10, 25], + chunk_step_factors=[1], + start_lrs=[2e-4], + decay_pt_factors=[0.25, 1 / 3], + gpu_mem=11, + total_epochs=[10 * 20, 20 * 20], + batch_size=15_000, + accum_grad=2, + time_rqmt=120, + decoder_mask_eoc=mask_eoc, + ) + + # global_att_chunk-1_step-1_linDecay200_0.0002_decayPt0.25_bs15000_accum2 + # 2.37 5.8 2.51 6.03 200 + for mask_eoc in [True, False]: + run_chunkwise_train( + enc_stream_type="global", + run_all_for_best_last_avg=True, + enable_check_align=False, + chunk_sizes=[1, 5, 10, 25], + chunk_step_factors=[1], + start_lrs=[2e-4], + decay_pt_factors=[0.25], + gpu_mem=11, + total_epochs=[20 * 20], + batch_size=15_000, + accum_grad=2, + time_rqmt=120, + decoder_mask_eoc=mask_eoc, + remove_att_ctx_from_dec_state=True + ) + + # TODO: with prev:att, just as-is, no change (done above) + + # TODO: change it to h_t, with att out linear transformation (should then be same kind of embedding, also same dim) + + # TODO: h_t without linear trafo (might be different dim) + + # TODO: no h_t at all (also different dim) + # run_chunkwise_train( + # enc_stream_type="global", + # run_all_for_best_last_avg=True, + # enable_check_align=False, + # chunk_sizes=[1, 20], + # chunk_step_factors=[1], + # start_lrs=[2e-4], + # decay_pt_factors=[0.25], + # gpu_mem=24, + # total_epochs=[300, 400], + # batch_size=30_000, + # accum_grad=1, + # time_rqmt=120, + # decoder_mask_eoc=True, + # remove_att_ctx_from_dec_state=True, + # returnn_root=RETURNN_ROOT_V2, + # ) diff --git a/users/zeineldeen/experiments/chunkwise_att_2023/tedlium2/configs/ted2_chunked_aed_variants.py b/users/zeineldeen/experiments/chunkwise_att_2023/tedlium2/configs/ted2_chunked_aed_variants.py index 91a1ba87b..df2209c86 100644 --- a/users/zeineldeen/experiments/chunkwise_att_2023/tedlium2/configs/ted2_chunked_aed_variants.py +++ b/users/zeineldeen/experiments/chunkwise_att_2023/tedlium2/configs/ted2_chunked_aed_variants.py @@ -1710,16 +1710,33 @@ def py(): decoder_mask_eoc=decoder_mask_eoc, ) - # TODO: mask out EOC symbol + ctx from decoder state + for decoder_mask_eoc in [True, False]: + run_chunkwise_train( + enc_stream_type="global", + run_all_for_best_last_avg=True, + enable_check_align=False, + chunk_sizes=[1, 5, 10, 25], + chunk_step_factors=[1], + start_lrs=[2e-4], + decay_pt_factors=[0.25], + final_lrs=[1e-6], + gpu_mem=11, + total_epochs=[20 * 4, 30 * 4, 40 * 4], + batch_size=15_000, + accum_grad=2, + time_rqmt=120, + decoder_mask_eoc=decoder_mask_eoc, + ) + # TODO: mask out EOC symbol + ctx from decoder state run_chunkwise_train( enc_stream_type="global", run_all_for_best_last_avg=True, enable_check_align=False, - chunk_sizes=[5, 25], + chunk_sizes=[1, 5, 10, 25], chunk_step_factors=[1], start_lrs=[2e-4], - decay_pt_factors=[0.25], + decay_pt_factors=[0.25, 1 / 3], final_lrs=[1e-6], gpu_mem=11, total_epochs=[20 * 4, 40 * 4], @@ -1730,33 +1747,21 @@ def py(): remove_att_ctx_from_dec_state=True, ) - # TODO: large chunks + overlap - # for left_context, center_context, right_context, conv_size, mem_size in [ - # (0, 80, 0, 0, 0), - # ]: - # run_chunkwise_train( - # enc_stream_type="chunked", - # run_all_for_best_last_avg=True, - # enable_check_align=False, - # chunk_sizes=[25], - # chunk_step_factors=[20 / 25], - # start_lrs=[2e-4], - # decay_pt_factors=[1 / 3], - # gpu_mem=24, - # total_epochs=[120], - # batch_size=15_000, - # accum_grad=2, - # time_rqmt=120, - # end_slice_start=left_context, - # end_slice_size=center_context, - # window_left_padding=left_context * 6, - # # conf_mem_opts={ - # # "self_att_version": 1, - # # "mem_size": mem_size, - # # "use_cached_prev_kv": True, - # # "conv_cache_size": conv_size, - # # "mem_slice_start": left_context, - # # "mem_slice_size": 20, - # # }, - # suffix=f"_L{left_context}_C{center_context}_R{right_context}", - # ) + # # TODO: use overlap + # run_chunkwise_train( + # enc_stream_type="global", + # run_all_for_best_last_avg=True, + # enable_check_align=False, + # chunk_sizes=[5, 10, 25], + # chunk_step_factors=[0.5, 0.75], # 50%, 25% overlaps + # start_lrs=[2e-4], + # decay_pt_factors=[0.25, 1 / 3], + # final_lrs=[1e-6], + # gpu_mem=11, + # total_epochs=[20 * 4, 40 * 4], + # batch_size=15_000, + # accum_grad=2, + # time_rqmt=120, + # decoder_mask_eoc=True, + # remove_att_ctx_from_dec_state=True, + # ) From 170f7762a118b91ccb405a79b3d6ffe3f8091382 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 6 Jun 2024 10:36:49 +0000 Subject: [PATCH 105/227] add more weight drop to rnn decoder --- .../models/asr/decoder/rnn_decoder.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/users/zeineldeen/models/asr/decoder/rnn_decoder.py b/users/zeineldeen/models/asr/decoder/rnn_decoder.py index b292d2ed8..586a1cdc0 100644 --- a/users/zeineldeen/models/asr/decoder/rnn_decoder.py +++ b/users/zeineldeen/models/asr/decoder/rnn_decoder.py @@ -408,14 +408,28 @@ def create_network(self): ) else: self.base_model.network.add_linear_layer( - "enc_ctx", "encoder", with_bias=True, n_out=self.enc_key_dim, l2=self.base_model.l2 + "enc_ctx", + "encoder", + with_bias=True, + n_out=self.enc_key_dim, + l2=self.base_model.l2, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, ) self.base_model.network.add_split_dim_layer( "enc_value", "encoder", dims=(self.att_num_heads, self.enc_value_dim // self.att_num_heads) ) self.base_model.network.add_linear_layer( - "inv_fertility", "encoder", activation="sigmoid", n_out=self.att_num_heads, with_bias=False + "inv_fertility", + "encoder", + activation="sigmoid", + n_out=self.att_num_heads, + with_bias=False, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, ) decision_layer_name = self.base_model.network.add_decide_layer("decision", self.dec_output, target=self.target) From 734dd697c5a5b1810f1ff07615b15ac31ea8a672 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 6 Jun 2024 10:38:48 +0000 Subject: [PATCH 106/227] add chunked rnn decoder --- .../chunkwise_attention_asr_config.py | 2 +- .../models/asr/decoder/chunked_rnn_decoder.py | 1046 +++++++++++++++++ 2 files changed, 1047 insertions(+), 1 deletion(-) create mode 100644 users/zeineldeen/models/asr/decoder/chunked_rnn_decoder.py diff --git a/users/zeineldeen/experiments/chunkwise_att_2023/librispeech_960/chunkwise_attention_asr_config.py b/users/zeineldeen/experiments/chunkwise_att_2023/librispeech_960/chunkwise_attention_asr_config.py index 81cb1b1f7..b611a8f8d 100644 --- a/users/zeineldeen/experiments/chunkwise_att_2023/librispeech_960/chunkwise_attention_asr_config.py +++ b/users/zeineldeen/experiments/chunkwise_att_2023/librispeech_960/chunkwise_attention_asr_config.py @@ -23,7 +23,7 @@ ExternalLMDecoder, ) -from i6_experiments.users.zeyer.experiments.exp2023_02_16_chunked_attention.model import ( +from i6_experiments.users.zeineldeen.models.asr.decoder.chunked_rnn_decoder import ( RNNDecoder as ChunkwiseRNNDecoder, _check_alignment, ) diff --git a/users/zeineldeen/models/asr/decoder/chunked_rnn_decoder.py b/users/zeineldeen/models/asr/decoder/chunked_rnn_decoder.py new file mode 100644 index 000000000..d93edffa4 --- /dev/null +++ b/users/zeineldeen/models/asr/decoder/chunked_rnn_decoder.py @@ -0,0 +1,1046 @@ +""" +Model, based on Mohammads code. +""" + + +from __future__ import annotations +from typing import Optional, Callable +import tensorflow as tf + +from returnn.util.basic import NotSpecified +from returnn.tf.util.data import Data, Dim, SpatialDim, FeatureDim, single_step_dim +from returnn.tf.layers.basic import LayerBase + +from i6_experiments.users.zeineldeen.modules.network import ReturnnNetwork +from i6_experiments.users.zeineldeen.modules.abs_module import AbsModule +from i6_experiments.users.zeineldeen.modules.attention import AdditiveLocAwareness + + +class AttentionMechanism(AbsModule): + """ + Single-head or Multi-head attention mechanism + """ + + def __init__( + self, + enc_key_dim, + att_num_heads, + att_dropout, + l2, + loc_filter_size, + loc_num_channels, + use_weight_feedback, + weight_drop, + weight_noise, + ): + super().__init__() + self.enc_key_dim = enc_key_dim + assert isinstance(att_num_heads, Dim) + self.att_num_heads = att_num_heads + + self.att_dropout = att_dropout + self.l2 = l2 + + self.loc_filter_size = loc_filter_size + self.loc_num_channels = loc_num_channels + + self.select_base_enc: Optional[Callable[[str], str]] = None + self.enc_time_dim = None + + self.use_weight_feedback = use_weight_feedback + + self.weight_drop = weight_drop + self.weight_noise = weight_noise + + def create(self): + out_net = ReturnnNetwork() + + out_net.add_linear_layer( + "s_transformed", + "s", + n_out=self.enc_key_dim, + with_bias=False, + l2=self.l2, + param_dropout=self.weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.weight_noise, + ) # project query + + if self.use_weight_feedback: + weight_feedback = AdditiveLocAwareness( + enc_key_dim=self.enc_key_dim, + att_num_heads=self.att_num_heads.dimension, + weight_drop=self.weight_drop, + weight_noise=self.weight_noise, + ) + out_net.update(weight_feedback.create()) # add weight feedback to network + else: + weight_feedback = None + + enc_ctx = "base:enc_ctx" + if self.select_base_enc: + enc_ctx = self.select_base_enc(enc_ctx) + out_net.add_combine_layer( + "energy_in", + [enc_ctx, "s_transformed"] if not weight_feedback else [enc_ctx, weight_feedback.name, "s_transformed"], + kind="add", + n_out=self.enc_key_dim, + ) + + # compute energies + out_net.add_activation_layer("energy_tanh", "energy_in", activation="tanh") + energy = out_net.add_linear_layer( + "energy", + "energy_tanh", + n_out=self.att_num_heads.dimension, + out_dim=self.att_num_heads, + with_bias=False, + l2=self.l2, + param_dropout=self.weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.weight_noise, + ) + + att_sm_opts = {} + if self.enc_time_dim: + att_sm_opts["axis"] = self.enc_time_dim + if self.att_dropout: + att_weights0 = out_net.add_softmax_over_spatial_layer("att_weights0", energy, **att_sm_opts) + att_weights = out_net.add_dropout_layer( + "att_weights", + att_weights0, + dropout=self.att_dropout, + dropout_noise_shape={"*": None}, + ) + else: + att_weights = out_net.add_softmax_over_spatial_layer("att_weights", energy, **att_sm_opts) + + enc_value = "base:enc_value" + if self.select_base_enc: + enc_value = self.select_base_enc(enc_value) + if self.enc_time_dim: + att0 = out_net.add_dot_layer( + "att0", + [att_weights, enc_value], + reduce=self.enc_time_dim, + var1="auto", + var2="auto", + ) + else: + att0 = out_net.add_generic_att_layer("att0", weights=att_weights, base=enc_value) + self.name = out_net.add_merge_dims_layer("att", att0, axes="static") + + return out_net.get_net() + + +class RNNDecoder: + """ + Represents RNN LSTM Attention-based decoder + + Related: + * Single headed attention based sequence-to-sequence model for state-of-the-art results on Switchboard + ref: https://arxiv.org/abs/2001.07263 + """ + + def __init__( + self, + base_model, + source=None, + dropout=0.0, + softmax_dropout=0.3, + label_smoothing=0.1, + target="bpe", + beam_size=12, + embed_dim=621, + embed_dropout=0.0, + lstm_num_units=1024, + output_num_units=1024, + enc_key_dim=1024, + l2=None, + att_dropout=None, + rec_weight_dropout=None, + zoneout=False, + ff_init=None, + add_lstm_lm=False, + lstm_lm_dim=1024, + loc_conv_att_filter_size=None, + loc_conv_att_num_channels=None, + reduceout=True, + att_num_heads=1, + embed_weight_init=None, + lstm_weights_init=None, + lstm_lm_proj_dim=1024, + length_normalization=True, + coverage_threshold=None, + coverage_scale=None, + enc_chunks_dim: Optional[Dim] = None, + enc_time_dim: Optional[Dim] = None, + eos_id=0, + search_type: Optional[str] = None, + enable_check_align=True, + masked_computation_blank_idx: Optional[int] = None, + full_sum_simple_approx: bool = False, + prev_target_embed_direct: bool = False, + use_zoneout_output: bool = False, + ff_weight_dropout=None, + ff_weight_noise=None, + ): + """ + :param base_model: base/encoder model instance + :param str|None source: input to decoder subnetwork + :param float softmax_dropout: Dropout applied to the softmax input + :param float label_smoothing: label smoothing value applied to softmax + :param str target: target data key name + :param int beam_size: value of the beam size + :param int embed_dim: target embedding dimension + :param float|None embed_dropout: dropout to be applied on the target embedding + :param int lstm_num_units: the number of hidden units for the decoder LSTM + :param int output_num_units: the number of hidden dimensions for the last layer before softmax + :param int enc_key_dim: the number of hidden dimensions for the encoder key + :param float|None l2: weight decay with l2 norm + :param float|None att_dropout: dropout applied to attention weights + :param float|None rec_weight_dropout: dropout applied to weight paramters + :param bool zoneout: if set, zoneout LSTM cell is used in the decoder instead of nativelstm2 + :param str|None ff_init: feed-forward weights initialization + :param bool add_lstm_lm: add separate LSTM layer that acts as LM-like model + same as here: https://arxiv.org/abs/2001.07263 + :param float lstm_lm_dim: + :param int|None loc_conv_att_filter_size: + :param int|None loc_conv_att_num_channels: + :param bool reduceout: if set to True, maxout layer is used + :param int att_num_heads: number of attention heads + :param enc_chunks_dim: + :param enc_time_dim: + :param int eos_id: end of sentence id. or end-of-chunk if chunking is used + :param search_type: + None -> use RETURNN default handling via search flag (i.e. disabled in training, enabled in search mode). + "end-of-chunk" -> assume given targets without EOC, and search for EOC. + :param enable_check_align: if set, the targets are checked whether M + U = T + :param masked_computation_blank_idx: if set, it uses masked computation for the LSTM/prev:target, + and the mask is for all non-blank indices + :param full_sum_simple_approx: if enabled, it creates a 4D tensor [B, M, U+1, V] via a simple approximation + by only attending to one fixed chunk in M for the whole sequence U+1, and then it uses the RNN-T loss. + The decoder gets only the non-blank labels as input in this case, including BOS (U+1). + This makes only sense in training. Then in recog, you do align-sync search, + and you should set masked_computation_blank_idx to get consistent behavior, + i.e. that blank labels are not used. + :param prev_target_embed_direct: if False, uses "prev:target_embed", + otherwise "prev_target_embed" uses "prev:output". should be like "apply(0)" as initial_output. + """ + + self.base_model = base_model + + self.source = source + + self.dropout = dropout + self.softmax_dropout = softmax_dropout + self.label_smoothing = label_smoothing + + self.enc_key_dim = enc_key_dim + self.enc_value_dim = base_model.enc_value_dim + if isinstance(att_num_heads, int): + att_num_heads = SpatialDim("dec-att-num-heads", att_num_heads) + assert isinstance(att_num_heads, Dim) + self.att_num_heads = att_num_heads + + self.target = target + + self.beam_size = beam_size + + self.embed_dim = embed_dim + self.embed_dropout = embed_dropout + + self.dec_lstm_num_units = lstm_num_units + self.dec_output_num_units = output_num_units + + self.ff_init = ff_init + + self.decision_layer_name = None # this is set in the end-point config + + self.l2 = l2 + self.att_dropout = att_dropout + self.rec_weight_dropout = rec_weight_dropout + self.dec_zoneout = zoneout + + self.ff_weight_drop = ff_weight_dropout + self.ff_weight_noise = ff_weight_noise + + self.add_lstm_lm = add_lstm_lm + self.lstm_lm_dim = lstm_lm_dim + self.lstm_lm_proj_dim = lstm_lm_proj_dim + + self.loc_conv_att_filter_size = loc_conv_att_filter_size + self.loc_conv_att_num_channels = loc_conv_att_num_channels + + self.embed_weight_init = embed_weight_init + self.lstm_weights_init = lstm_weights_init + + self.reduceout = reduceout + + self.length_normalization = length_normalization + self.coverage_threshold = coverage_threshold + self.coverage_scale = coverage_scale + + self.enc_chunks_dim = enc_chunks_dim + self.enc_time_dim = enc_time_dim + self.eos_id = eos_id + self.search_type = search_type + + self.network = ReturnnNetwork() + self.subnet_unit = ReturnnNetwork() + self.dec_output = None + self.output_prob = None + + self.enable_check_align = enable_check_align + self.masked_computation_blank_idx = masked_computation_blank_idx + self.full_sum_simple_approx = full_sum_simple_approx + if full_sum_simple_approx: + assert enc_chunks_dim is not None, "full_sum_simple_approx requires enc_chunks_dim" + self.prev_target_embed_direct = prev_target_embed_direct + + self.use_zoneout_output = use_zoneout_output + + def add_decoder_subnetwork( + self, + subnet_unit: ReturnnNetwork, + target: str = NotSpecified, + search_type: Optional[str] = NotSpecified, + rec_layer_name: Optional[str] = None, + rec_layer_opts: Optional[dict] = None, + ): + if target is NotSpecified: + target = self.target + if search_type is NotSpecified: + search_type = self.search_type + if rec_layer_opts is None: + rec_layer_opts = {} + if not rec_layer_name: + if search_type == "end-of-chunk": + rec_layer_name = "output_align" + rec_layer_opts.setdefault("name_scope", "output/rec") + else: + rec_layer_name = "output" + + if self.full_sum_simple_approx: + pass + elif self.enc_chunks_dim: # use chunking + subnet_unit["new_label_pos"] = { + "class": "eval", + "from": ["output", "prev:new_label_pos"], + "eval": f"tf.where(tf.equal(source(0), {self.eos_id}), source(1), source(1) + 1)", + "out_type": { + "dtype": "int32", + "dim": None, + "sparse_dim": self.enc_chunks_dim, + }, + "initial_output": 0, + } + + subnet_unit["label_pos"] = {"class": "copy", "from": "prev:new_label_pos"} + + subnet_unit["label_pos_reached_end"] = { + "class": "compare", + "from": ["label_pos", "ground_truth_label_seq_len"], + "kind": "greater_equal", + } + + subnet_unit["chunk_idx_reached_last"] = { + "class": "compare", + "from": ["chunk_idx", "last_chunk_idx"], + "kind": "equal", + } + + subnet_unit["chunk_idx_can_be_finished"] = { + "class": "eval", + "from": ["chunk_idx_reached_last", "label_pos_reached_end"], + "eval": "tf.logical_or(tf.logical_and(source(0), source(1)), tf.logical_not(source(0)))", + } + + subnet_unit["ground_truth_label"] = { + "class": "gather", + "from": f"base:data:{target}", + "axis": "T", + "position": "label_pos", + "clip_to_valid": True, + } + + subnet_unit["ground_truth_label_seq_len"] = { + "class": "length", + "from": f"base:data:{target}", + "axis": "T", + } + + subnet_unit["ground_truth_last_label_pos"] = { + "class": "eval", + "from": "ground_truth_label_seq_len", + "eval": "source(0) - 1", + } + + subnet_unit["new_chunk_idx"] = { + "class": "eval", + "from": ["output", "prev:new_chunk_idx"], + "eval": f"tf.where(tf.equal(source(0), {self.eos_id}), source(1) + 1, source(1))", + "out_type": { + "dtype": "int32", + "dim": None, + "sparse_dim": self.enc_chunks_dim, + }, + "initial_output": 0, + } + + subnet_unit["chunk_idx"] = {"class": "copy", "from": "prev:new_chunk_idx"} + + subnet_unit["num_chunks"] = { + "class": "length", + "from": "base:encoder", + "axis": self.enc_chunks_dim, + } + + subnet_unit["last_chunk_idx"] = { + "class": "eval", + "from": "num_chunks", + "eval": "source(0) - 1", + } + + subnet_unit["end"] = { + "class": "compare", + "from": ["new_chunk_idx", "num_chunks"], + "kind": "greater_equal", + } + + else: # no chunking + subnet_unit.add_compare_layer("end", source="output", value=self.eos_id) # sentence end token + + if self.masked_computation_blank_idx is not None: + subnet_unit["masked_comp_mask"] = { + "class": "compare", + "from": "output", + "kind": "not_equal", + "value": self.masked_computation_blank_idx, + "initial_output": True, + } + + prev_output = "prev:output" + if self.full_sum_simple_approx: + assert self.prev_target_embed_direct + assert not self.source + prev_output = "data:source" + + # target embedding + _name = subnet_unit.add_linear_layer( + "prev_target_embed0" if self.prev_target_embed_direct else "target_embed0", + prev_output if self.prev_target_embed_direct else "output", + n_out=self.embed_dim, + with_bias=False, + l2=self.l2, + forward_weights_init=self.embed_weight_init, + initial_output=self.eos_id, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, + ) + if self.masked_computation_blank_idx is not None: + target_embed_layer_dict = subnet_unit[_name] + target_embed_layer_dict = { + "class": "masked_computation", + "unit": target_embed_layer_dict, + "mask": "prev:masked_comp_mask" if self.prev_target_embed_direct else "masked_comp_mask", + "initial_output": self.eos_id, + "from": target_embed_layer_dict["from"], + } + target_embed_layer_dict["unit"]["from"] = "data" + subnet_unit[_name] = target_embed_layer_dict + if self.prev_target_embed_direct: + subnet_unit[_name]["name_scope"] = "target_embed0" # params compatible + + subnet_unit.add_dropout_layer( + "prev_target_embed" if self.prev_target_embed_direct else "target_embed", + _name, + dropout=self.embed_dropout, + dropout_noise_shape={"*": None}, + ) + prev_target_embed = "prev_target_embed" if self.prev_target_embed_direct else "prev:target_embed" + + # attention + att = AttentionMechanism( + enc_key_dim=self.enc_key_dim, + att_num_heads=self.att_num_heads, + att_dropout=self.att_dropout, + l2=self.l2, + loc_filter_size=self.loc_conv_att_filter_size, + loc_num_channels=self.loc_conv_att_num_channels, + use_weight_feedback=not self.enc_chunks_dim, # TODO: allow when chunked + weight_drop=self.ff_weight_drop, + weight_noise=self.ff_weight_noise, + ) + if self.masked_computation_blank_idx is not None: + subnet_unit["prev_att_masked"] = { + "class": "masked_computation", + "mask": "prev:masked_comp_mask", + "unit": {"class": "copy", "from": "data"}, + "from": "prev:att", + } + + if self.enc_chunks_dim: + att.enc_time_dim = self.enc_time_dim + if self.full_sum_simple_approx: + pass + else: + + def _gather_chunk(source: str) -> str: + name = source.replace("base:", "") + subnet_unit[name + "_gather"] = { + "class": "gather", + "from": source, + "position": "chunk_idx", + "axis": self.enc_chunks_dim, + "clip_to_valid": True, + } + subnet_unit[name + "_set_time"] = { + "class": "reinterpret_data", + "from": name + "_gather", + "set_axes": {"T": self.enc_time_dim}, + } + return name + "_set_time" + + assert self.enc_time_dim + att.select_base_enc = _gather_chunk + subnet_unit.update(att.create()) + + # LM-like component same as here https://arxiv.org/pdf/2001.07263.pdf + lstm_lm_component_proj = None + if self.add_lstm_lm: + assert self.masked_computation_blank_idx is None # not implemented... + lstm_lm_component = subnet_unit.add_rec_layer( + "lm_like_s", + prev_target_embed, + n_out=self.lstm_lm_dim, + l2=self.l2, + unit="NativeLSTM2", + rec_weight_dropout=self.rec_weight_dropout, + weights_init=self.lstm_weights_init, + ) + lstm_lm_component_proj = subnet_unit.add_linear_layer( + "lm_like_s_proj", + lstm_lm_component, + n_out=self.lstm_lm_proj_dim, + l2=self.l2, + with_bias=False, + dropout=self.dropout, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, + ) + + lstm_inputs = [] + if lstm_lm_component_proj: + lstm_inputs += [lstm_lm_component_proj] + else: + lstm_inputs += [prev_target_embed] + + if self.masked_computation_blank_idx is not None: + lstm_inputs += ["prev_att_masked"] + else: + lstm_inputs += ["prev:att"] + + if self.add_lstm_lm: + # element-wise addition is applied instead of concat + lstm_inputs = subnet_unit.add_combine_layer( + "add_embed_ctx", lstm_inputs, kind="add", n_out=self.lstm_lm_proj_dim + ) + + # LSTM decoder (or decoder state) + if self.dec_zoneout and not self.full_sum_simple_approx: + # It's bad to use rnn_cell here... Just annoying to keep this just to preserve hash... + zoneout_unit_opts = {"zoneout_factor_cell": 0.15, "zoneout_factor_output": 0.05} + if self.use_zoneout_output: + zoneout_unit_opts["use_zoneout_output"] = True + subnet_unit.add_rnn_cell_layer( + "s", + lstm_inputs, + n_out=self.dec_lstm_num_units, + l2=self.l2, + weights_init=self.lstm_weights_init, + unit="zoneoutlstm", + unit_opts=zoneout_unit_opts, + ) + else: + subnet_unit.add_rec_layer( + "s", + lstm_inputs, + n_out=self.dec_lstm_num_units, + l2=self.l2, + unit="zoneoutlstm" if self.dec_zoneout else "NativeLSTM2", + rec_weight_dropout=self.rec_weight_dropout, + weights_init=self.lstm_weights_init, + ) + if self.dec_zoneout: + subnet_unit["s"].setdefault("unit_opts", {}).update( + {"zoneout_factor_cell": 0.15, "zoneout_factor_output": 0.05} + ) + if self.full_sum_simple_approx: + subnet_unit["s"]["axis"] = single_step_dim + if self.masked_computation_blank_idx is not None: + subnet_unit["_s_input"] = {"class": "copy", "from": lstm_inputs} + layer_dict = subnet_unit["s"] + subnet_unit["s"] = { + "class": "masked_computation", + "unit": layer_dict, + "from": "_s_input", + "mask": "prev:masked_comp_mask", + } + layer_dict["from"] = "data" + + s_name = "s" + if self.add_lstm_lm: + s_name = subnet_unit.add_linear_layer( + "s_proj", + "s", + n_out=self.lstm_lm_proj_dim, + with_bias=False, + dropout=self.dropout, + l2=self.l2, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, + ) + + readout_in_src = subnet_unit.add_combine_layer( + "add_s_att", [s_name, "att"], kind="add", n_out=self.lstm_lm_proj_dim + ) + else: + readout_in_src = [s_name, prev_target_embed, "att"] + + subnet_unit.add_linear_layer( + "readout_in", + readout_in_src, + n_out=self.dec_output_num_units, + l2=self.l2, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, + ) + + if self.reduceout: + subnet_unit.add_reduceout_layer("readout", "readout_in") + else: + subnet_unit.add_copy_layer("readout", "readout_in") + + out_prob_opts = {} + if not search_type and not self.full_sum_simple_approx: + out_prob_opts.update( + dict( + loss="ce", + loss_opts={"label_smoothing": self.label_smoothing}, + ) + ) + + if self.full_sum_simple_approx: + # we only need the logits for full_sum training + self.output_logits = subnet_unit.add_linear_layer( + "output_logits", + "readout", + l2=self.l2, + target=f"layer:base:data:{target}", + dropout=self.softmax_dropout, + name_scope="output_prob", # for ckpt loading + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, + ) + # used for recognition + self.output_prob = subnet_unit.add_activation_layer( + "output_prob", + "output_logits", + activation="softmax", + ) + else: + self.output_prob = subnet_unit.add_softmax_layer( + "output_prob", + "readout", + l2=self.l2, + target=f"layer:base:data:{target}" + if (search_type == "end-of-chunk" or self.full_sum_simple_approx) + else target, + dropout=self.softmax_dropout, + **out_prob_opts, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, + ) + + if self.full_sum_simple_approx: + assert self.enc_chunks_dim + subnet_unit["full_sum_simple_approx_loss"] = { + "class": "eval", + "from": ["output_logits", f"base:data:{target}"], + # Pickling/serialization of the func ref should work when this is a global function of this module. + # But depending on your setup, there might anyway not be any serialization. + "eval": _rnnt_full_sum_log_prob_eval_layer_func, + "eval_locals": { + "blank_index": self.eos_id, + "input_spatial_dim": self.enc_chunks_dim, + }, + "out_type": _rnnt_full_sum_log_prob_eval_layer_out, + "loss": "as_is", + } + + if search_type == "end-of-chunk": + subnet_unit["_label_indices"] = { + "class": "range_in_axis", + "from": f"base:data:{target}", + "axis": "sparse_dim", + } + subnet_unit["_label_indices_eq_eoc"] = { + "class": "compare", + "from": "_label_indices", + "value": self.eos_id, + "kind": "equal", + } + subnet_unit["_label_indices_eq_eoc_"] = { + "class": "switch", + "condition": "chunk_idx_can_be_finished", + "true_from": "_label_indices_eq_eoc", + "false_from": False, + } + subnet_unit["_label_indices_eq_true_label"] = { + "class": "compare", + "from": ["_label_indices", "ground_truth_label"], + "kind": "equal", + } + subnet_unit["_label_indices_eq_true_label_"] = { + "class": "switch", + "condition": "label_pos_reached_end", + "true_from": False, + "false_from": "_label_indices_eq_true_label", + } + subnet_unit["eoc_label_mask"] = { + "class": "combine", + "kind": "logical_or", + "from": ["_label_indices_eq_eoc_", "_label_indices_eq_true_label_"], + } + subnet_unit["output_prob_filter_eoc"] = { + "class": "switch", + "condition": "eoc_label_mask", + "true_from": "output_prob", + "false_from": 1e-20, + } + self.output_prob = "output_prob_filter_eoc" + + if self.coverage_scale and self.coverage_threshold: + assert ( + self.att_num_heads.dimension == 1 + ), "Not supported for multi-head attention." # TODO: just average the heads? + accum_w = self.subnet_unit.add_eval_layer( + "accum_w", + source=["prev:att_weights", "att_weights"], + eval="source(0) + source(1)", + ) # [B,enc-T,H=1] + merge_accum_w = self.subnet_unit.add_merge_dims_layer( + "merge_accum_w", accum_w, axes="except_batch" + ) # [B,enc-T] + coverage_mask = self.subnet_unit.add_compare_layer( + "coverage_mask", + merge_accum_w, + kind="greater", + value=self.coverage_threshold, + ) # [B,enc-T] + float_coverage_mask = self.subnet_unit.add_cast_layer( + "float_coverage_mask", coverage_mask, dtype="float32" + ) # [B,enc-T] + accum_coverage = self.subnet_unit.add_reduce_layer( + "accum_coverage", + float_coverage_mask, + mode="sum", + axes=-1, + keep_dims=True, + ) # [B,1] + + self.output_prob = self.subnet_unit.add_eval_layer( + "output_prob_coverage", + source=[self.output_prob, accum_coverage], + eval=f"source(0) * (source(1) ** {self.coverage_scale})", + ) + + if self.full_sum_simple_approx: + subnet_unit.add_copy_layer("output", "data:source") + else: + choice_opts = dict(target=target) + if not self.length_normalization: + choice_opts["length_normalization"] = False + if not search_type: + pass + elif search_type == "end-of-chunk": + choice_opts["search"] = True + choice_opts["target"] = None + else: + raise ValueError(f"Unknown search type: {search_type!r}") + subnet_unit.add_choice_layer( + "output", + self.output_prob, + beam_size=self.beam_size, + initial_output=0, + **choice_opts, + ) + + # recurrent subnetwork + rec_opts = dict(target=target) + if search_type == "end-of-chunk": + # search_flag is False in training, but we anyway want to search, and we don't want the seq len + # from the ground truth labels (without EOC labels), so we must not use the target here. + rec_opts["target"] = None + if self.full_sum_simple_approx: + assert self.prev_target_embed_direct + self.network["_targets_with_bos"] = { + "class": "prefix_in_time", + "from": f"data:{target}", + "prefix": self.eos_id, + } + rec_opts["source"] = "_targets_with_bos" + rec_opts["target"] = None + elif self.source: + rec_opts["source"] = self.source + if self.enc_chunks_dim: + assert self.enc_time_dim and self.enc_time_dim.dimension is not None + rec_opts["include_eos"] = True + # TODO warning this is wrong, needs to be larger, + # but we can't easily change it now because it changes the hash + rec_opts["max_seq_len"] = f"max_len_from('base:encoder') * {self.enc_time_dim.dimension}" + if rec_layer_opts: + rec_opts.update(rec_layer_opts) + dec_output = self.network.add_subnet_rec_layer(rec_layer_name, unit=subnet_unit.get_net(), **rec_opts) + + return dec_output + + def create_network(self): + self.dec_output = self.add_decoder_subnetwork(self.subnet_unit) + target = self.target + if self.search_type == "end-of-chunk": + self.base_model.network["_02_alignment_on_the_fly"] = { + "class": "copy", + "from": "out_best", + "register_as_extern_data": "alignment_on_the_fly", + } + target = "alignment_on_the_fly" + # Add another output layer for potential training. + subnet_unit = ReturnnNetwork() + self.add_decoder_subnetwork(subnet_unit, search_type=None, target=target) + + # Add to Base/Encoder network + + if hasattr(self.base_model, "enc_proj_dim") and self.base_model.enc_proj_dim: + self.base_model.network.add_copy_layer("enc_ctx", "encoder_proj") + self.base_model.network.add_split_dim_layer( + "enc_value", + "encoder_proj", + dims=( + self.att_num_heads, + FeatureDim("val", self.enc_value_dim // self.att_num_heads.dimension), + ), + ) + else: + self.base_model.network.add_linear_layer( + "enc_ctx", + "encoder", + with_bias=True, + n_out=self.enc_key_dim, + l2=self.base_model.l2, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, + ) + self.base_model.network.add_split_dim_layer( + "enc_value", + "encoder", + dims=( + self.att_num_heads, + FeatureDim("val", self.enc_value_dim // self.att_num_heads.dimension), + ), + ) + + if not self.enc_chunks_dim: # use weight feedback + # there was a bug where n_out should be dimension and not the tag + # this does not override the layer to not break hashes of old exps without weight feedback. super annoying + self.base_model.network.add_linear_layer( + "inv_fertility", + "encoder", + activation="sigmoid", + n_out=self.att_num_heads.dimension, + with_bias=False, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, + ) + else: + # this layer is not used anw and kept just to not break hashes + self.base_model.network.add_linear_layer( + "inv_fertility", + "encoder", + activation="sigmoid", + n_out=self.att_num_heads, # Note: this is a bug + with_bias=False, + param_dropout=self.ff_weight_drop, + param_dropout_min_ndim=2, + param_variational_noise=self.ff_weight_noise, + ) + + self.base_model.network["out_best"] = { + "class": "decide", + "from": self.dec_output, + } + + self.base_model.network["enc_seq_len"] = {"class": "length", "from": "encoder"} + self.base_model.network["targets_seq_len"] = { + "class": "length", + "from": f"data:{self.target}", + } + self.base_model.network["out_best_seq_len"] = { + "class": "length", + "from": "out_best", + } + if False: + for name in [ + "enc_seq_len", + "out_best", + "out_best_seq_len", + f"data:{self.target}", + "targets_seq_len", + ]: + name_ = name.replace("data:", "") + self.base_model.network[f"debug_print_{name_}"] = { + "class": "print", + "from": name, + "is_output_layer": True, + } + + # Filter blank / EOS / EOC + if not self.full_sum_simple_approx: + self.base_model.network["out_best_non_blank_mask"] = { + "class": "compare", + "from": "out_best", + "value": self.eos_id, + "kind": "not_equal", + } + + self.base_model.network["out_best_wo_blank"] = { + "class": "masked_computation", + "mask": "out_best_non_blank_mask", + "from": "out_best", + "unit": {"class": "copy"}, + } + self.decision_layer_name = "out_best_wo_blank" + + self.base_model.network["edit_distance"] = { + "class": "copy", + "from": "out_best_wo_blank", + "only_on_search": True, + "loss": "edit_distance", + "target": self.target, + } + + if self.enc_chunks_dim and self.enable_check_align: + self.base_model.network["_check_alignment"] = { + "class": "eval", + "from": "out_best_wo_blank", + "eval": _check_alignment, + "eval_locals": {"target": target}, # with blank + "is_output_layer": True, + } + + return self.dec_output + + +# noinspection PyShadowingNames +def _check_alignment(source, self, target, **_kwargs): + import tensorflow as tf + from returnn.tf.util.data import Data + + out_wo_blank = source(0, as_data=True) + assert isinstance(out_wo_blank, Data) + if not self.network.eval_flag: + # Targets are not available during recognition. + return out_wo_blank.placeholder + out_with_blank = self.network.get_layer(f"data:{target}").output + assert isinstance(out_with_blank, Data) + encoder = self.network.get_layer("encoder").output + assert isinstance(encoder, Data) + num_chunks = encoder.get_sequence_lengths() + num_labels_wo_blank = out_wo_blank.get_sequence_lengths() + num_labels_w_blank = out_with_blank.get_sequence_lengths() + deps = [ + tf.Assert( + tf.reduce_all(tf.equal(num_labels_wo_blank + num_chunks, num_labels_w_blank)), + [ + "num labels wo blank, num chunks, with blank:", + num_labels_wo_blank, + num_chunks, + num_labels_w_blank, + "labels wo blank, with blank:", + out_wo_blank.placeholder, + out_with_blank.placeholder, + ], + summarize=100, + ), + ] + self.network.register_post_control_dependencies(deps) + with tf.control_dependencies(deps): + return tf.identity(out_wo_blank.placeholder) + + +# Taken from returnn_common, adopted. +def _rnnt_full_sum_log_prob_eval_layer_func( + *, + self: LayerBase, + source, + input_spatial_dim: Dim, + blank_index: int, +) -> tf.Tensor: + from returnn.tf.util.data import Data + from returnn.tf.layers.basic import LayerBase + from returnn.extern.HawkAaronWarpTransducer import rnnt_loss + + assert isinstance(self, LayerBase) + logits = source(0, auto_convert=False, as_data=True) + labels = source(1, auto_convert=False, as_data=True) + assert isinstance(logits, Data) and isinstance(labels, Data) + assert labels.batch_ndim == 2 and labels.have_batch_axis() and labels.have_time_axis() + labels_spatial_dim = labels.get_time_dim_tag() + prev_labels_spatial_dim = 1 + labels_spatial_dim + batch_dims = list(self.output.dim_tags) + feat_dim = logits.feature_dim_or_sparse_dim + if blank_index < 0: + blank_index += feat_dim.dimension + assert 0 <= blank_index < feat_dim.dimension + assert labels.sparse_dim.dimension <= feat_dim.dimension + # Move axes into the right order (no-op if they already are). + logits = logits.copy_compatible_to( + Data("logits", dim_tags=batch_dims + [input_spatial_dim, prev_labels_spatial_dim, feat_dim]), + check_dtype=False, + ) + labels = labels.copy_compatible_to( + Data("labels", dim_tags=batch_dims + [labels_spatial_dim], sparse_dim=labels.sparse_dim), check_dtype=False + ) + input_lengths = input_spatial_dim.get_dyn_size_ext_for_batch_ctx( + logits.batch, logits.control_flow_ctx + ).copy_compatible_to(Data("input_lengths", dim_tags=batch_dims), check_dtype=False) + label_lengths = labels_spatial_dim.get_dyn_size_ext_for_batch_ctx( + logits.batch, logits.control_flow_ctx + ).copy_compatible_to(Data("label_lengths", dim_tags=batch_dims), check_dtype=False) + + return rnnt_loss( + acts=logits.placeholder, + labels=labels.placeholder, + input_lengths=input_lengths.placeholder, + label_lengths=label_lengths.placeholder, + blank_label=blank_index, + ) + + +def _rnnt_full_sum_log_prob_eval_layer_out( + *, + name: str, + **_kwargs, +) -> Data: + from returnn.tf.util.data import Data, batch_dim + + return Data("%s_output" % name, dim_tags=[batch_dim]) From 454d2b155374948bfedba232f71c694f0bba3bb7 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 6 Jun 2024 10:39:35 +0000 Subject: [PATCH 107/227] update --- users/zeineldeen/models/asr/decoder/chunked_rnn_decoder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/users/zeineldeen/models/asr/decoder/chunked_rnn_decoder.py b/users/zeineldeen/models/asr/decoder/chunked_rnn_decoder.py index d93edffa4..0a10baba6 100644 --- a/users/zeineldeen/models/asr/decoder/chunked_rnn_decoder.py +++ b/users/zeineldeen/models/asr/decoder/chunked_rnn_decoder.py @@ -1,5 +1,6 @@ """ Model, based on Mohammads code. +Starting point here: i6_experiments.users.zeyer.experiments.exp2023_02_16_chunked_attention.model """ From 79e126386d22b168833c2d9bb7cde1a9c0625383 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 6 Jun 2024 13:24:24 +0000 Subject: [PATCH 108/227] fix --- .../chunkwise_attention_asr_config.py | 8 ++++++- .../configs/ted2_chunked_aed_variants.py | 22 +++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/users/zeineldeen/experiments/chunkwise_att_2023/librispeech_960/chunkwise_attention_asr_config.py b/users/zeineldeen/experiments/chunkwise_att_2023/librispeech_960/chunkwise_attention_asr_config.py index b611a8f8d..cab33c1b0 100644 --- a/users/zeineldeen/experiments/chunkwise_att_2023/librispeech_960/chunkwise_attention_asr_config.py +++ b/users/zeineldeen/experiments/chunkwise_att_2023/librispeech_960/chunkwise_attention_asr_config.py @@ -571,6 +571,9 @@ class RNNDecoderArgs(DecoderArgs): use_zoneout_output: bool = False + ff_weight_dropout: Optional[float] = None + ff_weight_noise: Optional[float] = None + def create_config( training_datasets, @@ -1082,7 +1085,10 @@ def create_config( exp_config["network"]["output"]["unit"].pop("s", None) # change inputs - exp_config["network"]["output"]["unit"]["s_wo_att"]["from"] = "prev:target_embed" # remove prev:att + if decoder_args["full_sum_simple_approx"]: + exp_config["network"]["output"]["unit"]["s_wo_att"]["from"] = "prev_target_embed" + else: + exp_config["network"]["output"]["unit"]["s_wo_att"]["from"] = "prev:target_embed" exp_config["network"]["output"]["unit"]["s_transformed"]["from"] = "s_wo_att" assert exp_config["network"]["output"]["unit"]["readout_in"]["from"][0] == "s" exp_config["network"]["output"]["unit"]["readout_in"]["from"][0] = "s_wo_att" diff --git a/users/zeineldeen/experiments/chunkwise_att_2023/tedlium2/configs/ted2_chunked_aed_variants.py b/users/zeineldeen/experiments/chunkwise_att_2023/tedlium2/configs/ted2_chunked_aed_variants.py index df2209c86..fbb28d0a5 100644 --- a/users/zeineldeen/experiments/chunkwise_att_2023/tedlium2/configs/ted2_chunked_aed_variants.py +++ b/users/zeineldeen/experiments/chunkwise_att_2023/tedlium2/configs/ted2_chunked_aed_variants.py @@ -1747,6 +1747,28 @@ def py(): remove_att_ctx_from_dec_state=True, ) + # TODO: exact full-sum training + run_chunkwise_train( + enc_stream_type="global", + run_all_for_best_last_avg=True, + enable_check_align=False, + chunk_sizes=[25], + chunk_step_factors=[1], + start_lrs=[2e-4], + decay_pt_factors=[0.25, 1 / 3], + final_lrs=[1e-6], + gpu_mem=11, + total_epochs=[20 * 4, 40 * 4], + batch_size=15_000, + accum_grad=2, + time_rqmt=120, + decoder_mask_eoc=False, # there are no blanks in the target seq + remove_att_ctx_from_dec_state=True, # remove att ctx dependency so we can do exact full sum + full_sum_approx=True, + ) + + # TODO: exact full-sum training starting from viterbi-trained model + # # TODO: use overlap # run_chunkwise_train( # enc_stream_type="global", From 58c4d07e991cecd8cf8aef902f19cf29f5711308 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 6 Jun 2024 13:53:11 +0000 Subject: [PATCH 109/227] fix --- .../tedlium2/configs/ted2_chunked_aed_variants.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/users/zeineldeen/experiments/chunkwise_att_2023/tedlium2/configs/ted2_chunked_aed_variants.py b/users/zeineldeen/experiments/chunkwise_att_2023/tedlium2/configs/ted2_chunked_aed_variants.py index fbb28d0a5..e0050a9ef 100644 --- a/users/zeineldeen/experiments/chunkwise_att_2023/tedlium2/configs/ted2_chunked_aed_variants.py +++ b/users/zeineldeen/experiments/chunkwise_att_2023/tedlium2/configs/ted2_chunked_aed_variants.py @@ -1759,8 +1759,8 @@ def py(): final_lrs=[1e-6], gpu_mem=11, total_epochs=[20 * 4, 40 * 4], - batch_size=15_000, - accum_grad=2, + batch_size=10_000, + accum_grad=3, time_rqmt=120, decoder_mask_eoc=False, # there are no blanks in the target seq remove_att_ctx_from_dec_state=True, # remove att ctx dependency so we can do exact full sum From 202004d01a0fe8b162ce0bcd7cbff719fbf56bb7 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 6 Jun 2024 16:58:14 +0200 Subject: [PATCH 110/227] more --- .../exp2024_04_23_baselines/ctc.py | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 450331bf0..76197d180 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -55,15 +55,21 @@ def py(): - Luca uses older behavior_version 21 -> 16. """ - train_exp( - f"v6-bhv20-11gb-f32-bs15k-accgrad5-mgpu4-pavg100-wd1e_5-lrlin1e_5_295k-bpe10k", - config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, - config_updates={ - **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), - "accum_grad_multiple_step": 5, - "optimizer.weight_decay": 1e-5, - }, - ) + for wd in [ + # 1e-5, # 9.9 + 1e-4, + 1e-3, + 1e-2, + ]: + train_exp( + f"v6-bhv20-11gb-f32-bs15k-accgrad5-mgpu4-pavg100-wd{str(wd).replace('-','_')}-lrlin1e_5_295k-bpe10k", + config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), + "accum_grad_multiple_step": 5, + "optimizer.weight_decay": wd, + }, + ) train_exp( # 9.24 f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_4-lrlin1e_5_295k-bpe10k", From cd83e0e9b8a05aa7598b258441a7aefa109d0002 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 6 Jun 2024 17:02:55 +0200 Subject: [PATCH 111/227] more --- .../exp2024_04_23_baselines/ctc.py | 48 +++++++------------ 1 file changed, 16 insertions(+), 32 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 76197d180..d8d936b04 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -55,38 +55,22 @@ def py(): - Luca uses older behavior_version 21 -> 16. """ - for wd in [ - # 1e-5, # 9.9 - 1e-4, - 1e-3, - 1e-2, - ]: - train_exp( - f"v6-bhv20-11gb-f32-bs15k-accgrad5-mgpu4-pavg100-wd{str(wd).replace('-','_')}-lrlin1e_5_295k-bpe10k", - config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, - config_updates={ - **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), - "accum_grad_multiple_step": 5, - "optimizer.weight_decay": wd, - }, - ) - - train_exp( # 9.24 - f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_4-lrlin1e_5_295k-bpe10k", - config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, - config_updates={ - **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), - }, - ) - - train_exp( - f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-bpe10k", - config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, - config_updates={ - **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), - "optimizer.weight_decay": 1e-2, - }, - ) + for acc in [5, 1]: + for wd in [ + # 1e-5, # accum=5, wd=1e-5: 9.90 + 1e-4, # accum=1, wd=1e-4: 9.24 + 1e-3, + 1e-2, + ]: + train_exp( + f"v6-bhv20-11gb-f32-bs15k-accgrad{acc}-mgpu4-pavg100-wd{str(wd).replace('-','_')}-lrlin1e_5_295k-bpe10k", + config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), + "accum_grad_multiple_step": acc, + "optimizer.weight_decay": wd, + }, + ) train_exp( # 8.79 f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_4-lrlin1e_5_295k-speedpertV2-bpe10k", From 5f620eeb0580d629e99c18a97a234da8d2350ccc Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 6 Jun 2024 17:18:50 +0200 Subject: [PATCH 112/227] fix name --- users/zeyer/experiments/exp2024_04_23_baselines/ctc.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index d8d936b04..c749b0941 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -63,7 +63,9 @@ def py(): 1e-2, ]: train_exp( - f"v6-bhv20-11gb-f32-bs15k-accgrad{acc}-mgpu4-pavg100-wd{str(wd).replace('-','_')}-lrlin1e_5_295k-bpe10k", + f"v6-bhv20-11gb-f32-bs15k-accgrad{acc}" + f"-mgpu4-pavg100-wd{('%.e'%wd).replace('e-0', 'e_')}" + f"-lrlin1e_5_295k-bpe10k", config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, config_updates={ **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), From 2f4fa9a6d0ac4d6907f19b0aab100d63c35977e9 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 6 Jun 2024 17:20:47 +0200 Subject: [PATCH 113/227] more --- users/zeyer/experiments/exp2024_04_23_baselines/ctc.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index c749b0941..5a37e66a0 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -62,6 +62,8 @@ def py(): 1e-3, 1e-2, ]: + if (acc, wd) == (1, 1e-4): + continue # skip for now train_exp( f"v6-bhv20-11gb-f32-bs15k-accgrad{acc}" f"-mgpu4-pavg100-wd{('%.e'%wd).replace('e-0', 'e_')}" From 4e835195605293f5df35047c7c3dced5a77426df Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 6 Jun 2024 17:24:19 +0200 Subject: [PATCH 114/227] more --- .../exp2024_04_23_baselines/ctc.py | 39 +++++++++---------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 5a37e66a0..e023ae15a 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -55,26 +55,25 @@ def py(): - Luca uses older behavior_version 21 -> 16. """ - for acc in [5, 1]: - for wd in [ - # 1e-5, # accum=5, wd=1e-5: 9.90 - 1e-4, # accum=1, wd=1e-4: 9.24 - 1e-3, - 1e-2, - ]: - if (acc, wd) == (1, 1e-4): - continue # skip for now - train_exp( - f"v6-bhv20-11gb-f32-bs15k-accgrad{acc}" - f"-mgpu4-pavg100-wd{('%.e'%wd).replace('e-0', 'e_')}" - f"-lrlin1e_5_295k-bpe10k", - config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, - config_updates={ - **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), - "accum_grad_multiple_step": acc, - "optimizer.weight_decay": wd, - }, - ) + for acc, wd in [ + # (5, 1e-5), # 9.90 + (5, 1e-3), + (5, 1e-2), + # (1, 1e-4), # 9.24 + (1, 1e-3), + (1, 1e-2), + ]: + train_exp( + f"v6-bhv20-11gb-f32-bs15k-accgrad{acc}" + f"-mgpu4-pavg100-wd{('%.e'%wd).replace('e-0', 'e_')}" + f"-lrlin1e_5_295k-bpe10k", + config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), + "accum_grad_multiple_step": acc, + "optimizer.weight_decay": wd, + }, + ) train_exp( # 8.79 f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_4-lrlin1e_5_295k-speedpertV2-bpe10k", From 8963e44194d790965e06bd8e858fccc096fdb7dc Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Fri, 7 Jun 2024 09:26:54 +0200 Subject: [PATCH 115/227] small fix --- users/zeyer/experiments/exp2024_04_23_baselines/aed.py | 2 +- users/zeyer/experiments/exp2024_04_23_baselines/ctc.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/aed.py b/users/zeyer/experiments/exp2024_04_23_baselines/aed.py index 59572b25a..8099b81df 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/aed.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/aed.py @@ -171,7 +171,7 @@ def train_exp( num_epochs=num_epochs, gpu_mem=gpu_mem, num_processes=num_processes, - distributed_launch_cmd="torchrun" if num_processes else None, + distributed_launch_cmd="torchrun" if num_processes else "mpirun", time_rqmt=time_rqmt, ) recog_training_exp(prefix, task, model_with_checkpoint, recog_def=model_recog) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index e023ae15a..3606c888f 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -247,7 +247,7 @@ def train_exp( num_epochs=num_epochs, gpu_mem=gpu_mem, num_processes=num_processes, - distributed_launch_cmd="torchrun" if num_processes else None, + distributed_launch_cmd="torchrun" if num_processes else "mpirun", time_rqmt=time_rqmt, ) From f21297dfb634e74f13a33cd150453266dba049de Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Fri, 7 Jun 2024 09:28:13 +0200 Subject: [PATCH 116/227] more --- .../experiments/exp2024_04_23_baselines/ctc.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 3606c888f..84eaf6a6c 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -55,6 +55,23 @@ def py(): - Luca uses older behavior_version 21 -> 16. """ + train_exp( + f"v6-bhv21-24gb-bf16-bs40k-accgrad2-wd1e_6-lrlin1e_5_450k-bpe10k", + config_24gb_v6, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(40_000, 2000), + }, + ) + + train_exp( + f"v6-bhv21-24gb-bf16-bs40k-accgrad2-wd1e_6-lrlin1e_5_600k-bpe10k", + config_24gb_v6, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(40_000, 2000), + "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], + }, + ) + for acc, wd in [ # (5, 1e-5), # 9.90 (5, 1e-3), From 49438b92fe95093ac8de2facbf1fbef82f124257 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Fri, 7 Jun 2024 09:35:06 +0200 Subject: [PATCH 117/227] more --- .../experiments/exp2024_04_23_baselines/ctc.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 84eaf6a6c..257dc1374 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -123,6 +123,19 @@ def py(): vocab=vocab, ) + train_exp( + f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_393k-speedpertV2-bpe10k", + config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), + "learning_rate_piecewise_steps": [393_000, 590_000, 652_000], # total steps after 500 epochs: ~652k + "optimizer.weight_decay": 1e-2, + "__train_audio_preprocess": speed_pert_librosa_config, + "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], + }, + vocab="bpe10k", + ) + for alpha in [ 0.3, # 7.88 0.5, # 7.13 From 16b34cc858509637e4a65309db2539225eb5750a Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Fri, 7 Jun 2024 11:26:47 +0200 Subject: [PATCH 118/227] cleanup --- .../exp2024_04_23_baselines/ctc.py | 63 +++++++------------ 1 file changed, 22 insertions(+), 41 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 257dc1374..9659ffd21 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -172,50 +172,31 @@ def py(): train_vocab_opts={"other_opts": {"enable_sampling": True, "alpha": 0.7}}, ) - train_exp( - "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-featBN-speedpertV2-spm10k-spmSample07", - config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, - model_config={"feature_batch_norm": True}, - config_updates={ - **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), - "optimizer.weight_decay": 1e-2, - "__train_audio_preprocess": speed_pert_librosa_config, - "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], - }, - vocab="spm10k", - train_vocab_opts={"other_opts": {"enable_sampling": True, "alpha": 0.7}}, - ) - - train_exp( - "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-featNorm-speedpertV2-spm10k-spmSample07", - config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, - model_config={"feature_norm": True}, - config_updates={ - **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), - "optimizer.weight_decay": 1e-2, - "__train_audio_preprocess": speed_pert_librosa_config, - "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], - }, - vocab="spm10k", - train_vocab_opts={"other_opts": {"enable_sampling": True, "alpha": 0.7}}, - ) - from i6_experiments.users.zeyer.datasets.librispeech import get_librispeech_log_mel_stats feature_stats = get_librispeech_log_mel_stats(_log_mel_feature_dim) - train_exp( - "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-featGN-speedpertV2-spm10k-spmSample07", - config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, - model_config={"feature_stats": {"mean": feature_stats.mean, "std_dev": feature_stats.std_dev}}, - config_updates={ - **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), - "optimizer.weight_decay": 1e-2, - "__train_audio_preprocess": speed_pert_librosa_config, - "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], - }, - vocab="spm10k", - train_vocab_opts={"other_opts": {"enable_sampling": True, "alpha": 0.7}}, - ) + + # Test different feature normalization schemes. + for name, model_opts in { + None: None, + "featBN": {"feature_batch_norm": True}, + "featNorm": {"feature_norm": True}, + "featGN": {"feature_stats": {"mean": feature_stats.mean, "std_dev": feature_stats.std_dev}}, + }.items(): + train_exp( + "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-" + f"{(name + '-') if name else ''}speedpertV2-spm10k-spmSample07", + config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, + model_config=model_opts, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), + "optimizer.weight_decay": 1e-2, + "__train_audio_preprocess": speed_pert_librosa_config, + "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], + }, + vocab="spm10k", + train_vocab_opts={"other_opts": {"enable_sampling": True, "alpha": 0.7}}, + ) # noinspection PyShadowingNames From acdfdf47483f483abb6f6bbce3e4895f9b863260 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Fri, 7 Jun 2024 11:49:28 +0000 Subject: [PATCH 119/227] fix --- .../librispeech_960/chunkwise_attention_asr_config.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/users/zeineldeen/experiments/chunkwise_att_2023/librispeech_960/chunkwise_attention_asr_config.py b/users/zeineldeen/experiments/chunkwise_att_2023/librispeech_960/chunkwise_attention_asr_config.py index cab33c1b0..bbbac39c1 100644 --- a/users/zeineldeen/experiments/chunkwise_att_2023/librispeech_960/chunkwise_attention_asr_config.py +++ b/users/zeineldeen/experiments/chunkwise_att_2023/librispeech_960/chunkwise_attention_asr_config.py @@ -933,6 +933,8 @@ def create_config( decoder_args = asdict(decoder_args) decoder_args.update({"target": target, "beam_size": beam_size}) + chunked_decoder_trained_with_fs = False + if chunked_decoder: decoder_args["enc_chunks_dim"] = chunked_time_dim decoder_args["enc_time_dim"] = chunk_size_dim @@ -941,6 +943,7 @@ def create_config( decoder_args["enable_check_align"] = enable_check_align # just here to keep some old changes if decoder_args["full_sum_simple_approx"] and is_recog: + chunked_decoder_trained_with_fs = True decoder_args["full_sum_simple_approx"] = False decoder_args["masked_computation_blank_idx"] = eoc_idx elif chunk_size and not dump_ctc_dataset and not dump_ctc and not dump_alignments_dataset: @@ -1085,7 +1088,7 @@ def create_config( exp_config["network"]["output"]["unit"].pop("s", None) # change inputs - if decoder_args["full_sum_simple_approx"]: + if decoder_args["full_sum_simple_approx"] or chunked_decoder_trained_with_fs: exp_config["network"]["output"]["unit"]["s_wo_att"]["from"] = "prev_target_embed" else: exp_config["network"]["output"]["unit"]["s_wo_att"]["from"] = "prev:target_embed" From 50ba89a8b7ac7b3320649e2f84fdaada00d50bcb Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Fri, 7 Jun 2024 11:28:11 +0200 Subject: [PATCH 120/227] comment --- users/zeyer/experiments/exp2024_04_23_baselines/ctc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 9659ffd21..07743cec6 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -179,9 +179,9 @@ def py(): # Test different feature normalization schemes. for name, model_opts in { None: None, - "featBN": {"feature_batch_norm": True}, - "featNorm": {"feature_norm": True}, - "featGN": {"feature_stats": {"mean": feature_stats.mean, "std_dev": feature_stats.std_dev}}, + "featBN": {"feature_batch_norm": True}, # batch norm + "featNorm": {"feature_norm": True}, # normalize (on sequence level) + "featGN": {"feature_stats": {"mean": feature_stats.mean, "std_dev": feature_stats.std_dev}}, # global norm }.items(): train_exp( "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-" From f34dfdffcb2cad3d4c9bc17496c22c58e6f4d8b7 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Fri, 7 Jun 2024 12:01:02 +0200 Subject: [PATCH 121/227] more --- .../exp2024_04_23_baselines/ctc.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 07743cec6..e79993e77 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -140,6 +140,7 @@ def py(): 0.3, # 7.88 0.5, # 7.13 0.7, # 6.99 + 0.8, ]: train_exp( "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-spm10k" @@ -157,6 +158,28 @@ def py(): # v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2 # with spm_bpe10k and enable_sampling, alpha in {0.3, 0.7} was both very bad (90% WER). + # But actually, alpha for BPE has a very different effect, and it causes the seq len to be much longer. + # The higher the alpha, the longer (the reverse as for SPM Unigram). + # See archive/returnn-spm_bpe10-sample.config. + for alpha in [ + 0.005, + 0.01, + # 0.3, # broken + # 0.7, # broken + ]: + train_exp( + "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-spm_bpe10k" + f"-spmSample{str(alpha).replace('.', '')}", + config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), + "optimizer.weight_decay": 1e-2, + "__train_audio_preprocess": speed_pert_librosa_config, + "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], + }, + vocab="spm_bpe10k", + train_vocab_opts={"other_opts": {"enable_sampling": True, "alpha": alpha}}, + ) train_exp( "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-spm10k-eos-spmSample07", From 1a640e8b7f66967efda4ce7003237be1701da93b Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Fri, 7 Jun 2024 15:49:57 +0200 Subject: [PATCH 122/227] cleanup --- .../exp2024_04_23_baselines/ctc.py | 49 +++++++------------ 1 file changed, 18 insertions(+), 31 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index e79993e77..a2022eb34 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -136,14 +136,23 @@ def py(): vocab="bpe10k", ) - for alpha in [ - 0.3, # 7.88 - 0.5, # 7.13 - 0.7, # 6.99 - 0.8, + # Testing different vocabs together with sampling. + for vocab, alpha in [ + # See archive/returnn-spm10-sample.config for playing around with alpha and checking avg seq len. + ("spm10k", 0.3), # 7.88 + ("spm10k", 0.5), # 7.13 + ("spm10k", 0.7), # 6.99 + ("spm10k", 0.8), + # alpha for BPE has a very different effect, and it causes the seq len to be much longer. + # The higher the alpha, the longer (the reverse as for SPM Unigram). + # See archive/returnn-spm_bpe10-sample.config. + ("spm_bpe10k", 0.005), + ("spm_bpe10k", 0.01), + # ("spm_bpe10k", 0.3), # broken + # ("spm_bpe10k", 0.7), # broken ]: train_exp( - "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-spm10k" + f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-{vocab}" f"-spmSample{str(alpha).replace('.', '')}", config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, config_updates={ @@ -152,34 +161,12 @@ def py(): "__train_audio_preprocess": speed_pert_librosa_config, "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], }, - vocab="spm10k", + vocab=vocab, train_vocab_opts={"other_opts": {"enable_sampling": True, "alpha": alpha}}, ) - # v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2 - # with spm_bpe10k and enable_sampling, alpha in {0.3, 0.7} was both very bad (90% WER). - # But actually, alpha for BPE has a very different effect, and it causes the seq len to be much longer. - # The higher the alpha, the longer (the reverse as for SPM Unigram). - # See archive/returnn-spm_bpe10-sample.config. - for alpha in [ - 0.005, - 0.01, - # 0.3, # broken - # 0.7, # broken - ]: - train_exp( - "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-spm_bpe10k" - f"-spmSample{str(alpha).replace('.', '')}", - config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, - config_updates={ - **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), - "optimizer.weight_decay": 1e-2, - "__train_audio_preprocess": speed_pert_librosa_config, - "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], - }, - vocab="spm_bpe10k", - train_vocab_opts={"other_opts": {"enable_sampling": True, "alpha": alpha}}, - ) + # TODO SamplingBytePairEncoding for orig bpe10k, breadth_prob=0.01 + # See archive/returnn-bpe10-sample.config. train_exp( "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-spm10k-eos-spmSample07", From 1725cded5edf583361132691410700a124a5c4a7 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Fri, 7 Jun 2024 16:14:43 +0200 Subject: [PATCH 123/227] more --- .../experiments/exp2024_04_23_baselines/ctc.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index a2022eb34..c65c7ffde 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -143,17 +143,20 @@ def py(): ("spm10k", 0.5), # 7.13 ("spm10k", 0.7), # 6.99 ("spm10k", 0.8), - # alpha for BPE has a very different effect, and it causes the seq len to be much longer. + # alpha for SPM-BPE has a very different effect, and it causes the seq len to be much longer. # The higher the alpha, the longer (the reverse as for SPM Unigram). # See archive/returnn-spm_bpe10-sample.config. ("spm_bpe10k", 0.005), ("spm_bpe10k", 0.01), # ("spm_bpe10k", 0.3), # broken # ("spm_bpe10k", 0.7), # broken + # alpha for BPE is again a bit different, but more similar to SPM-BPE than SPM-Unigram. + # See archive/returnn-bpe10-sample.config. + ("bpe10k", 0.01), ]: train_exp( f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-{vocab}" - f"-spmSample{str(alpha).replace('.', '')}", + f"-{'spmSample' if vocab.startswith('spm') else 'bpeSample'}{str(alpha).replace('.', '')}", config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, config_updates={ **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), @@ -162,12 +165,15 @@ def py(): "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], }, vocab=vocab, - train_vocab_opts={"other_opts": {"enable_sampling": True, "alpha": alpha}}, + train_vocab_opts={ + "other_opts": ( + {"enable_sampling": True, "alpha": alpha} + if vocab.startswith("spm") + else {"class": "SamplingBytePairEncoding", "breadth_prob": alpha} + ) + }, ) - # TODO SamplingBytePairEncoding for orig bpe10k, breadth_prob=0.01 - # See archive/returnn-bpe10-sample.config. - train_exp( "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-spm10k-eos-spmSample07", config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, From 56543ca18a3a3eb9ae3595716cf00ad6005c9c62 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Fri, 7 Jun 2024 17:13:41 +0200 Subject: [PATCH 124/227] add canary 1b recog sis prepare config --- .../canary_aed/configs/canary_1b_recog.py | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py diff --git a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py new file mode 100644 index 000000000..e50468767 --- /dev/null +++ b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py @@ -0,0 +1,27 @@ +from sisyphus import * + +from i6_core.datasets.huggingface import DownloadAndPrepareHuggingFaceDatasetJob + +test_sets = ["ami", "earnings22", "gigaspeech"] + + +def download_test_datasets(): + # for downloading gigaspeech, a token is required. I login to huggingface and generate a token and then + # run the command `huggingface-cli login` and paste the token + + for test_set in test_sets: + j = DownloadAndPrepareHuggingFaceDatasetJob( + path="open-asr-leaderboard/datasets-test-only", + name=test_set, + split="test", + time_rqmt=24, + mem_rqmt=4, + cpu_rqmt=4, + mini_task=True, + token=True, + ) + tk.register_output(f"datasets/{test_set}", j.out_dir) + + +def py(): + download_test_datasets() From 0beee4cef76badb04aef6a442be94a9a1e774e64 Mon Sep 17 00:00:00 2001 From: Judyxujj Date: Mon, 10 Jun 2024 01:06:53 +0800 Subject: [PATCH 125/227] add config (#223) Co-authored-by: Jingjing Xu --- .../jointly_train_simple_top_k_layerwise.py | 238 ++++++++++++++ .../jointly_train_simple_top_k_layerwise.py | 300 ++++++++++++++++++ 2 files changed, 538 insertions(+) create mode 100644 users/jxu/experiments/ctc/tedlium2/configs/dynamic_encoder_size/simple_topk_refactored/jointly_train_simple_top_k_layerwise.py create mode 100644 users/jxu/experiments/ctc/tedlium2/pytorch_networks/dynamic_encoder_size/simple_topk_refactored/jointly_train_simple_top_k_layerwise.py diff --git a/users/jxu/experiments/ctc/tedlium2/configs/dynamic_encoder_size/simple_topk_refactored/jointly_train_simple_top_k_layerwise.py b/users/jxu/experiments/ctc/tedlium2/configs/dynamic_encoder_size/simple_topk_refactored/jointly_train_simple_top_k_layerwise.py new file mode 100644 index 000000000..4dff7cf8b --- /dev/null +++ b/users/jxu/experiments/ctc/tedlium2/configs/dynamic_encoder_size/simple_topk_refactored/jointly_train_simple_top_k_layerwise.py @@ -0,0 +1,238 @@ +import functools +from typing import Any, Dict, List, Optional, Union +import copy + +import i6_core.returnn as returnn +import i6_experiments.users.jxu.experiments.ctc.tedlium2.configs.configs_helper as configs_helper +from i6_experiments.users.berger.systems.dataclasses import ReturnnConfigs +from i6_experiments.common.setups.returnn_pytorch.serialization import Collection +from i6_experiments.users.berger.systems.dataclasses import ConfigVariant + +# ********** Constant values ********** + +num_outputs = 79 +num_subepochs = 250 + + +# ********** Settings ********** + +def get_returnn_config( + network: Optional[Dict] = None, + *, + target: Optional[str] = "classes", + num_inputs: Optional[int] = None, + num_outputs: Optional[int] = None, + python_prolog: Optional[Union[List, Dict]] = None, + extern_data_config: bool = False, + extra_python: Optional[List] = None, + extra_config: Optional[Dict] = None, + hash_full_python_code: bool = False, + **kwargs, +) -> returnn.ReturnnConfig: + python_prolog = python_prolog or ["import numpy as np"] + extra_python = extra_python or [] + config_dict: dict[str, Any] = {"target": target} + if num_inputs is not None: + config_dict["num_inputs"] = num_inputs + if num_outputs is not None: + config_dict["num_outputs"] = {target: num_outputs} + if extern_data_config: + config_dict.update( + configs_helper.get_extern_data_config(num_inputs=num_inputs, num_outputs=num_outputs, target=target, + **kwargs) + ) + config_dict.update(configs_helper.get_base_config()) + + if network: + config_dict.update({"network:": network}) + + lrate_config = configs_helper.get_oclr_config(**kwargs) + config_dict.update(lrate_config) + + config_dict.update(configs_helper.get_base_regularization_config(**kwargs)) + + if extra_config: + config_dict.update(extra_config) + + post_config_dict = {} + post_config_dict.update(configs_helper.get_base_post_config(**kwargs)) + + return returnn.ReturnnConfig( + config=config_dict, + post_config=post_config_dict, + hash_full_python_code=hash_full_python_code, + python_prolog=python_prolog, + python_epilog=extra_python, + pprint_kwargs={"sort_dicts": False}, + ) + + +def get_serializer(model_config, variant: ConfigVariant, in_dim: int = 1) -> Collection: + from i6_experiments.users.jxu.experiments.ctc.tedlium2.pytorch_networks.dynamic_encoder_size.simple_topk_with_new_i6_models.joint_train_two_model_simple_top_k_modwise import \ + get_train_serializer, get_recog_serializer, get_prior_serializer + if variant == ConfigVariant.TRAIN: + return get_train_serializer(model_config) + if variant == ConfigVariant.PRIOR: + return get_prior_serializer(model_config) + if variant == ConfigVariant.RECOG: + return get_recog_serializer(model_config) + raise NotImplementedError + + +def returnn_config_generator(train_data_config: dict, dev_data_config: dict, peak_lr: float) -> dict: + from i6_experiments.users.jxu.experiments.ctc.tedlium2.pytorch_networks.dynamic_encoder_size.simple_topk_with_new_i6_models.joint_train_two_model_simple_top_k_modwise import \ + get_default_config_v1 as get_train_config + from i6_experiments.users.jxu.experiments.ctc.tedlium2.pytorch_networks.dynamic_encoder_size.simple_topk_with_new_i6_models.joint_train_two_model_simple_top_k_modwise import \ + get_default_config_v1 as get_recog_config + + extra_config = { + "train": train_data_config, + "dev": dev_data_config, + } + recog_extra_config = copy.deepcopy(extra_config) + recog_extra_config["model_outputs"] = {"classes": {"dim": num_outputs}} + + config_partial = functools.partial( + get_returnn_config, + num_epochs=num_subepochs, + num_inputs=50, + num_outputs=num_outputs, + target="targets", + extern_data_config=True, + grad_noise=0.0, + grad_clip=0.0, + cycle_epoch=110, + initial_lr=peak_lr / 100, + peak_lr=peak_lr, + final_lr=1e-08, + batch_size=15000, + extra_config=extra_config, + ) + + def get_returnn_configs(train_config, recog_config): + return ReturnnConfigs( + train_config=config_partial( + extra_python=[get_serializer(train_config, ConfigVariant.TRAIN)], + extra_config=extra_config), + prior_config=config_partial(extra_python=[get_serializer(recog_config, ConfigVariant.PRIOR)], + extra_config=extra_config), + recog_configs={ + "recog": config_partial(extra_python=[get_serializer(recog_config, ConfigVariant.RECOG)], + extra_config=recog_extra_config)}, + ) + + # ----------------------------- gumbel_scale 0.05 gumble_top_k_dropout_0_3 init tau 0.5 k annealing every 20 sub-epoch --------------------------------------- + + experiments = {} + for k_annealing_step in [25]: + for layer_dropout_stage_1, layer_dropout_stage_2 in [(0, 0.3)]: + layer_dropout_kwargs = {"layer_dropout_stage_1": layer_dropout_stage_1, + "layer_dropout_stage_2": layer_dropout_stage_2} + k_anneal_kwargs = {"k_anneal_num_steps_per_iter": 1400 * k_annealing_step, "k_reduction_per_iter": 4} + network_args = {"layer_dropout_kwargs": layer_dropout_kwargs, + "k_anneal_kwargs": k_anneal_kwargs, "num_layers_set": [24, 48], "recog_num_layers": 48} + + train_config = get_train_config(num_inputs=50, num_outputs=num_outputs, + network_args=network_args) + recog_network_args = copy.deepcopy(network_args) + recog_network_args["recog_num_layers"] = 48 + num_recog_mods_48_recog_config = get_recog_config(num_inputs=50, num_outputs=num_outputs, + network_args=recog_network_args) + recog_network_args["recog_num_layers"] = 24 + num_recog_mods_24_recog_config = get_recog_config(num_inputs=50, num_outputs=num_outputs, + network_args=recog_network_args) + + num_recog_mods_48_experiment_name = f"simple_topk_annealing_step_{k_annealing_step}_6_dropout_{layer_dropout_stage_1}_{layer_dropout_stage_2}_recog_num_mods_48_peak_lr_{peak_lr}" + num_recog_mods_48_experiment_name = num_recog_mods_48_experiment_name.replace(".", "_") + experiments[num_recog_mods_48_experiment_name] = get_returnn_configs(train_config, + num_recog_mods_48_recog_config) + + num_recog_mods_24_experiment_name = f"simple_topk_annealing_step_{k_annealing_step}_6_dropout_{layer_dropout_stage_1}_{layer_dropout_stage_2}_recog_num_mods_24_peak_lr_{peak_lr}" + num_recog_mods_24_experiment_name = num_recog_mods_24_experiment_name.replace(".", "_") + experiments[num_recog_mods_24_experiment_name] = get_returnn_configs(train_config, + num_recog_mods_24_recog_config) + + for k_annealing_step, k_reduction_per_iter in [(19, 4)]: + for layer_dropout_stage_1, layer_dropout_stage_2 in [(0, 0.4)]: + layer_dropout_kwargs = {"layer_dropout_stage_1": layer_dropout_stage_1, + "layer_dropout_stage_2": layer_dropout_stage_2} + k_anneal_kwargs = {"k_anneal_num_steps_per_iter": 1400 * k_annealing_step, + "k_reduction_per_iter": k_reduction_per_iter} + network_args = {"layer_dropout_kwargs": layer_dropout_kwargs, + "k_anneal_kwargs": k_anneal_kwargs, "recog_num_layers": 48, "num_layers_set": [16, 32, 48]} + + train_config = get_train_config(num_inputs=50, num_outputs=num_outputs, + network_args=network_args) + recog_network_args = copy.deepcopy(network_args) + recog_network_args["recog_num_layers"] = 48 + num_recog_mods_48_recog_config = get_recog_config(num_inputs=50, num_outputs=num_outputs, + network_args=recog_network_args) + + recog_network_args["recog_num_layers"] = 32 + num_recog_mods_32_recog_config = get_recog_config(num_inputs=50, num_outputs=num_outputs, + network_args=recog_network_args) + + recog_network_args["recog_num_layers"] = 16 + num_recog_mods_16_recog_config = get_recog_config(num_inputs=50, num_outputs=num_outputs, + network_args=recog_network_args) + + num_recog_mods_48_experiment_name = f"three_models_k_annealing_step_{k_annealing_step}_{32 // k_reduction_per_iter}_dropout_{layer_dropout_stage_1}_{layer_dropout_stage_2}_recog_num_mods_48_peak_lr_{peak_lr}" + num_recog_mods_48_experiment_name = num_recog_mods_48_experiment_name.replace(".", "_") + experiments[num_recog_mods_48_experiment_name] = get_returnn_configs(train_config, + num_recog_mods_48_recog_config) + + num_recog_mods_32_experiment_name = f"three_models_k_annealing_step_{k_annealing_step}_{32 // k_reduction_per_iter}_dropout_{layer_dropout_stage_1}_{layer_dropout_stage_2}_recog_num_mods_32_peak_lr_{peak_lr}" + num_recog_mods_32_experiment_name = num_recog_mods_32_experiment_name.replace(".", "_") + experiments[num_recog_mods_32_experiment_name] = get_returnn_configs(train_config, + num_recog_mods_32_recog_config) + + num_recog_mods_16_experiment_name = f"three_models_k_annealing_step_{k_annealing_step}_{32 // k_reduction_per_iter}_dropout_{layer_dropout_stage_1}_{layer_dropout_stage_2}_recog_num_mods_16_peak_lr_{peak_lr}" + num_recog_mods_16_experiment_name = num_recog_mods_16_experiment_name.replace(".", "_") + experiments[num_recog_mods_16_experiment_name] = get_returnn_configs(train_config, + num_recog_mods_16_recog_config) + + + for k_annealing_step, k_reduction_per_iter in [(50, 12), (4.1, 1)]: + for layer_dropout_stage_1, layer_dropout_stage_2 in [(0, 0.3), (0.1, 0.3)]: + layer_dropout_kwargs = {"layer_dropout_stage_1": layer_dropout_stage_1, + "layer_dropout_stage_2": layer_dropout_stage_2} + k_anneal_kwargs = {"k_anneal_num_steps_per_iter": 1400 * k_annealing_step, "k_reduction_per_iter": k_reduction_per_iter} + network_args = {"layer_dropout_kwargs": layer_dropout_kwargs, + "k_anneal_kwargs": k_anneal_kwargs, "recog_num_layers": 48, "num_layers_set": [12, 24, 36, 48]} + + train_config = get_train_config(num_inputs=50, num_outputs=num_outputs, + network_args=network_args) + recog_network_args = copy.deepcopy(network_args) + recog_network_args["recog_num_layers"] = 48 + num_recog_mods_48_recog_config = get_recog_config(num_inputs=50, num_outputs=num_outputs, + network_args=recog_network_args) + + recog_network_args["recog_num_layers"] = 36 + num_recog_mods_36_recog_config = get_recog_config(num_inputs=50, num_outputs=num_outputs, + network_args=recog_network_args) + + recog_network_args["recog_num_layers"] = 24 + num_recog_mods_24_recog_config = get_recog_config(num_inputs=50, num_outputs=num_outputs, + network_args=recog_network_args) + + recog_network_args["recog_num_layers"] = 12 + num_recog_mods_12_recog_config = get_recog_config(num_inputs=50, num_outputs=num_outputs, + network_args=recog_network_args) + + num_recog_mods_48_experiment_name = f"four_models_k_annealing_step_{k_annealing_step}_{36//k_reduction_per_iter}_dropout_{layer_dropout_stage_1}_{layer_dropout_stage_2}_recog_num_mods_48_peak_lr_{peak_lr}" + num_recog_mods_48_experiment_name = num_recog_mods_48_experiment_name.replace(".", "_") + experiments[num_recog_mods_48_experiment_name] = get_returnn_configs(train_config, num_recog_mods_48_recog_config) + + num_recog_mods_36_experiment_name = f"four_models_k_annealing_step_{k_annealing_step}_{36//k_reduction_per_iter}_dropout_{layer_dropout_stage_1}_{layer_dropout_stage_2}_recog_num_mods_36_peak_lr_{peak_lr}" + num_recog_mods_36_experiment_name = num_recog_mods_36_experiment_name.replace(".", "_") + experiments[num_recog_mods_36_experiment_name] = get_returnn_configs(train_config, num_recog_mods_36_recog_config) + + num_recog_mods_24_experiment_name = f"four_models_k_annealing_step_{k_annealing_step}_{36//k_reduction_per_iter}_dropout_{layer_dropout_stage_1}_{layer_dropout_stage_2}_recog_num_mods_24_peak_lr_{peak_lr}" + num_recog_mods_24_experiment_name = num_recog_mods_24_experiment_name.replace(".", "_") + experiments[num_recog_mods_24_experiment_name] = get_returnn_configs(train_config, num_recog_mods_24_recog_config) + + num_recog_mods_12_experiment_name = f"four_models_k_annealing_step_{k_annealing_step}_{36//k_reduction_per_iter}_dropout_{layer_dropout_stage_1}_{layer_dropout_stage_2}_recog_num_mods_12_peak_lr_{peak_lr}" + num_recog_mods_12_experiment_name = num_recog_mods_12_experiment_name.replace(".", "_") + experiments[num_recog_mods_12_experiment_name] = get_returnn_configs(train_config, num_recog_mods_12_recog_config) + + return experiments diff --git a/users/jxu/experiments/ctc/tedlium2/pytorch_networks/dynamic_encoder_size/simple_topk_refactored/jointly_train_simple_top_k_layerwise.py b/users/jxu/experiments/ctc/tedlium2/pytorch_networks/dynamic_encoder_size/simple_topk_refactored/jointly_train_simple_top_k_layerwise.py new file mode 100644 index 000000000..af58af0a4 --- /dev/null +++ b/users/jxu/experiments/ctc/tedlium2/pytorch_networks/dynamic_encoder_size/simple_topk_refactored/jointly_train_simple_top_k_layerwise.py @@ -0,0 +1,300 @@ +from dataclasses import dataclass +from typing import Optional +from collections import OrderedDict + +import torch +from torch import nn +from typing import Tuple +from returnn.tensor.tensor_dict import TensorDict +import returnn.frontend as rf +import numpy as np + +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1, VGG4LayerActFrontendV1Config +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.assemblies.conformer_with_dynamic_model_size.selection_with_simple_top_k import ( + ConformerBlockConfig, + ConformerEncoderConfig, + ConformerEncoder +) +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.config import ModelConfiguration, ModuleFactoryV1 + +from i6_experiments.users.berger.pytorch.models.util import lengths_to_padding_mask +from i6_experiments.common.setups.returnn_pytorch.serialization import Collection +from i6_experiments.users.berger.pytorch.serializers.basic import ( + get_basic_pt_network_serializer, +) + +from i6_experiments.common.setups.serialization import Import, CodeFromFunction, NonhashedCode + + +@dataclass +class ConformerCTCConfig(ModelConfiguration): + conformer_cfg: ConformerEncoderConfig + target_size: int + recog_num_layers: int + k_anneal_kwargs: dict + + +class ConformerCTCModel(torch.nn.Module): + def __init__(self, step: int, cfg: ConformerCTCConfig, **kwargs): + super().__init__() + self.conformer = ConformerEncoder(cfg.conformer_cfg) + self.final_linear_list = torch.nn.ModuleList( + [nn.Linear(cfg.conformer_cfg.block_cfg.ff_cfg.input_dim, cfg.target_size) for _ in + range(len(self.conformer.num_layers_set))]) + self.small_model_layers = None + self.recog_num_layers = cfg.recog_num_layers + self.conformer.recog_num_layers = self.recog_num_layers + self.k_anneal_kwargs = cfg.k_anneal_kwargs + self.export_mode = False + + def forward( + self, + audio_features: torch.Tensor, + audio_features_len: Optional[torch.Tensor] = None, + global_train_step: int = 0 + ): + if self.training: + x = specaugment_v1_by_length(audio_features, + time_min_num_masks=2, + time_max_mask_per_n_frames=25, + time_mask_max_size=20, + freq_min_num_masks=2, + freq_mask_max_size=5, + freq_max_num_masks=10) # [B, T, F] + else: + x = audio_features + sequence_mask = lengths_to_padding_mask(audio_features_len) + conformer_out_list, sequence_mask = self.conformer(x, sequence_mask, global_train_step, + self.k_anneal_kwargs) # [B, T, F] + + log_probs_list = [] + for i in range(len(conformer_out_list)): + logits = self.final_linear_list[i](conformer_out_list[i]) # [B, T, F] + log_probs = torch.log_softmax(logits, dim=2) + log_probs_list.append(log_probs) + + if self.training: + return log_probs_list, sequence_mask + + idx = self.conformer.num_layers_set.index(self.recog_num_layers) + return log_probs_list[idx] + +def get_default_config_v1(num_inputs: int, num_outputs: int, network_args: dict) -> ConformerCTCConfig: + dropout = 0.2 if "dropout" not in network_args else network_args["dropout"] + num_att_heads = 6 if "num_att_heads" not in network_args else network_args["num_att_heads"] + att_weights_dropout = 0.1 if "att_weights_dropout" not in network_args else network_args["att_weights_dropout"] + num_layers = 12 if "num_layers" not in network_args else network_args["num_layers"] + kernel_size = 31 if "kernel_size" not in network_args else network_args["kernel_size"] + num_layers_set = network_args["num_layers_set"] + layer_dropout_kwargs = network_args["layer_dropout_kwargs"] + recog_num_layers = network_args["recog_num_layers"] + k_anneal_kwargs = network_args["k_anneal_kwargs"] + + frontend_cfg = VGG4LayerActFrontendV1Config( + in_features=num_inputs, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation=torch.nn.ReLU(), + out_features=384, + ) + + frontend = ModuleFactoryV1(VGG4LayerActFrontendV1, frontend_cfg) + + ff_cfg = ConformerPositionwiseFeedForwardV1Config( + input_dim=384, + hidden_dim=1536, + dropout=dropout, + activation=torch.nn.SiLU(), + ) + + mhsa_cfg = ConformerMHSAV1Config( + input_dim=384, + num_att_heads=num_att_heads, + att_weights_dropout=att_weights_dropout, + dropout=dropout, + ) + + conv_cfg = ConformerConvolutionV1Config( + channels=384, + kernel_size=kernel_size, + dropout=dropout, + activation=torch.nn.SiLU(), + norm=LayerNormNC(384), + ) + + block_cfg = ConformerBlockConfig( + ff_cfg=ff_cfg, + mhsa_cfg=mhsa_cfg, + conv_cfg=conv_cfg, + layer_dropout=layer_dropout_kwargs["layer_dropout_stage_1"], + modules=["ff", "conv", "mhsa", "ff"], + scales=[0.5, 1.0, 1.0, 0.5] + ) + + + conformer_cfg = ConformerEncoderConfig( + num_layers=num_layers, + frontend=frontend, + block_cfg=block_cfg, + num_layers_set=num_layers_set, + layer_dropout_kwargs=layer_dropout_kwargs + ) + + return ConformerCTCConfig( + conformer_cfg=conformer_cfg, + target_size=num_outputs, + recog_num_layers=recog_num_layers, + k_anneal_kwargs=k_anneal_kwargs + ) + + +def train_step(*, model: torch.nn.Module, extern_data: TensorDict, global_train_step, **kwargs): + audio_features = extern_data["data"].raw_tensor + audio_features_len = extern_data["data"].dims[1].dyn_size_ext.raw_tensor + + targets = extern_data["targets"].raw_tensor.long() + targets_len = extern_data["targets"].dims[1].dyn_size_ext.raw_tensor + + model.train() + + log_probs_list, sequence_mask = model( + audio_features=audio_features, + audio_features_len=audio_features_len.to("cuda"), + global_train_step=global_train_step + ) + sequence_lengths = torch.sum(sequence_mask.type(torch.int32), dim=1) + + k_anneal_num_steps_per_iter = model.k_anneal_kwargs["k_anneal_num_steps_per_iter"] + k_reduction_per_iter = model.k_anneal_kwargs["k_reduction_per_iter"] + k_anneal_num_iters = (4 * len(model.conformer.module_list) - model.conformer.min_k) / k_reduction_per_iter + + loss_layers = model.conformer.num_layers_set + + # stage 1 : jointly train the largest and smallest model + if global_train_step <= k_anneal_num_steps_per_iter * k_anneal_num_iters: + loss_scales = [0.3, 1] + for i in [0, -1]: + log_probs = torch.transpose(log_probs_list[i], 0, 1) # [T, B, F] + loss = torch.nn.functional.ctc_loss( + log_probs=log_probs, + targets=targets, + input_lengths=sequence_lengths, + target_lengths=targets_len, + blank=0, + reduction="sum", + zero_infinity=True, + ) + + loss /= torch.sum(sequence_lengths) + rf.get_run_ctx().mark_as_loss(name=f"CTC_{loss_layers[i]}", loss=loss, scale=loss_scales[i]) + + # stage 2 : jointly train all models efficiently with sandwich rules + else: + if len(model.conformer.num_layers_set) <= 3: + loss_scales = [0.3]*(len(model.conformer.num_layers_set)-1) + [1] + else: + loss_scales = [0.3] + [0]*(len(model.conformer.num_layers_set)-2) + [1] + loss_scales[model.conformer.random_idx] = 0.3 + print(f"random_idx {model.conformer.random_idx}") + + for i in range(len(log_probs_list)): + log_probs = torch.transpose(log_probs_list[i], 0, 1) # [T, B, F] + + loss = torch.nn.functional.ctc_loss( + log_probs=log_probs, + targets=targets, + input_lengths=sequence_lengths, + target_lengths=targets_len, + blank=0, + reduction="sum", + zero_infinity=True, + ) + + loss /= torch.sum(sequence_lengths) + rf.get_run_ctx().mark_as_loss(name=f"CTC_{loss_layers[i]}", loss=loss, scale=loss_scales[i]) + + +def forward_step(*, model: torch.nn.Module, extern_data: TensorDict, **kwargs): + audio_features = extern_data["data"].raw_tensor + audio_features_len = extern_data["data"].dims[1].dyn_size_ext.raw_tensor + log_probs = model( + audio_features=audio_features, + audio_features_len=audio_features_len.to("cuda"), + ) # [B, T, F] + rf.get_run_ctx().mark_as_output(log_probs, name="log_probs") + + +def get_prior_serializer( + model_config: ConformerCTCConfig, +) -> Collection: + pytorch_package = "i6_experiments.users.berger.pytorch" + + return get_basic_pt_network_serializer( + module_import_path=f"{__name__}.{ConformerCTCModel.__name__}", + model_config=model_config, + additional_serializer_objects=[ + Import(f"{__name__}.forward_step"), + Import(f"{pytorch_package}.forward.prior_callback.ComputePriorCallback", import_as="forward_callback"), + ], + ) + + +def export(*, model: torch.nn.Module, model_filename: str): + model.export_mode = True + dummy_data = torch.randn(1, 30, 50, device="cpu") + dummy_data_len = torch.tensor([30], dtype=torch.int32) + torch.onnx.export( + model=model.eval(), + args=(dummy_data, dummy_data_len), + f=model_filename, + verbose=True, + input_names=["data", "data_len"], + output_names=["classes"], + opset_version=17, + dynamic_axes={ + # dict value: manually named axes + "data": {0: "batch", 1: "time"}, + "data_len": {0: "batch"}, + "targets": {0: "batch", 1: "time"}, + }, + ) + + +def get_recog_serializer( + model_config: ConformerCTCConfig, +) -> Collection: + pytorch_package = __package__.rpartition(".")[0] + return get_basic_pt_network_serializer( + module_import_path=f"{__name__}.{ConformerCTCModel.__name__}", + model_config=model_config, + additional_serializer_objects=[ + Import(f"{__name__}.export"), + ], + ) + + +def get_train_serializer( + model_config: ConformerCTCConfig, +) -> Collection: + return get_basic_pt_network_serializer( + module_import_path=f"{__name__}.{ConformerCTCModel.__name__}", + model_config=model_config, + additional_serializer_objects=[ + Import(f"{__name__}.train_step"), + ], + ) From 4cc3c40d0c7e3880c77cacb994f978090a632804 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Mon, 10 Jun 2024 09:17:26 +0200 Subject: [PATCH 126/227] more --- users/zeyer/experiments/exp2024_04_23_baselines/ctc.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index c65c7ffde..e4ef939e8 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -193,8 +193,10 @@ def py(): feature_stats = get_librispeech_log_mel_stats(_log_mel_feature_dim) # Test different feature normalization schemes. + # Note: It seems the diff between dev-other and test-other is less here, probably du to the normalization. for name, model_opts in { - None: None, + None: None, # {"dev-clean": 3.69, "dev-other": 6.99, "test-clean": 3.83, "test-other": 7.32} + # featBN: {"dev-clean": 3.63, "dev-other": 6.96, "test-clean": 3.82, "test-other": 7.15} "featBN": {"feature_batch_norm": True}, # batch norm "featNorm": {"feature_norm": True}, # normalize (on sequence level) "featGN": {"feature_stats": {"mean": feature_stats.mean, "std_dev": feature_stats.std_dev}}, # global norm From 3e688ba34cc734c21553e745c3c3194af61fc1b0 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Mon, 10 Jun 2024 11:58:42 +0000 Subject: [PATCH 127/227] add nemo model download job --- .../canary_aed/configs/canary_1b_recog.py | 26 ++++++++++++--- .../experiments/canary_aed/nemo/download.py | 33 +++++++++++++++++++ 2 files changed, 54 insertions(+), 5 deletions(-) create mode 100644 users/zeineldeen/experiments/canary_aed/nemo/download.py diff --git a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py index e50468767..1ffce34a6 100644 --- a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py +++ b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py @@ -1,18 +1,24 @@ +from typing import Dict + from sisyphus import * from i6_core.datasets.huggingface import DownloadAndPrepareHuggingFaceDatasetJob +from i6_experiments.users.zeineldeen.experiments.canary_aed.nemo.download import DownloadNemoModel -test_sets = ["ami", "earnings22", "gigaspeech"] +TEST_DATASETS = ["ami", "earnings22", "gigaspeech"] +MODEL_ID = "nvidia/canary-1b" -def download_test_datasets(): +def download_test_datasets() -> Dict[str, tk.Path]: # for downloading gigaspeech, a token is required. I login to huggingface and generate a token and then # run the command `huggingface-cli login` and paste the token - for test_set in test_sets: + out_dirs = {} + + for test_dataset in TEST_DATASETS: j = DownloadAndPrepareHuggingFaceDatasetJob( path="open-asr-leaderboard/datasets-test-only", - name=test_set, + name=test_dataset, split="test", time_rqmt=24, mem_rqmt=4, @@ -20,8 +26,18 @@ def download_test_datasets(): mini_task=True, token=True, ) - tk.register_output(f"datasets/{test_set}", j.out_dir) + out_dirs[test_dataset] = j.out_dir + tk.register_output(f"datasets/{test_dataset}", j.out_dir) + + return out_dirs + + +def download_canary_1b_model(): + j = DownloadNemoModel(model_id=MODEL_ID, device=-1) + tk.register_output("canary_1b_nemo_model", j.out_model_dir) + return j.out_model_dir def py(): download_test_datasets() + download_canary_1b_model() diff --git a/users/zeineldeen/experiments/canary_aed/nemo/download.py b/users/zeineldeen/experiments/canary_aed/nemo/download.py new file mode 100644 index 000000000..058991135 --- /dev/null +++ b/users/zeineldeen/experiments/canary_aed/nemo/download.py @@ -0,0 +1,33 @@ +from sisyphus import * + +import os + + +class DownloadNemoModel(Job): + def __init__(self, model_id: str, device: int): + self.model_id = model_id + + import torch + + if device >= 0: + self.device = torch.device(f"cuda:{device}") + else: + self.device = torch.device("cpu") + + self.out_model_dir = self.output_path("nemo_model", directory=True) + + def tasks(self): + yield Task("run", mini_task=True) + + def run(self): + # the model by default will be downloaded to huggingface cache + # this can be overridden by setting the HF_HUB_CACHE environment variable: + # https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/file_download.py#L1171 + # https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/constants.py#L123 + + for env_var in ["HUGGINGFACE_HUB_CACHE", "HF_HUB_CACHE", "NEMO_CACHE_DIR"]: + os.environ[env_var] = self.out_model_dir.get_path() + + from nemo.collections.asr.models import ASRModel + + ASRModel.from_pretrained(self.model_id, map_location=self.device) From f457f63df93de24dc49f2ac08cf89fc120ff404d Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Mon, 10 Jun 2024 12:33:57 +0000 Subject: [PATCH 128/227] add nemo search job --- .../canary_aed/configs/canary_1b_recog.py | 22 +++++- .../experiments/canary_aed/nemo/search.py | 67 +++++++++++++++++++ 2 files changed, 86 insertions(+), 3 deletions(-) create mode 100644 users/zeineldeen/experiments/canary_aed/nemo/search.py diff --git a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py index 1ffce34a6..f0b9570b0 100644 --- a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py +++ b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py @@ -4,6 +4,7 @@ from i6_core.datasets.huggingface import DownloadAndPrepareHuggingFaceDatasetJob from i6_experiments.users.zeineldeen.experiments.canary_aed.nemo.download import DownloadNemoModel +from i6_experiments.users.zeineldeen.experiments.canary_aed.nemo.search import SearchJob TEST_DATASETS = ["ami", "earnings22", "gigaspeech"] MODEL_ID = "nvidia/canary-1b" @@ -32,12 +33,27 @@ def download_test_datasets() -> Dict[str, tk.Path]: return out_dirs -def download_canary_1b_model(): +def download_canary_1b_model() -> tk.Path: j = DownloadNemoModel(model_id=MODEL_ID, device=-1) tk.register_output("canary_1b_nemo_model", j.out_model_dir) return j.out_model_dir def py(): - download_test_datasets() - download_canary_1b_model() + dataset_paths = download_test_datasets() + model_path = download_canary_1b_model() + + search_script = tk.Path( + "/u/zeineldeen/setups/ubuntu_22_setups/2024-06-07--canary-aed/recipe/i6_experiments/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py", + hash_overwrite="run_eval_v1", + ) + + search_job = SearchJob( + model_path=model_path, + dataset_path=dataset_paths["ami"], + search_script=search_script, + device="gpu", + time_rqmt=4, + mem_rqmt=4, + cpu_rqmt=2, + ) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/search.py b/users/zeineldeen/experiments/canary_aed/nemo/search.py new file mode 100644 index 000000000..e5a97ee15 --- /dev/null +++ b/users/zeineldeen/experiments/canary_aed/nemo/search.py @@ -0,0 +1,67 @@ +from sisyphus import * +import subprocess as sp +import os + +from typing import Dict, Any, Optional + +from i6_core.util import create_executable + + +class SearchJob(Job): + def __init__( + self, + model_path: tk.Path, + dataset_path: tk.Path, + search_script: tk.Path, + search_args: Optional[Dict[str, Any]] = None, + python_exe: Optional[tk.Path] = None, + device: str = "gpu", + time_rqmt: float = 4, + mem_rqmt: int = 4, + cpu_rqmt: int = 2, + ): + self.model_path = model_path + self.dataset_path = dataset_path + self.search_script = search_script + self.search_args = search_args + self.python_exe = python_exe if python_exe is not None else "python3" + self.device = device + self.rqmt = { + "gpu": 1 if device == "gpu" else 0, + "cpu": cpu_rqmt, + "mem": mem_rqmt, + "time": time_rqmt, + } + + self.out_wer = self.output_path("wer") + + def tasks(self): + yield Task("create_files", mini_task=True) + yield Task("run", rqmt=self.rqmt) + + def get_cmd(self): + cmd = [ + self.python_exe.get_path(), + self.search_script.get_path(), + "--model_path", + self.model_path.get_path(), + "--dataset_path", + self.dataset_path, + "--device", + 0 if self.device == "gpu" else -1, + ] + for k, v in self.search_args.items(): + if k == "device": + continue # ignored. this is only set via job parameter + cmd.append(f"--{k}") + cmd.append(str(v)) + return cmd + + def create_files(self): + create_executable("run.sh", self.get_cmd()) + + def run(self): + env = os.environ.copy() + env["OMP_NUM_THREADS"] = str(self.rqmt["cpu"]) + env["MKL_NUM_THREADS"] = str(self.rqmt["cpu"]) + sp.check_call(self.get_cmd(), env=env) From 1eebd00cbe5ed414af712509b2fd909683cf73d7 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Mon, 10 Jun 2024 12:36:13 +0000 Subject: [PATCH 129/227] add custom hash --- .../zeineldeen/experiments/canary_aed/nemo/search.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/search.py b/users/zeineldeen/experiments/canary_aed/nemo/search.py index e5a97ee15..3e5705490 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/search.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/search.py @@ -65,3 +65,14 @@ def run(self): env["OMP_NUM_THREADS"] = str(self.rqmt["cpu"]) env["MKL_NUM_THREADS"] = str(self.rqmt["cpu"]) sp.check_call(self.get_cmd(), env=env) + + @classmethod + def hash(cls, kwargs): + d = { + "model_path": kwargs["model_path"], + "dataset_path": kwargs["dataset_path"], + "search_script": kwargs["search_script"], + "search_args": kwargs["search_args"], + "python_exe": kwargs["python_exe"], + } + return super().hash(d) From 7db6937792225d643d88f4064ca1e81099de4023 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Mon, 10 Jun 2024 12:38:49 +0000 Subject: [PATCH 130/227] fix --- .../experiments/canary_aed/configs/canary_1b_recog.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py index f0b9570b0..2101e086c 100644 --- a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py +++ b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py @@ -47,13 +47,19 @@ def py(): "/u/zeineldeen/setups/ubuntu_22_setups/2024-06-07--canary-aed/recipe/i6_experiments/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py", hash_overwrite="run_eval_v1", ) + python_exe = tk.Path( + "/work/asr4/zeineldeen/setups-data/ubuntu_22_setups/2024-06-07--canary-aed/nemo_venv/bin/python3" + ) search_job = SearchJob( model_path=model_path, dataset_path=dataset_paths["ami"], search_script=search_script, + python_exe=python_exe, device="gpu", time_rqmt=4, mem_rqmt=4, cpu_rqmt=2, ) + search_job.add_alias("canary_1b_ami") + tk.register_output("canary_1b_ami/wer", search_job.out_wer) From 88c4d966fe7c131b4180d7e40b821a5faae55823 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Mon, 10 Jun 2024 12:54:00 +0000 Subject: [PATCH 131/227] add nemo search --- .../canary_aed/configs/canary_1b_recog.py | 3 + .../experiments/canary_aed/nemo/run_eval.py | 198 ++++++++++++++++++ .../experiments/canary_aed/nemo/search.py | 15 ++ 3 files changed, 216 insertions(+) create mode 100644 users/zeineldeen/experiments/canary_aed/nemo/run_eval.py diff --git a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py index 2101e086c..f218e5cd7 100644 --- a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py +++ b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py @@ -52,8 +52,11 @@ def py(): ) search_job = SearchJob( + model_id=MODEL_ID, model_path=model_path, dataset_path=dataset_paths["ami"], + dataset_name="ami", + split="test", search_script=search_script, python_exe=python_exe, device="gpu", diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py new file mode 100644 index 000000000..a46ad6263 --- /dev/null +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py @@ -0,0 +1,198 @@ +""" +Adapted from here: https://github.com/huggingface/open_asr_leaderboard/blob/5c03c1f85a84ab7a991dcc1b3f14905ec6d632c9/nemo_asr/run_eval.py +""" +import argparse + +import os +import sys + +sys.path.append("/u/zeineldeen/setups/ubuntu_22_setups/2024-06-07--canary-aed/recipe/open_asr_leaderboard/normalizer") + +import shutil +import torch +import evaluate +import soundfile + +from tqdm import tqdm +from normalizer import data_utils + +from nemo.collections.asr.models import ASRModel + +DATA_CACHE_DIR = "/var/tmp/audio_cache" + +wer_metric = evaluate.load("wer") + + +def dataset_iterator(dataset): + for i, item in enumerate(dataset): + yield { + **item["audio"], + "reference": item["norm_text"], + "audio_filename": f"file_{i}", + "sample_rate": 16_000, + "sample_id": i, + } + + +def write_audio(buffer, cache_prefix) -> list: + cache_dir = os.path.join(DATA_CACHE_DIR, cache_prefix) + + if os.path.exists(cache_dir): + shutil.rmtree(cache_dir, ignore_errors=True) + + os.makedirs(cache_dir) + + data_paths = [] + for idx, data in enumerate(buffer): + fn = os.path.basename(data["audio_filename"]) + fn = os.path.splitext(fn)[0] + path = os.path.join(cache_dir, f"{idx}_{fn}.wav") + data_paths.append(path) + + soundfile.write(path, data["array"], samplerate=data["sample_rate"]) + + return data_paths + + +def pack_results(results: list, buffer, transcriptions): + for sample, transcript in zip(buffer, transcriptions): + result = {"reference": sample["reference"], "pred_text": transcript} + results.append(result) + return results + + +def buffer_audio_and_transcribe( + model: ASRModel, dataset, batch_size: int, pnc: bool, cache_prefix: str, verbose: bool = True +): + buffer = [] + results = [] + for sample in tqdm(dataset_iterator(dataset), desc="Evaluating: Sample id", unit="", disable=not verbose): + buffer.append(sample) + + if len(buffer) == batch_size: + filepaths = write_audio(buffer, cache_prefix) + + if pnc is not None: + transcriptions = model.transcribe(filepaths, batch_size=batch_size, pnc=False, verbose=False) + else: + transcriptions = model.transcribe(filepaths, batch_size=batch_size, verbose=False) + # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis + if type(transcriptions) == tuple and len(transcriptions) == 2: + transcriptions = transcriptions[0] + results = pack_results(results, buffer, transcriptions) + buffer.clear() + + if len(buffer) > 0: + filepaths = write_audio(buffer, cache_prefix) + if pnc is not None: + transcriptions = model.transcribe(filepaths, batch_size=batch_size, pnc=False, verbose=False) + else: + transcriptions = model.transcribe(filepaths, batch_size=batch_size, verbose=False) + # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis + if type(transcriptions) == tuple and len(transcriptions) == 2: + transcriptions = transcriptions[0] + results = pack_results(results, buffer, transcriptions) + buffer.clear() + + # Delete temp cache dir + if os.path.exists(DATA_CACHE_DIR): + shutil.rmtree(DATA_CACHE_DIR) + + return results + + +def main(args): + if args.device >= 0: + device = torch.device(f"cuda:{args.device}") + else: + device = torch.device("cpu") + + asr_model = ASRModel.restore_from(args.model_path, map_location=device) + asr_model.freeze() + + from datasets import load_dataset + + # download model is defautl to REUSE_DATASET_IF_EXISTS which means it does not download data again + dataset = load_dataset(args.dataset_path) + + if args.max_eval_samples is not None and args.max_eval_samples > 0: + print(f"Subsampling dataset to first {args.max_eval_samples} samples !") + dataset = dataset.take(args.max_eval_samples) + + dataset = data_utils.prepare_data(dataset) + + predictions = [] + references = [] + + # run streamed inference + cache_prefix = ( + f"{args.model_id.replace('/', '-')}-{args.dataset_path.replace('/', '')}-" + f"{args.dataset.replace('/', '-')}-{args.split}" + ) + results = buffer_audio_and_transcribe(asr_model, dataset, args.batch_size, args.pnc, cache_prefix, verbose=True) + for sample in results: + predictions.append(data_utils.normalizer(sample["pred_text"])) + references.append(sample["reference"]) + + # Write manifest results + manifest_path = data_utils.write_manifest( + references, predictions, args.model_id, args.dataset_path, args.dataset, args.split + ) + print("Results saved at path:", os.path.abspath(manifest_path)) + + wer = wer_metric.compute(references=references, predictions=predictions) + wer = round(100 * wer, 2) + + print("WER:", wer, "%") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("--model_id", type=str, required=True, help="Model ID.") + + parser.add_argument( + "--model_path", + type=str, + required=True, + help="Path to nemo model.", + ) + + parser.add_argument("--dataset_path", type=str, required=True, help="Dataset path.") + parser.add_argument("--dataset", type=str, required=True, help="Dataset name.") + parser.add_argument("--split", type=str, required=True, help="Dataset split.") + + parser.add_argument( + "--device", + type=int, + default=-1, + help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.", + ) + parser.add_argument( + "--batch_size", + type=int, + default=32, + help="Number of samples to go through each streamed batch.", + ) + parser.add_argument( + "--max_eval_samples", + type=int, + default=None, + help="Number of samples to be evaluated. Put a lower number e.g. 64 for testing this script.", + ) + parser.add_argument( + "--pnc", + type=bool, + default=None, + help="flag to indicate inferene in pnc mode for models that support punctuation and capitalization", + ) + parser.add_argument( + "--no-streaming", + dest="streaming", + action="store_false", + help="Choose whether you'd like to download the entire dataset or stream it during the evaluation.", + ) + args = parser.parse_args() + parser.set_defaults(streaming=True) + + main(args) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/search.py b/users/zeineldeen/experiments/canary_aed/nemo/search.py index 3e5705490..5203b5c5b 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/search.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/search.py @@ -10,8 +10,11 @@ class SearchJob(Job): def __init__( self, + model_id: str, model_path: tk.Path, dataset_path: tk.Path, + dataset_name: str, + split: str, search_script: tk.Path, search_args: Optional[Dict[str, Any]] = None, python_exe: Optional[tk.Path] = None, @@ -20,8 +23,11 @@ def __init__( mem_rqmt: int = 4, cpu_rqmt: int = 2, ): + self.model_id = model_id self.model_path = model_path self.dataset_path = dataset_path + self.dataset_name = dataset_name + self.split = split self.search_script = search_script self.search_args = search_args self.python_exe = python_exe if python_exe is not None else "python3" @@ -43,10 +49,16 @@ def get_cmd(self): cmd = [ self.python_exe.get_path(), self.search_script.get_path(), + "--model_id", + self.model_id, "--model_path", self.model_path.get_path(), "--dataset_path", self.dataset_path, + "--dataset", + self.dataset_name, + "--split", + self.split, "--device", 0 if self.device == "gpu" else -1, ] @@ -69,8 +81,11 @@ def run(self): @classmethod def hash(cls, kwargs): d = { + "model_id": kwargs["model_id"], "model_path": kwargs["model_path"], "dataset_path": kwargs["dataset_path"], + "dataset_name": kwargs["dataset_name"], + "split": kwargs["split"], "search_script": kwargs["search_script"], "search_args": kwargs["search_args"], "python_exe": kwargs["python_exe"], From a885383bc5ab2331ad79e14e83a356a054270098 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Mon, 10 Jun 2024 13:04:22 +0000 Subject: [PATCH 132/227] first version of nemo search --- .../experiments/canary_aed/configs/canary_1b_recog.py | 7 ++++++- users/zeineldeen/experiments/canary_aed/nemo/search.py | 6 +++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py index f218e5cd7..f4998cf9d 100644 --- a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py +++ b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py @@ -36,7 +36,12 @@ def download_test_datasets() -> Dict[str, tk.Path]: def download_canary_1b_model() -> tk.Path: j = DownloadNemoModel(model_id=MODEL_ID, device=-1) tk.register_output("canary_1b_nemo_model", j.out_model_dir) - return j.out_model_dir + # return j.out_model_dir + # TODO: let the job returns directly the model path instead + return tk.Path( + j.out_model_dir.get_path() + + "/models--nvidia--canary-1b/snapshots/dd32c0c709e2bfc79f583e16b9df4b3a160f7e86/canary-1b.nemo" + ) def py(): diff --git a/users/zeineldeen/experiments/canary_aed/nemo/search.py b/users/zeineldeen/experiments/canary_aed/nemo/search.py index 5203b5c5b..0e30dcb82 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/search.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/search.py @@ -29,7 +29,7 @@ def __init__( self.dataset_name = dataset_name self.split = split self.search_script = search_script - self.search_args = search_args + self.search_args = search_args if search_args is not None else {} self.python_exe = python_exe if python_exe is not None else "python3" self.device = device self.rqmt = { @@ -54,13 +54,13 @@ def get_cmd(self): "--model_path", self.model_path.get_path(), "--dataset_path", - self.dataset_path, + self.dataset_path.get_path(), "--dataset", self.dataset_name, "--split", self.split, "--device", - 0 if self.device == "gpu" else -1, + "0" if self.device == "gpu" else "-1", ] for k, v in self.search_args.items(): if k == "device": From 20d50b9102e947972059510849db608e142f7918 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Mon, 10 Jun 2024 13:14:02 +0000 Subject: [PATCH 133/227] better --- users/zeineldeen/experiments/canary_aed/nemo/search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/search.py b/users/zeineldeen/experiments/canary_aed/nemo/search.py index 0e30dcb82..d1bba5633 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/search.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/search.py @@ -30,7 +30,7 @@ def __init__( self.split = split self.search_script = search_script self.search_args = search_args if search_args is not None else {} - self.python_exe = python_exe if python_exe is not None else "python3" + self.python_exe = python_exe self.device = device self.rqmt = { "gpu": 1 if device == "gpu" else 0, From fbab0684d448848ad42194f6502585f819278d84 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Mon, 10 Jun 2024 13:26:28 +0000 Subject: [PATCH 134/227] fix bug --- users/zeineldeen/experiments/canary_aed/nemo/run_eval.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py index a46ad6263..06e869f63 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py @@ -20,8 +20,6 @@ DATA_CACHE_DIR = "/var/tmp/audio_cache" -wer_metric = evaluate.load("wer") - def dataset_iterator(dataset): for i, item in enumerate(dataset): @@ -110,10 +108,10 @@ def main(args): asr_model = ASRModel.restore_from(args.model_path, map_location=device) asr_model.freeze() - from datasets import load_dataset + from datasets import load_from_disk - # download model is defautl to REUSE_DATASET_IF_EXISTS which means it does not download data again - dataset = load_dataset(args.dataset_path) + print("Loading dataset...") + dataset = load_from_disk(args.dataset_path) if args.max_eval_samples is not None and args.max_eval_samples > 0: print(f"Subsampling dataset to first {args.max_eval_samples} samples !") @@ -140,6 +138,7 @@ def main(args): ) print("Results saved at path:", os.path.abspath(manifest_path)) + wer_metric = evaluate.load("wer") wer = wer_metric.compute(references=references, predictions=predictions) wer = round(100 * wer, 2) From 536fa29f41aa4705c3c647b6c95e07586a2f9840 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Mon, 10 Jun 2024 13:30:10 +0000 Subject: [PATCH 135/227] better --- users/zeineldeen/experiments/canary_aed/nemo/run_eval.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py index 06e869f63..b81225b43 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py @@ -16,6 +16,8 @@ from tqdm import tqdm from normalizer import data_utils +from datasets import load_from_disk + from nemo.collections.asr.models import ASRModel DATA_CACHE_DIR = "/var/tmp/audio_cache" @@ -108,9 +110,6 @@ def main(args): asr_model = ASRModel.restore_from(args.model_path, map_location=device) asr_model.freeze() - from datasets import load_from_disk - - print("Loading dataset...") dataset = load_from_disk(args.dataset_path) if args.max_eval_samples is not None and args.max_eval_samples > 0: From 5c21c8ac744b2eb61ab32a7848541b7d4f2b9298 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Mon, 10 Jun 2024 14:02:23 +0000 Subject: [PATCH 136/227] add missing search output path --- .../experiments/canary_aed/configs/canary_1b_recog.py | 2 +- users/zeineldeen/experiments/canary_aed/nemo/run_eval.py | 7 +++++-- users/zeineldeen/experiments/canary_aed/nemo/search.py | 3 +++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py index f4998cf9d..b31659c5d 100644 --- a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py +++ b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py @@ -70,4 +70,4 @@ def py(): cpu_rqmt=2, ) search_job.add_alias("canary_1b_ami") - tk.register_output("canary_1b_ami/wer", search_job.out_wer) + tk.register_output("canary_1b_ami/search_out", search_job.out_search_results) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py index b81225b43..ea2ac4519 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py @@ -22,6 +22,8 @@ DATA_CACHE_DIR = "/var/tmp/audio_cache" +wer_metric = evaluate.load("wer") + def dataset_iterator(dataset): for i, item in enumerate(dataset): @@ -133,11 +135,10 @@ def main(args): # Write manifest results manifest_path = data_utils.write_manifest( - references, predictions, args.model_id, args.dataset_path, args.dataset, args.split + args.manifest_path, references, predictions, args.model_id, args.dataset_path, args.dataset, args.split ) print("Results saved at path:", os.path.abspath(manifest_path)) - wer_metric = evaluate.load("wer") wer = wer_metric.compute(references=references, predictions=predictions) wer = round(100 * wer, 2) @@ -160,6 +161,8 @@ def main(args): parser.add_argument("--dataset", type=str, required=True, help="Dataset name.") parser.add_argument("--split", type=str, required=True, help="Dataset split.") + parser.add_argument("--manifest_path", type=str, required=True, help="Path to save the search output.") + parser.add_argument( "--device", type=int, diff --git a/users/zeineldeen/experiments/canary_aed/nemo/search.py b/users/zeineldeen/experiments/canary_aed/nemo/search.py index d1bba5633..c785ca8ff 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/search.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/search.py @@ -39,6 +39,7 @@ def __init__( "time": time_rqmt, } + self.out_search_results = self.output_path("search_results") self.out_wer = self.output_path("wer") def tasks(self): @@ -59,6 +60,8 @@ def get_cmd(self): self.dataset_name, "--split", self.split, + "--manifest_path", + self.out_search_results.get_path(), "--device", "0" if self.device == "gpu" else "-1", ] From c462d05c1d96293812cb989bc175110204ec4d8b Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Mon, 10 Jun 2024 15:02:22 +0000 Subject: [PATCH 137/227] add compute_wer func --- .../experiments/canary_aed/nemo/run_eval.py | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py index ea2ac4519..1f2d37928 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py @@ -6,11 +6,11 @@ import os import sys -sys.path.append("/u/zeineldeen/setups/ubuntu_22_setups/2024-06-07--canary-aed/recipe/open_asr_leaderboard/normalizer") +# sys.path.append("/u/zeineldeen/setups/ubuntu_22_setups/2024-06-07--canary-aed/recipe/open_asr_leaderboard/normalizer") import shutil import torch -import evaluate + import soundfile from tqdm import tqdm @@ -22,7 +22,17 @@ DATA_CACHE_DIR = "/var/tmp/audio_cache" -wer_metric = evaluate.load("wer") + +def compute_wer(predictions, references): + from jiwer import compute_measures + + incorrect = 0 + total = 0 + for prediction, reference in zip(predictions, references): + measures = compute_measures(reference, prediction) + incorrect += measures["substitutions"] + measures["deletions"] + measures["insertions"] + total += measures["substitutions"] + measures["deletions"] + measures["hits"] + return incorrect / total def dataset_iterator(dataset): @@ -133,13 +143,13 @@ def main(args): predictions.append(data_utils.normalizer(sample["pred_text"])) references.append(sample["reference"]) - # Write manifest results + # Write manifest results to args.manifest_path manifest_path = data_utils.write_manifest( args.manifest_path, references, predictions, args.model_id, args.dataset_path, args.dataset, args.split ) print("Results saved at path:", os.path.abspath(manifest_path)) - wer = wer_metric.compute(references=references, predictions=predictions) + wer = compute_wer(references=references, predictions=predictions) wer = round(100 * wer, 2) print("WER:", wer, "%") From 3884115111c0ffd8ec386c2b16394d74dd61700b Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Mon, 10 Jun 2024 15:11:51 +0000 Subject: [PATCH 138/227] add wer as output var --- users/zeineldeen/experiments/canary_aed/nemo/run_eval.py | 7 +++++++ users/zeineldeen/experiments/canary_aed/nemo/search.py | 4 +++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py index 1f2d37928..43b467413 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py @@ -154,6 +154,11 @@ def main(args): print("WER:", wer, "%") + if args.wer_out_path: + with open(args.wer_out_path, "w") as f: + f.write(f"{wer}\n") + print(f"Wrote WER (%) to {args.wer_out_path}") + if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -173,6 +178,8 @@ def main(args): parser.add_argument("--manifest_path", type=str, required=True, help="Path to save the search output.") + parser.add_argument("--wer_out_path", type=str, default=None, help="Path to save the WER output.") + parser.add_argument( "--device", type=int, diff --git a/users/zeineldeen/experiments/canary_aed/nemo/search.py b/users/zeineldeen/experiments/canary_aed/nemo/search.py index c785ca8ff..75c9cd98f 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/search.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/search.py @@ -40,7 +40,7 @@ def __init__( } self.out_search_results = self.output_path("search_results") - self.out_wer = self.output_path("wer") + self.out_wer = self.output_var("wer") def tasks(self): yield Task("create_files", mini_task=True) @@ -64,6 +64,8 @@ def get_cmd(self): self.out_search_results.get_path(), "--device", "0" if self.device == "gpu" else "-1", + "--wer_out_path", + self.out_wer.get_path(), ] for k, v in self.search_args.items(): if k == "device": From 2ffa0d8de3bad7bc9df803e758023efafc3dde77 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Mon, 10 Jun 2024 15:14:14 +0000 Subject: [PATCH 139/227] run search for all test sets with canary 1b model --- .../canary_aed/configs/canary_1b_recog.py | 31 ++++++++++--------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py index b31659c5d..0399321c4 100644 --- a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py +++ b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py @@ -56,18 +56,19 @@ def py(): "/work/asr4/zeineldeen/setups-data/ubuntu_22_setups/2024-06-07--canary-aed/nemo_venv/bin/python3" ) - search_job = SearchJob( - model_id=MODEL_ID, - model_path=model_path, - dataset_path=dataset_paths["ami"], - dataset_name="ami", - split="test", - search_script=search_script, - python_exe=python_exe, - device="gpu", - time_rqmt=4, - mem_rqmt=4, - cpu_rqmt=2, - ) - search_job.add_alias("canary_1b_ami") - tk.register_output("canary_1b_ami/search_out", search_job.out_search_results) + for test_set in TEST_DATASETS: + search_job = SearchJob( + model_id=MODEL_ID, + model_path=model_path, + dataset_path=dataset_paths[test_set], + dataset_name=test_set, + split="test", + search_script=search_script, + python_exe=python_exe, + device="gpu", + time_rqmt=4, + mem_rqmt=4, + cpu_rqmt=2, + ) + search_job.add_alias(f"canary_1b/{test_set}") + tk.register_output(f"canary_1b/{test_set}/search_out", search_job.out_search_results) From b6a6b6581c84ffbd39d3e85b3be4cf494429b3d1 Mon Sep 17 00:00:00 2001 From: Judyxujj Date: Mon, 10 Jun 2024 23:48:34 +0800 Subject: [PATCH 140/227] add configs (#224) Co-authored-by: Jingjing Xu --- ...ntly_train_iterative_zero_out_layerwise.py | 245 ++++++++++++++ ...ntly_train_iterative_zero_out_layerwise.py | 316 ++++++++++++++++++ 2 files changed, 561 insertions(+) create mode 100644 users/jxu/experiments/ctc/tedlium2/configs/dynamic_encoder_size/iterative_zero_out_refactored/jointly_train_iterative_zero_out_layerwise.py create mode 100644 users/jxu/experiments/ctc/tedlium2/pytorch_networks/dynamic_encoder_size/iterative_zero_out_refactored/jointly_train_iterative_zero_out_layerwise.py diff --git a/users/jxu/experiments/ctc/tedlium2/configs/dynamic_encoder_size/iterative_zero_out_refactored/jointly_train_iterative_zero_out_layerwise.py b/users/jxu/experiments/ctc/tedlium2/configs/dynamic_encoder_size/iterative_zero_out_refactored/jointly_train_iterative_zero_out_layerwise.py new file mode 100644 index 000000000..713bb5ba5 --- /dev/null +++ b/users/jxu/experiments/ctc/tedlium2/configs/dynamic_encoder_size/iterative_zero_out_refactored/jointly_train_iterative_zero_out_layerwise.py @@ -0,0 +1,245 @@ +import functools +from typing import Any, Dict, List, Optional, Union +import copy + +import i6_core.returnn as returnn +import i6_experiments.users.jxu.experiments.ctc.tedlium2.configs.configs_helper as configs_helper +from i6_experiments.users.berger.systems.dataclasses import ReturnnConfigs +from i6_experiments.common.setups.returnn_pytorch.serialization import Collection +from i6_experiments.users.berger.systems.dataclasses import ConfigVariant + +# ********** Constant values ********** + +num_outputs = 79 +num_subepochs = 250 + + +# ********** Settings ********** + +def get_returnn_config( + network: Optional[Dict] = None, + *, + target: Optional[str] = "classes", + num_inputs: Optional[int] = None, + num_outputs: Optional[int] = None, + python_prolog: Optional[Union[List, Dict]] = None, + extern_data_config: bool = False, + extra_python: Optional[List] = None, + extra_config: Optional[Dict] = None, + hash_full_python_code: bool = False, + **kwargs, +) -> returnn.ReturnnConfig: + python_prolog = python_prolog or ["import numpy as np"] + extra_python = extra_python or [] + config_dict: dict[str, Any] = {"target": target} + if num_inputs is not None: + config_dict["num_inputs"] = num_inputs + if num_outputs is not None: + config_dict["num_outputs"] = {target: num_outputs} + if extern_data_config: + config_dict.update( + configs_helper.get_extern_data_config(num_inputs=num_inputs, num_outputs=num_outputs, target=target, + **kwargs) + ) + config_dict.update(configs_helper.get_base_config()) + + if network: + config_dict.update({"network:": network}) + + lrate_config = configs_helper.get_oclr_config(**kwargs) + config_dict.update(lrate_config) + + config_dict.update(configs_helper.get_base_regularization_config(**kwargs)) + + if extra_config: + config_dict.update(extra_config) + + post_config_dict = {} + post_config_dict.update(configs_helper.get_base_post_config(**kwargs)) + + return returnn.ReturnnConfig( + config=config_dict, + post_config=post_config_dict, + hash_full_python_code=hash_full_python_code, + python_prolog=python_prolog, + python_epilog=extra_python, + pprint_kwargs={"sort_dicts": False}, + ) + + +def get_serializer(model_config, variant: ConfigVariant, in_dim: int = 1) -> Collection: + from i6_experiments.users.jxu.experiments.ctc.tedlium2.pytorch_networks.dynamic_encoder_size.zeroout_with_new_i6_models.joint_train_two_model_iterative_zero_out_modwise import \ + get_train_serializer, get_recog_serializer, get_prior_serializer + if variant == ConfigVariant.TRAIN: + return get_train_serializer(model_config) + if variant == ConfigVariant.PRIOR: + return get_prior_serializer(model_config) + if variant == ConfigVariant.RECOG: + return get_recog_serializer(model_config) + raise NotImplementedError + + +def returnn_config_generator(train_data_config: dict, dev_data_config: dict, peak_lr: float) -> dict: + from i6_experiments.users.jxu.experiments.ctc.tedlium2.pytorch_networks.dynamic_encoder_size.zeroout_with_new_i6_models.joint_train_two_model_iterative_zero_out_modwise import \ + get_default_config_v1 as get_train_config + from i6_experiments.users.jxu.experiments.ctc.tedlium2.pytorch_networks.dynamic_encoder_size.zeroout_with_new_i6_models.joint_train_two_model_iterative_zero_out_modwise import \ + get_default_config_v1 as get_recog_config + + extra_config = { + "train": train_data_config, + "dev": dev_data_config, + } + recog_extra_config = copy.deepcopy(extra_config) + recog_extra_config["model_outputs"] = {"classes": {"dim": num_outputs}} + + config_partial = functools.partial( + get_returnn_config, + num_epochs=num_subepochs, + num_inputs=50, + num_outputs=num_outputs, + target="targets", + extern_data_config=True, + grad_noise=0.0, + grad_clip=0.0, + cycle_epoch=110, + initial_lr=peak_lr / 100, + peak_lr=peak_lr, + final_lr=1e-08, + batch_size=15000, + extra_config=extra_config, + ) + + def get_returnn_configs(train_config, recog_config): + return ReturnnConfigs( + train_config=config_partial( + extra_python=[get_serializer(train_config, ConfigVariant.TRAIN)], + extra_config=extra_config), + prior_config=config_partial(extra_python=[get_serializer(recog_config, ConfigVariant.PRIOR)], + extra_config=extra_config), + recog_configs={ + "recog": config_partial(extra_python=[get_serializer(recog_config, ConfigVariant.RECOG)], + extra_config=recog_extra_config)}, + ) + + # ----------------------------- gumbel_scale 0.05 gumble_top_k_dropout_0_3 init tau 0.5 k annealing every 20 sub-epoch --------------------------------------- + + experiments = {} + for num_steps_per_iter in [[45 * 1400] + [15 * 1400] * 7]: + for num_zeroout_elements_per_iter in [[3,6,9,12,15,18,21,24]]: + for layer_dropout_stage_1, layer_dropout_stage_2 in [(0, 0.3)]: + layer_dropout_kwargs = {"layer_dropout_stage_1": layer_dropout_stage_1, + "layer_dropout_stage_2": layer_dropout_stage_2} + zero_out_kwargs = {"num_steps_per_iter":num_steps_per_iter, "num_zeroout_elements_per_iter":num_zeroout_elements_per_iter, "zeroout_val": -5} + network_args = {"layer_dropout_kwargs": layer_dropout_kwargs, + "zero_out_kwargs": zero_out_kwargs, "num_layers_set": [24, 48], + "layer_gate_activation":"sigmoid", "recog_num_layers": 48} + + train_config = get_train_config(num_inputs=50, num_outputs=num_outputs, + network_args=network_args) + recog_network_args = copy.deepcopy(network_args) + recog_network_args["recog_num_layers"] = 48 + num_recog_mods_48_recog_config = get_recog_config(num_inputs=50, num_outputs=num_outputs, + network_args=recog_network_args) + recog_network_args["recog_num_layers"] = 24 + num_recog_mods_24_recog_config = get_recog_config(num_inputs=50, num_outputs=num_outputs, + network_args=recog_network_args) + + num_recog_mods_48_experiment_name = f"two_models_dropout_{layer_dropout_stage_1}_{layer_dropout_stage_2}_recog_num_mods_48_peak_lr_{peak_lr}" + num_recog_mods_48_experiment_name = num_recog_mods_48_experiment_name.replace(".", "_") + experiments[num_recog_mods_48_experiment_name] = get_returnn_configs(train_config, + num_recog_mods_48_recog_config) + + num_recog_mods_24_experiment_name = f"two_models_dropout_{layer_dropout_stage_1}_{layer_dropout_stage_2}_recog_num_mods_24_peak_lr_{peak_lr}" + num_recog_mods_24_experiment_name = num_recog_mods_24_experiment_name.replace(".", "_") + experiments[num_recog_mods_24_experiment_name] = get_returnn_configs(train_config, + num_recog_mods_24_recog_config) + + for num_steps_per_iter in [[42000,25200,25200,25200,25200,25200,25200,25200]]: + for num_zeroout_elements_per_iter in [[4,8,12,16,20,24,28,32]]: + layer_dropout_kwargs = {"layer_dropout_stage_1": layer_dropout_stage_1, + "layer_dropout_stage_2": layer_dropout_stage_2} + zero_out_kwargs = {"num_steps_per_iter": num_steps_per_iter, + "num_zeroout_elements_per_iter": num_zeroout_elements_per_iter, "zeroout_val": -5} + + network_args = {"layer_dropout_kwargs": layer_dropout_kwargs, + "zero_out_kwargs": zero_out_kwargs, "recog_num_layers": 48, "num_layers_set": [16, 32, 48], + "layer_gate_activation":"sigmoid"} + + train_config = get_train_config(num_inputs=50, num_outputs=num_outputs, + network_args=network_args) + recog_network_args = copy.deepcopy(network_args) + recog_network_args["recog_num_layers"] = 48 + num_recog_mods_48_recog_config = get_recog_config(num_inputs=50, num_outputs=num_outputs, + network_args=recog_network_args) + + recog_network_args["recog_num_layers"] = 32 + num_recog_mods_32_recog_config = get_recog_config(num_inputs=50, num_outputs=num_outputs, + network_args=recog_network_args) + + recog_network_args["recog_num_layers"] = 16 + num_recog_mods_16_recog_config = get_recog_config(num_inputs=50, num_outputs=num_outputs, + network_args=recog_network_args) + + num_recog_mods_48_experiment_name = f"three_models_dropout_{layer_dropout_stage_1}_{layer_dropout_stage_2}_recog_num_mods_48_peak_lr_{peak_lr}" + num_recog_mods_48_experiment_name = num_recog_mods_48_experiment_name.replace(".", "_") + experiments[num_recog_mods_48_experiment_name] = get_returnn_configs(train_config, + num_recog_mods_48_recog_config) + + num_recog_mods_32_experiment_name = f"three_models_dropout_{layer_dropout_stage_1}_{layer_dropout_stage_2}_recog_num_mods_32_peak_lr_{peak_lr}" + num_recog_mods_32_experiment_name = num_recog_mods_32_experiment_name.replace(".", "_") + experiments[num_recog_mods_32_experiment_name] = get_returnn_configs(train_config, + num_recog_mods_32_recog_config) + + num_recog_mods_16_experiment_name = f"three_models_dropout_{layer_dropout_stage_1}_{layer_dropout_stage_2}_recog_num_mods_16_peak_lr_{peak_lr}" + num_recog_mods_16_experiment_name = num_recog_mods_16_experiment_name.replace(".", "_") + experiments[num_recog_mods_16_experiment_name] = get_returnn_configs(train_config, + num_recog_mods_16_recog_config) + + + for num_steps_per_iter in [[10*1400]+[4*1400]*35]: + for num_zeroout_elements_per_iter in [list(range(1,37))]: + for layer_dropout_stage_1, layer_dropout_stage_2 in [(0, 0.3)]: + zero_out_kwargs = {"num_steps_per_iter": num_steps_per_iter, + "num_zeroout_elements_per_iter": num_zeroout_elements_per_iter, "zeroout_val": -5} + layer_dropout_kwargs = {"layer_dropout_stage_1": layer_dropout_stage_1, + "layer_dropout_stage_2": layer_dropout_stage_2} + network_args = {"layer_dropout_kwargs": layer_dropout_kwargs, + "zero_out_kwargs": zero_out_kwargs, "recog_num_layers": 48, "num_layers_set": [12, 24, 36, 48], + "layer_gate_activation":"sigmoid"} + + train_config = get_train_config(num_inputs=50, num_outputs=num_outputs, + network_args=network_args) + recog_network_args = copy.deepcopy(network_args) + recog_network_args["recog_num_layers"] = 48 + num_recog_mods_48_recog_config = get_recog_config(num_inputs=50, num_outputs=num_outputs, + network_args=recog_network_args) + + recog_network_args["recog_num_layers"] = 36 + num_recog_mods_36_recog_config = get_recog_config(num_inputs=50, num_outputs=num_outputs, + network_args=recog_network_args) + + recog_network_args["recog_num_layers"] = 24 + num_recog_mods_24_recog_config = get_recog_config(num_inputs=50, num_outputs=num_outputs, + network_args=recog_network_args) + + recog_network_args["recog_num_layers"] = 12 + num_recog_mods_12_recog_config = get_recog_config(num_inputs=50, num_outputs=num_outputs, + network_args=recog_network_args) + + num_recog_mods_48_experiment_name = f"four_models_dropout_{layer_dropout_stage_1}_{layer_dropout_stage_2}_recog_num_mods_48_peak_lr_{peak_lr}" + num_recog_mods_48_experiment_name = num_recog_mods_48_experiment_name.replace(".", "_") + experiments[num_recog_mods_48_experiment_name] = get_returnn_configs(train_config, num_recog_mods_48_recog_config) + + num_recog_mods_36_experiment_name = f"four_models_{layer_dropout_stage_1}_{layer_dropout_stage_2}_recog_num_mods_36_peak_lr_{peak_lr}" + num_recog_mods_36_experiment_name = num_recog_mods_36_experiment_name.replace(".", "_") + experiments[num_recog_mods_36_experiment_name] = get_returnn_configs(train_config, num_recog_mods_36_recog_config) + + num_recog_mods_24_experiment_name = f"four_models_{layer_dropout_stage_1}_{layer_dropout_stage_2}_recog_num_mods_24_peak_lr_{peak_lr}" + num_recog_mods_24_experiment_name = num_recog_mods_24_experiment_name.replace(".", "_") + experiments[num_recog_mods_24_experiment_name] = get_returnn_configs(train_config, num_recog_mods_24_recog_config) + + num_recog_mods_12_experiment_name = f"four_models_dropout_{layer_dropout_stage_1}_{layer_dropout_stage_2}_recog_num_mods_12_peak_lr_{peak_lr}" + num_recog_mods_12_experiment_name = num_recog_mods_12_experiment_name.replace(".", "_") + experiments[num_recog_mods_12_experiment_name] = get_returnn_configs(train_config, num_recog_mods_12_recog_config) + + return experiments diff --git a/users/jxu/experiments/ctc/tedlium2/pytorch_networks/dynamic_encoder_size/iterative_zero_out_refactored/jointly_train_iterative_zero_out_layerwise.py b/users/jxu/experiments/ctc/tedlium2/pytorch_networks/dynamic_encoder_size/iterative_zero_out_refactored/jointly_train_iterative_zero_out_layerwise.py new file mode 100644 index 000000000..70933243f --- /dev/null +++ b/users/jxu/experiments/ctc/tedlium2/pytorch_networks/dynamic_encoder_size/iterative_zero_out_refactored/jointly_train_iterative_zero_out_layerwise.py @@ -0,0 +1,316 @@ +from dataclasses import dataclass +from typing import Optional +from collections import OrderedDict + +import torch +from torch import nn +from returnn.tensor.tensor_dict import TensorDict +import returnn.frontend as rf +import numpy as np + +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1, VGG4LayerActFrontendV1Config +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.assemblies.conformer_with_dynamic_model_size.selection_with_iterative_zero_out import ( + ConformerBlockConfig, + ConformerEncoderConfig, + ConformerEncoder +) +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.config import ModelConfiguration, ModuleFactoryV1 + +from i6_experiments.users.berger.pytorch.models.util import lengths_to_padding_mask +from i6_experiments.common.setups.returnn_pytorch.serialization import Collection +from i6_experiments.users.berger.pytorch.serializers.basic import ( + get_basic_pt_network_serializer, +) + +from i6_experiments.common.setups.serialization import Import + + +@dataclass +class ConformerCTCConfig(ModelConfiguration): + conformer_cfg: ConformerEncoderConfig + target_size: int + recog_num_layers: int + zero_out_kwargs: dict + + +class ConformerCTCModel(torch.nn.Module): + def __init__(self, step: int, cfg: ConformerCTCConfig, **kwargs): + super().__init__() + self.conformer = ConformerEncoder(cfg.conformer_cfg) + self.final_linear_list = torch.nn.ModuleList( + [nn.Linear(cfg.conformer_cfg.block_cfg.ff_cfg.input_dim, cfg.target_size) for _ in + range(len(self.conformer.num_layers_set))]) + self.recog_num_layers = cfg.recog_num_layers + self.conformer.recog_num_layers = cfg.recog_num_layers + self.zero_out_kwargs = cfg.zero_out_kwargs + self.export_mode = False + + def forward( + self, + audio_features: torch.Tensor, + audio_features_len: Optional[torch.Tensor] = None, + global_train_step: int = 0 + ): + if self.training: + x = specaugment_v1_by_length(audio_features, + time_min_num_masks=2, + time_max_mask_per_n_frames=25, + time_mask_max_size=20, + freq_min_num_masks=2, + freq_mask_max_size=5, + freq_max_num_masks=10) # [B, T, F] + else: + x = audio_features + # sequence_mask = None if self.export_mode else lengths_to_padding_mask(audio_features_len) + sequence_mask = lengths_to_padding_mask(audio_features_len) + # sequence_mask = lengths_to_padding_mask((audio_features_len + 2) // 3) + conformer_out_list, total_utilised_layers, sequence_mask = self.conformer(x, sequence_mask, global_train_step, + self.zero_out_kwargs) # [B, T, F] + + log_probs_list = [] + for i in range(len(conformer_out_list)): + logits = self.final_linear_list[i](conformer_out_list[i]) # [B, T, F] + log_probs = torch.log_softmax(logits, dim=2) + log_probs_list.append(log_probs) + + if self.training: + return log_probs_list, total_utilised_layers, sequence_mask + + idx = self.conformer.num_layers_set.index(self.recog_num_layers) + return log_probs_list[idx] + + +def get_default_config_v1(num_inputs: int, num_outputs: int, network_args: dict) -> ConformerCTCConfig: + dropout = 0.2 if "dropout" not in network_args else network_args["dropout"] + num_att_heads = 6 if "num_att_heads" not in network_args else network_args["num_att_heads"] + att_weights_dropout = 0.1 if "att_weights_dropout" not in network_args else network_args["att_weights_dropout"] + num_layers = 12 if "num_layers" not in network_args else network_args["num_layers"] + kernel_size = 31 if "kernel_size" not in network_args else network_args["kernel_size"] + num_layers_set = network_args["num_layers_set"] + layer_dropout_kwargs = network_args["layer_dropout_kwargs"] + recog_num_layers = network_args["recog_num_layers"] + zero_out_kwargs = network_args["zero_out_kwargs"] + layer_gate_activation = network_args["layer_gate_activation"] + + frontend_cfg = VGG4LayerActFrontendV1Config( + in_features=num_inputs, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation=torch.nn.ReLU(), + out_features=384, + ) + + frontend = ModuleFactoryV1(VGG4LayerActFrontendV1, frontend_cfg) + + ff_cfg = ConformerPositionwiseFeedForwardV1Config( + input_dim=384, + hidden_dim=1536, + dropout=dropout, + activation=torch.nn.SiLU(), + ) + + mhsa_cfg = ConformerMHSAV1Config( + input_dim=384, + num_att_heads=num_att_heads, + att_weights_dropout=att_weights_dropout, + dropout=dropout, + ) + + conv_cfg = ConformerConvolutionV1Config( + channels=384, + kernel_size=kernel_size, + dropout=dropout, + activation=torch.nn.SiLU(), + norm=LayerNormNC(384), + ) + + block_cfg = ConformerBlockConfig( + ff_cfg=ff_cfg, + mhsa_cfg=mhsa_cfg, + conv_cfg=conv_cfg, + layer_dropout=layer_dropout_kwargs["layer_dropout_stage_1"], + modules=["ff", "conv", "mhsa", "ff"], + scales=[0.5, 1.0, 1.0, 0.5] + ) + + + conformer_cfg = ConformerEncoderConfig( + num_layers=num_layers, + frontend=frontend, + block_cfg=block_cfg, + num_layers_set=num_layers_set, + layer_dropout_kwargs=layer_dropout_kwargs, + layer_gate_activation=torch.nn.Sigmoid() if layer_gate_activation=="sigmoid" else torch.nn.Identity() + ) + + return ConformerCTCConfig( + conformer_cfg=conformer_cfg, + target_size=num_outputs, + recog_num_layers=recog_num_layers, + zero_out_kwargs=zero_out_kwargs, + ) + + +def train_step(*, model: torch.nn.Module, extern_data: TensorDict, global_train_step, **kwargs): + audio_features = extern_data["data"].raw_tensor + audio_features_len = extern_data["data"].dims[1].dyn_size_ext.raw_tensor + + targets = extern_data["targets"].raw_tensor.long() + targets_len = extern_data["targets"].dims[1].dyn_size_ext.raw_tensor + + model.train() + + log_probs_list, total_utilised_layers, sequence_mask = model( + audio_features=audio_features, + audio_features_len=audio_features_len.to("cuda"), + global_train_step=global_train_step + ) + sequence_lengths = torch.sum(sequence_mask.type(torch.int32), dim=1) + + num_steps_per_iter = model.zero_out_kwargs["num_steps_per_iter"] + cum_steps_per_iter = np.cumsum(num_steps_per_iter) + num_zeroout_elements_per_iter = model.zero_out_kwargs["num_zeroout_elements_per_iter"] + stage_1_expected_sparsity_per_iter = [n / (4*len(model.conformer.module_list)) for n in num_zeroout_elements_per_iter] + + assert len(num_steps_per_iter) == len(num_zeroout_elements_per_iter) + + loss_layers = model.conformer.num_layers_set + + # stage 1 : jointly train the largest and smallest model + if global_train_step <= cum_steps_per_iter[-1]: + loss_scales = [0.3, 1] + iter_idx = int(model.conformer.iter_idx) + sparsity_loss = torch.abs((48-total_utilised_layers)/48 - stage_1_expected_sparsity_per_iter[iter_idx]) + rf.get_run_ctx().mark_as_loss(name=f"sparsity_loss", loss=sparsity_loss, scale=5) + if global_train_step == cum_steps_per_iter[iter_idx] and global_train_step Collection: + pytorch_package = "i6_experiments.users.berger.pytorch" + + return get_basic_pt_network_serializer( + module_import_path=f"{__name__}.{ConformerCTCModel.__name__}", + model_config=model_config, + additional_serializer_objects=[ + Import(f"{__name__}.forward_step"), + Import(f"{pytorch_package}.forward.prior_callback.ComputePriorCallback", import_as="forward_callback"), + ], + ) + + +def export(*, model: torch.nn.Module, model_filename: str): + model.export_mode = True + dummy_data = torch.randn(1, 30, 50, device="cpu") + dummy_data_len = torch.tensor([30], dtype=torch.int32) + torch.onnx.export( + model=model.eval(), + args=(dummy_data, dummy_data_len), + f=model_filename, + verbose=True, + input_names=["data", "data_len"], + output_names=["classes"], + opset_version=17, + dynamic_axes={ + # dict value: manually named axes + "data": {0: "batch", 1: "time"}, + "data_len": {0: "batch"}, + "targets": {0: "batch", 1: "time"}, + }, + ) + + +def get_recog_serializer( + model_config: ConformerCTCConfig, +) -> Collection: + return get_basic_pt_network_serializer( + module_import_path=f"{__name__}.{ConformerCTCModel.__name__}", + model_config=model_config, + additional_serializer_objects=[ + Import(f"{__name__}.export"), + ], + ) + + +def get_train_serializer( + model_config: ConformerCTCConfig, +) -> Collection: + return get_basic_pt_network_serializer( + module_import_path=f"{__name__}.{ConformerCTCModel.__name__}", + model_config=model_config, + additional_serializer_objects=[ + Import(f"{__name__}.train_step"), + ], + ) From 9b708659573384afa73176552c0b37c1e30b0b7b Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Mon, 10 Jun 2024 15:51:08 +0000 Subject: [PATCH 141/227] update --- .../zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py | 1 + users/zeineldeen/experiments/canary_aed/nemo/run_eval.py | 1 + 2 files changed, 2 insertions(+) diff --git a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py index 0399321c4..02efab412 100644 --- a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py +++ b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py @@ -64,6 +64,7 @@ def py(): dataset_name=test_set, split="test", search_script=search_script, + search_args={"batch_size": 1}, python_exe=python_exe, device="gpu", time_rqmt=4, diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py index 43b467413..4c3d9d34e 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py @@ -122,6 +122,7 @@ def main(args): asr_model = ASRModel.restore_from(args.model_path, map_location=device) asr_model.freeze() + # TODO: how to set the num_workers? dataset = load_from_disk(args.dataset_path) if args.max_eval_samples is not None and args.max_eval_samples > 0: From 8a24d8fcef8c74f323178a9f0644821d0555a8f8 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Mon, 10 Jun 2024 15:52:15 +0000 Subject: [PATCH 142/227] update --- users/zeineldeen/experiments/canary_aed/nemo/run_eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py index 4c3d9d34e..3d65ae83f 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py @@ -144,7 +144,7 @@ def main(args): predictions.append(data_utils.normalizer(sample["pred_text"])) references.append(sample["reference"]) - # Write manifest results to args.manifest_path + # Write manifest results to args.manifest_path. This required modification in normalizer/eval_utils.py script manifest_path = data_utils.write_manifest( args.manifest_path, references, predictions, args.model_id, args.dataset_path, args.dataset, args.split ) From 01e4c4c927bc380635d709d479f49b10ecd82848 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Mon, 10 Jun 2024 19:09:28 +0200 Subject: [PATCH 143/227] register wer as out --- .../zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py | 1 + 1 file changed, 1 insertion(+) diff --git a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py index 02efab412..dbf8b786c 100644 --- a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py +++ b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py @@ -73,3 +73,4 @@ def py(): ) search_job.add_alias(f"canary_1b/{test_set}") tk.register_output(f"canary_1b/{test_set}/search_out", search_job.out_search_results) + tk.register_output(f"canary_1b/{test_set}/wer", search_job.out_wer) From 898609287c8268f25c60b688045e08c6dc25f126 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 11 Jun 2024 00:25:36 +0200 Subject: [PATCH 144/227] update --- .../experiments/canary_aed/configs/canary_1b_recog.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py index dbf8b786c..4f1c51c0f 100644 --- a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py +++ b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py @@ -64,13 +64,14 @@ def py(): dataset_name=test_set, split="test", search_script=search_script, - search_args={"batch_size": 1}, + search_args={"batch_size": 64}, python_exe=python_exe, device="gpu", time_rqmt=4, mem_rqmt=4, cpu_rqmt=2, ) - search_job.add_alias(f"canary_1b/{test_set}") - tk.register_output(f"canary_1b/{test_set}/search_out", search_job.out_search_results) - tk.register_output(f"canary_1b/{test_set}/wer", search_job.out_wer) + search_job.rqmt["sbatch_args"] = ["-p", "gpu_24gb"] + search_job.add_alias(f"canary_1b/{test_set}_bs64") + tk.register_output(f"canary_1b/{test_set}_bs64/search_out", search_job.out_search_results) + tk.register_output(f"canary_1b/{test_set}_bs64/wer", search_job.out_wer) From eddd357d7baacac30387f7ee58841128d3cc0ace Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 11 Jun 2024 12:05:33 +0000 Subject: [PATCH 145/227] add libri test other test set --- .../canary_aed/configs/canary_1b_recog.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py index 4f1c51c0f..b6a601fac 100644 --- a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py +++ b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py @@ -6,7 +6,8 @@ from i6_experiments.users.zeineldeen.experiments.canary_aed.nemo.download import DownloadNemoModel from i6_experiments.users.zeineldeen.experiments.canary_aed.nemo.search import SearchJob -TEST_DATASETS = ["ami", "earnings22", "gigaspeech"] +TEST_DATASETS = {"ami": "test", "earnings22": "test", "gigaspeech": "test", "librispeech": "test.other"} + MODEL_ID = "nvidia/canary-1b" @@ -16,11 +17,11 @@ def download_test_datasets() -> Dict[str, tk.Path]: out_dirs = {} - for test_dataset in TEST_DATASETS: + for test_dataset, split in TEST_DATASETS.items(): j = DownloadAndPrepareHuggingFaceDatasetJob( path="open-asr-leaderboard/datasets-test-only", name=test_dataset, - split="test", + split=split, time_rqmt=24, mem_rqmt=4, cpu_rqmt=4, @@ -56,19 +57,19 @@ def py(): "/work/asr4/zeineldeen/setups-data/ubuntu_22_setups/2024-06-07--canary-aed/nemo_venv/bin/python3" ) - for test_set in TEST_DATASETS: + for test_set, split in TEST_DATASETS.items(): search_job = SearchJob( model_id=MODEL_ID, model_path=model_path, dataset_path=dataset_paths[test_set], dataset_name=test_set, - split="test", + split=split, search_script=search_script, search_args={"batch_size": 64}, python_exe=python_exe, device="gpu", time_rqmt=4, - mem_rqmt=4, + mem_rqmt=8, cpu_rqmt=2, ) search_job.rqmt["sbatch_args"] = ["-p", "gpu_24gb"] From bc8e182b56232fe8a37f4786db893565469b8742 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 11 Jun 2024 12:21:42 +0000 Subject: [PATCH 146/227] fix args --- .../experiments/canary_aed/configs/canary_1b_recog.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py index b6a601fac..27e55da77 100644 --- a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py +++ b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py @@ -65,14 +65,14 @@ def py(): dataset_name=test_set, split=split, search_script=search_script, - search_args={"batch_size": 64}, + search_args={"batch_size": 64, "pcn": False, "max_eval_samples": -1}, python_exe=python_exe, device="gpu", - time_rqmt=4, + time_rqmt=24, mem_rqmt=8, cpu_rqmt=2, ) search_job.rqmt["sbatch_args"] = ["-p", "gpu_24gb"] - search_job.add_alias(f"canary_1b/{test_set}_bs64") - tk.register_output(f"canary_1b/{test_set}_bs64/search_out", search_job.out_search_results) - tk.register_output(f"canary_1b/{test_set}_bs64/wer", search_job.out_wer) + search_job.add_alias(f"canary_1b/{test_set}_bs64_wo-pcn") + tk.register_output(f"canary_1b/{test_set}_bs64_wo-pcn/search_out", search_job.out_search_results) + tk.register_output(f"canary_1b/{test_set}_bs64_wo-pcn/wer", search_job.out_wer) From 108fa7dad8b9a34befb46fa0694d11dd8a8addaa Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 11 Jun 2024 12:26:30 +0000 Subject: [PATCH 147/227] fix args --- .../experiments/canary_aed/configs/canary_1b_recog.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py index 27e55da77..8381a546c 100644 --- a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py +++ b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py @@ -65,7 +65,7 @@ def py(): dataset_name=test_set, split=split, search_script=search_script, - search_args={"batch_size": 64, "pcn": False, "max_eval_samples": -1}, + search_args={"batch_size": 64, "pnc": False, "max_eval_samples": -1}, python_exe=python_exe, device="gpu", time_rqmt=24, @@ -73,6 +73,6 @@ def py(): cpu_rqmt=2, ) search_job.rqmt["sbatch_args"] = ["-p", "gpu_24gb"] - search_job.add_alias(f"canary_1b/{test_set}_bs64_wo-pcn") - tk.register_output(f"canary_1b/{test_set}_bs64_wo-pcn/search_out", search_job.out_search_results) - tk.register_output(f"canary_1b/{test_set}_bs64_wo-pcn/wer", search_job.out_wer) + search_job.add_alias(f"canary_1b/{test_set}_bs64_wo-pnc") + tk.register_output(f"canary_1b/{test_set}_bs64_wo-pnc/search_out", search_job.out_search_results) + tk.register_output(f"canary_1b/{test_set}_bs64_wo-pnc/wer", search_job.out_wer) From 0b839fffc6150e709540a33551200ed7185f334c Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Wed, 12 Jun 2024 17:54:12 +0200 Subject: [PATCH 148/227] update --- .../canary_aed/configs/canary_1b_recog.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py index 8381a546c..a01b0da88 100644 --- a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py +++ b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py @@ -53,10 +53,22 @@ def py(): "/u/zeineldeen/setups/ubuntu_22_setups/2024-06-07--canary-aed/recipe/i6_experiments/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py", hash_overwrite="run_eval_v1", ) + # to run canary model, this env has installed nemo toolkit with: + # pip3 install git+https://github.com/NVIDIA/NeMo.git@r2.0.0rc0#egg=nemo_toolkit[all] + # related issue: https://github.com/huggingface/open_asr_leaderboard/issues/26 python_exe = tk.Path( "/work/asr4/zeineldeen/setups-data/ubuntu_22_setups/2024-06-07--canary-aed/nemo_venv/bin/python3" ) + # Greedy decoding: + # + # testset | ours | huggingface + # ----------------------------------- + # libri | 2.95 | 2.94 + # ami | 13.96 | 14.0 + # earnings22 | 12.23 | 12.25 + # gigaspeech | 10.14 | 10.19 + for test_set, split in TEST_DATASETS.items(): search_job = SearchJob( model_id=MODEL_ID, From a6a4368c81c000e298bce7d74811eb2ef46a123a Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Wed, 12 Jun 2024 18:37:51 +0200 Subject: [PATCH 149/227] add modified normalized --- .../canary_aed/nemo/normalizer/__init__.py | 1 + .../canary_aed/nemo/normalizer/data_utils.py | 59 + .../nemo/normalizer/english_abbreviations.py | 1743 +++++++++++++++++ .../canary_aed/nemo/normalizer/eval_utils.py | 157 ++ .../canary_aed/nemo/normalizer/normalizer.py | 596 ++++++ 5 files changed, 2556 insertions(+) create mode 100644 users/zeineldeen/experiments/canary_aed/nemo/normalizer/__init__.py create mode 100644 users/zeineldeen/experiments/canary_aed/nemo/normalizer/data_utils.py create mode 100644 users/zeineldeen/experiments/canary_aed/nemo/normalizer/english_abbreviations.py create mode 100644 users/zeineldeen/experiments/canary_aed/nemo/normalizer/eval_utils.py create mode 100644 users/zeineldeen/experiments/canary_aed/nemo/normalizer/normalizer.py diff --git a/users/zeineldeen/experiments/canary_aed/nemo/normalizer/__init__.py b/users/zeineldeen/experiments/canary_aed/nemo/normalizer/__init__.py new file mode 100644 index 000000000..7b3024108 --- /dev/null +++ b/users/zeineldeen/experiments/canary_aed/nemo/normalizer/__init__.py @@ -0,0 +1 @@ +from .normalizer import EnglishTextNormalizer \ No newline at end of file diff --git a/users/zeineldeen/experiments/canary_aed/nemo/normalizer/data_utils.py b/users/zeineldeen/experiments/canary_aed/nemo/normalizer/data_utils.py new file mode 100644 index 000000000..0f8ad7151 --- /dev/null +++ b/users/zeineldeen/experiments/canary_aed/nemo/normalizer/data_utils.py @@ -0,0 +1,59 @@ +from datasets import load_dataset, Audio +from normalizer import EnglishTextNormalizer + +from .eval_utils import read_manifest, write_manifest + + +def is_target_text_in_range(ref): + if ref.strip() == "ignore time segment in scoring": + return False + else: + return ref.strip() != "" + + +def get_text(sample): + if "text" in sample: + return sample["text"] + elif "sentence" in sample: + return sample["sentence"] + elif "normalized_text" in sample: + return sample["normalized_text"] + elif "transcript" in sample: + return sample["transcript"] + elif "transcription" in sample: + return sample["transcription"] + else: + raise ValueError( + f"Expected transcript column of either 'text', 'sentence', 'normalized_text' or 'transcript'. Got sample of " + ".join{sample.keys()}. Ensure a text column name is present in the dataset." + ) + +normalizer = EnglishTextNormalizer() + + +def normalize(batch): + batch["original_text"] = get_text(batch) + batch["norm_text"] = normalizer(batch["original_text"]) + return batch + + +def load_data(args): + dataset = load_dataset( + args.dataset_path, + args.dataset, + split=args.split, + streaming=args.streaming, + token=True, + ) + + return dataset + +def prepare_data(dataset): + # Re-sample to 16kHz and normalise transcriptions + dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) + dataset = dataset.map(normalize) + dataset = dataset.filter(is_target_text_in_range, input_columns=["norm_text"]) + + return dataset + + diff --git a/users/zeineldeen/experiments/canary_aed/nemo/normalizer/english_abbreviations.py b/users/zeineldeen/experiments/canary_aed/nemo/normalizer/english_abbreviations.py new file mode 100644 index 000000000..5a8cc9320 --- /dev/null +++ b/users/zeineldeen/experiments/canary_aed/nemo/normalizer/english_abbreviations.py @@ -0,0 +1,1743 @@ +english_spelling_normalizer = { + "accessorise": "accessorize", + "accessorised": "accessorized", + "accessorises": "accessorizes", + "accessorising": "accessorizing", + "acclimatisation": "acclimatization", + "acclimatise": "acclimatize", + "acclimatised": "acclimatized", + "acclimatises": "acclimatizes", + "acclimatising": "acclimatizing", + "accoutrements": "accouterments", + "aeon": "eon", + "aeons": "eons", + "aerogramme": "aerogram", + "aerogrammes": "aerograms", + "aeroplane": "airplane", + "aeroplanes": "airplanes", + "aesthete": "esthete", + "aesthetes": "esthetes", + "aesthetic": "esthetic", + "aesthetically": "esthetically", + "aesthetics": "esthetics", + "aetiology": "etiology", + "ageing": "aging", + "aggrandisement": "aggrandizement", + "agonise": "agonize", + "agonised": "agonized", + "agonises": "agonizes", + "agonising": "agonizing", + "agonisingly": "agonizingly", + "almanack": "almanac", + "almanacks": "almanacs", + "aluminium": "aluminum", + "amortisable": "amortizable", + "amortisation": "amortization", + "amortisations": "amortizations", + "amortise": "amortize", + "amortised": "amortized", + "amortises": "amortizes", + "amortising": "amortizing", + "amphitheatre": "amphitheater", + "amphitheatres": "amphitheaters", + "anaemia": "anemia", + "anaemic": "anemic", + "anaesthesia": "anesthesia", + "anaesthetic": "anesthetic", + "anaesthetics": "anesthetics", + "anaesthetise": "anesthetize", + "anaesthetised": "anesthetized", + "anaesthetises": "anesthetizes", + "anaesthetising": "anesthetizing", + "anaesthetist": "anesthetist", + "anaesthetists": "anesthetists", + "anaesthetize": "anesthetize", + "anaesthetized": "anesthetized", + "anaesthetizes": "anesthetizes", + "anaesthetizing": "anesthetizing", + "analogue": "analog", + "analogues": "analogs", + "analyse": "analyze", + "analysed": "analyzed", + "analyses": "analyzes", + "analysing": "analyzing", + "anglicise": "anglicize", + "anglicised": "anglicized", + "anglicises": "anglicizes", + "anglicising": "anglicizing", + "annualised": "annualized", + "antagonise": "antagonize", + "antagonised": "antagonized", + "antagonises": "antagonizes", + "antagonising": "antagonizing", + "apologise": "apologize", + "apologised": "apologized", + "apologises": "apologizes", + "apologising": "apologizing", + "appal": "appall", + "appals": "appalls", + "appetiser": "appetizer", + "appetisers": "appetizers", + "appetising": "appetizing", + "appetisingly": "appetizingly", + "arbour": "arbor", + "arbours": "arbors", + "archaeologically": "archeologically", + "archaeologist": "archeologist", + "archaeologists": "archeologists", + "archaeology": "archeology", + "archeological": "archaeological", + "ardour": "ardor", + "armour": "armor", + "armoured": "armored", + "armourer": "armorer", + "armourers": "armorers", + "armouries": "armories", + "armoury": "armory", + "artefact": "artifact", + "artefacts": "artifacts", + "authorise": "authorize", + "authorised": "authorized", + "authorises": "authorizes", + "authorising": "authorizing", + "axe": "ax", + "backpedalled": "backpedaled", + "backpedalling": "backpedaling", + "bannister": "banister", + "bannisters": "banisters", + "baptise": "baptize", + "baptised": "baptized", + "baptises": "baptizes", + "baptising": "baptizing", + "bastardise": "bastardize", + "bastardised": "bastardized", + "bastardises": "bastardizes", + "bastardising": "bastardizing", + "battleax": "battleaxe", + "baulk": "balk", + "baulked": "balked", + "baulking": "balking", + "baulks": "balks", + "bedevilled": "bedeviled", + "bedevilling": "bedeviling", + "behaviour": "behavior", + "behavioural": "behavioral", + "behaviourism": "behaviorism", + "behaviourist": "behaviorist", + "behaviourists": "behaviorists", + "behaviours": "behaviors", + "behove": "behoove", + "behoved": "behooved", + "behoves": "behooves", + "bejewelled": "bejeweled", + "belabour": "belabor", + "belaboured": "belabored", + "belabouring": "belaboring", + "belabours": "belabors", + "bevelled": "beveled", + "bevvies": "bevies", + "bevvy": "bevy", + "biassed": "biased", + "biassing": "biasing", + "bingeing": "binging", + "bougainvillaea": "bougainvillea", + "bougainvillaeas": "bougainvilleas", + "bowdlerise": "bowdlerize", + "bowdlerised": "bowdlerized", + "bowdlerises": "bowdlerizes", + "bowdlerising": "bowdlerizing", + "breathalyse": "breathalyze", + "breathalysed": "breathalyzed", + "breathalyser": "breathalyzer", + "breathalysers": "breathalyzers", + "breathalyses": "breathalyzes", + "breathalysing": "breathalyzing", + "brutalise": "brutalize", + "brutalised": "brutalized", + "brutalises": "brutalizes", + "brutalising": "brutalizing", + "busses": "buses", + "bussing": "busing", + "caesarean": "cesarean", + "caesareans": "cesareans", + "calibre": "caliber", + "calibres": "calibers", + "calliper": "caliper", + "callipers": "calipers", + "callisthenics": "calisthenics", + "canalise": "canalize", + "canalised": "canalized", + "canalises": "canalizes", + "canalising": "canalizing", + "cancelation": "cancellation", + "cancelations": "cancellations", + "cancelled": "canceled", + "cancelling": "canceling", + "candour": "candor", + "cannibalise": "cannibalize", + "cannibalised": "cannibalized", + "cannibalises": "cannibalizes", + "cannibalising": "cannibalizing", + "canonise": "canonize", + "canonised": "canonized", + "canonises": "canonizes", + "canonising": "canonizing", + "capitalise": "capitalize", + "capitalised": "capitalized", + "capitalises": "capitalizes", + "capitalising": "capitalizing", + "caramelise": "caramelize", + "caramelised": "caramelized", + "caramelises": "caramelizes", + "caramelising": "caramelizing", + "carbonise": "carbonize", + "carbonised": "carbonized", + "carbonises": "carbonizes", + "carbonising": "carbonizing", + "carolled": "caroled", + "carolling": "caroling", + "catalogue": "catalog", + "catalogued": "cataloged", + "catalogues": "catalogs", + "cataloguing": "cataloging", + "catalyse": "catalyze", + "catalysed": "catalyzed", + "catalyses": "catalyzes", + "catalysing": "catalyzing", + "categorise": "categorize", + "categorised": "categorized", + "categorises": "categorizes", + "categorising": "categorizing", + "cauterise": "cauterize", + "cauterised": "cauterized", + "cauterises": "cauterizes", + "cauterising": "cauterizing", + "cavilled": "caviled", + "cavilling": "caviling", + "centigramme": "centigram", + "centigrammes": "centigrams", + "centilitre": "centiliter", + "centilitres": "centiliters", + "centimetre": "centimeter", + "centimetres": "centimeters", + "centralise": "centralize", + "centralised": "centralized", + "centralises": "centralizes", + "centralising": "centralizing", + "centre": "center", + "centred": "centered", + "centrefold": "centerfold", + "centrefolds": "centerfolds", + "centrepiece": "centerpiece", + "centrepieces": "centerpieces", + "centres": "centers", + "channelled": "channeled", + "channelling": "channeling", + "characterise": "characterize", + "characterised": "characterized", + "characterises": "characterizes", + "characterising": "characterizing", + "cheque": "check", + "chequebook": "checkbook", + "chequebooks": "checkbooks", + "chequered": "checkered", + "cheques": "checks", + "chilli": "chili", + "chimaera": "chimera", + "chimaeras": "chimeras", + "chiselled": "chiseled", + "chiselling": "chiseling", + "circularise": "circularize", + "circularised": "circularized", + "circularises": "circularizes", + "circularising": "circularizing", + "civilise": "civilize", + "civilised": "civilized", + "civilises": "civilizes", + "civilising": "civilizing", + "clamour": "clamor", + "clamoured": "clamored", + "clamouring": "clamoring", + "clamours": "clamors", + "clangour": "clangor", + "clarinettist": "clarinetist", + "clarinettists": "clarinetists", + "collectivise": "collectivize", + "collectivised": "collectivized", + "collectivises": "collectivizes", + "collectivising": "collectivizing", + "colonisation": "colonization", + "colonise": "colonize", + "colonised": "colonized", + "coloniser": "colonizer", + "colonisers": "colonizers", + "colonises": "colonizes", + "colonising": "colonizing", + "colour": "color", + "colourant": "colorant", + "colourants": "colorants", + "coloured": "colored", + "coloureds": "coloreds", + "colourful": "colorful", + "colourfully": "colorfully", + "colouring": "coloring", + "colourize": "colorize", + "colourized": "colorized", + "colourizes": "colorizes", + "colourizing": "colorizing", + "colourless": "colorless", + "colours": "colors", + "commercialise": "commercialize", + "commercialised": "commercialized", + "commercialises": "commercializes", + "commercialising": "commercializing", + "compartmentalise": "compartmentalize", + "compartmentalised": "compartmentalized", + "compartmentalises": "compartmentalizes", + "compartmentalising": "compartmentalizing", + "computerise": "computerize", + "computerised": "computerized", + "computerises": "computerizes", + "computerising": "computerizing", + "conceptualise": "conceptualize", + "conceptualised": "conceptualized", + "conceptualises": "conceptualizes", + "conceptualising": "conceptualizing", + "connexion": "connection", + "connexions": "connections", + "contextualise": "contextualize", + "contextualised": "contextualized", + "contextualises": "contextualizes", + "contextualising": "contextualizing", + "cosier": "cozier", + "cosies": "cozies", + "cosiest": "coziest", + "cosily": "cozily", + "cosiness": "coziness", + "cosy": "cozy", + "councillor": "councilor", + "councillors": "councilors", + "counselled": "counseled", + "counselling": "counseling", + "counsellor": "counselor", + "counsellors": "counselors", + "crenelated": "crenellated", + "criminalise": "criminalize", + "criminalised": "criminalized", + "criminalises": "criminalizes", + "criminalising": "criminalizing", + "criticise": "criticize", + "criticised": "criticized", + "criticises": "criticizes", + "criticising": "criticizing", + "crueller": "crueler", + "cruellest": "cruelest", + "crystallisation": "crystallization", + "crystallise": "crystallize", + "crystallised": "crystallized", + "crystallises": "crystallizes", + "crystallising": "crystallizing", + "cudgelled": "cudgeled", + "cudgelling": "cudgeling", + "customise": "customize", + "customised": "customized", + "customises": "customizes", + "customising": "customizing", + "cypher": "cipher", + "cyphers": "ciphers", + "decentralisation": "decentralization", + "decentralise": "decentralize", + "decentralised": "decentralized", + "decentralises": "decentralizes", + "decentralising": "decentralizing", + "decriminalisation": "decriminalization", + "decriminalise": "decriminalize", + "decriminalised": "decriminalized", + "decriminalises": "decriminalizes", + "decriminalising": "decriminalizing", + "defence": "defense", + "defenceless": "defenseless", + "defences": "defenses", + "dehumanisation": "dehumanization", + "dehumanise": "dehumanize", + "dehumanised": "dehumanized", + "dehumanises": "dehumanizes", + "dehumanising": "dehumanizing", + "demeanour": "demeanor", + "demilitarisation": "demilitarization", + "demilitarise": "demilitarize", + "demilitarised": "demilitarized", + "demilitarises": "demilitarizes", + "demilitarising": "demilitarizing", + "demobilisation": "demobilization", + "demobilise": "demobilize", + "demobilised": "demobilized", + "demobilises": "demobilizes", + "demobilising": "demobilizing", + "democratisation": "democratization", + "democratise": "democratize", + "democratised": "democratized", + "democratises": "democratizes", + "democratising": "democratizing", + "demonise": "demonize", + "demonised": "demonized", + "demonises": "demonizes", + "demonising": "demonizing", + "demoralisation": "demoralization", + "demoralise": "demoralize", + "demoralised": "demoralized", + "demoralises": "demoralizes", + "demoralising": "demoralizing", + "denationalisation": "denationalization", + "denationalise": "denationalize", + "denationalised": "denationalized", + "denationalises": "denationalizes", + "denationalising": "denationalizing", + "deodorise": "deodorize", + "deodorised": "deodorized", + "deodorises": "deodorizes", + "deodorising": "deodorizing", + "depersonalise": "depersonalize", + "depersonalised": "depersonalized", + "depersonalises": "depersonalizes", + "depersonalising": "depersonalizing", + "deputise": "deputize", + "deputised": "deputized", + "deputises": "deputizes", + "deputising": "deputizing", + "desensitisation": "desensitization", + "desensitise": "desensitize", + "desensitised": "desensitized", + "desensitises": "desensitizes", + "desensitising": "desensitizing", + "destabilisation": "destabilization", + "destabilise": "destabilize", + "destabilised": "destabilized", + "destabilises": "destabilizes", + "destabilising": "destabilizing", + "dialled": "dialed", + "dialling": "dialing", + "dialogue": "dialog", + "dialogues": "dialogs", + "diarrhoea": "diarrhea", + "digitise": "digitize", + "digitised": "digitized", + "digitises": "digitizes", + "digitising": "digitizing", + "disc": "disk", + "discolour": "discolor", + "discoloured": "discolored", + "discolouring": "discoloring", + "discolours": "discolors", + "discs": "disks", + "disembowelled": "disemboweled", + "disembowelling": "disemboweling", + "disfavour": "disfavor", + "dishevelled": "disheveled", + "dishonour": "dishonor", + "dishonourable": "dishonorable", + "dishonourably": "dishonorably", + "dishonoured": "dishonored", + "dishonouring": "dishonoring", + "dishonours": "dishonors", + "disorganisation": "disorganization", + "disorganised": "disorganized", + "distil": "distill", + "distils": "distills", + "dramatisation": "dramatization", + "dramatisations": "dramatizations", + "dramatise": "dramatize", + "dramatised": "dramatized", + "dramatises": "dramatizes", + "dramatising": "dramatizing", + "draught": "draft", + "draughtboard": "draftboard", + "draughtboards": "draftboards", + "draughtier": "draftier", + "draughtiest": "draftiest", + "draughts": "drafts", + "draughtsman": "draftsman", + "draughtsmanship": "draftsmanship", + "draughtsmen": "draftsmen", + "draughtswoman": "draftswoman", + "draughtswomen": "draftswomen", + "draughty": "drafty", + "drivelled": "driveled", + "drivelling": "driveling", + "duelled": "dueled", + "duelling": "dueling", + "economise": "economize", + "economised": "economized", + "economises": "economizes", + "economising": "economizing", + "editorialise": "editorialize", + "editorialised": "editorialized", + "editorialises": "editorializes", + "editorialising": "editorializing", + "edoema": "edema", + "empathise": "empathize", + "empathised": "empathized", + "empathises": "empathizes", + "empathising": "empathizing", + "emphasise": "emphasize", + "emphasised": "emphasized", + "emphasises": "emphasizes", + "emphasising": "emphasizing", + "enamelled": "enameled", + "enamelling": "enameling", + "enamoured": "enamored", + "encyclopaedia": "encyclopedia", + "encyclopaedias": "encyclopedias", + "encyclopaedic": "encyclopedic", + "endeavour": "endeavor", + "endeavoured": "endeavored", + "endeavouring": "endeavoring", + "endeavours": "endeavors", + "energise": "energize", + "energised": "energized", + "energises": "energizes", + "energising": "energizing", + "enrol": "enroll", + "enrols": "enrolls", + "enthral": "enthrall", + "enthrals": "enthralls", + "epaulette": "epaulet", + "epaulettes": "epaulets", + "epicentre": "epicenter", + "epicentres": "epicenters", + "epilogue": "epilog", + "epilogues": "epilogs", + "epitomise": "epitomize", + "epitomised": "epitomized", + "epitomises": "epitomizes", + "epitomising": "epitomizing", + "equalisation": "equalization", + "equalise": "equalize", + "equalised": "equalized", + "equaliser": "equalizer", + "equalisers": "equalizers", + "equalises": "equalizes", + "equalising": "equalizing", + "eulogise": "eulogize", + "eulogised": "eulogized", + "eulogises": "eulogizes", + "eulogising": "eulogizing", + "evangelise": "evangelize", + "evangelised": "evangelized", + "evangelises": "evangelizes", + "evangelising": "evangelizing", + "exorcise": "exorcize", + "exorcised": "exorcized", + "exorcises": "exorcizes", + "exorcising": "exorcizing", + "extemporisation": "extemporization", + "extemporise": "extemporize", + "extemporised": "extemporized", + "extemporises": "extemporizes", + "extemporising": "extemporizing", + "externalisation": "externalization", + "externalisations": "externalizations", + "externalise": "externalize", + "externalised": "externalized", + "externalises": "externalizes", + "externalising": "externalizing", + "factorise": "factorize", + "factorised": "factorized", + "factorises": "factorizes", + "factorising": "factorizing", + "faecal": "fecal", + "faeces": "feces", + "familiarisation": "familiarization", + "familiarise": "familiarize", + "familiarised": "familiarized", + "familiarises": "familiarizes", + "familiarising": "familiarizing", + "fantasise": "fantasize", + "fantasised": "fantasized", + "fantasises": "fantasizes", + "fantasising": "fantasizing", + "favour": "favor", + "favourable": "favorable", + "favourably": "favorably", + "favoured": "favored", + "favouring": "favoring", + "favourite": "favorite", + "favourites": "favorites", + "favouritism": "favoritism", + "favours": "favors", + "feminise": "feminize", + "feminised": "feminized", + "feminises": "feminizes", + "feminising": "feminizing", + "fertilisation": "fertilization", + "fertilise": "fertilize", + "fertilised": "fertilized", + "fertiliser": "fertilizer", + "fertilisers": "fertilizers", + "fertilises": "fertilizes", + "fertilising": "fertilizing", + "fervour": "fervor", + "fibre": "fiber", + "fibreglass": "fiberglass", + "fibres": "fibers", + "fictionalisation": "fictionalization", + "fictionalisations": "fictionalizations", + "fictionalise": "fictionalize", + "fictionalised": "fictionalized", + "fictionalises": "fictionalizes", + "fictionalising": "fictionalizing", + "fillet": "filet", + "filleted": "fileted", + "filleting": "fileting", + "fillets": "filets", + "finalisation": "finalization", + "finalise": "finalize", + "finalised": "finalized", + "finalises": "finalizes", + "finalising": "finalizing", + "flautist": "flutist", + "flautists": "flutists", + "flavour": "flavor", + "flavoured": "flavored", + "flavouring": "flavoring", + "flavourings": "flavorings", + "flavourless": "flavorless", + "flavours": "flavors", + "flavoursome": "flavorsome", + "flyer / flier": "flier / flyer", + "foetal": "fetal", + "foetid": "fetid", + "foetus": "fetus", + "foetuses": "fetuses", + "formalisation": "formalization", + "formalise": "formalize", + "formalised": "formalized", + "formalises": "formalizes", + "formalising": "formalizing", + "fossilisation": "fossilization", + "fossilise": "fossilize", + "fossilised": "fossilized", + "fossilises": "fossilizes", + "fossilising": "fossilizing", + "fraternisation": "fraternization", + "fraternise": "fraternize", + "fraternised": "fraternized", + "fraternises": "fraternizes", + "fraternising": "fraternizing", + "fulfil": "fulfill", + "fulfilment": "fulfillment", + "fulfils": "fulfills", + "funnelled": "funneled", + "funnelling": "funneling", + "gage": "gauge", + "gaged": "gauged", + "gages": "gauges", + "gaging": "gauging", + "galvanise": "galvanize", + "galvanised": "galvanized", + "galvanises": "galvanizes", + "galvanising": "galvanizing", + "gambolled": "gamboled", + "gambolling": "gamboling", + "gaol": "jail", + "gaolbird": "jailbird", + "gaolbirds": "jailbirds", + "gaolbreak": "jailbreak", + "gaolbreaks": "jailbreaks", + "gaoled": "jailed", + "gaoler": "jailer", + "gaolers": "jailers", + "gaoling": "jailing", + "gaols": "jails", + "gasses": "gases", + "generalisation": "generalization", + "generalisations": "generalizations", + "generalise": "generalize", + "generalised": "generalized", + "generalises": "generalizes", + "generalising": "generalizing", + "ghettoise": "ghettoize", + "ghettoised": "ghettoized", + "ghettoises": "ghettoizes", + "ghettoising": "ghettoizing", + "gipsies": "gypsies", + "glamor": "glamour", + "glamorise": "glamorize", + "glamorised": "glamorized", + "glamorises": "glamorizes", + "glamorising": "glamorizing", + "globalisation": "globalization", + "globalise": "globalize", + "globalised": "globalized", + "globalises": "globalizes", + "globalising": "globalizing", + "glueing": "gluing", + "goitre": "goiter", + "goitres": "goiters", + "gonorrhoea": "gonorrhea", + "gramme": "gram", + "grammes": "grams", + "gravelled": "graveled", + "grey": "gray", + "greyed": "grayed", + "greying": "graying", + "greyish": "grayish", + "greyness": "grayness", + "greys": "grays", + "grovelled": "groveled", + "grovelling": "groveling", + "groyne": "groin", + "groynes": "groins", + "gruelling": "grueling", + "gruellingly": "gruelingly", + "gryphon": "griffin", + "gryphons": "griffins", + "gynaecological": "gynecological", + "gynaecologist": "gynecologist", + "gynaecologists": "gynecologists", + "gynaecology": "gynecology", + "haematological": "hematological", + "haematologist": "hematologist", + "haematologists": "hematologists", + "haematology": "hematology", + "haemoglobin": "hemoglobin", + "haemophilia": "hemophilia", + "haemophiliac": "hemophiliac", + "haemophiliacs": "hemophiliacs", + "haemorrhage": "hemorrhage", + "haemorrhaged": "hemorrhaged", + "haemorrhages": "hemorrhages", + "haemorrhaging": "hemorrhaging", + "haemorrhoids": "hemorrhoids", + "harbour": "harbor", + "harboured": "harbored", + "harbouring": "harboring", + "harbours": "harbors", + "harmonisation": "harmonization", + "harmonise": "harmonize", + "harmonised": "harmonized", + "harmonises": "harmonizes", + "harmonising": "harmonizing", + "homoeopath": "homeopath", + "homoeopathic": "homeopathic", + "homoeopaths": "homeopaths", + "homoeopathy": "homeopathy", + "homogenise": "homogenize", + "homogenised": "homogenized", + "homogenises": "homogenizes", + "homogenising": "homogenizing", + "honour": "honor", + "honourable": "honorable", + "honourably": "honorably", + "honoured": "honored", + "honouring": "honoring", + "honours": "honors", + "hospitalisation": "hospitalization", + "hospitalise": "hospitalize", + "hospitalised": "hospitalized", + "hospitalises": "hospitalizes", + "hospitalising": "hospitalizing", + "humanise": "humanize", + "humanised": "humanized", + "humanises": "humanizes", + "humanising": "humanizing", + "humour": "humor", + "humoured": "humored", + "humouring": "humoring", + "humourless": "humorless", + "humours": "humors", + "hybridise": "hybridize", + "hybridised": "hybridized", + "hybridises": "hybridizes", + "hybridising": "hybridizing", + "hypnotise": "hypnotize", + "hypnotised": "hypnotized", + "hypnotises": "hypnotizes", + "hypnotising": "hypnotizing", + "hypothesise": "hypothesize", + "hypothesised": "hypothesized", + "hypothesises": "hypothesizes", + "hypothesising": "hypothesizing", + "idealisation": "idealization", + "idealise": "idealize", + "idealised": "idealized", + "idealises": "idealizes", + "idealising": "idealizing", + "idolise": "idolize", + "idolised": "idolized", + "idolises": "idolizes", + "idolising": "idolizing", + "immobilisation": "immobilization", + "immobilise": "immobilize", + "immobilised": "immobilized", + "immobiliser": "immobilizer", + "immobilisers": "immobilizers", + "immobilises": "immobilizes", + "immobilising": "immobilizing", + "immortalise": "immortalize", + "immortalised": "immortalized", + "immortalises": "immortalizes", + "immortalising": "immortalizing", + "immunisation": "immunization", + "immunise": "immunize", + "immunised": "immunized", + "immunises": "immunizes", + "immunising": "immunizing", + "impanelled": "impaneled", + "impanelling": "impaneling", + "imperilled": "imperiled", + "imperilling": "imperiling", + "individualise": "individualize", + "individualised": "individualized", + "individualises": "individualizes", + "individualising": "individualizing", + "industrialise": "industrialize", + "industrialised": "industrialized", + "industrialises": "industrializes", + "industrialising": "industrializing", + "inflexion": "inflection", + "inflexions": "inflections", + "initialise": "initialize", + "initialised": "initialized", + "initialises": "initializes", + "initialising": "initializing", + "initialled": "initialed", + "initialling": "initialing", + "instal": "install", + "instalment": "installment", + "instalments": "installments", + "instals": "installs", + "instil": "instill", + "instils": "instills", + "institutionalisation": "institutionalization", + "institutionalise": "institutionalize", + "institutionalised": "institutionalized", + "institutionalises": "institutionalizes", + "institutionalising": "institutionalizing", + "intellectualise": "intellectualize", + "intellectualised": "intellectualized", + "intellectualises": "intellectualizes", + "intellectualising": "intellectualizing", + "internalisation": "internalization", + "internalise": "internalize", + "internalised": "internalized", + "internalises": "internalizes", + "internalising": "internalizing", + "internationalisation": "internationalization", + "internationalise": "internationalize", + "internationalised": "internationalized", + "internationalises": "internationalizes", + "internationalising": "internationalizing", + "ionisation": "ionization", + "ionise": "ionize", + "ionised": "ionized", + "ioniser": "ionizer", + "ionisers": "ionizers", + "ionises": "ionizes", + "ionising": "ionizing", + "italicise": "italicize", + "italicised": "italicized", + "italicises": "italicizes", + "italicising": "italicizing", + "itemise": "itemize", + "itemised": "itemized", + "itemises": "itemizes", + "itemising": "itemizing", + "jeopardise": "jeopardize", + "jeopardised": "jeopardized", + "jeopardises": "jeopardizes", + "jeopardising": "jeopardizing", + "jewelled": "jeweled", + "jeweller": "jeweler", + "jewellers": "jewelers", + "jewellery": "jewelry", + "judgement": "judgment", + "kilogramme": "kilogram", + "kilogrammes": "kilograms", + "kilometre": "kilometer", + "kilometres": "kilometers", + "labelled": "labeled", + "labelling": "labeling", + "labour": "labor", + "laboured": "labored", + "labourer": "laborer", + "labourers": "laborers", + "labouring": "laboring", + "labours": "labors", + "lacklustre": "lackluster", + "legalisation": "legalization", + "legalise": "legalize", + "legalised": "legalized", + "legalises": "legalizes", + "legalising": "legalizing", + "legitimise": "legitimize", + "legitimised": "legitimized", + "legitimises": "legitimizes", + "legitimising": "legitimizing", + "leukaemia": "leukemia", + "levelled": "leveled", + "leveller": "leveler", + "levellers": "levelers", + "levelling": "leveling", + "libelled": "libeled", + "libelling": "libeling", + "libellous": "libelous", + "liberalisation": "liberalization", + "liberalise": "liberalize", + "liberalised": "liberalized", + "liberalises": "liberalizes", + "liberalising": "liberalizing", + "licence": "license", + "licenced": "licensed", + "licences": "licenses", + "licencing": "licensing", + "likeable": "likable", + "lionisation": "lionization", + "lionise": "lionize", + "lionised": "lionized", + "lionises": "lionizes", + "lionising": "lionizing", + "liquidise": "liquidize", + "liquidised": "liquidized", + "liquidiser": "liquidizer", + "liquidisers": "liquidizers", + "liquidises": "liquidizes", + "liquidising": "liquidizing", + "litre": "liter", + "litres": "liters", + "localise": "localize", + "localised": "localized", + "localises": "localizes", + "localising": "localizing", + "louvre": "louver", + "louvred": "louvered", + "louvres": "louvers", + "lustre": "luster", + "magnetise": "magnetize", + "magnetised": "magnetized", + "magnetises": "magnetizes", + "magnetising": "magnetizing", + "manoeuvrability": "maneuverability", + "manoeuvrable": "maneuverable", + "manoeuvre": "maneuver", + "manoeuvred": "maneuvered", + "manoeuvres": "maneuvers", + "manoeuvring": "maneuvering", + "manoeuvrings": "maneuverings", + "marginalisation": "marginalization", + "marginalise": "marginalize", + "marginalised": "marginalized", + "marginalises": "marginalizes", + "marginalising": "marginalizing", + "marshalled": "marshaled", + "marshalling": "marshaling", + "marvelled": "marveled", + "marvelling": "marveling", + "marvellous": "marvelous", + "marvellously": "marvelously", + "materialisation": "materialization", + "materialise": "materialize", + "materialised": "materialized", + "materialises": "materializes", + "materialising": "materializing", + "maximisation": "maximization", + "maximise": "maximize", + "maximised": "maximized", + "maximises": "maximizes", + "maximising": "maximizing", + "meagre": "meager", + "mechanisation": "mechanization", + "mechanise": "mechanize", + "mechanised": "mechanized", + "mechanises": "mechanizes", + "mechanising": "mechanizing", + "mediaeval": "medieval", + "memorialise": "memorialize", + "memorialised": "memorialized", + "memorialises": "memorializes", + "memorialising": "memorializing", + "memorise": "memorize", + "memorised": "memorized", + "memorises": "memorizes", + "memorising": "memorizing", + "mesmerise": "mesmerize", + "mesmerised": "mesmerized", + "mesmerises": "mesmerizes", + "mesmerising": "mesmerizing", + "metabolise": "metabolize", + "metabolised": "metabolized", + "metabolises": "metabolizes", + "metabolising": "metabolizing", + "metre": "meter", + "metres": "meters", + "mhm": "hmm", + "micrometre": "micrometer", + "micrometres": "micrometers", + "militarise": "militarize", + "militarised": "militarized", + "militarises": "militarizes", + "militarising": "militarizing", + "milligramme": "milligram", + "milligrammes": "milligrams", + "millilitre": "milliliter", + "millilitres": "milliliters", + "millimetre": "millimeter", + "millimetres": "millimeters", + "miniaturisation": "miniaturization", + "miniaturise": "miniaturize", + "miniaturised": "miniaturized", + "miniaturises": "miniaturizes", + "miniaturising": "miniaturizing", + "minibusses": "minibuses", + "minimise": "minimize", + "minimised": "minimized", + "minimises": "minimizes", + "minimising": "minimizing", + "misbehaviour": "misbehavior", + "misdemeanour": "misdemeanor", + "misdemeanours": "misdemeanors", + "misspelt": "misspelled", + "mitre": "miter", + "mitres": "miters", + "mm": "hmm", + "mmm": "hmm", + "mobilisation": "mobilization", + "mobilise": "mobilize", + "mobilised": "mobilized", + "mobilises": "mobilizes", + "mobilising": "mobilizing", + "modelled": "modeled", + "modeller": "modeler", + "modellers": "modelers", + "modelling": "modeling", + "modernise": "modernize", + "modernised": "modernized", + "modernises": "modernizes", + "modernising": "modernizing", + "moisturise": "moisturize", + "moisturised": "moisturized", + "moisturiser": "moisturizer", + "moisturisers": "moisturizers", + "moisturises": "moisturizes", + "moisturising": "moisturizing", + "monologue": "monolog", + "monologues": "monologs", + "monopolisation": "monopolization", + "monopolise": "monopolize", + "monopolised": "monopolized", + "monopolises": "monopolizes", + "monopolising": "monopolizing", + "moralise": "moralize", + "moralised": "moralized", + "moralises": "moralizes", + "moralising": "moralizing", + "motorised": "motorized", + "mould": "mold", + "moulded": "molded", + "moulder": "molder", + "mouldered": "moldered", + "mouldering": "moldering", + "moulders": "molders", + "mouldier": "moldier", + "mouldiest": "moldiest", + "moulding": "molding", + "mouldings": "moldings", + "moulds": "molds", + "mouldy": "moldy", + "moult": "molt", + "moulted": "molted", + "moulting": "molting", + "moults": "molts", + "moustache": "mustache", + "moustached": "mustached", + "moustaches": "mustaches", + "moustachioed": "mustachioed", + "multicoloured": "multicolored", + "nationalisation": "nationalization", + "nationalisations": "nationalizations", + "nationalise": "nationalize", + "nationalised": "nationalized", + "nationalises": "nationalizes", + "nationalising": "nationalizing", + "naturalisation": "naturalization", + "naturalise": "naturalize", + "naturalised": "naturalized", + "naturalises": "naturalizes", + "naturalising": "naturalizing", + "neighbour": "neighbor", + "neighbourhood": "neighborhood", + "neighbourhoods": "neighborhoods", + "neighbouring": "neighboring", + "neighbourliness": "neighborliness", + "neighbourly": "neighborly", + "neighbours": "neighbors", + "neutralisation": "neutralization", + "neutralise": "neutralize", + "neutralised": "neutralized", + "neutralises": "neutralizes", + "neutralising": "neutralizing", + "normalisation": "normalization", + "normalise": "normalize", + "normalised": "normalized", + "normalises": "normalizes", + "normalising": "normalizing", + "odour": "odor", + "odourless": "odorless", + "odours": "odors", + "oesophagus": "esophagus", + "oesophaguses": "esophaguses", + "oestrogen": "estrogen", + "offence": "offense", + "offences": "offenses", + "omelette": "omelet", + "omelettes": "omelets", + "optimise": "optimize", + "optimised": "optimized", + "optimises": "optimizes", + "optimising": "optimizing", + "organisation": "organization", + "organisational": "organizational", + "organisations": "organizations", + "organise": "organize", + "organised": "organized", + "organiser": "organizer", + "organisers": "organizers", + "organises": "organizes", + "organising": "organizing", + "orthopaedic": "orthopedic", + "orthopaedics": "orthopedics", + "ostracise": "ostracize", + "ostracised": "ostracized", + "ostracises": "ostracizes", + "ostracising": "ostracizing", + "outmanoeuvre": "outmaneuver", + "outmanoeuvred": "outmaneuvered", + "outmanoeuvres": "outmaneuvers", + "outmanoeuvring": "outmaneuvering", + "overemphasise": "overemphasize", + "overemphasised": "overemphasized", + "overemphasises": "overemphasizes", + "overemphasising": "overemphasizing", + "oxidisation": "oxidization", + "oxidise": "oxidize", + "oxidised": "oxidized", + "oxidises": "oxidizes", + "oxidising": "oxidizing", + "paederast": "pederast", + "paederasts": "pederasts", + "paediatric": "pediatric", + "paediatrician": "pediatrician", + "paediatricians": "pediatricians", + "paediatrics": "pediatrics", + "paedophile": "pedophile", + "paedophiles": "pedophiles", + "paedophilia": "pedophilia", + "palaeolithic": "paleolithic", + "palaeontologist": "paleontologist", + "palaeontologists": "paleontologists", + "palaeontology": "paleontology", + "panelled": "paneled", + "panelling": "paneling", + "panellist": "panelist", + "panellists": "panelists", + "paralyse": "paralyze", + "paralysed": "paralyzed", + "paralyses": "paralyzes", + "paralysing": "paralyzing", + "parcelled": "parceled", + "parcelling": "parceling", + "parlour": "parlor", + "parlours": "parlors", + "particularise": "particularize", + "particularised": "particularized", + "particularises": "particularizes", + "particularising": "particularizing", + "passivisation": "passivization", + "passivise": "passivize", + "passivised": "passivized", + "passivises": "passivizes", + "passivising": "passivizing", + "pasteurisation": "pasteurization", + "pasteurise": "pasteurize", + "pasteurised": "pasteurized", + "pasteurises": "pasteurizes", + "pasteurising": "pasteurizing", + "patronise": "patronize", + "patronised": "patronized", + "patronises": "patronizes", + "patronising": "patronizing", + "patronisingly": "patronizingly", + "pedalled": "pedaled", + "pedalling": "pedaling", + "pedestrianisation": "pedestrianization", + "pedestrianise": "pedestrianize", + "pedestrianised": "pedestrianized", + "pedestrianises": "pedestrianizes", + "pedestrianising": "pedestrianizing", + "penalise": "penalize", + "penalised": "penalized", + "penalises": "penalizes", + "penalising": "penalizing", + "pencilled": "penciled", + "pencilling": "penciling", + "personalise": "personalize", + "personalised": "personalized", + "personalises": "personalizes", + "personalising": "personalizing", + "pharmacopoeia": "pharmacopeia", + "pharmacopoeias": "pharmacopeias", + "philosophise": "philosophize", + "philosophised": "philosophized", + "philosophises": "philosophizes", + "philosophising": "philosophizing", + "philtre": "filter", + "philtres": "filters", + "phoney": "phony", + "plagiarise": "plagiarize", + "plagiarised": "plagiarized", + "plagiarises": "plagiarizes", + "plagiarising": "plagiarizing", + "plough": "plow", + "ploughed": "plowed", + "ploughing": "plowing", + "ploughman": "plowman", + "ploughmen": "plowmen", + "ploughs": "plows", + "ploughshare": "plowshare", + "ploughshares": "plowshares", + "polarisation": "polarization", + "polarise": "polarize", + "polarised": "polarized", + "polarises": "polarizes", + "polarising": "polarizing", + "politicisation": "politicization", + "politicise": "politicize", + "politicised": "politicized", + "politicises": "politicizes", + "politicising": "politicizing", + "popularisation": "popularization", + "popularise": "popularize", + "popularised": "popularized", + "popularises": "popularizes", + "popularising": "popularizing", + "pouffe": "pouf", + "pouffes": "poufs", + "practise": "practice", + "practised": "practiced", + "practises": "practices", + "practising": "practicing", + "praesidium": "presidium", + "praesidiums": "presidiums", + "pressurisation": "pressurization", + "pressurise": "pressurize", + "pressurised": "pressurized", + "pressurises": "pressurizes", + "pressurising": "pressurizing", + "pretence": "pretense", + "pretences": "pretenses", + "primaeval": "primeval", + "prioritisation": "prioritization", + "prioritise": "prioritize", + "prioritised": "prioritized", + "prioritises": "prioritizes", + "prioritising": "prioritizing", + "privatisation": "privatization", + "privatisations": "privatizations", + "privatise": "privatize", + "privatised": "privatized", + "privatises": "privatizes", + "privatising": "privatizing", + "professionalisation": "professionalization", + "professionalise": "professionalize", + "professionalised": "professionalized", + "professionalises": "professionalizes", + "professionalising": "professionalizing", + "programme": "program", + "programmes": "programs", + "prologue": "prolog", + "prologues": "prologs", + "propagandise": "propagandize", + "propagandised": "propagandized", + "propagandises": "propagandizes", + "propagandising": "propagandizing", + "proselytise": "proselytize", + "proselytised": "proselytized", + "proselytiser": "proselytizer", + "proselytisers": "proselytizers", + "proselytises": "proselytizes", + "proselytising": "proselytizing", + "psychoanalyse": "psychoanalyze", + "psychoanalysed": "psychoanalyzed", + "psychoanalyses": "psychoanalyzes", + "psychoanalysing": "psychoanalyzing", + "publicise": "publicize", + "publicised": "publicized", + "publicises": "publicizes", + "publicising": "publicizing", + "pulverisation": "pulverization", + "pulverise": "pulverize", + "pulverised": "pulverized", + "pulverises": "pulverizes", + "pulverising": "pulverizing", + "pummelled": "pummel", + "pummelling": "pummeled", + "pyjama": "pajama", + "pyjamas": "pajamas", + "pzazz": "pizzazz", + "quarrelled": "quarreled", + "quarrelling": "quarreling", + "radicalise": "radicalize", + "radicalised": "radicalized", + "radicalises": "radicalizes", + "radicalising": "radicalizing", + "rancour": "rancor", + "randomise": "randomize", + "randomised": "randomized", + "randomises": "randomizes", + "randomising": "randomizing", + "rationalisation": "rationalization", + "rationalisations": "rationalizations", + "rationalise": "rationalize", + "rationalised": "rationalized", + "rationalises": "rationalizes", + "rationalising": "rationalizing", + "ravelled": "raveled", + "ravelling": "raveling", + "realisable": "realizable", + "realisation": "realization", + "realisations": "realizations", + "realise": "realize", + "realised": "realized", + "realises": "realizes", + "realising": "realizing", + "recognisable": "recognizable", + "recognisably": "recognizably", + "recognisance": "recognizance", + "recognise": "recognize", + "recognised": "recognized", + "recognises": "recognizes", + "recognising": "recognizing", + "reconnoitre": "reconnoiter", + "reconnoitred": "reconnoitered", + "reconnoitres": "reconnoiters", + "reconnoitring": "reconnoitering", + "refuelled": "refueled", + "refuelling": "refueling", + "regularisation": "regularization", + "regularise": "regularize", + "regularised": "regularized", + "regularises": "regularizes", + "regularising": "regularizing", + "remodelled": "remodeled", + "remodelling": "remodeling", + "remould": "remold", + "remoulded": "remolded", + "remoulding": "remolding", + "remoulds": "remolds", + "reorganisation": "reorganization", + "reorganisations": "reorganizations", + "reorganise": "reorganize", + "reorganised": "reorganized", + "reorganises": "reorganizes", + "reorganising": "reorganizing", + "revelled": "reveled", + "reveller": "reveler", + "revellers": "revelers", + "revelling": "reveling", + "revitalise": "revitalize", + "revitalised": "revitalized", + "revitalises": "revitalizes", + "revitalising": "revitalizing", + "revolutionise": "revolutionize", + "revolutionised": "revolutionized", + "revolutionises": "revolutionizes", + "revolutionising": "revolutionizing", + "rhapsodise": "rhapsodize", + "rhapsodised": "rhapsodized", + "rhapsodises": "rhapsodizes", + "rhapsodising": "rhapsodizing", + "rigour": "rigor", + "rigours": "rigors", + "ritualised": "ritualized", + "rivalled": "rivaled", + "rivalling": "rivaling", + "romanticise": "romanticize", + "romanticised": "romanticized", + "romanticises": "romanticizes", + "romanticising": "romanticizing", + "rumour": "rumor", + "rumoured": "rumored", + "rumours": "rumors", + "sabre": "saber", + "sabres": "sabers", + "saltpetre": "saltpeter", + "sanitise": "sanitize", + "sanitised": "sanitized", + "sanitises": "sanitizes", + "sanitising": "sanitizing", + "satirise": "satirize", + "satirised": "satirized", + "satirises": "satirizes", + "satirising": "satirizing", + "saviour": "savior", + "saviours": "saviors", + "savour": "savor", + "savoured": "savored", + "savouries": "savories", + "savouring": "savoring", + "savours": "savors", + "savoury": "savory", + "scandalise": "scandalize", + "scandalised": "scandalized", + "scandalises": "scandalizes", + "scandalising": "scandalizing", + "sceptic": "skeptic", + "sceptical": "skeptical", + "sceptically": "skeptically", + "scepticism": "skepticism", + "sceptics": "skeptics", + "sceptre": "scepter", + "sceptres": "scepters", + "scrutinise": "scrutinize", + "scrutinised": "scrutinized", + "scrutinises": "scrutinizes", + "scrutinising": "scrutinizing", + "secularisation": "secularization", + "secularise": "secularize", + "secularised": "secularized", + "secularises": "secularizes", + "secularising": "secularizing", + "sensationalise": "sensationalize", + "sensationalised": "sensationalized", + "sensationalises": "sensationalizes", + "sensationalising": "sensationalizing", + "sensitise": "sensitize", + "sensitised": "sensitized", + "sensitises": "sensitizes", + "sensitising": "sensitizing", + "sentimentalise": "sentimentalize", + "sentimentalised": "sentimentalized", + "sentimentalises": "sentimentalizes", + "sentimentalising": "sentimentalizing", + "sepulchre": "sepulcher", + "sepulchres": "sepulchers", + "serialisation": "serialization", + "serialisations": "serializations", + "serialise": "serialize", + "serialised": "serialized", + "serialises": "serializes", + "serialising": "serializing", + "sermonise": "sermonize", + "sermonised": "sermonized", + "sermonises": "sermonizes", + "sermonising": "sermonizing", + "sheikh": "sheik", + "shovelled": "shoveled", + "shovelling": "shoveling", + "shrivelled": "shriveled", + "shrivelling": "shriveling", + "signalise": "signalize", + "signalised": "signalized", + "signalises": "signalizes", + "signalising": "signalizing", + "signalled": "signaled", + "signalling": "signaling", + "smoulder": "smolder", + "smouldered": "smoldered", + "smouldering": "smoldering", + "smoulders": "smolders", + "snivelled": "sniveled", + "snivelling": "sniveling", + "snorkelled": "snorkeled", + "snorkelling": "snorkeling", + "snowplough": "snowplow", + "snowploughs": "snowplow", + "socialisation": "socialization", + "socialise": "socialize", + "socialised": "socialized", + "socialises": "socializes", + "socialising": "socializing", + "sodomise": "sodomize", + "sodomised": "sodomized", + "sodomises": "sodomizes", + "sodomising": "sodomizing", + "solemnise": "solemnize", + "solemnised": "solemnized", + "solemnises": "solemnizes", + "solemnising": "solemnizing", + "sombre": "somber", + "specialisation": "specialization", + "specialisations": "specializations", + "specialise": "specialize", + "specialised": "specialized", + "specialises": "specializes", + "specialising": "specializing", + "spectre": "specter", + "spectres": "specters", + "spiralled": "spiraled", + "spiralling": "spiraling", + "splendour": "splendor", + "splendours": "splendors", + "squirrelled": "squirreled", + "squirrelling": "squirreling", + "stabilisation": "stabilization", + "stabilise": "stabilize", + "stabilised": "stabilized", + "stabiliser": "stabilizer", + "stabilisers": "stabilizers", + "stabilises": "stabilizes", + "stabilising": "stabilizing", + "standardisation": "standardization", + "standardise": "standardize", + "standardised": "standardized", + "standardises": "standardizes", + "standardising": "standardizing", + "stencilled": "stenciled", + "stencilling": "stenciling", + "sterilisation": "sterilization", + "sterilisations": "sterilizations", + "sterilise": "sterilize", + "sterilised": "sterilized", + "steriliser": "sterilizer", + "sterilisers": "sterilizers", + "sterilises": "sterilizes", + "sterilising": "sterilizing", + "stigmatisation": "stigmatization", + "stigmatise": "stigmatize", + "stigmatised": "stigmatized", + "stigmatises": "stigmatizes", + "stigmatising": "stigmatizing", + "storey": "story", + "storeys": "stories", + "subsidisation": "subsidization", + "subsidise": "subsidize", + "subsidised": "subsidized", + "subsidiser": "subsidizer", + "subsidisers": "subsidizers", + "subsidises": "subsidizes", + "subsidising": "subsidizing", + "succour": "succor", + "succoured": "succored", + "succouring": "succoring", + "succours": "succors", + "sulphate": "sulfate", + "sulphates": "sulfates", + "sulphide": "sulfide", + "sulphides": "sulfides", + "sulphur": "sulfur", + "sulphurous": "sulfurous", + "summarise": "summarize", + "summarised": "summarized", + "summarises": "summarizes", + "summarising": "summarizing", + "swivelled": "swiveled", + "swivelling": "swiveling", + "symbolise": "symbolize", + "symbolised": "symbolized", + "symbolises": "symbolizes", + "symbolising": "symbolizing", + "sympathise": "sympathize", + "sympathised": "sympathized", + "sympathiser": "sympathizer", + "sympathisers": "sympathizers", + "sympathises": "sympathizes", + "sympathising": "sympathizing", + "synchronisation": "synchronization", + "synchronise": "synchronize", + "synchronised": "synchronized", + "synchronises": "synchronizes", + "synchronising": "synchronizing", + "synthesise": "synthesize", + "synthesised": "synthesized", + "synthesiser": "synthesizer", + "synthesisers": "synthesizers", + "synthesises": "synthesizes", + "synthesising": "synthesizing", + "syphon": "siphon", + "syphoned": "siphoned", + "syphoning": "siphoning", + "syphons": "siphons", + "systematisation": "systematization", + "systematise": "systematize", + "systematised": "systematized", + "systematises": "systematizes", + "systematising": "systematizing", + "tantalise": "tantalize", + "tantalised": "tantalized", + "tantalises": "tantalizes", + "tantalising": "tantalizing", + "tantalisingly": "tantalizingly", + "tasselled": "tasseled", + "technicolour": "technicolor", + "temporise": "temporize", + "temporised": "temporized", + "temporises": "temporizes", + "temporising": "temporizing", + "tenderise": "tenderize", + "tenderised": "tenderized", + "tenderises": "tenderizes", + "tenderising": "tenderizing", + "terrorise": "terrorize", + "terrorised": "terrorized", + "terrorises": "terrorizes", + "terrorising": "terrorizing", + "theatre": "theater", + "theatregoer": "theatergoer", + "theatregoers": "theatergoers", + "theatres": "theaters", + "theorise": "theorize", + "theorised": "theorized", + "theorises": "theorizes", + "theorising": "theorizing", + "tonne": "ton", + "tonnes": "tons", + "towelled": "toweled", + "towelling": "toweling", + "toxaemia": "toxemia", + "tranquillise": "tranquilize", + "tranquillised": "tranquilized", + "tranquilliser": "tranquilizer", + "tranquillisers": "tranquilizers", + "tranquillises": "tranquilizes", + "tranquillising": "tranquilizing", + "tranquillity": "tranquility", + "tranquillize": "tranquilize", + "tranquillized": "tranquilized", + "tranquillizer": "tranquilizer", + "tranquillizers": "tranquilizers", + "tranquillizes": "tranquilizes", + "tranquillizing": "tranquilizing", + "tranquilly": "tranquility", + "transistorised": "transistorized", + "traumatise": "traumatize", + "traumatised": "traumatized", + "traumatises": "traumatizes", + "traumatising": "traumatizing", + "travelled": "traveled", + "traveller": "traveler", + "travellers": "travelers", + "travelling": "traveling", + "travelog": "travelogue", + "travelogs": "travelogues", + "trialled": "trialed", + "trialling": "trialing", + "tricolour": "tricolor", + "tricolours": "tricolors", + "trivialise": "trivialize", + "trivialised": "trivialized", + "trivialises": "trivializes", + "trivialising": "trivializing", + "tumour": "tumor", + "tumours": "tumors", + "tunnelled": "tunneled", + "tunnelling": "tunneling", + "tyrannise": "tyrannize", + "tyrannised": "tyrannized", + "tyrannises": "tyrannizes", + "tyrannising": "tyrannizing", + "tyre": "tire", + "tyres": "tires", + "unauthorised": "unauthorized", + "uncivilised": "uncivilized", + "underutilised": "underutilized", + "unequalled": "unequaled", + "unfavourable": "unfavorable", + "unfavourably": "unfavorably", + "unionisation": "unionization", + "unionise": "unionize", + "unionised": "unionized", + "unionises": "unionizes", + "unionising": "unionizing", + "unorganised": "unorganized", + "unravelled": "unraveled", + "unravelling": "unraveling", + "unrecognisable": "unrecognizable", + "unrecognised": "unrecognized", + "unrivalled": "unrivaled", + "unsavoury": "unsavory", + "untrammelled": "untrammeled", + "urbanisation": "urbanization", + "urbanise": "urbanize", + "urbanised": "urbanized", + "urbanises": "urbanizes", + "urbanising": "urbanizing", + "utilisable": "utilizable", + "utilisation": "utilization", + "utilise": "utilize", + "utilised": "utilized", + "utilises": "utilizes", + "utilising": "utilizing", + "valour": "valor", + "vandalise": "vandalize", + "vandalised": "vandalized", + "vandalises": "vandalizes", + "vandalising": "vandalizing", + "vaporisation": "vaporization", + "vaporise": "vaporize", + "vaporised": "vaporized", + "vaporises": "vaporizes", + "vaporising": "vaporizing", + "vapour": "vapor", + "vapours": "vapors", + "verbalise": "verbalize", + "verbalised": "verbalized", + "verbalises": "verbalizes", + "verbalising": "verbalizing", + "victimisation": "victimization", + "victimise": "victimize", + "victimised": "victimized", + "victimises": "victimizes", + "victimising": "victimizing", + "videodisc": "videodisk", + "videodiscs": "videodisks", + "vigour": "vigor", + "visualisation": "visualization", + "visualisations": "visualizations", + "visualise": "visualize", + "visualised": "visualized", + "visualises": "visualizes", + "visualising": "visualizing", + "vocalisation": "vocalization", + "vocalisations": "vocalizations", + "vocalise": "vocalize", + "vocalised": "vocalized", + "vocalises": "vocalizes", + "vocalising": "vocalizing", + "vulcanised": "vulcanized", + "vulgarisation": "vulgarization", + "vulgarise": "vulgarize", + "vulgarised": "vulgarized", + "vulgarises": "vulgarizes", + "vulgarising": "vulgarizing", + "waggon": "wagon", + "waggons": "wagons", + "watercolour": "watercolor", + "watercolours": "watercolors", + "weaselled": "weaseled", + "weaselling": "weaseling", + "westernisation": "westernization", + "westernise": "westernize", + "westernised": "westernized", + "westernises": "westernizes", + "westernising": "westernizing", + "womanise": "womanize", + "womanised": "womanized", + "womaniser": "womanizer", + "womanisers": "womanizers", + "womanises": "womanizes", + "womanising": "womanizing", + "woollen": "woolen", + "woollens": "woolens", + "woollies": "woolies", + "woolly": "wooly", + "worshipped": "worshiped", + "worshipper": "worshiper", + "worshipping": "worshiping", + "yodelled": "yodeled", + "yodelling": "yodeling", + "yoghourt": "yogurt", + "yoghourts": "yogurts", + "yoghurt": "yogurt", + "yoghurts": "yogurts" +} + diff --git a/users/zeineldeen/experiments/canary_aed/nemo/normalizer/eval_utils.py b/users/zeineldeen/experiments/canary_aed/nemo/normalizer/eval_utils.py new file mode 100644 index 000000000..4e9276b50 --- /dev/null +++ b/users/zeineldeen/experiments/canary_aed/nemo/normalizer/eval_utils.py @@ -0,0 +1,157 @@ +import os +import glob +import json + +import evaluate +from collections import defaultdict + + +def read_manifest(manifest_path: str): + """ + Reads a manifest file (jsonl format) and returns a list of dictionaries containing samples. + """ + data = [] + with open(manifest_path, "r", encoding="utf-8") as f: + for line in f: + if len(line) > 0: + datum = json.loads(line) + data.append(datum) + return data + + +def write_manifest( + manifest_path, + references: list, + transcriptions: list, + model_id: str, + dataset_path: str, + dataset_name: str, + split: str, +): + """ + Writes a manifest file (jsonl format) and returns the path to the file. + + Args: + references: Ground truth reference texts. + transcriptions: Model predicted transcriptions. + model_id: String identifier for the model. + dataset_path: Path to the dataset. + dataset_name: Name of the dataset. + split: Dataset split name. + + Returns: + Path to the manifest file. + """ + model_id = model_id.replace("/", "-") + dataset_path = dataset_path.replace("/", "-") + dataset_name = dataset_name.replace("/", "-") + + if len(references) != len(transcriptions): + raise ValueError( + f"The number of samples in `ground_truths` ({len(references)}) " + f"must match `transcriptions` ({len(transcriptions)})." + ) + + basedir = "./results/" + if not os.path.exists(basedir): + os.makedirs(basedir) + + # manifest_path = os.path.join(basedir, f"MODEL_{model_id}_DATASET_{dataset_path}_{dataset_name}_{split}.jsonl") + + with open(manifest_path, "w", encoding="utf-8") as f: + for idx, (text, transcript) in enumerate(zip(references, transcriptions)): + datum = { + "audio_filepath": f"sample_{idx}", # dummy value for Speech Data Processor + "duration": 0.0, # dummy value for Speech Data Processor + "text": text, + "pred_text": transcript, + } + f.write(f"{json.dumps(datum, ensure_ascii=False)}\n") + return manifest_path + + +def score_results(directory: str, model_id: str = None): + """ + Scores all result files in a directory and returns a composite score over all evaluated datasets. + + Args: + directory: Path to the result directory, containing one or more jsonl files. + model_id: Optional, model name to filter out result files based on model name. + + Returns: + Composite score over all evaluated datasets and a dictionary of all results. + """ + + # Strip trailing slash + if directory.endswith(os.pathsep): + directory = directory[:-1] + + # Find all result files in the directory + result_files = list(glob.glob(f"{directory}/**/*.jsonl", recursive=True)) + result_files = list(sorted(result_files)) + + # Filter files belonging to a specific model id + if model_id is not None and model_id != "": + print("Filtering models by id:", model_id) + model_id = model_id.replace("/", "-") + result_files = [fp for fp in result_files if model_id in fp] + + # Check if any result files were found + if len(result_files) == 0: + raise ValueError(f"No result files found in {directory}") + + # Utility function to parse the file path and extract model id, dataset path, dataset name and split + def parse_filepath(fp: str): + model_index = fp.find("MODEL_") + fp = fp[model_index:] + ds_index = fp.find("DATASET_") + model_id = fp[:ds_index].replace("MODEL_", "").rstrip("_") + author_index = model_id.find("-") + model_id = model_id[:author_index] + "/" + model_id[author_index + 1 :] + + ds_fp = fp[ds_index:] + dataset_id = ds_fp.replace("DATASET_", "").rstrip(".jsonl") + return model_id, dataset_id + + # Compute results per dataset + results = {} + wer_metric = evaluate.load("wer") + + for result_file in result_files: + manifest = read_manifest(result_file) + model_id_of_file, dataset_id = parse_filepath(result_file) + + references = [datum["text"] for datum in manifest] + predictions = [datum["pred_text"] for datum in manifest] + + wer = wer_metric.compute(references=references, predictions=predictions) + wer = round(100 * wer, 2) + + result_key = f"{model_id_of_file} | {dataset_id}" + results[result_key] = wer + + print("*" * 80) + print("Results per dataset:") + print("*" * 80) + + for k, v in results.items(): + print(f"{k}: WER = {v:0.2f} %") + + # composite WER should be computed over all datasets and with the same key + composite_wer = defaultdict(float) + count_entries = defaultdict(int) + for k, v in results.items(): + key = k.split("|")[0].strip() + composite_wer[key] += v + count_entries[key] += 1 + + # normalize scores & print + print() + print("*" * 80) + print("Composite WER:") + print("*" * 80) + for k, v in composite_wer.items(): + wer = v / count_entries[k] + print(f"{k}: WER = {wer:0.2f} %") + print("*" * 80) + return composite_wer, results diff --git a/users/zeineldeen/experiments/canary_aed/nemo/normalizer/normalizer.py b/users/zeineldeen/experiments/canary_aed/nemo/normalizer/normalizer.py new file mode 100644 index 000000000..6fc418b93 --- /dev/null +++ b/users/zeineldeen/experiments/canary_aed/nemo/normalizer/normalizer.py @@ -0,0 +1,596 @@ +# Copyright 2022 The OpenAI team and The HuggingFace Team. All rights reserved. +# Most of the code is copy pasted from the original whisper repository +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +import unicodedata +from fractions import Fraction +from typing import Iterator, List, Match, Optional, Union +from .english_abbreviations import english_spelling_normalizer + +import regex + + +# non-ASCII letters that are not separated by "NFKD" normalization +ADDITIONAL_DIACRITICS = { + "œ": "oe", + "Œ": "OE", + "ø": "o", + "Ø": "O", + "æ": "ae", + "Æ": "AE", + "ß": "ss", + "ẞ": "SS", + "đ": "d", + "Đ": "D", + "ð": "d", + "Ð": "D", + "þ": "th", + "Þ": "th", + "ł": "l", + "Ł": "L", +} + + +def remove_symbols_and_diacritics(s: str, keep=""): + """ + Replace any other markers, symbols, and punctuations with a space, and drop any diacritics (category 'Mn' and some + manual mappings) + """ + + def replace_character(char): + if char in keep: + return char + elif char in ADDITIONAL_DIACRITICS: + return ADDITIONAL_DIACRITICS[char] + + elif unicodedata.category(char) == "Mn": + return "" + + elif unicodedata.category(char)[0] in "MSP": + return " " + + return char + + return "".join(replace_character(c) for c in unicodedata.normalize("NFKD", s)) + + +def remove_symbols(s: str): + """ + Replace any other markers, symbols, punctuations with a space, keeping diacritics + """ + return "".join(" " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKC", s)) + + +class BasicTextNormalizer: + def __init__(self, remove_diacritics: bool = False, split_letters: bool = False): + self.clean = remove_symbols_and_diacritics if remove_diacritics else remove_symbols + self.split_letters = split_letters + + def __call__(self, s: str): + s = s.lower() + s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets + s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis + s = self.clean(s).lower() + + if self.split_letters: + s = " ".join(regex.findall(r"\X", s, regex.U)) + + s = re.sub(r"\s+", " ", s) # replace any successive whitespace characters with a space + + return s + + +class EnglishNumberNormalizer: + """ + Convert any spelled-out numbers into arabic numbers, while handling: + + - remove any commas + - keep the suffixes such as: `1960s`, `274th`, `32nd`, etc. + - spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars` + - spell out `one` and `ones` + - interpret successive single-digit numbers as nominal: `one oh one` -> `101` + """ + + def __init__(self): + super().__init__() + + self.zeros = {"o", "oh", "zero"} + # fmt: off + self.ones = { + name: i + for i, name in enumerate( + ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen"], + start=1, + ) + } + # fmt: on + self.ones_plural = { + "sixes" if name == "six" else name + "s": (value, "s") for name, value in self.ones.items() + } + self.ones_ordinal = { + "zeroth": (0, "th"), + "first": (1, "st"), + "second": (2, "nd"), + "third": (3, "rd"), + "fifth": (5, "th"), + "twelfth": (12, "th"), + **{ + name + ("h" if name.endswith("t") else "th"): (value, "th") + for name, value in self.ones.items() + if value > 3 and value != 5 and value != 12 + }, + } + self.ones_suffixed = {**self.ones_plural, **self.ones_ordinal} + + self.tens = { + "twenty": 20, + "thirty": 30, + "forty": 40, + "fifty": 50, + "sixty": 60, + "seventy": 70, + "eighty": 80, + "ninety": 90, + } + self.tens_plural = {name.replace("y", "ies"): (value, "s") for name, value in self.tens.items()} + self.tens_ordinal = {name.replace("y", "ieth"): (value, "th") for name, value in self.tens.items()} + self.tens_suffixed = {**self.tens_plural, **self.tens_ordinal} + + self.multipliers = { + "hundred": 100, + "thousand": 1_000, + "million": 1_000_000, + "billion": 1_000_000_000, + "trillion": 1_000_000_000_000, + "quadrillion": 1_000_000_000_000_000, + "quintillion": 1_000_000_000_000_000_000, + "sextillion": 1_000_000_000_000_000_000_000, + "septillion": 1_000_000_000_000_000_000_000_000, + "octillion": 1_000_000_000_000_000_000_000_000_000, + "nonillion": 1_000_000_000_000_000_000_000_000_000_000, + "decillion": 1_000_000_000_000_000_000_000_000_000_000_000, + } + self.multipliers_plural = {name + "s": (value, "s") for name, value in self.multipliers.items()} + self.multipliers_ordinal = {name + "th": (value, "th") for name, value in self.multipliers.items()} + self.multipliers_suffixed = {**self.multipliers_plural, **self.multipliers_ordinal} + self.decimals = {*self.ones, *self.tens, *self.zeros} + + self.preceding_prefixers = { + "minus": "-", + "negative": "-", + "plus": "+", + "positive": "+", + } + self.following_prefixers = { + "pound": "£", + "pounds": "£", + "euro": "€", + "euros": "€", + "dollar": "$", + "dollars": "$", + "cent": "¢", + "cents": "¢", + } + self.prefixes = set(list(self.preceding_prefixers.values()) + list(self.following_prefixers.values())) + self.suffixers = { + "per": {"cent": "%"}, + "percent": "%", + } + self.specials = {"and", "double", "triple", "point"} + + self.words = { + key + for mapping in [ + self.zeros, + self.ones, + self.ones_suffixed, + self.tens, + self.tens_suffixed, + self.multipliers, + self.multipliers_suffixed, + self.preceding_prefixers, + self.following_prefixers, + self.suffixers, + self.specials, + ] + for key in mapping + } + self.literal_words = {"one", "ones"} + + def process_words(self, words: List[str]) -> Iterator[str]: + prefix: Optional[str] = None + value: Optional[Union[str, int]] = None + skip = False + + def to_fraction(s: str): + try: + return Fraction(s) + except ValueError: + return None + + def output(result: Union[str, int]): + nonlocal prefix, value + result = str(result) + if prefix is not None: + result = prefix + result + value = None + prefix = None + return result + + if len(words) == 0: + return + + for i, current in enumerate(words): + prev = words[i - 1] if i != 0 else None + next = words[i + 1] if i != len(words) - 1 else None + if skip: + skip = False + continue + + next_is_numeric = next is not None and re.match(r"^\d+(\.\d+)?$", next) + has_prefix = current[0] in self.prefixes + current_without_prefix = current[1:] if has_prefix else current + if re.match(r"^\d+(\.\d+)?$", current_without_prefix): + # arabic numbers (potentially with signs and fractions) + f = to_fraction(current_without_prefix) + if f is None: + raise ValueError("Converting the fraction failed") + + if value is not None: + if isinstance(value, str) and value.endswith("."): + # concatenate decimals / ip address components + value = str(value) + str(current) + continue + else: + yield output(value) + + prefix = current[0] if has_prefix else prefix + if f.denominator == 1: + value = f.numerator # store integers as int + else: + value = current_without_prefix + elif current not in self.words: + # non-numeric words + if value is not None: + yield output(value) + yield output(current) + elif current in self.zeros: + value = str(value or "") + "0" + elif current in self.ones: + ones = self.ones[current] + + if value is None: + value = ones + elif isinstance(value, str) or prev in self.ones: + if prev in self.tens and ones < 10: # replace the last zero with the digit + value = value[:-1] + str(ones) + else: + value = str(value) + str(ones) + elif ones < 10: + if value % 10 == 0: + value += ones + else: + value = str(value) + str(ones) + else: # eleven to nineteen + if value % 100 == 0: + value += ones + else: + value = str(value) + str(ones) + elif current in self.ones_suffixed: + # ordinal or cardinal; yield the number right away + ones, suffix = self.ones_suffixed[current] + if value is None: + yield output(str(ones) + suffix) + elif isinstance(value, str) or prev in self.ones: + if prev in self.tens and ones < 10: + yield output(value[:-1] + str(ones) + suffix) + else: + yield output(str(value) + str(ones) + suffix) + elif ones < 10: + if value % 10 == 0: + yield output(str(value + ones) + suffix) + else: + yield output(str(value) + str(ones) + suffix) + else: # eleven to nineteen + if value % 100 == 0: + yield output(str(value + ones) + suffix) + else: + yield output(str(value) + str(ones) + suffix) + value = None + elif current in self.tens: + tens = self.tens[current] + if value is None: + value = tens + elif isinstance(value, str): + value = str(value) + str(tens) + else: + if value % 100 == 0: + value += tens + else: + value = str(value) + str(tens) + elif current in self.tens_suffixed: + # ordinal or cardinal; yield the number right away + tens, suffix = self.tens_suffixed[current] + if value is None: + yield output(str(tens) + suffix) + elif isinstance(value, str): + yield output(str(value) + str(tens) + suffix) + else: + if value % 100 == 0: + yield output(str(value + tens) + suffix) + else: + yield output(str(value) + str(tens) + suffix) + elif current in self.multipliers: + multiplier = self.multipliers[current] + if value is None: + value = multiplier + elif isinstance(value, str) or value == 0: + f = to_fraction(value) + p = f * multiplier if f is not None else None + if f is not None and p.denominator == 1: + value = p.numerator + else: + yield output(value) + value = multiplier + else: + before = value // 1000 * 1000 + residual = value % 1000 + value = before + residual * multiplier + elif current in self.multipliers_suffixed: + multiplier, suffix = self.multipliers_suffixed[current] + if value is None: + yield output(str(multiplier) + suffix) + elif isinstance(value, str): + f = to_fraction(value) + p = f * multiplier if f is not None else None + if f is not None and p.denominator == 1: + yield output(str(p.numerator) + suffix) + else: + yield output(value) + yield output(str(multiplier) + suffix) + else: # int + before = value // 1000 * 1000 + residual = value % 1000 + value = before + residual * multiplier + yield output(str(value) + suffix) + value = None + elif current in self.preceding_prefixers: + # apply prefix (positive, minus, etc.) if it precedes a number + if value is not None: + yield output(value) + + if next in self.words or next_is_numeric: + prefix = self.preceding_prefixers[current] + else: + yield output(current) + elif current in self.following_prefixers: + # apply prefix (dollars, cents, etc.) only after a number + if value is not None: + prefix = self.following_prefixers[current] + yield output(value) + else: + yield output(current) + elif current in self.suffixers: + # apply suffix symbols (percent -> '%') + if value is not None: + suffix = self.suffixers[current] + if isinstance(suffix, dict): + if next in suffix: + yield output(str(value) + suffix[next]) + skip = True + else: + yield output(value) + yield output(current) + else: + yield output(str(value) + suffix) + else: + yield output(current) + elif current in self.specials: + if next not in self.words and not next_is_numeric: + # apply special handling only if the next word can be numeric + if value is not None: + yield output(value) + yield output(current) + elif current == "and": + # ignore "and" after hundreds, thousands, etc. + if prev not in self.multipliers: + if value is not None: + yield output(value) + yield output(current) + elif current == "double" or current == "triple": + if next in self.ones or next in self.zeros: + repeats = 2 if current == "double" else 3 + ones = self.ones.get(next, 0) + value = str(value or "") + str(ones) * repeats + skip = True + else: + if value is not None: + yield output(value) + yield output(current) + elif current == "point": + if next in self.decimals or next_is_numeric: + value = str(value or "") + "." + else: + # should all have been covered at this point + raise ValueError(f"Unexpected token: {current}") + else: + # all should have been covered at this point + raise ValueError(f"Unexpected token: {current}") + + if value is not None: + yield output(value) + + def preprocess(self, s: str): + # replace " and a half" with " point five" + results = [] + + segments = re.split(r"\band\s+a\s+half\b", s) + for i, segment in enumerate(segments): + if len(segment.strip()) == 0: + continue + if i == len(segments) - 1: + results.append(segment) + else: + results.append(segment) + last_word = segment.rsplit(maxsplit=2)[-1] + if last_word in self.decimals or last_word in self.multipliers: + results.append("point five") + else: + results.append("and a half") + + s = " ".join(results) + + # put a space at number/letter boundary + s = re.sub(r"([a-z])([0-9])", r"\1 \2", s) + s = re.sub(r"([0-9])([a-z])", r"\1 \2", s) + + # but remove spaces which could be a suffix + s = re.sub(r"([0-9])\s+(st|nd|rd|th|s)\b", r"\1\2", s) + + return s + + def postprocess(self, s: str): + def combine_cents(m: Match): + try: + currency = m.group(1) + integer = m.group(2) + cents = int(m.group(3)) + return f"{currency}{integer}.{cents:02d}" + except ValueError: + return m.string + + def extract_cents(m: Match): + try: + return f"¢{int(m.group(1))}" + except ValueError: + return m.string + + # apply currency postprocessing; "$2 and ¢7" -> "$2.07" + s = re.sub(r"([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\b", combine_cents, s) + s = re.sub(r"[€£$]0.([0-9]{1,2})\b", extract_cents, s) + + # write "one(s)" instead of "1(s)", just for the readability + s = re.sub(r"\b1(s?)\b", r"one\1", s) + + return s + + def __call__(self, s: str): + s = self.preprocess(s) + s = " ".join(word for word in self.process_words(s.split()) if word is not None) + s = self.postprocess(s) + + return s + + +class EnglishSpellingNormalizer: + """ + Applies British-American spelling mappings as listed in [1]. + + [1] https://www.tysto.com/uk-us-spelling-list.html + """ + + def __init__(self, english_spelling_mapping): + self.mapping = english_spelling_mapping + + def __call__(self, s: str): + return " ".join(self.mapping.get(word, word) for word in s.split()) + + +class EnglishTextNormalizer: + def __init__(self, english_spelling_mapping=english_spelling_normalizer): + self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b" + self.replacers = { + # common contractions + r"\bwon't\b": "will not", + r"\bcan't\b": "can not", + r"\blet's\b": "let us", + r"\bain't\b": "aint", + r"\by'all\b": "you all", + r"\bwanna\b": "want to", + r"\bgotta\b": "got to", + r"\bgonna\b": "going to", + r"\bi'ma\b": "i am going to", + r"\bimma\b": "i am going to", + r"\bwoulda\b": "would have", + r"\bcoulda\b": "could have", + r"\bshoulda\b": "should have", + r"\bma'am\b": "madam", + # contractions in titles/prefixes + r"\bmr\b": "mister ", + r"\bmrs\b": "missus ", + r"\bst\b": "saint ", + r"\bdr\b": "doctor ", + r"\bprof\b": "professor ", + r"\bcapt\b": "captain ", + r"\bgov\b": "governor ", + r"\bald\b": "alderman ", + r"\bgen\b": "general ", + r"\bsen\b": "senator ", + r"\brep\b": "representative ", + r"\bpres\b": "president ", + r"\brev\b": "reverend ", + r"\bhon\b": "honorable ", + r"\basst\b": "assistant ", + r"\bassoc\b": "associate ", + r"\blt\b": "lieutenant ", + r"\bcol\b": "colonel ", + r"\bjr\b": "junior ", + r"\bsr\b": "senior ", + r"\besq\b": "esquire ", + # prefect tenses, ideally it should be any past participles, but it's harder.. + r"'d been\b": " had been", + r"'s been\b": " has been", + r"'d gone\b": " had gone", + r"'s gone\b": " has gone", + r"'d done\b": " had done", # "'s done" is ambiguous + r"'s got\b": " has got", + # general contractions + r"n't\b": " not", + r"'re\b": " are", + r"'s\b": " is", + r"'d\b": " would", + r"'ll\b": " will", + r"'t\b": " not", + r"'ve\b": " have", + r"'m\b": " am", + } + self.standardize_numbers = EnglishNumberNormalizer() + self.standardize_spellings = EnglishSpellingNormalizer(english_spelling_mapping) + + def __call__(self, s: str): + s = s.lower() + + s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets + s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis + s = re.sub(self.ignore_patterns, "", s) + s = re.sub(r"\s+'", "'", s) # standardize when there's a space before an apostrophe + + for pattern, replacement in self.replacers.items(): + s = re.sub(pattern, replacement, s) + + s = re.sub(r"(\d),(\d)", r"\1\2", s) # remove commas between digits + s = re.sub(r"\.([^0-9]|$)", r" \1", s) # remove periods not followed by numbers + s = remove_symbols_and_diacritics(s, keep=".%$¢€£") # keep some symbols for numerics + + s = self.standardize_numbers(s) + s = self.standardize_spellings(s) + + # now remove prefix/suffix symbols that are not preceded/followed by numbers + s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s) + s = re.sub(r"([^0-9])%", r"\1 ", s) + + s = re.sub(r"\s+", " ", s) # replace any successive whitespace characters with a space + + return s From a17ad44c1186de28d1f46a710f824c59d63a14d2 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Wed, 12 Jun 2024 18:39:24 +0200 Subject: [PATCH 150/227] Create README.md --- users/zeineldeen/experiments/canary_aed/nemo/README.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 users/zeineldeen/experiments/canary_aed/nemo/README.md diff --git a/users/zeineldeen/experiments/canary_aed/nemo/README.md b/users/zeineldeen/experiments/canary_aed/nemo/README.md new file mode 100644 index 000000000..601598cef --- /dev/null +++ b/users/zeineldeen/experiments/canary_aed/nemo/README.md @@ -0,0 +1 @@ +The `normalizer` folder is taken from here: https://github.com/huggingface/open_asr_leaderboard/tree/main/normalizer. The `write_manifest` function was modified in order to pass the manifest output path as parameter to the function. From aafdf607e0062b8f70d95b7a571653d74863ee23 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Wed, 12 Jun 2024 18:42:15 +0200 Subject: [PATCH 151/227] Update README.md --- users/zeineldeen/experiments/canary_aed/nemo/README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/README.md b/users/zeineldeen/experiments/canary_aed/nemo/README.md index 601598cef..1757e566a 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/README.md +++ b/users/zeineldeen/experiments/canary_aed/nemo/README.md @@ -1 +1,4 @@ -The `normalizer` folder is taken from here: https://github.com/huggingface/open_asr_leaderboard/tree/main/normalizer. The `write_manifest` function was modified in order to pass the manifest output path as parameter to the function. +Some code here is based on: https://huggingface.co/spaces/hf-audio/open_asr_leaderboard + +- The `normalizer` folder is taken from here: https://github.com/huggingface/open_asr_leaderboard/tree/main/normalizer. The `write_manifest` function was modified in order to pass the manifest output path as parameter to the function. +- `run_eval.py` reads the dataset path and model path from input instead. From 0e1d235a42b1def90576c967578d4608f75e5cdb Mon Sep 17 00:00:00 2001 From: Simon Berger Date: Wed, 12 Jun 2024 19:31:16 +0200 Subject: [PATCH 152/227] Update users/berger --- users/berger/args/experiments/transducer.py | 2 +- users/berger/args/returnn/learning_rates.py | 1 - .../config_02b_transducer_rasr_features.py | 40 +++-- ...e_transducer_rasr_features_tinaconf_rtf.py | 148 +++++++++++++++--- ...ig_03b_transducer_fullsum_rasr_features.py | 13 +- .../models/context_1_transducer_tinaconf.py | 37 +++++ .../recognition/generic_seq2seq_search_v2.py | 5 +- users/berger/recipe/recognition/statistics.py | 33 +++- 8 files changed, 226 insertions(+), 53 deletions(-) diff --git a/users/berger/args/experiments/transducer.py b/users/berger/args/experiments/transducer.py index c5c37aa21..67b032755 100644 --- a/users/berger/args/experiments/transducer.py +++ b/users/berger/args/experiments/transducer.py @@ -68,7 +68,7 @@ def get_transducer_recog_step_args( "mem_rqmt": 16, }, "rtf": 50, - "mem": 4, + "mem": 8, } return recursive_update(default_args, kwargs) diff --git a/users/berger/args/returnn/learning_rates.py b/users/berger/args/returnn/learning_rates.py index 9845dce64..176129393 100644 --- a/users/berger/args/returnn/learning_rates.py +++ b/users/berger/args/returnn/learning_rates.py @@ -156,7 +156,6 @@ def get_oclr_function( **kwargs, ) -> str: initial_lr = initial_lr or peak_lr / 10 - decayed_lr = decayed_lr or initial_lr final_lr = final_lr or initial_lr / 5 cycle_epoch = cycle_epoch or (num_epochs * 9) // 20 # 45% of the training diff --git a/users/berger/configs/librispeech/20230210_baselines/config_02b_transducer_rasr_features.py b/users/berger/configs/librispeech/20230210_baselines/config_02b_transducer_rasr_features.py index 9970ae508..603fd7e8d 100644 --- a/users/berger/configs/librispeech/20230210_baselines/config_02b_transducer_rasr_features.py +++ b/users/berger/configs/librispeech/20230210_baselines/config_02b_transducer_rasr_features.py @@ -66,7 +66,10 @@ def generate_returnn_config( } if train: - (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer( + ( + network_dict, + extra_python, + ) = transducer_model.make_context_1_conformer_transducer( num_outputs=num_classes, specaug_args=specaug_args, conformer_args={ @@ -408,18 +411,35 @@ def run_exp( recog_exp_names=["recog_ilm-0.3"], **recog_args, ) + + # switch to seq2seq v2 and run rescale experiments + + for data_input in data.data_inputs.values(): + data_input.create_lm_images(tools.rasr_binary_path) + system.init_corpora( + dev_keys=data.dev_keys, + test_keys=data.test_keys, + align_keys=data.align_keys, + corpus_data=data.data_inputs, + am_args=exp_args.transducer_recog_am_args, + ) + system.setup_scoring() recog_args["search_parameters"].update( { - "label-pruning": 11.0, + "label-pruning": 10.5, "label-pruning-limit": 300, "word-end-pruning": 0.5, "word-end-pruning-limit": 200, } ) + recog_args["seq2seq_v2"] = True + recog_args["rqmt_update"] = {"sbatch_args": ["-A", "rescale_speed", "-p", "rescale_amd"], "cpu": 2} + system.run_recog_step_for_corpora( exp_names=[f"Conformer_Transducer_Viterbi_specaug-v2_{name_suffix}"], + recog_exp_names=["recog_ilm-0.3"], corpora=["dev-other_4gram"], - recog_descriptor="lp-11_lpl-300_wep-0.5_wepl-200", + recog_descriptor="lp-10.5_lpl-300_wep-0.5_wepl-200", **recog_args, ) @@ -450,20 +470,8 @@ def run_exp( ), ) - for data_input in data.data_inputs.values(): - data_input.create_lm_images(tools.rasr_binary_path) - system.init_corpora( - dev_keys=data.dev_keys, - test_keys=data.test_keys, - align_keys=data.align_keys, - corpus_data=data.data_inputs, - am_args=exp_args.transducer_recog_am_args, - ) - system.setup_scoring() - recog_args.update( { - "seq2seq_v2": True, "label_scorer_type": "precomputed-log-posterior", "model_flow_args": {"output_layer_name": "output_precompute"}, } @@ -473,7 +481,7 @@ def run_exp( system.run_recog_step_for_corpora( exp_names=[f"Conformer_Transducer_Viterbi_specaug-v2_{name_suffix}"], corpora=["dev-other_4gram"], - recog_descriptor="lp-11_lpl-300_wep-0.5_wepl-200", + recog_descriptor="lp-10.5_lpl-300_wep-0.5_wepl-200", **recog_args, ) diff --git a/users/berger/configs/librispeech/20230210_baselines/config_02e_transducer_rasr_features_tinaconf_rtf.py b/users/berger/configs/librispeech/20230210_baselines/config_02e_transducer_rasr_features_tinaconf_rtf.py index a396a92d2..9a2ff9916 100644 --- a/users/berger/configs/librispeech/20230210_baselines/config_02e_transducer_rasr_features_tinaconf_rtf.py +++ b/users/berger/configs/librispeech/20230210_baselines/config_02e_transducer_rasr_features_tinaconf_rtf.py @@ -48,6 +48,7 @@ def generate_returnn_config( *, train_data_config: dict, dev_data_config: dict, + precompute: bool = False, **kwargs, ) -> ReturnnConfig: if train: @@ -84,24 +85,44 @@ def generate_returnn_config( loss_boost_v2=kwargs.get("loss_boost_v2", False), ) else: - network_dict, extra_python = transducer_model.make_context_1_conformer_transducer_recog( - num_inputs=50, - num_outputs=num_classes, - decoder_args={ - "dec_mlp_args": { - "num_layers": 2, - "size": 640, - "activation": "tanh", + if precompute: + network_dict, extra_python = transducer_model.make_context_1_conformer_transducer_precomputed_recog( + num_inputs=50, + num_outputs=num_classes, + decoder_args={ + "dec_mlp_args": { + "num_layers": 2, + "size": 640, + "activation": "tanh", + }, + "combination_mode": "concat", + "joint_mlp_args": { + "num_layers": 1, + "size": 1024, + "activation": "tanh", + }, + "ilm_scale": kwargs.get("ilm_scale", 0.0), }, - "combination_mode": "concat", - "joint_mlp_args": { - "num_layers": 1, - "size": 1024, - "activation": "tanh", + ) + else: + network_dict, extra_python = transducer_model.make_context_1_conformer_transducer_recog( + num_inputs=50, + num_outputs=num_classes, + decoder_args={ + "dec_mlp_args": { + "num_layers": 2, + "size": 640, + "activation": "tanh", + }, + "combination_mode": "concat", + "joint_mlp_args": { + "num_layers": 1, + "size": 1024, + "activation": "tanh", + }, + "ilm_scale": kwargs.get("ilm_scale", 0.0), }, - "ilm_scale": kwargs.get("ilm_scale", 0.0), - }, - ) + ) extra_config = { "train": train_data_config, @@ -134,7 +155,8 @@ def generate_returnn_config( python_prolog=[ "import sys", "sys.setrecursionlimit(10 ** 6)", - ], + ] + + (["from returnn.tf.util.data import FeatureDim"] if precompute else []), extra_python=extra_python, num_inputs=50, num_outputs=num_classes, @@ -281,13 +303,19 @@ def run_exp( returnn_configs = ReturnnConfigs( train_config=train_config, recog_configs={ - f"recog_ilm-{ilm_scale}": generate_returnn_config( + "recog_ilm-0.2": generate_returnn_config( train=False, - ilm_scale=ilm_scale, + ilm_scale=0.2, train_data_config=data.train_data_config, dev_data_config=data.cv_data_config, - ) - for ilm_scale in [0.2] + ), + "recog_ilm-0.2_precompute": generate_returnn_config( + train=False, + precompute=True, + ilm_scale=0.2, + train_data_config=data.train_data_config, + dev_data_config=data.cv_data_config, + ), }, ) @@ -311,7 +339,7 @@ def run_exp( ) recog_args["search_parameters"] = search_params descr = f"lp-{lp:.2f}_lpl-{lpl}_wep-{wep:.2f}_wepl-{wepl}" - # system.run_recog_step_for_corpora(corpora=["dev-other_4gram"], recog_descriptor=descr, **recog_args) + # system.run_recog_step_for_corpora(recog_exp_names=["recog_ilm-0.2"], corpora=["dev-other_4gram"], recog_descriptor=descr, **recog_args) for _ in range(10): lp = np.random.uniform(10.0, 13.0) @@ -328,7 +356,7 @@ def run_exp( ) recog_args["search_parameters"] = search_params descr = f"lp-{lp:.2f}_lpl-{lpl}_wep-{wep:.2f}_wepl-{wepl}" - # system.run_recog_step_for_corpora(corpora=["dev-other_4gram"], recog_descriptor=descr, **recog_args) + # system.run_recog_step_for_corpora(recog_exp_names=["recog_ilm-0.2"], corpora=["dev-other_4gram"], recog_descriptor=descr, **recog_args) for _ in range(20): lp = 12.0 @@ -345,7 +373,7 @@ def run_exp( ) recog_args["search_parameters"] = search_params descr = f"lp-{lp:.2f}_lpl-{lpl}_wep-{wep:.2f}_wepl-{wepl}" - # system.run_recog_step_for_corpora(corpora=["dev-other_4gram"], recog_descriptor=descr, **recog_args) + # system.run_recog_step_for_corpora(recog_exp_names=["recog_ilm-0.2"], corpora=["dev-other_4gram"], recog_descriptor=descr, **recog_args) for lp in [ 2.0, @@ -382,7 +410,9 @@ def run_exp( ) recog_args["search_parameters"] = search_params descr = f"lp-{lp:.2f}_lpl-300_wep-{wep:.2f}_wepl-200" - system.run_recog_step_for_corpora(corpora=["dev-other_4gram"], recog_descriptor=descr, **recog_args) + system.run_recog_step_for_corpora( + recog_exp_names=["recog_ilm-0.2"], corpora=["dev-other_4gram"], recog_descriptor=descr, **recog_args + ) for lp in [ 2.0, @@ -419,7 +449,73 @@ def run_exp( ) recog_args["search_parameters"] = search_params descr = f"lp-{lp:.2f}_lpl-1000_wep-{wep:.2f}_wepl-500" - system.run_recog_step_for_corpora(corpora=["dev-other_4gram"], recog_descriptor=descr, **recog_args) + system.run_recog_step_for_corpora( + recog_exp_names=["recog_ilm-0.2"], corpora=["dev-other_4gram"], recog_descriptor=descr, **recog_args + ) + + recog_args.update( + { + "seq2seq_v2": True, + "label_scorer_type": "precomputed-log-posterior", + "model_flow_args": {"output_layer_name": "output_precompute"}, + } + ) + for lp in [ + 9.0, + 10.0, + 10.5, + 11.0, + 11.5, + 12.0, + 12.5, + 13.0, + ]: + # for wep in [0.3, 0.5, 0.7]: + for wep in [0.5]: + search_params = get_seq2seq_search_parameters( + lp=lp, + lpl=300, + wep=wep, + wepl=200, + allow_blank=True, + allow_loop=False, + ) + recog_args["search_parameters"] = search_params + descr = f"lp-{lp:.2f}_lpl-300_wep-{wep:.2f}_wepl-200" + system.run_recog_step_for_corpora( + recog_exp_names=["recog_ilm-0.2_precompute"], + corpora=["dev-other_4gram"], + recog_descriptor=descr, + **recog_args, + ) + + for lp in [ + 9.0, + 10.0, + 10.5, + 11.0, + 11.5, + 12.0, + 12.5, + 13.0, + ]: + for wep in [0.5, 0.6]: + search_params = get_seq2seq_search_parameters( + lp=lp, + lpl=1000, + wep=wep, + wepl=500, + allow_blank=True, + allow_loop=False, + ) + recog_args["search_parameters"] = search_params + descr = f"lp-{lp:.2f}_lpl-1000_wep-{wep:.2f}_wepl-500" + # system.run_recog_step_for_corpora( + # recog_exp_names=["recog_ilm-0.2_precompute"], + # corpora=["dev-other_4gram"], + # recog_descriptor=descr, + # **recog_args, + # ) # system.run_dev_recog_step(**recog_args) # system.run_test_recog_step(**recog_args) diff --git a/users/berger/configs/librispeech/20230210_baselines/config_03b_transducer_fullsum_rasr_features.py b/users/berger/configs/librispeech/20230210_baselines/config_03b_transducer_fullsum_rasr_features.py index f0cf7ca0d..db0c6d156 100644 --- a/users/berger/configs/librispeech/20230210_baselines/config_03b_transducer_fullsum_rasr_features.py +++ b/users/berger/configs/librispeech/20230210_baselines/config_03b_transducer_fullsum_rasr_features.py @@ -49,7 +49,10 @@ def generate_returnn_config( **kwargs, ) -> ReturnnConfig: if train: - (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_fullsum( + ( + network_dict, + extra_python, + ) = transducer_model.make_context_1_conformer_transducer_fullsum( num_outputs=num_classes, specaug_args={ "max_time_num": 1, @@ -84,7 +87,10 @@ def generate_returnn_config( fullsum_v2=True, ) else: - (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_recog( + ( + network_dict, + extra_python, + ) = transducer_model.make_context_1_conformer_transducer_recog( num_outputs=num_classes, conformer_args={ "num_blocks": 12, @@ -286,7 +292,8 @@ def run_exp(alignments: Dict[str, AlignmentData], viterbi_model_checkpoint: Chec # recog_args["lm_scales"] = [0.8, 0.9] # for lm_lookahead_scale in [0.3, 0.4, 0.45, 0.5, 0.6]: recog_args["lm_scales"] = [0.9] - for lm_lookahead_scale in [0.3, 0.4, 0.45, 0.5, 0.6]: + # for lm_lookahead_scale in [0.3, 0.4, 0.45, 0.5, 0.6]: + for lm_lookahead_scale in [0.45]: recog_args["lookahead_options"].update({"lm_lookahead_scale": lm_lookahead_scale}) system.run_recog_step_for_corpora( diff --git a/users/berger/network/models/context_1_transducer_tinaconf.py b/users/berger/network/models/context_1_transducer_tinaconf.py index d4bd41e11..56390c480 100644 --- a/users/berger/network/models/context_1_transducer_tinaconf.py +++ b/users/berger/network/models/context_1_transducer_tinaconf.py @@ -155,3 +155,40 @@ def make_context_1_conformer_transducer_recog( ) return network, python_code + + +def make_context_1_conformer_transducer_precomputed_recog( + num_inputs: int, + num_outputs: int, + decoder_args: Dict = {}, +) -> Tuple[Dict, List]: + network = {} + python_code = [] + + network.update( + encoder.get_best_conformer_network( + size=512, + num_classes=num_outputs, + num_input_feature=num_inputs, + time_tag_name=None, + upsample_by_transposed_conv=False, + chunking="400:200", + label_smoothing=0.0, + additional_args={ + "feature_stacking": False, + "reduction_factor": (1, 4), + "use_spec_augment": False, + }, + ).network + ) + + network["encoder-output"] = { + "class": "copy", + "from": "encoder", + } + + label_context.add_precomputed_context_1_decoder_recog( + network, num_outputs=num_outputs, encoder="encoder-output", **decoder_args + ) + + return network, python_code diff --git a/users/berger/recipe/recognition/generic_seq2seq_search_v2.py b/users/berger/recipe/recognition/generic_seq2seq_search_v2.py index 134b9c0d9..c08d6b69c 100644 --- a/users/berger/recipe/recognition/generic_seq2seq_search_v2.py +++ b/users/berger/recipe/recognition/generic_seq2seq_search_v2.py @@ -49,7 +49,7 @@ def __init__( self.out_log_file = self.log_file_output_path("build_global_cache", crp, False) self.out_global_cache = self.output_path("global.cache", cached=True) - self.rqmt = {"time": 1, "cpu": 1, "mem": 4} + self.rqmt = {"time": 1, "cpu": 1, "mem": 8} def tasks(self): yield Task("create_files", mini_task=True) @@ -356,7 +356,8 @@ def create_config( if "minimum_representation" in la_opts: la_config.minimum_representation = la_opts["minimum_representation"] if "lm_lookahead_scale" in la_opts: - la_config.lm_lookahead_scale = la_opts["lm_lookahead_scale"] + # la_config.lm_lookahead_scale = la_opts["lm_lookahead_scale"] + la_config.scale = la_opts["lm_lookahead_scale"] if "cache_low" in la_opts: post_config.flf_lattice_tool.network.recognizer.recognizer.lm_lookahead.cache_size_low = la_opts[ "cache_low" diff --git a/users/berger/recipe/recognition/statistics.py b/users/berger/recipe/recognition/statistics.py index a0fc9e50a..a822bc391 100644 --- a/users/berger/recipe/recognition/statistics.py +++ b/users/berger/recipe/recognition/statistics.py @@ -108,13 +108,31 @@ def run(self): features = seg.find('./layer[@name="recognizer"]/statistics/frames[@port="features"]') seq_ss_statistics[full_name]["frames"] = int(features.attrib["number"]) + seq_ss_statistics[full_name]["encoder_fwd"] = 0.0 + for element in seg.iterfind( + './layer[@name="recognizer"]/information[@component="flf-lattice-tool.network.recognizer.label-scorer"]' + ): + if element.text.strip().startswith("encoder fwd time"): + seq_ss_statistics[full_name]["encoder_fwd"] = float(element.text.strip().split()[-1]) + break + + seq_ss_statistics[full_name]["decoder_fwd"] = 0.0 + for element in seg.iterfind( + './layer[@name="recognizer"]/information[@component="flf-lattice-tool.network.recognizer.label-scorer"]' + ): + if element.text.strip().startswith("decoder fwd time"): + seq_ss_statistics[full_name]["decoder_fwd"] = float(element.text.strip().split()[-1]) + break + tf_fwd = seg.find( './layer[@name="recognizer"]/information[@component="flf-lattice-tool.network.recognizer.feature-extraction.tf-fwd"]' ) if tf_fwd is not None: seq_ss_statistics[full_name]["tf_fwd"] = float(tf_fwd.text.strip().split()[-1]) else: - seq_ss_statistics[full_name]["tf_fwd"] = 0.0 + seq_ss_statistics[full_name]["tf_fwd"] = ( + seq_ss_statistics[full_name]["encoder_fwd"] + seq_ss_statistics[full_name]["decoder_fwd"] + ) eval_statistics[full_name] = {} for evaluation in seg.findall(".//evaluation"): @@ -131,7 +149,7 @@ def run(self): for stat, val in s.items(): if stat == "frames": pass - elif stat == "tf_fwd": + elif stat == "tf_fwd" or stat == "encoder_fwd" or stat == "decoder_fwd": prev_count, prev_frames = ss_statistics[stat] ss_statistics[stat] = ( prev_count + val, @@ -162,11 +180,18 @@ def run(self): self.rescoring_rtf.set(rescoring_time / (3600.0 * 1000.0 * self.corpus_duration)) self.tf_lm_time.set(lm_time / (3600.0 * 1000.0)) self.tf_lm_rtf.set(lm_time / (3600.0 * 1000.0 * self.corpus_duration)) - self.decoding_rtf.set((recognizer_time + rescoring_time) / (3600.0 * 1000.0 * self.corpus_duration)) + self.decoding_rtf.set( + (recognizer_time + rescoring_time) / (3600.0 * 1000.0 * self.corpus_duration) + - ss_statistics["encoder_fwd"] + - ss_statistics["decoder_fwd"] + ) self.ss_statistics.set(dict(ss_statistics.items())) self.seq_ss_statistics.set(seq_ss_statistics) self.eval_statistics.set(eval_statistics) - self.overall_rtf.set(self.recognizer_rtf + self.ss_statistics["tf_fwd"]) + if ss_statistics["encoder_fwd"] != 0 and ss_statistics["decoder_fwd"] != 0: + self.overall_rtf.set(self.recognizer_rtf.get()) + else: + self.overall_rtf.set(self.recognizer_rtf + ss_statistics["tf_fwd"]) class ExtractSearchStatisticsJobWeiV2(Job): From 47ba45c149177c47d350788b408793a01851c03f Mon Sep 17 00:00:00 2001 From: Simon Berger Date: Wed, 12 Jun 2024 19:33:27 +0200 Subject: [PATCH 153/227] Update users/berger --- users/berger/args/experiments/ctc.py | 2 +- users/berger/args/returnn/regularization.py | 2 +- .../20230602_rescale_baselines/__init__.py | 27 +- .../config_01_conformer_ctc.py | 47 +- ...ig_02_conformer_transducer_phon_viterbi.py | 298 +++++++++ ...onformer_transducer_phon_viterbi_tuning.py | 533 +++++++++++++++ .../config_03_blstm_ctc.py | 141 ---- ...ig_03_conformer_transducer_phon_fullsum.py | 283 ++++++++ ...onformer_transducer_phon_align_restrict.py | 275 ++++++++ users/berger/corpus/general/hdf.py | 34 +- .../corpus/tedlium2/phon_transducer_data.py | 21 +- .../tedlium2/viterbi_transducer_data.py | 113 ++++ users/berger/helpers/hdf.py | 9 +- .../pytorch/custom_parts/vgg_frontend.py | 327 +++++++-- users/berger/pytorch/forward/transducer.py | 23 +- .../pytorch/forward/transducer_beam_search.py | 7 +- .../pytorch/models/conformer_transducer_v2.py | 633 +++++++++++++----- .../berger/pytorch/train_steps/transducer.py | 263 +++++++- users/berger/recipe/returnn/onnx.py | 70 ++ .../recipe/returnn/optuna_returnn_training.py | 294 ++++---- .../alignment/optuna_legacy_alignment.py | 7 +- .../alignment/optuna_seq2seq_alignment.py | 8 +- .../functors/alignment/seq2seq_alignment.py | 47 +- .../systems/functors/optuna_rasr_base.py | 55 +- .../recognition/optuna_legacy_search.py | 14 +- .../recognition/optuna_seq2seq_search.py | 145 +++- .../functors/recognition/seq2seq_search.py | 8 +- users/berger/systems/functors/seq2seq_base.py | 9 +- .../training/optuna_returnn_training.py | 3 +- users/berger/systems/types.py | 1 - 30 files changed, 2964 insertions(+), 735 deletions(-) create mode 100644 users/berger/configs/tedlium2/20230602_rescale_baselines/config_02_conformer_transducer_phon_viterbi.py create mode 100644 users/berger/configs/tedlium2/20230602_rescale_baselines/config_02a_conformer_transducer_phon_viterbi_tuning.py delete mode 100644 users/berger/configs/tedlium2/20230602_rescale_baselines/config_03_blstm_ctc.py create mode 100644 users/berger/configs/tedlium2/20230602_rescale_baselines/config_03_conformer_transducer_phon_fullsum.py create mode 100644 users/berger/configs/tedlium2/20230602_rescale_baselines/config_05_conformer_transducer_phon_align_restrict.py create mode 100644 users/berger/corpus/tedlium2/viterbi_transducer_data.py diff --git a/users/berger/args/experiments/ctc.py b/users/berger/args/experiments/ctc.py index 32c0c90d9..b50ba678d 100644 --- a/users/berger/args/experiments/ctc.py +++ b/users/berger/args/experiments/ctc.py @@ -67,7 +67,7 @@ def get_ctc_recog_step_args(num_classes: int, reduction_factor: int = 4, **kwarg "mem_rqmt": 16, }, "rtf": 20, - "mem": 4, + "mem": 8, } return recursive_update(default_args, kwargs) diff --git a/users/berger/args/returnn/regularization.py b/users/berger/args/returnn/regularization.py index eda6ff7bf..afde3196b 100644 --- a/users/berger/args/returnn/regularization.py +++ b/users/berger/args/returnn/regularization.py @@ -13,7 +13,7 @@ def get_chunking_config( if isinstance(chunking_factors, list): chunking_factors = {key: 1 for key in chunking_factors} - assert isinstance(chunking_factors, Dict) + assert isinstance(chunking_factors, dict) return { "chunking": ( {key: base_chunk_size // factor for key, factor in chunking_factors.items()}, diff --git a/users/berger/configs/tedlium2/20230602_rescale_baselines/__init__.py b/users/berger/configs/tedlium2/20230602_rescale_baselines/__init__.py index a6c351e5a..5cebc69b1 100644 --- a/users/berger/configs/tedlium2/20230602_rescale_baselines/__init__.py +++ b/users/berger/configs/tedlium2/20230602_rescale_baselines/__init__.py @@ -1,13 +1,17 @@ +import getpass import copy from i6_experiments.users.berger.recipe.summary.report import SummaryReport from i6_experiments.users.berger.systems.dataclasses import SummaryKey from sisyphus import tk, gs - from .config_01_conformer_ctc import py as py_01 +from .config_02_conformer_transducer_phon_viterbi import py as py_02 +from .config_02a_conformer_transducer_phon_viterbi_tuning import py as py_02a +from .config_03_conformer_transducer_phon_fullsum import py as py_03 +from .config_05_conformer_transducer_phon_align_restrict import py as py_05 -from .config_04a_conformer_transducer_bpe import py as py_04a -from .config_04a_conformer_transducer_bpe_rasr import py as py_04a_rasr -from .config_04b_conformer_transducer_phon import py as py_04b +# from .config_04a_conformer_transducer_bpe import py as py_04a +# from .config_04a_conformer_transducer_bpe_rasr import py as py_04a_rasr +# from .config_04b_conformer_transducer_phon import py as py_04b def main() -> SummaryReport: @@ -46,11 +50,14 @@ def worker_wrapper(job, task_name, call): "ReturnnForwardComputePriorJob", "OptunaReturnnForwardComputePriorJob", "CompileKenLMJob", + "OptunaReportIntermediateScoreJob", + "OptunaReportFinalScoreJob", } onnx_jobs = { "ExportPyTorchModelToOnnxJob", "TorchOnnxExportJob", "OptunaExportPyTorchModelToOnnxJob", + "OptunaTorchOnnxExportJob", } jobclass = type(job).__name__ if jobclass in rasr_jobs: @@ -71,6 +78,9 @@ def worker_wrapper(job, task_name, call): "apptainer", "exec", ] + + app_call += ["--env", f"NUMBA_CACHE_DIR=/var/tmp/numba_cache_{getpass.getuser()}"] + if t._rqmt.get("gpu", 0) > 0: app_call += ["--nv"] @@ -88,10 +98,11 @@ def worker_wrapper(job, task_name, call): summary_report = SummaryReport() for subreport in [ - copy.deepcopy(py_01()), - copy.deepcopy(py_04a()), - copy.deepcopy(py_04a_rasr()), - copy.deepcopy(py_04b()), + copy.deepcopy(py_01()[0]), + copy.deepcopy(py_02()[0]), + copy.deepcopy(py_02a()), + copy.deepcopy(py_03()), + copy.deepcopy(py_05()), ]: subreport.collapse([SummaryKey.CORPUS.value], best_selector_key=SummaryKey.ERR.value) summary_report.merge_report(subreport, update_structure=True) diff --git a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_01_conformer_ctc.py b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_01_conformer_ctc.py index 7420c202f..ca75695d1 100644 --- a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_01_conformer_ctc.py +++ b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_01_conformer_ctc.py @@ -1,5 +1,6 @@ import copy import os +from typing import Dict, Tuple from i6_models.config import ModuleFactoryV1 from i6_core.returnn.config import ReturnnConfig @@ -12,7 +13,7 @@ from i6_experiments.users.berger.corpus.tedlium2.ctc_data import get_tedlium2_data_dumped_labels from i6_experiments.users.berger.pytorch.models import conformer_ctc from i6_experiments.users.berger.recipe.summary.report import SummaryReport -from i6_experiments.users.berger.systems.dataclasses import ConfigVariant, FeatureType, ReturnnConfigs +from i6_experiments.users.berger.systems.dataclasses import AlignmentData, ConfigVariant, FeatureType, ReturnnConfigs from i6_experiments.users.berger.systems.returnn_seq2seq_system import ( ReturnnSeq2SeqSystem, ) @@ -63,7 +64,7 @@ def returnn_config_generator( extra_python=[conformer_ctc.get_serializer(model_config, variant=variant)], extern_data_config=True, backend=Backend.PYTORCH, - grad_noise=kwargs.get("grad_noise", 0.0), + grad_noise=0.0, grad_clip=0.0, optimizer=Optimizers.AdamW, schedule=LearningRateSchedules.OCLR_STEP_TORCH, @@ -112,7 +113,7 @@ def get_returnn_config_collection( ) -def run_exp(num_subepochs: int = 250) -> SummaryReport: +def run_exp(num_subepochs: int = 250) -> Tuple[SummaryReport, Dict[str, AlignmentData]]: assert tools.returnn_root assert tools.returnn_python_exe assert tools.rasr_binary_path @@ -140,6 +141,13 @@ def run_exp(num_subepochs: int = 250) -> SummaryReport: search_stats=True, seq2seq_v2=True, ) + align_args = exp_args.get_ctc_align_step_args( + num_classes=num_outputs, + feature_type=FeatureType.LOGMEL_16K, + prior_scale=0.3, + epoch=num_subepochs, + register_output=True, + ) # ********** System ********** @@ -148,6 +156,7 @@ def run_exp(num_subepochs: int = 250) -> SummaryReport: system.init_corpora( dev_keys=data.dev_keys, test_keys=data.test_keys, + align_keys=data.align_keys, corpus_data=data.data_inputs, am_args=exp_args.ctc_recog_am_args, ) @@ -155,34 +164,34 @@ def run_exp(num_subepochs: int = 250) -> SummaryReport: # ********** Returnn Configs ********** - for grad_noise in [0.0, 0.1]: - system.add_experiment_configs( - f"Conformer_CTC_{num_subepochs}-epochs_gn-{grad_noise}", - get_returnn_config_collection( - train_data_config=data.train_data_config, - dev_data_config=data.cv_data_config, - num_subepochs=num_subepochs, - grad_noise=grad_noise, - ), - ) + system.add_experiment_configs( + f"Conformer_CTC_{num_subepochs}-epochs", + get_returnn_config_collection( + train_data_config=data.train_data_config, + dev_data_config=data.cv_data_config, + num_subepochs=num_subepochs, + ), + ) system.run_train_step(**train_args) system.run_dev_recog_step(**recog_args) + align_data = next(iter(system.run_align_step(**align_args).values())) assert system.summary_report - return system.summary_report + return system.summary_report, align_data -def py() -> SummaryReport: +def py() -> Tuple[SummaryReport, Dict[str, AlignmentData]]: filename_handle = os.path.splitext(os.path.basename(__file__))[0][len("config_") :] gs.ALIAS_AND_OUTPUT_SUBDIR = f"{filename_handle}/" summary_report = SummaryReport() - summary_report.merge_report(run_exp(num_subepochs=250), update_structure=True) - summary_report.merge_report(run_exp(num_subepochs=500), update_structure=True) - summary_report.merge_report(run_exp(num_subepochs=1000), update_structure=True) + summary_report.merge_report(run_exp(num_subepochs=250)[0], update_structure=True) + summary_report.merge_report(run_exp(num_subepochs=500)[0], update_structure=True) + report, align_data = run_exp(num_subepochs=1000) + summary_report.merge_report(report, update_structure=True) tk.register_report(f"{gs.ALIAS_AND_OUTPUT_SUBDIR}/summary.report", summary_report) - return summary_report + return summary_report, align_data diff --git a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_02_conformer_transducer_phon_viterbi.py b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_02_conformer_transducer_phon_viterbi.py new file mode 100644 index 000000000..65e6ddbde --- /dev/null +++ b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_02_conformer_transducer_phon_viterbi.py @@ -0,0 +1,298 @@ +import copy +import os +from typing import Dict, List, Tuple + +import i6_core.rasr as rasr +from i6_core.returnn import PtCheckpoint +from i6_core.returnn.config import ReturnnConfig +from i6_experiments.users.berger.args.experiments import transducer as exp_args +from i6_experiments.users.berger.args.returnn.config import Backend, get_returnn_config +from i6_experiments.users.berger.args.returnn.learning_rates import LearningRateSchedules, Optimizers +from i6_experiments.users.berger.corpus.tedlium2.viterbi_transducer_data import get_tedlium2_data +from i6_experiments.users.berger.pytorch.custom_parts.identity import IdentityConfig, IdentityModule +from i6_experiments.users.berger.pytorch.models import conformer_transducer_v2 as model +from i6_experiments.users.berger.recipe.summary.report import SummaryReport +from i6_experiments.users.berger.systems.dataclasses import ( + AlignmentData, + EncDecConfig, + FeatureType, + ReturnnConfigs, + SummaryKey, +) +from i6_experiments.users.berger.systems.returnn_seq2seq_system import ReturnnSeq2SeqSystem +from i6_experiments.users.berger.util import default_tools_v2 +from i6_models.config import ModuleFactoryV1 +from sisyphus import gs, tk + +from .config_01_conformer_ctc import py as py_ctc + +# ********** Settings ********** + +rasr.flow.FlowNetwork.default_flags = {"cache_mode": "task_dependent"} + +num_outputs = 79 +num_subepochs = 500 + +tools = copy.deepcopy(default_tools_v2) +tools.rasr_binary_path = tk.Path("/u/berger/repositories/rasr_versions/gen_seq2seq_dev/arch/linux-x86_64-standard") + + +# ********** Return Config generators ********** + + +def returnn_config_generator( + train_data_config: dict, + dev_data_config: dict, + **kwargs, +) -> ReturnnConfig: + model_config = model.get_default_config_v1(num_outputs=num_outputs) + + extra_config = { + "train": train_data_config, + "dev": dev_data_config, + "max_seq_length": {"audio_features": 560000}, + "torch_amp": {"dtype": "bfloat16"}, + } + serializer = model.get_viterbi_train_serializer(model_config, enc_loss_scales={5: 0.3, 11: 1.0}, **kwargs) + + return get_returnn_config( + num_epochs=num_subepochs, + num_inputs=1, + num_outputs=num_outputs, + target="classes", + extra_python=[serializer], + extern_data_config=True, + backend=Backend.PYTORCH, + grad_noise=0.0, + grad_clip=0.0, + optimizer=Optimizers.AdamW, + weight_decay=5e-06, + schedule=LearningRateSchedules.OCLR, + initial_lr=8e-05, + peak_lr=kwargs.get("peak_lr", 8e-04), + decayed_lr=1e-05, + final_lr=1e-07, + batch_size=30000 * 160, + use_chunking=False, + extra_config=extra_config, + ) + + +def recog_returnn_configs_generator( + ilm_scale: float = 0.0, + **kwargs, +) -> EncDecConfig[ReturnnConfig]: + model_config = model.get_default_config_v1(num_outputs=num_outputs) + model_config.transcriber_cfg.feature_extraction = ModuleFactoryV1( + IdentityModule, + IdentityConfig(), + ) + if ilm_scale != 0: + model_config = model.FFNNTransducerWithIlmConfig( + transcriber_cfg=model_config.transcriber_cfg, + predictor_cfg=model_config.predictor_cfg, + joiner_cfg=model_config.joiner_cfg, + ilm_scale=ilm_scale, + ) + + enc_extra_config = { + "extern_data": { + "sources": {"dim": 80, "dtype": "float32"}, + }, + "model_outputs": { + "source_encodings": { + "dim": 384, + "dtype": "float32", + }, + }, + } + dec_extra_config = { + "extern_data": { + "source_encodings": { + "dim": 384, + "time_dim_axis": None, + "dtype": "float32", + }, + "history": { + "dim": num_outputs, + "time_dim_axis": None, + "sparse": True, + "shape": (1,), + "dtype": "int32", + }, + }, + "model_outputs": { + "log_probs": { + "dim": num_outputs, + "time_dim_axis": None, + "dtype": "float32", + } + }, + } + enc_serializer = model.get_encoder_recog_serializer(model_config, **kwargs) + dec_serializer = model.get_decoder_recog_serializer(model_config, **kwargs) + + return EncDecConfig( + encoder_config=get_returnn_config( + num_inputs=80, + num_outputs=num_outputs, + target=None, + extra_python=[enc_serializer], + extern_data_config=False, + backend=Backend.PYTORCH, + extra_config=enc_extra_config, + ), + decoder_config=get_returnn_config( + num_inputs=1, + num_outputs=num_outputs, + target=None, + extra_python=[dec_serializer], + extern_data_config=False, + backend=Backend.PYTORCH, + extra_config=dec_extra_config, + ), + ) + + +def get_returnn_config_collection( + train_data_config: dict, + dev_data_config: dict, + ilm_scales: List[float] = [0.0, 0.2], + **kwargs, +) -> ReturnnConfigs[ReturnnConfig]: + return ReturnnConfigs( + train_config=returnn_config_generator( + train_data_config=train_data_config, + dev_data_config=dev_data_config, + blank_id=0, + **kwargs, + ), + recog_configs={ + f"recog_ilm-{ilm_scale}": recog_returnn_configs_generator(ilm_scale=ilm_scale, **kwargs) + for ilm_scale in ilm_scales + }, + ) + + +def run_exp(alignments: Dict[str, AlignmentData]) -> Tuple[SummaryReport, PtCheckpoint]: + assert tools.returnn_root + assert tools.returnn_python_exe + assert tools.rasr_binary_path + data = get_tedlium2_data( + alignments=alignments, + returnn_root=tools.returnn_root, + returnn_python_exe=tools.returnn_python_exe, + rasr_binary_path=tools.rasr_binary_path, + augmented_lexicon=True, + feature_type=FeatureType.SAMPLES, + ) + + for data_input in data.data_inputs.values(): + data_input.create_lm_images(tools.rasr_binary_path) + + # ********** Step args ********** + + train_args = exp_args.get_transducer_train_step_args(num_epochs=num_subepochs, gpu_mem_rqmt=24) + recog_args = exp_args.get_transducer_recog_step_args( + num_classes=num_outputs, + epochs=[20, 40, 80, 160, 320, 480, num_subepochs], + label_scorer_type="onnx-ffnn-transducer", + label_scorer_args={"extra_args": {"start_label_index": 0}}, + reduction_subtrahend=3, + reduction_factor=4, + feature_type=FeatureType.LOGMEL_16K, + seq2seq_v2=True, + ) + + # ********** System ********** + + system = ReturnnSeq2SeqSystem( + tool_paths=tools, + summary_keys=[ + SummaryKey.TRAIN_NAME, + SummaryKey.RECOG_NAME, + SummaryKey.CORPUS, + SummaryKey.EPOCH, + SummaryKey.LM, + SummaryKey.WER, + SummaryKey.SUB, + SummaryKey.INS, + SummaryKey.DEL, + SummaryKey.ERR, + ], + summary_sort_keys=[SummaryKey.ERR, SummaryKey.CORPUS], + ) + + system.init_corpora( + dev_keys=data.dev_keys, + test_keys=data.test_keys, + corpus_data=data.data_inputs, + am_args=exp_args.transducer_recog_am_args, + ) + system.setup_scoring() + + # ********** Returnn Configs ********** + + system.add_experiment_configs( + "Conformer_Transducer_Viterbi", + get_returnn_config_collection( + data.train_data_config, + data.cv_data_config, + ), + ) + + system.run_train_step(**train_args) + system.run_dev_recog_step(**recog_args) + + system.add_experiment_configs( + "Conformer_Transducer_Viterbi", + get_returnn_config_collection( + data.train_data_config, + data.cv_data_config, + ilm_scales=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5], + ), + ) + + recog_args.update( + { + "epochs": [320], + "lm_scales": [0.5, 0.7, 0.9, 1.1, 1.3, 1.5], + } + ) + for bp in [0.0, 0.5, 1.0, 2.0, 3.0]: + if bp != 0: + recog_args["search_parameters"]["blank-label-penalty"] = bp + system.run_dev_recog_step(recog_descriptor=f"bp-{bp}", **recog_args) + + recog_args.update( + { + "epochs": [num_subepochs], + "lm_scales": [0.5, 0.6, 0.7], + } + ) + for bp in [0.0, 0.5, 1.0]: + if bp != 0: + recog_args["search_parameters"]["blank-label-penalty"] = bp + system.run_dev_recog_step(recog_descriptor=f"bp-{bp}", **recog_args) + + model = system.get_train_job().out_checkpoints[num_subepochs] + assert isinstance(model, PtCheckpoint) + + assert system.summary_report + return system.summary_report, model + + +def py() -> Tuple[SummaryReport, PtCheckpoint]: + _, alignments = py_ctc() + filename_handle = os.path.splitext(os.path.basename(__file__))[0][len("config_") :] + gs.ALIAS_AND_OUTPUT_SUBDIR = f"{filename_handle}/" + + summary_report = SummaryReport() + + report, model = run_exp(alignments) + + summary_report.merge_report(report, update_structure=True) + + tk.register_report(f"{gs.ALIAS_AND_OUTPUT_SUBDIR}/summary.report", summary_report) + + return summary_report, model diff --git a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_02a_conformer_transducer_phon_viterbi_tuning.py b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_02a_conformer_transducer_phon_viterbi_tuning.py new file mode 100644 index 000000000..f1f69e84d --- /dev/null +++ b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_02a_conformer_transducer_phon_viterbi_tuning.py @@ -0,0 +1,533 @@ +import copy +from enum import Enum, auto +import torch +import os +from pathlib import Path +from typing import Callable, Dict, List + +import optuna + +import i6_core.rasr as rasr +from i6_core.returnn.config import ReturnnConfig +from i6_experiments.users.berger.args.experiments import transducer as exp_args +from i6_experiments.users.berger.args.returnn.config import Backend, get_returnn_config +from i6_experiments.users.berger.args.returnn.learning_rates import LearningRateSchedules, Optimizers +from i6_experiments.users.berger.corpus.tedlium2.viterbi_transducer_data import get_tedlium2_data +from i6_experiments.users.berger.pytorch.custom_parts.identity import IdentityConfig, IdentityModule +from i6_experiments.users.berger.pytorch.models import conformer_transducer_v2 as model +from i6_experiments.users.berger.recipe.summary.report import SummaryReport +from i6_experiments.users.berger.systems.dataclasses import ( + AlignmentData, + EncDecConfig, + FeatureType, + ReturnnConfigs, + SummaryKey, +) +from i6_experiments.users.berger.util import default_tools_v2 +from i6_models.config import ModuleFactoryV1 +from sisyphus import gs, tk +from i6_experiments.users.berger.recipe.returnn.optuna_config import OptunaReturnnConfig +from i6_experiments.users.berger.systems.optuna_returnn_seq2seq_system import OptunaReturnnSeq2SeqSystem +from i6_experiments.users.berger.recipe.returnn.hdf import MatchLengthsJob +from i6_experiments.users.berger.pytorch.custom_parts.vgg_frontend import ( + VGG4LayerActFrontendCeilPoolV1, + VGG4LayerActFrontendCeilPoolV1Config, +) + +from .config_01_conformer_ctc import py as py_ctc + +# ********** Settings ********** + +rasr.flow.FlowNetwork.default_flags = {"cache_mode": "task_dependent"} + +storage_path = Path(__file__).parent.parent / "optuna_studies" / "storage.db" +storage = f"sqlite:///{storage_path.as_posix()}" + +num_outputs = 79 +num_subepochs = 300 +sub_checkpoints = [50, 100, 150, 200, 220, 240, 260, 280, 290, 300] + +tools = copy.deepcopy(default_tools_v2) +tools.rasr_binary_path = tk.Path("/u/berger/repositories/rasr_versions/gen_seq2seq_dev/arch/linux-x86_64-standard") + + +# ********** Return Config generators ********** + + +def subsample_by_4_ceil(x: int) -> int: + return -(-x // 4) + + +def tune_specaugment(trial: optuna.Trial, model_config: model.FFNNTransducerConfig) -> dict: + model_config.transcriber_cfg.specaugment.cfg.time_max_mask_per_n_frames = trial.suggest_int( + "time_max_mask_per_n_frames", 20, 40, step=5 + ) + model_config.transcriber_cfg.specaugment.cfg.time_max_mask_size = trial.suggest_int( + "time_max_mask_size", 10, 30, step=5 + ) + freq_max_num_masks = trial.suggest_categorical("freq_max_num_masks", [8, 10, 16, 20]) + model_config.transcriber_cfg.specaugment.cfg.freq_max_num_masks = freq_max_num_masks + model_config.transcriber_cfg.specaugment.cfg.freq_mask_max_size = 80 // freq_max_num_masks + + return {} + + +def tune_model(trial: optuna.Trial, model_config: model.FFNNTransducerConfig) -> dict: + num_att_heads = trial.suggest_int("att_heads", 6, 8, step=2) + dim_per_head = trial.suggest_int("dim_per_head", 64, 96, step=32) + + total_dim = num_att_heads * dim_per_head + + model_config.transcriber_cfg.encoder.cfg.frontend.cfg.out_features = total_dim + model_config.transcriber_cfg.layer_size = total_dim + model_config.transcriber_cfg.encoder.cfg.block_cfg.ff_cfg.input_dim = total_dim + model_config.transcriber_cfg.encoder.cfg.block_cfg.ff_cfg.hidden_dim = 4 * total_dim + model_config.transcriber_cfg.encoder.cfg.block_cfg.mhsa_cfg.input_dim = total_dim + model_config.transcriber_cfg.encoder.cfg.block_cfg.mhsa_cfg.num_att_heads = num_att_heads + model_config.transcriber_cfg.encoder.cfg.block_cfg.conv_cfg.channels = total_dim + model_config.transcriber_cfg.encoder.cfg.block_cfg.conv_cfg.norm = torch.nn.BatchNorm1d( + num_features=total_dim, affine=False + ) + + model_config.transcriber_cfg.encoder.cfg.block_cfg.conv_cfg.kernel_size = trial.suggest_categorical( + "conv_kernel_size", [7, 15, 31] + ) + + model_config.predictor_cfg.layers = trial.suggest_int("predictor_layers", 1, 2) + + join_combination_mode = trial.suggest_categorical("joiner_combination", ["add", "concat"]) + if join_combination_mode == "add": + predictor_layer_size = total_dim + model_config.joiner_cfg.combination_mode = model.CombinationMode.SUM + model_config.joiner_cfg.input_size = total_dim + else: + predictor_layer_size = trial.suggest_int("predictor_layer_size", 384, 640, step=128) + model_config.joiner_cfg.combination_mode = model.CombinationMode.CONCAT + model_config.joiner_cfg.input_size = total_dim + predictor_layer_size + + model_config.predictor_cfg.layer_size = predictor_layer_size + + dropout = trial.suggest_categorical("dropout", [0.1, 0.2, 0.3]) + + model_config.transcriber_cfg.encoder.cfg.block_cfg.ff_cfg.dropout = dropout + model_config.transcriber_cfg.encoder.cfg.block_cfg.mhsa_cfg.dropout = dropout + model_config.transcriber_cfg.encoder.cfg.block_cfg.mhsa_cfg.att_weights_dropout = dropout + model_config.transcriber_cfg.encoder.cfg.block_cfg.conv_cfg.dropout = dropout + + layer_order = trial.suggest_categorical("layer_order", ["conv_first", "mhsa_first"]) + if layer_order == "conv_first": + model_config.transcriber_cfg.encoder.cfg.block_cfg.modules = ["ff", "conv", "mhsa", "ff"] + else: + model_config.transcriber_cfg.encoder.cfg.block_cfg.modules = ["ff", "mhsa", "conv", "ff"] + + return {} + + +def tune_model_broad(trial: optuna.Trial, model_config: model.FFNNTransducerConfig) -> dict: + size = trial.suggest_categorical("size", ["small", "medium", "large"]) + + def build_model( + att_heads: int, dim_per_head: int, conv_kernel_size: int, predictor_layers: int, predictor_layer_size: int + ) -> None: + total_dim = att_heads * dim_per_head + model_config.transcriber_cfg.encoder.cfg.frontend = ModuleFactoryV1( + VGG4LayerActFrontendCeilPoolV1, + VGG4LayerActFrontendCeilPoolV1Config( + in_features=80, + conv1_channels=32, + conv2_channels=64, + conv3_channels=32, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation=torch.nn.ReLU(), + out_features=total_dim, + ), + ) + + model_config.transcriber_cfg.encoder.cfg.frontend.cfg.out_features = total_dim + model_config.transcriber_cfg.layer_size = total_dim + model_config.transcriber_cfg.encoder.cfg.block_cfg.ff_cfg.input_dim = total_dim + model_config.transcriber_cfg.encoder.cfg.block_cfg.ff_cfg.hidden_dim = 4 * total_dim + model_config.transcriber_cfg.encoder.cfg.block_cfg.mhsa_cfg.input_dim = total_dim + model_config.transcriber_cfg.encoder.cfg.block_cfg.mhsa_cfg.num_att_heads = att_heads + model_config.transcriber_cfg.encoder.cfg.block_cfg.conv_cfg.channels = total_dim + model_config.transcriber_cfg.encoder.cfg.block_cfg.conv_cfg.norm = torch.nn.BatchNorm1d( + num_features=total_dim, affine=False + ) + + model_config.transcriber_cfg.encoder.cfg.block_cfg.conv_cfg.kernel_size = conv_kernel_size + + model_config.predictor_cfg.layers = predictor_layers + + model_config.joiner_cfg.combination_mode = model.CombinationMode.CONCAT + model_config.joiner_cfg.input_size = total_dim + predictor_layer_size + + model_config.predictor_cfg.layer_size = predictor_layer_size + + model_config.transcriber_cfg.encoder.cfg.block_cfg.ff_cfg.dropout = 0.3 + model_config.transcriber_cfg.encoder.cfg.block_cfg.mhsa_cfg.dropout = 0.3 + model_config.transcriber_cfg.encoder.cfg.block_cfg.mhsa_cfg.att_weights_dropout = 0.3 + model_config.transcriber_cfg.encoder.cfg.block_cfg.conv_cfg.dropout = 0.3 + + model_config.transcriber_cfg.encoder.cfg.block_cfg.modules = ["ff", "conv", "mhsa", "ff"] + + if size == "small": + build_model( + att_heads=6, + dim_per_head=64, + conv_kernel_size=7, + predictor_layers=2, + predictor_layer_size=384, + ) + elif size == "medium": + build_model( + att_heads=8, + dim_per_head=64, + conv_kernel_size=7, + predictor_layers=2, + predictor_layer_size=384, + ) + elif size == "large": + build_model( + att_heads=8, + dim_per_head=96, + conv_kernel_size=7, + predictor_layers=2, + predictor_layer_size=384, + ) + + return {} + + +def tune_learn_schedule(trial: optuna.Trial, _: model.FFNNTransducerConfig) -> dict: + batch_size = trial.suggest_int("batch_size", 10000, 30000, step=5000) + peak_lr = trial.suggest_float("peak_lr", 4e-04, 1e-03, log=True) + initial_lr = peak_lr / 10 + return {"initial_lr": initial_lr, "peak_lr": peak_lr, "batch_size": batch_size * 160} + + +class TuningOption(Enum): + SPECAUGMENT = auto() + MODEL = auto() + MODEL_BROAD = auto() + LEARN_SCHEDULE = auto() + + +def map_tuning_option(tuning_option: TuningOption) -> Callable[[optuna.Trial, model.FFNNTransducerConfig], dict]: + if tuning_option == TuningOption.SPECAUGMENT: + return tune_specaugment + if tuning_option == TuningOption.MODEL: + return tune_model + if tuning_option == TuningOption.MODEL_BROAD: + return tune_model_broad + if tuning_option == TuningOption.LEARN_SCHEDULE: + return tune_learn_schedule + + +def returnn_config_generator( + trial: optuna.Trial, + tuning_options: List[TuningOption], + train_data_config: dict, + dev_data_config: dict, +) -> ReturnnConfig: + model_config = model.get_default_config_v1(num_outputs=num_outputs) + model_config.transcriber_cfg.feature_extraction = ModuleFactoryV1( + IdentityModule, + IdentityConfig(), + ) + + tuning_kwargs = {} + for tuning_option in tuning_options: + tuning_kwargs.update(map_tuning_option(tuning_option)(trial, model_config)) + + extra_config = { + "train": train_data_config, + "dev": dev_data_config, + "max_seq_length": {"audio_features": 560000}, + "torch_amp": {"dtype": "bfloat16"}, + "chunking": ( + { + "data": 256, + "classes": 64, + }, + { + "data": 128, + "classes": 32, + }, + ), + } + serializer = model.get_viterbi_train_serializer(model_config, enc_loss_scales={5: 0.3, 11: 0.7}) + + return get_returnn_config( + num_epochs=num_subepochs, + num_inputs=80, + num_outputs=num_outputs, + target="classes", + extra_python=[serializer], + extern_data_config=True, + backend=Backend.PYTORCH, + grad_noise=0.0, + grad_clip=0.0, + keep_last_n=1, + keep_best_n=0, + keep=sub_checkpoints, + optimizer=Optimizers.AdamW, + weight_decay=5e-06, + schedule=LearningRateSchedules.OCLR, + initial_lr=8e-05, + peak_lr=8e-04, + decayed_lr=1e-05, + final_lr=1e-07, + batch_size=30000 * 160, + use_chunking=False, + extra_config=extra_config, + ) + + +def recog_enc_returnn_config_generator( + trial: optuna.Trial, + tuning_options: List[TuningOption], + ilm_scale: float = 0.0, +) -> ReturnnConfig: + model_config = model.get_default_config_v1(num_outputs=num_outputs) + model_config.transcriber_cfg.feature_extraction = ModuleFactoryV1( + IdentityModule, + IdentityConfig(), + ) + for tuning_option in tuning_options: + map_tuning_option(tuning_option)(trial, model_config) + if ilm_scale != 0: + model_config = model.FFNNTransducerWithIlmConfig( + transcriber_cfg=model_config.transcriber_cfg, + predictor_cfg=model_config.predictor_cfg, + joiner_cfg=model_config.joiner_cfg, + ilm_scale=ilm_scale, + ) + + enc_extra_config = { + "extern_data": { + "sources": {"dim": 80, "dtype": "float32"}, + }, + "model_outputs": { + "source_encodings": { + "dim": model_config.transcriber_cfg.layer_size, + "dtype": "float32", + }, + }, + } + enc_serializer = model.get_encoder_recog_serializer(model_config) + + return get_returnn_config( + num_inputs=80, + num_outputs=num_outputs, + target=None, + extra_python=[enc_serializer], + extern_data_config=False, + backend=Backend.PYTORCH, + extra_config=enc_extra_config, + ) + + +def recog_dec_returnn_config_generator( + trial: optuna.Trial, + tuning_options: List[TuningOption], + ilm_scale: float = 0.0, +) -> ReturnnConfig: + model_config = model.get_default_config_v1(num_outputs=num_outputs) + for tuning_option in tuning_options: + map_tuning_option(tuning_option)(trial, model_config) + model_config.transcriber_cfg.feature_extraction = ModuleFactoryV1( + IdentityModule, + IdentityConfig(), + ) + if ilm_scale != 0: + model_config = model.FFNNTransducerWithIlmConfig( + transcriber_cfg=model_config.transcriber_cfg, + predictor_cfg=model_config.predictor_cfg, + joiner_cfg=model_config.joiner_cfg, + ilm_scale=ilm_scale, + ) + + dec_extra_config = { + "extern_data": { + "source_encodings": { + "dim": model_config.transcriber_cfg.layer_size, + "time_dim_axis": None, + "dtype": "float32", + }, + "history": { + "dim": num_outputs, + "time_dim_axis": None, + "sparse": True, + "shape": (1,), + "dtype": "int32", + }, + }, + "model_outputs": { + "log_probs": { + "dim": num_outputs, + "time_dim_axis": None, + "dtype": "float32", + } + }, + } + dec_serializer = model.get_decoder_recog_serializer(model_config) + + return get_returnn_config( + num_inputs=80, + num_outputs=num_outputs, + target=None, + extra_python=[dec_serializer], + extern_data_config=False, + backend=Backend.PYTORCH, + extra_config=dec_extra_config, + ) + + +def get_returnn_config_collection( + tuning_options: List[TuningOption], + train_data_config: dict, + dev_data_config: dict, + ilm_scales: List[float] = [0.2], +) -> ReturnnConfigs[OptunaReturnnConfig]: + return ReturnnConfigs( + train_config=OptunaReturnnConfig( + returnn_config_generator, + { + "train_data_config": train_data_config, + "dev_data_config": dev_data_config, + "tuning_options": tuning_options, + }, + ), + recog_configs={ + f"recog_ilm-{ilm_scale}": EncDecConfig( + encoder_config=OptunaReturnnConfig( + recog_enc_returnn_config_generator, {"ilm_scale": ilm_scale, "tuning_options": tuning_options} + ), + decoder_config=OptunaReturnnConfig( + recog_dec_returnn_config_generator, {"ilm_scale": ilm_scale, "tuning_options": tuning_options} + ), + ) + for ilm_scale in ilm_scales + }, + ) + + +def run_exp(alignments: Dict[str, AlignmentData]) -> SummaryReport: + assert tools.returnn_root + assert tools.returnn_python_exe + assert tools.rasr_binary_path + data = get_tedlium2_data( + alignments=alignments, + returnn_root=tools.returnn_root, + returnn_python_exe=tools.returnn_python_exe, + rasr_binary_path=tools.rasr_binary_path, + augmented_lexicon=True, + feature_type=FeatureType.LOGMEL_16K, + ) + + for data_input in data.data_inputs.values(): + data_input.create_lm_images(tools.rasr_binary_path) + + for data_config in [data.train_data_config, data.cv_data_config]: + data_config["datasets"]["classes"]["files"] = [ + MatchLengthsJob( + hdf_file, + match_hdfs=data_config["datasets"]["data"]["files"], + match_len_transform_func=subsample_by_4_ceil, + ).out_hdf + for hdf_file in data_config["datasets"]["classes"]["files"] + ] + + # ********** Step args ********** + + train_args = exp_args.get_transducer_train_step_args( + num_epochs=num_subepochs, + study_storage=storage, + num_parallel=5, + gpu_mem_rqmt=24, + backend=Backend.PYTORCH, + ) + recog_args = exp_args.get_transducer_recog_step_args( + num_classes=num_outputs, + epochs=sub_checkpoints, + trial_nums=list(range(30)), + label_scorer_type="onnx-ffnn-transducer", + label_scorer_args={"extra_args": {"start_label_index": 0}}, + search_parameters={"blank-label-penalty": 1.0}, + reduction_subtrahend=3, + reduction_factor=4, + feature_type=FeatureType.LOGMEL_16K, + seq2seq_v2=True, + backend=Backend.PYTORCH, + ) + + # ********** System ********** + + system = OptunaReturnnSeq2SeqSystem( + tool_paths=tools, + summary_keys=[ + SummaryKey.TRAIN_NAME, + SummaryKey.RECOG_NAME, + SummaryKey.CORPUS, + SummaryKey.TRIAL, + SummaryKey.EPOCH, + SummaryKey.LM, + SummaryKey.WER, + SummaryKey.SUB, + SummaryKey.INS, + SummaryKey.DEL, + SummaryKey.ERR, + ], + summary_sort_keys=[SummaryKey.ERR, SummaryKey.CORPUS], + ) + + system.init_corpora( + dev_keys=data.dev_keys, + test_keys=data.test_keys, + corpus_data=data.data_inputs, + am_args=exp_args.transducer_recog_am_args, + ) + system.setup_scoring() + + # ********** Returnn Configs ********** + + train_args["num_trials"] = 30 + recog_args["trial_nums"] = list(range(30)) + system.add_experiment_configs( + "Conformer_Transducer_Viterbi_tuning", + get_returnn_config_collection( + tuning_options=[TuningOption.MODEL_BROAD, TuningOption.LEARN_SCHEDULE, TuningOption.SPECAUGMENT], + train_data_config=data.train_data_config, + dev_data_config=data.cv_data_config, + ), + ) + + system.run_train_step(**train_args) + system.run_dev_recog_step(**recog_args) + + assert system.summary_report + return system.summary_report + + +def py() -> SummaryReport: + _, alignments = py_ctc() + filename_handle = os.path.splitext(os.path.basename(__file__))[0][len("config_") :] + gs.ALIAS_AND_OUTPUT_SUBDIR = f"{filename_handle}/" + + summary_report = SummaryReport() + + report = run_exp(alignments) + + summary_report.merge_report(report, update_structure=True) + + tk.register_report(f"{gs.ALIAS_AND_OUTPUT_SUBDIR}/summary.report", summary_report) + + return summary_report diff --git a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_03_blstm_ctc.py b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_03_blstm_ctc.py deleted file mode 100644 index 9a4788cbc..000000000 --- a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_03_blstm_ctc.py +++ /dev/null @@ -1,141 +0,0 @@ -import copy -import os -from i6_core.returnn.config import ReturnnConfig - -from sisyphus import gs, tk - -import i6_core.rasr as rasr -from i6_experiments.users.berger.args.experiments import ctc as exp_args -from i6_experiments.users.berger.args.returnn.config import get_returnn_config, Backend -from i6_experiments.users.berger.args.returnn.learning_rates import ( - LearningRateSchedules, -) -from i6_experiments.users.berger.corpus.tedlium2.ctc_data import get_tedlium2_pytorch_data -from i6_experiments.users.berger.pytorch.models import conformer_ctc -from i6_experiments.users.berger.recipe.summary.report import SummaryReport -from i6_experiments.users.berger.systems.dataclasses import ConfigVariant, FeatureType, ReturnnConfigs -from i6_experiments.users.berger.systems.returnn_seq2seq_system import ( - ReturnnSeq2SeqSystem, -) -from i6_experiments.users.berger.util import default_tools_v2 - -# ********** Settings ********** - -rasr.flow.FlowNetwork.default_flags = {"cache_mode": "task_dependent"} - -num_outputs = 79 -num_subepochs = 150 - -tools = copy.deepcopy(default_tools_v2) - -tools.rasr_binary_path = tk.Path("/u/berger/repositories/rasr_versions/onnx/arch/linux-x86_64-standard") -# tools.returnn_root = tk.Path("/u/berger/repositories/MiniReturnn") - - -# ********** Return Config generators ********** - - -def returnn_config_generator(variant: ConfigVariant, train_data_config: dict, dev_data_config: dict) -> ReturnnConfig: - model_config = conformer_ctc.get_default_config_v1(num_inputs=50, num_outputs=num_outputs) - - extra_config = { - "train": train_data_config, - "dev": dev_data_config, - } - if variant == ConfigVariant.RECOG: - extra_config["model_outputs"] = {"classes": {"dim": num_outputs}} - - return get_returnn_config( - num_epochs=num_subepochs, - num_inputs=50, - num_outputs=num_outputs, - target="targets", - extra_python=[conformer_ctc.get_serializer(model_config, variant=variant)], - extern_data_config=True, - backend=Backend.PYTORCH, - grad_noise=0.0, - grad_clip=0.0, - schedule=LearningRateSchedules.OCLR, - initial_lr=1e-05, - peak_lr=3e-04, - final_lr=1e-05, - batch_size=10000, - use_chunking=False, - extra_config=extra_config, - ) - - -def get_returnn_config_collection( - train_data_config: dict, - dev_data_config: dict, -) -> ReturnnConfigs[ReturnnConfig]: - generator_kwargs = {"train_data_config": train_data_config, "dev_data_config": dev_data_config} - return ReturnnConfigs( - train_config=returnn_config_generator(variant=ConfigVariant.TRAIN, **generator_kwargs), - prior_config=returnn_config_generator(variant=ConfigVariant.PRIOR, **generator_kwargs), - recog_configs={"recog": returnn_config_generator(variant=ConfigVariant.RECOG, **generator_kwargs)}, - ) - - -def run_exp() -> SummaryReport: - assert tools.returnn_root - assert tools.returnn_python_exe - assert tools.rasr_binary_path - data = get_tedlium2_pytorch_data( - returnn_root=tools.returnn_root, - returnn_python_exe=tools.returnn_python_exe, - rasr_binary_path=tools.rasr_binary_path, - augmented_lexicon=True, - ) - - # ********** Step args ********** - - train_args = exp_args.get_ctc_train_step_args(num_epochs=num_subepochs) - recog_args = exp_args.get_ctc_recog_step_args( - num_classes=num_outputs, - epochs=[40, 80, num_subepochs], - prior_scales=[0.4], - lm_scales=[0.7], - feature_type=FeatureType.GAMMATONE_16K, - ) - - # ********** System ********** - - # tools.returnn_root = tk.Path("/u/berger/repositories/MiniReturnn") - tools.rasr_binary_path = tk.Path( - "/u/berger/repositories/rasr_versions/gen_seq2seq_onnx_apptainer/arch/linux-x86_64-standard" - ) - system = ReturnnSeq2SeqSystem(tools) - - system.init_corpora( - dev_keys=data.dev_keys, - test_keys=data.test_keys, - corpus_data=data.data_inputs, - am_args=exp_args.ctc_recog_am_args, - ) - system.setup_scoring() - - # ********** Returnn Configs ********** - - system.add_experiment_configs( - f"Conformer_CTC", get_returnn_config_collection(data.train_data_config, data.cv_data_config) - ) - - system.run_train_step(**train_args) - system.run_dev_recog_step(**recog_args) - - assert system.summary_report - return system.summary_report - - -def py() -> SummaryReport: - filename_handle = os.path.splitext(os.path.basename(__file__))[0][len("config_") :] - gs.ALIAS_AND_OUTPUT_SUBDIR = f"{filename_handle}/" - - summary_report = SummaryReport() - - summary_report.merge_report(run_exp(), update_structure=True) - - tk.register_report(f"{gs.ALIAS_AND_OUTPUT_SUBDIR}/summary.report", summary_report) - - return summary_report diff --git a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_03_conformer_transducer_phon_fullsum.py b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_03_conformer_transducer_phon_fullsum.py new file mode 100644 index 000000000..5ae641c0f --- /dev/null +++ b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_03_conformer_transducer_phon_fullsum.py @@ -0,0 +1,283 @@ +import copy +import os +from typing import List + +from i6_models.config import ModuleFactoryV1 +import i6_core.rasr as rasr +from i6_core.returnn import PtCheckpoint +from i6_core.returnn.config import ReturnnConfig +from i6_experiments.users.berger.args.experiments import transducer as exp_args +from i6_experiments.users.berger.args.returnn.config import Backend, get_returnn_config +from i6_experiments.users.berger.args.returnn.learning_rates import LearningRateSchedules, Optimizers +from i6_experiments.users.berger.corpus.tedlium2.phon_transducer_data import get_tedlium2_data_dumped_labels +from i6_experiments.users.berger.pytorch.models import conformer_transducer_v2 as model +from i6_experiments.users.berger.recipe.summary.report import SummaryReport +from i6_experiments.users.berger.systems.dataclasses import ( + EncDecConfig, + FeatureType, + ReturnnConfigs, + SummaryKey, +) +from i6_experiments.users.berger.systems.returnn_seq2seq_system import ReturnnSeq2SeqSystem +from i6_experiments.users.berger.util import default_tools_v2 +from sisyphus import gs, tk +from i6_experiments.users.berger.pytorch.custom_parts.identity import IdentityConfig, IdentityModule + +from .config_02_conformer_transducer_phon_viterbi import py as py_viterbi + +# ********** Settings ********** + +rasr.flow.FlowNetwork.default_flags = {"cache_mode": "task_dependent"} + +num_outputs = 79 +num_subepochs = 500 + +tools = copy.deepcopy(default_tools_v2) +tools.rasr_binary_path = tk.Path("/u/berger/repositories/rasr_versions/gen_seq2seq_dev/arch/linux-x86_64-standard") + + +# ********** Return Config generators ********** + + +def returnn_config_generator( + train_data_config: dict, + dev_data_config: dict, + pretrained_checkpoint: PtCheckpoint, + **kwargs, +) -> ReturnnConfig: + model_config = model.get_default_config_v1(num_outputs=num_outputs) + + extra_config = { + "train": train_data_config, + "dev": dev_data_config, + "max_seq_length": {"audio_features": 560000}, + "torch_amp": {"dtype": "bfloat16"}, + "preload_from_files": { + "pretrained_model": { + "filename": pretrained_checkpoint, + "init_for_train": True, + }, + }, + } + serializer = model.get_train_serializer(model_config, **kwargs) + + return get_returnn_config( + num_epochs=num_subepochs, + num_inputs=1, + num_outputs=num_outputs, + target="classes", + extra_python=[serializer], + extern_data_config=True, + backend=Backend.PYTORCH, + grad_noise=0.0, + grad_clip=0.0, + optimizer=Optimizers.AdamW, + schedule=LearningRateSchedules.CONST_DECAY, + const_lr=8e-05, + decayed_lr=1e-05, + final_lr=1e-06, + batch_size=15000 * 160, + accum_grad=2, + use_chunking=False, + extra_config=extra_config, + ) + + +def recog_returnn_configs_generator( + ilm_scale: float = 0.0, + **kwargs, +) -> EncDecConfig[ReturnnConfig]: + model_config = model.get_default_config_v1(num_outputs=num_outputs) + model_config.transcriber_cfg.feature_extraction = ModuleFactoryV1( + IdentityModule, + IdentityConfig(), + ) + + if ilm_scale != 0: + model_config = model.FFNNTransducerWithIlmConfig( + transcriber_cfg=model_config.transcriber_cfg, + predictor_cfg=model_config.predictor_cfg, + joiner_cfg=model_config.joiner_cfg, + ilm_scale=ilm_scale, + ) + + enc_extra_config = { + "extern_data": { + "sources": {"dim": 80, "dtype": "float32"}, + }, + "model_outputs": { + "source_encodings": { + "dim": 384, + "dtype": "float32", + }, + }, + } + dec_extra_config = { + "extern_data": { + "source_encodings": { + "dim": 384, + "time_dim_axis": None, + "dtype": "float32", + }, + "history": { + "dim": num_outputs, + "time_dim_axis": None, + "sparse": True, + "shape": (1,), + "dtype": "int32", + }, + }, + "model_outputs": { + "log_probs": { + "dim": num_outputs, + "time_dim_axis": None, + "dtype": "float32", + } + }, + } + enc_serializer = model.get_encoder_recog_serializer(model_config, **kwargs) + dec_serializer = model.get_decoder_recog_serializer(model_config, **kwargs) + + return EncDecConfig( + encoder_config=get_returnn_config( + num_inputs=80, + num_outputs=num_outputs, + target=None, + extra_python=[enc_serializer], + extern_data_config=False, + backend=Backend.PYTORCH, + extra_config=enc_extra_config, + ), + decoder_config=get_returnn_config( + num_inputs=1, + num_outputs=num_outputs, + target=None, + # python_prolog=["from returnn.tensor.dim import Dim, batch_dim"], + extra_python=[dec_serializer], + extern_data_config=False, + backend=Backend.PYTORCH, + extra_config=dec_extra_config, + ), + ) + + +def get_returnn_config_collection( + train_data_config: dict, + dev_data_config: dict, + ilm_scales: List[float] = [0.2], + **kwargs, +) -> ReturnnConfigs[ReturnnConfig]: + return ReturnnConfigs( + train_config=returnn_config_generator( + train_data_config=train_data_config, + dev_data_config=dev_data_config, + blank_id=0, + **kwargs, + ), + recog_configs={ + f"recog_ilm-{ilm_scale}": recog_returnn_configs_generator(ilm_scale=ilm_scale, **kwargs) + for ilm_scale in ilm_scales + }, + ) + + +def run_exp(pretrained_checkpoint: PtCheckpoint) -> SummaryReport: + assert tools.returnn_root + assert tools.returnn_python_exe + assert tools.rasr_binary_path + data = get_tedlium2_data_dumped_labels( + num_classes=num_outputs, + returnn_root=tools.returnn_root, + returnn_python_exe=tools.returnn_python_exe, + rasr_binary_path=tools.rasr_binary_path, + augmented_lexicon=True, + feature_type=FeatureType.SAMPLES, + ) + + # ********** Step args ********** + + train_args = exp_args.get_transducer_train_step_args(num_epochs=num_subepochs, gpu_mem_rqmt=24) + recog_args = exp_args.get_transducer_recog_step_args( + num_classes=num_outputs, + lm_scales=[0.7], + epochs=[40, 80, 160, 320, 480, num_subepochs], + search_parameters={"blank-label-penalty": 1.0}, + label_scorer_type="onnx-ffnn-transducer", + label_scorer_args={"extra_args": {"start_label_index": 0}}, + reduction_subtrahend=3, + # reduction_subtrahend=0, + reduction_factor=4, + feature_type=FeatureType.LOGMEL_16K, + ) + + # ********** System ********** + + system = ReturnnSeq2SeqSystem( + tool_paths=tools, + summary_keys=[ + SummaryKey.TRAIN_NAME, + SummaryKey.RECOG_NAME, + SummaryKey.CORPUS, + SummaryKey.EPOCH, + SummaryKey.LM, + SummaryKey.WER, + SummaryKey.SUB, + SummaryKey.INS, + SummaryKey.DEL, + SummaryKey.ERR, + ], + summary_sort_keys=[SummaryKey.ERR, SummaryKey.CORPUS], + ) + + system.init_corpora( + dev_keys=data.dev_keys, + test_keys=data.test_keys, + corpus_data=data.data_inputs, + am_args=exp_args.transducer_recog_am_args, + ) + system.setup_scoring() + + # ********** Returnn Configs ********** + + system.add_experiment_configs( + "Conformer_Transducer_Fullsum", + get_returnn_config_collection( + data.train_data_config, + data.cv_data_config, + pretrained_checkpoint=pretrained_checkpoint, + ), + ) + + system.run_train_step(**train_args) + system.run_dev_recog_step(recog_descriptor="bp-1.0", **recog_args) + + system.add_experiment_configs( + "Conformer_Transducer_Fullsum", + get_returnn_config_collection( + data.train_data_config, + data.cv_data_config, + ilm_scales=[0.2, 0.4, 0.6], + pretrained_checkpoint=pretrained_checkpoint, + ), + ) + recog_args["lm_scales"] = [0.7, 0.9, 1.1, 1.3, 1.5] + recog_args["epochs"] = [num_subepochs] + recog_args["search_parameters"]["blank-label-penalty"] = 0.0 + system.run_dev_recog_step(**recog_args) + + assert system.summary_report + return system.summary_report + + +def py() -> SummaryReport: + _, model = py_viterbi() + filename_handle = os.path.splitext(os.path.basename(__file__))[0][len("config_") :] + gs.ALIAS_AND_OUTPUT_SUBDIR = f"{filename_handle}/" + + summary_report = SummaryReport() + + summary_report.merge_report(run_exp(model), update_structure=True) + + tk.register_report(f"{gs.ALIAS_AND_OUTPUT_SUBDIR}/summary.report", summary_report) + + return summary_report diff --git a/users/berger/configs/tedlium2/20230602_rescale_baselines/config_05_conformer_transducer_phon_align_restrict.py b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_05_conformer_transducer_phon_align_restrict.py new file mode 100644 index 000000000..d5a805069 --- /dev/null +++ b/users/berger/configs/tedlium2/20230602_rescale_baselines/config_05_conformer_transducer_phon_align_restrict.py @@ -0,0 +1,275 @@ +import copy +import os +from typing import Dict, List, Tuple + +import i6_core.rasr as rasr +from i6_core.returnn import PtCheckpoint +from i6_core.returnn.config import ReturnnConfig +from i6_experiments.users.berger.args.experiments import transducer as exp_args +from i6_experiments.users.berger.args.returnn.config import Backend, get_returnn_config +from i6_experiments.users.berger.args.returnn.learning_rates import LearningRateSchedules, Optimizers +from i6_experiments.users.berger.corpus.tedlium2.viterbi_transducer_data import get_tedlium2_data +from i6_experiments.users.berger.pytorch.custom_parts.identity import IdentityConfig, IdentityModule +from i6_experiments.users.berger.pytorch.models import conformer_transducer_v2 as model +from i6_experiments.users.berger.recipe.summary.report import SummaryReport +from i6_experiments.users.berger.systems.dataclasses import ( + AlignmentData, + EncDecConfig, + FeatureType, + ReturnnConfigs, + SummaryKey, +) +from i6_experiments.users.berger.systems.returnn_seq2seq_system import ReturnnSeq2SeqSystem +from i6_experiments.users.berger.util import default_tools_v2 +from i6_models.config import ModuleFactoryV1 +from sisyphus import gs, tk + +from .config_01_conformer_ctc import py as py_ctc + +# ********** Settings ********** + +rasr.flow.FlowNetwork.default_flags = {"cache_mode": "task_dependent"} + +num_outputs = 79 +num_subepochs = 500 + +tools = copy.deepcopy(default_tools_v2) +tools.rasr_binary_path = tk.Path("/u/berger/repositories/rasr_versions/gen_seq2seq_dev/arch/linux-x86_64-standard") + + +# ********** Return Config generators ********** + + +def returnn_config_generator( + train_data_config: dict, + dev_data_config: dict, + **kwargs, +) -> ReturnnConfig: + model_config = model.get_default_config_v2(num_outputs=num_outputs) + + extra_config = { + "train": train_data_config, + "dev": dev_data_config, + "max_seqs": 60, + "max_seq_length": {"audio_features": 560000}, + "torch_amp": {"dtype": "bfloat16"}, + } + serializer = model.get_align_restrict_train_serializer( + model_config, + max_distance_from_alignment=kwargs.get("max_shift", 0), + enc_loss_scales={5: 0.2, 11: 0.5}, + blank_idx=0, + ) + + return get_returnn_config( + num_epochs=num_subepochs, + num_inputs=1, + num_outputs=num_outputs, + target="classes", + extra_python=[serializer], + extern_data_config=True, + backend=Backend.PYTORCH, + grad_noise=0.0, + grad_clip=0.0, + optimizer=Optimizers.AdamW, + weight_decay=5e-06, + schedule=LearningRateSchedules.OCLR, + initial_lr=8e-05, + peak_lr=kwargs.get("peak_lr", 8e-04), + decayed_lr=1e-05, + final_lr=1e-07, + batch_size=10000 * 160, + accum_grad=3, + use_chunking=False, + extra_config=extra_config, + ) + + +def recog_returnn_configs_generator( + ilm_scale: float = 0.0, + **kwargs, +) -> EncDecConfig[ReturnnConfig]: + model_config = model.get_default_config_v2(num_outputs=num_outputs) + model_config.transcriber_cfg.feature_extraction = ModuleFactoryV1( + IdentityModule, + IdentityConfig(), + ) + if ilm_scale != 0: + model_config = model.FFNNTransducerWithIlmConfig( + transcriber_cfg=model_config.transcriber_cfg, + predictor_cfg=model_config.predictor_cfg, + joiner_cfg=model_config.joiner_cfg, + ilm_scale=ilm_scale, + ) + + enc_extra_config = { + "extern_data": { + "sources": {"dim": 80, "dtype": "float32"}, + }, + "model_outputs": { + "source_encodings": { + "dim": 768, + "dtype": "float32", + }, + }, + } + dec_extra_config = { + "extern_data": { + "source_encodings": { + "dim": 768, + "time_dim_axis": None, + "dtype": "float32", + }, + "history": { + "dim": num_outputs, + "time_dim_axis": None, + "sparse": True, + "shape": (1,), + "dtype": "int32", + }, + }, + "model_outputs": { + "log_probs": { + "dim": num_outputs, + "time_dim_axis": None, + "dtype": "float32", + } + }, + } + enc_serializer = model.get_encoder_recog_serializer(model_config, **kwargs) + dec_serializer = model.get_decoder_recog_serializer(model_config, **kwargs) + + return EncDecConfig( + encoder_config=get_returnn_config( + num_inputs=80, + num_outputs=num_outputs, + target=None, + extra_python=[enc_serializer], + extern_data_config=False, + backend=Backend.PYTORCH, + extra_config=enc_extra_config, + ), + decoder_config=get_returnn_config( + num_inputs=1, + num_outputs=num_outputs, + target=None, + extra_python=[dec_serializer], + extern_data_config=False, + backend=Backend.PYTORCH, + extra_config=dec_extra_config, + ), + ) + + +def get_returnn_config_collection( + train_data_config: dict, + dev_data_config: dict, + ilm_scales: List[float] = [0.0, 0.2], + **kwargs, +) -> ReturnnConfigs[ReturnnConfig]: + return ReturnnConfigs( + train_config=returnn_config_generator( + train_data_config=train_data_config, + dev_data_config=dev_data_config, + **kwargs, + ), + recog_configs={ + f"recog_ilm-{ilm_scale}": recog_returnn_configs_generator(ilm_scale=ilm_scale, **kwargs) + for ilm_scale in ilm_scales + }, + ) + + +def run_exp(alignments: Dict[str, AlignmentData]) -> SummaryReport: + assert tools.returnn_root + assert tools.returnn_python_exe + assert tools.rasr_binary_path + data = get_tedlium2_data( + alignments=alignments, + returnn_root=tools.returnn_root, + returnn_python_exe=tools.returnn_python_exe, + rasr_binary_path=tools.rasr_binary_path, + augmented_lexicon=True, + feature_type=FeatureType.SAMPLES, + ) + + for data_input in data.data_inputs.values(): + data_input.create_lm_images(tools.rasr_binary_path) + + # ********** Step args ********** + + train_args = exp_args.get_transducer_train_step_args(num_epochs=num_subepochs, gpu_mem_rqmt=24) + recog_args = exp_args.get_transducer_recog_step_args( + num_classes=num_outputs, + epochs=[20, 40, 80, 160, 320, num_subepochs], + lm_scales=[0.7], + label_scorer_type="onnx-ffnn-transducer", + label_scorer_args={"extra_args": {"start_label_index": 0}}, + search_parameters={"blank-label-penalty": 1.0}, + reduction_factor=4, + reduction_subtrahend=3, + feature_type=FeatureType.LOGMEL_16K, + seq2seq_v2=True, + ) + + # ********** System ********** + + system = ReturnnSeq2SeqSystem( + tool_paths=tools, + summary_keys=[ + SummaryKey.TRAIN_NAME, + SummaryKey.RECOG_NAME, + SummaryKey.CORPUS, + SummaryKey.EPOCH, + SummaryKey.LM, + SummaryKey.WER, + SummaryKey.SUB, + SummaryKey.INS, + SummaryKey.DEL, + SummaryKey.ERR, + ], + summary_sort_keys=[SummaryKey.ERR, SummaryKey.CORPUS], + ) + + system.init_corpora( + dev_keys=data.dev_keys, + test_keys=data.test_keys, + corpus_data=data.data_inputs, + am_args=exp_args.transducer_recog_am_args, + ) + system.setup_scoring() + + # ********** Returnn Configs ********** + + for max_shift in [0, 1, 5, 10, 10000]: + system.add_experiment_configs( + f"Conformer_Transducer_Viterbi_shift-{max_shift}", + get_returnn_config_collection( + data.train_data_config, + data.cv_data_config, + max_shift=max_shift, + ilm_scales=[0.2], + ), + ) + + system.run_train_step(**train_args) + system.run_dev_recog_step(**recog_args) + + assert system.summary_report + return system.summary_report + + +def py() -> SummaryReport: + _, alignments = py_ctc() + filename_handle = os.path.splitext(os.path.basename(__file__))[0][len("config_") :] + gs.ALIAS_AND_OUTPUT_SUBDIR = f"{filename_handle}/" + + summary_report = SummaryReport() + + report = run_exp(alignments) + + summary_report.merge_report(report, update_structure=True) + + tk.register_report(f"{gs.ALIAS_AND_OUTPUT_SUBDIR}/summary.report", summary_report) + + return summary_report diff --git a/users/berger/corpus/general/hdf.py b/users/berger/corpus/general/hdf.py index 41df1e7e8..08754d422 100644 --- a/users/berger/corpus/general/hdf.py +++ b/users/berger/corpus/general/hdf.py @@ -1,17 +1,18 @@ -from typing import Optional, List -from i6_core.corpus import SegmentCorpusJob +from typing import List, Optional +from i6_core.corpus import SegmentCorpusJob from i6_core.returnn.hdf import BlissToPcmHDFJob -from i6_experiments.users.berger.recipe.returnn.hdf import BlissCorpusToTargetHdfJob -from i6_experiments.users.berger.args.returnn.dataset import MetaDatasetBuilder, hdf_config_dict_for_files -from i6_experiments.users.berger.systems.dataclasses import AlignmentData, FeatureType from sisyphus import tk + from i6_experiments.users.berger.args.jobs.rasr_init_args import ( - get_feature_extraction_args_16kHz, get_feature_extraction_args_8kHz, + get_feature_extraction_args_16kHz, ) -from i6_experiments.users.berger.helpers import build_rasr_feature_hdfs, RasrDataInput, SeparatedCorpusObject +from i6_experiments.users.berger.args.returnn.dataset import MetaDatasetBuilder, hdf_config_dict_for_files +from i6_experiments.users.berger.helpers import RasrDataInput, SeparatedCorpusObject, build_rasr_feature_hdfs from i6_experiments.users.berger.recipe.corpus.transform import ReplaceUnknownWordsJob +from i6_experiments.users.berger.recipe.returnn.hdf import BlissCorpusToTargetHdfJob +from i6_experiments.users.berger.systems.dataclasses import AlignmentData, FeatureType def build_feature_hdf_dataset_config( @@ -52,7 +53,20 @@ def build_feature_hdf_dataset_config( rasr_arch=rasr_arch, single_hdf=single_hdf, ) - + elif feature_type == FeatureType.LOGMEL_16K: + logmel_args = get_feature_extraction_args_16kHz(dc_detection=dc_detection)["filterbank"] + for data_input in data_inputs: + feature_hdfs += build_rasr_feature_hdfs( + data_input.corpus_object, + split=data_input.concurrent, + feature_type="fb", + feature_extraction_args=logmel_args, + returnn_python_exe=returnn_python_exe, + returnn_root=returnn_root, + rasr_binary_path=rasr_binary_path, + rasr_arch=rasr_arch, + single_hdf=single_hdf, + ) elif feature_type == FeatureType.SAMPLES: for data_input in data_inputs: if single_hdf: @@ -80,6 +94,10 @@ def build_feature_hdf_dataset_config( return hdf_config_dict_for_files(files=feature_hdfs, extra_config=extra_config) +def subsample_by_4(x): + return -(-x // 4) + + def build_feature_alignment_meta_dataset_config( data_inputs: List[RasrDataInput], feature_type: FeatureType, diff --git a/users/berger/corpus/tedlium2/phon_transducer_data.py b/users/berger/corpus/tedlium2/phon_transducer_data.py index f914d1b7b..6589e0a0b 100644 --- a/users/berger/corpus/tedlium2/phon_transducer_data.py +++ b/users/berger/corpus/tedlium2/phon_transducer_data.py @@ -7,7 +7,7 @@ from i6_experiments.users.berger.systems.dataclasses import FeatureType from i6_experiments.users.berger.corpus.general.experiment_data import BasicSetupData -from ..general import build_feature_hdf_dataset_config, build_feature_label_meta_dataset_config +from ..general import build_feature_label_meta_dataset_config from . import data @@ -81,25 +81,6 @@ def get_tedlium2_data_dumped_labels( }, ) - # ********** forward data ********** - - forward_data_config = { - key: build_feature_hdf_dataset_config( - data_inputs=[data_input], - feature_type=feature_type, - returnn_root=returnn_root, - returnn_python_exe=returnn_python_exe, - rasr_binary_path=rasr_binary_path, - rasr_arch=rasr_arch, - single_hdf=True, - extra_config={ - "partition_epoch": 1, - "seq_ordering": "sorted", - }, - ) - for key, data_input in {**dev_data_inputs, **test_data_inputs}.items() - } - # ********** Recog lexicon ********** for rasr_input in {**dev_data_inputs, **test_data_inputs}.values(): diff --git a/users/berger/corpus/tedlium2/viterbi_transducer_data.py b/users/berger/corpus/tedlium2/viterbi_transducer_data.py new file mode 100644 index 000000000..a7a4784dc --- /dev/null +++ b/users/berger/corpus/tedlium2/viterbi_transducer_data.py @@ -0,0 +1,113 @@ +import copy +from typing import Dict, List, Optional + +from i6_core.lexicon.modification import AddEowPhonemesToLexiconJob +from sisyphus import tk + +from i6_experiments.users.berger.systems.dataclasses import AlignmentData, FeatureType +from i6_experiments.users.berger.corpus.general.experiment_data import BasicSetupData +from i6_experiments.users.berger.corpus.general.hdf import build_feature_alignment_meta_dataset_config + +from . import data + + +def get_tedlium2_data( + alignments: Dict[str, AlignmentData], + returnn_root: tk.Path, + returnn_python_exe: tk.Path, + rasr_binary_path: tk.Path, + rasr_arch: str = "linux-x86_64-standard", + train_key: str = "train", + cv_keys: Optional[List[str]] = None, + dev_keys: Optional[List[str]] = None, + test_keys: Optional[List[str]] = None, + add_unknown: bool = False, + augmented_lexicon: bool = True, + feature_type: FeatureType = FeatureType.GAMMATONE_16K, + dc_detection: bool = False, + **kwargs, +) -> BasicSetupData: + if cv_keys is None: + cv_keys = ["dev"] + if dev_keys is None: + dev_keys = ["dev"] + if test_keys is None: + test_keys = ["test"] + + # ********** Data inputs ********** + train_data_inputs, cv_data_inputs, dev_data_inputs, test_data_inputs = copy.deepcopy( + data.get_data_inputs( + ctc_lexicon=True, + use_augmented_lexicon=augmented_lexicon, + add_all_allophones=True, + add_unknown_phoneme_and_mapping=add_unknown, + filter_unk_from_corpus=True, + **kwargs, + ) + ) + + # ********** Train data ********** + + train_lexicon = train_data_inputs[train_key].lexicon.filename + eow_lexicon = AddEowPhonemesToLexiconJob(train_lexicon).out_lexicon + + train_data_config = build_feature_alignment_meta_dataset_config( + data_inputs=[train_data_inputs[train_key]], + feature_type=feature_type, + alignments=[alignments[f"{train_key}_align"]], + returnn_root=returnn_root, + returnn_python_exe=returnn_python_exe, + rasr_binary_path=rasr_binary_path, + rasr_arch=rasr_arch, + dc_detection=dc_detection, + extra_config={ + "partition_epoch": 5, + "seq_ordering": "laplace:.1000", + }, + ) + + # ********** CV data ********** + + cv_data_config = build_feature_alignment_meta_dataset_config( + data_inputs=[cv_data_inputs[cv_key] for cv_key in cv_keys], + feature_type=feature_type, + alignments=[alignments[f"{cv_key}_align"] for cv_key in cv_keys], + returnn_root=returnn_root, + returnn_python_exe=returnn_python_exe, + rasr_binary_path=rasr_binary_path, + rasr_arch=rasr_arch, + dc_detection=dc_detection, + single_hdf=True, + extra_config={ + "partition_epoch": 1, + "seq_ordering": "sorted", + }, + ) + + # ********** Recog lexicon ********** + + for rasr_input in {**dev_data_inputs, **test_data_inputs}.values(): + rasr_input.lexicon.filename = eow_lexicon + + # ********** Align data ********** + + align_data_inputs = { + f"{key}_align": copy.deepcopy(data_input) for key, data_input in {**train_data_inputs, **cv_data_inputs}.items() + } + for data_input in align_data_inputs.values(): + data_input.lexicon.filename = eow_lexicon + + return BasicSetupData( + train_key=train_key, + dev_keys=list(dev_data_inputs.keys()), + test_keys=list(test_data_inputs.keys()), + align_keys=[f"{train_key}_align", *[f"{key}_align" for key in cv_keys]], + train_data_config=train_data_config, + cv_data_config=cv_data_config, + data_inputs={ + **train_data_inputs, + **dev_data_inputs, + **test_data_inputs, + **align_data_inputs, + }, + ) diff --git a/users/berger/helpers/hdf.py b/users/berger/helpers/hdf.py index 523768d50..038a174d3 100644 --- a/users/berger/helpers/hdf.py +++ b/users/berger/helpers/hdf.py @@ -56,9 +56,12 @@ def build_rasr_feature_hdfs( rasr.crp_set_corpus(base_crp, corpus) base_crp.concurrent = split - feature_job = {"mfcc": features.MfccJob, "gt": features.GammatoneJob, "energy": features.EnergyJob}[feature_type]( - crp=base_crp, **feature_extraction_args - ) + feature_job = { + "mfcc": features.MfccJob, + "gt": features.GammatoneJob, + "fb": features.FilterbankJob, + "energy": features.EnergyJob, + }[feature_type](crp=base_crp, **feature_extraction_args) feature_job.set_keep_value(gs.JOB_DEFAULT_KEEP_VALUE - 20) hdf_files = [] diff --git a/users/berger/pytorch/custom_parts/vgg_frontend.py b/users/berger/pytorch/custom_parts/vgg_frontend.py index d42e8ac9a..c8412c787 100644 --- a/users/berger/pytorch/custom_parts/vgg_frontend.py +++ b/users/berger/pytorch/custom_parts/vgg_frontend.py @@ -1,105 +1,294 @@ from dataclasses import dataclass -from typing import Optional, Tuple +from typing import Callable, Optional, Tuple, Union import torch from i6_models.config import ModelConfiguration +from i6_models.parts.frontend.common import get_same_padding, calculate_output_dim + + +def mask_pool(seq_mask: torch.Tensor, *, kernel_size: int, stride: int, padding: int, ceil: bool) -> torch.Tensor: + """ + apply strides to the masking + + :param seq_mask: [B,T] + :param kernel_size: + :param stride: + :param padding: + :return: [B,T'] using maxpool + """ + if stride == 1 and 2 * padding == kernel_size - 1: + return seq_mask + + seq_mask = seq_mask.float() + seq_mask = torch.unsqueeze(seq_mask, 1) # [B,1,T] + seq_mask = torch.nn.functional.max_pool1d(seq_mask, kernel_size, stride, padding, ceil_mode=ceil) # [B,1,T'] + seq_mask = torch.squeeze(seq_mask, 1) # [B,T'] + seq_mask = seq_mask.bool() + return seq_mask @dataclass -class VGGFrontendConfigV1(ModelConfiguration): - num_inputs: int +class VGG4LayerActFrontendCeilPoolV1Config(ModelConfiguration): + """ + Attributes: + in_features: number of input features to module + conv1_channels: number of channels for first conv layer + conv2_channels: number of channels for second conv layer + conv3_channels: number of channels for third conv layer + conv4_channels: number of channels for fourth conv layer + conv_kernel_size: kernel size of conv layers + conv_padding: padding for the convolution + pool1_kernel_size: kernel size of first pooling layer + pool1_stride: stride of first pooling layer + pool1_padding: padding for first pooling layer + pool2_kernel_size: kernel size of second pooling layer + pool2_stride: stride of second pooling layer + pool2_padding: padding for second pooling layer + activation: activation function at the end + out_features: output size of the final linear layer + """ + + in_features: int conv1_channels: int conv2_channels: int conv3_channels: int - conv_kernel_size: int - conv1_stride: int - conv2_stride: int - conv3_stride: int - pool_size: int - linear_size: int - dropout: float + conv4_channels: int + conv_kernel_size: Tuple[int, int] + conv_padding: Optional[Tuple[int, int]] + pool1_kernel_size: Tuple[int, int] + pool1_stride: Optional[Tuple[int, int]] + pool1_padding: Optional[Tuple[int, int]] + pool2_kernel_size: Tuple[int, int] + pool2_stride: Optional[Tuple[int, int]] + pool2_padding: Optional[Tuple[int, int]] + activation: Union[torch.nn.Module, Callable[[torch.Tensor], torch.Tensor]] + out_features: int + + def check_valid(self): + if isinstance(self.conv_kernel_size, int): + assert self.conv_kernel_size % 2 == 1, "ConformerVGGFrontendV1 only supports odd kernel sizes" + if isinstance(self.pool1_kernel_size, int): + assert self.pool1_kernel_size % 2 == 1, "ConformerVGGFrontendV1 only supports odd kernel sizes" + if isinstance(self.pool2_kernel_size, int): + assert self.pool2_kernel_size % 2 == 1, "ConformerVGGFrontendV1 only supports odd kernel sizes" def __post__init__(self): super().__post_init__() - assert self.conv_kernel_size % 2 == 1, "Conv kernel size must be odd." + self.check_valid() -def subsample_mask(sequence_mask: torch.Tensor, subsampling_factor: int): - if subsampling_factor == 1: - return sequence_mask +class VGG4LayerActFrontendCeilPoolV1(torch.nn.Module): + """ + Convolutional Front-End - max_len = sequence_mask.shape[1] + The frond-end utilizes convolutional and pooling layers, as well as activation functions + to transform a feature vector, typically Log-Mel or Gammatone for audio, into an intermediate + representation. - padding = 0 - if (overhang := max_len % subsampling_factor) != 0: - padding = subsampling_factor - overhang + Structure of the front-end: + - Conv + - Conv + - Activation + - Pool + - Conv + - Conv + - Activation + - Pool - padded_mask = torch.nn.functional.pad(sequence_mask, pad=(0, padding), value=0) + Uses explicit padding for ONNX exportability, see: + https://github.com/pytorch/pytorch/issues/68880 + """ - reshaped_mask = padded_mask.reshape(padded_mask.shape[0], -1, subsampling_factor) + def __init__(self, model_cfg: VGG4LayerActFrontendCeilPoolV1Config): + """ + :param model_cfg: model configuration for this module + """ + super().__init__() - subsampled_mask = torch.all(reshaped_mask == 1, dim=2) - subsampled_mask = subsampled_mask.type(sequence_mask.dtype) + model_cfg.check_valid() - return subsampled_mask + self.cfg = model_cfg + conv_padding = ( + model_cfg.conv_padding + if model_cfg.conv_padding is not None + else get_same_padding(model_cfg.conv_kernel_size) + ) + pool1_padding = model_cfg.pool1_padding if model_cfg.pool1_padding is not None else (0, 0) + pool2_padding = model_cfg.pool2_padding if model_cfg.pool2_padding is not None else (0, 0) -class VGGFrontendV1(torch.nn.Module): - def __init__(self, config: VGGFrontendConfigV1): - super().__init__() self.conv1 = torch.nn.Conv2d( in_channels=1, - out_channels=config.conv1_channels, - kernel_size=(config.conv_kernel_size, 1), - padding=(config.conv_kernel_size // 2, 0), - stride=(config.conv1_stride, 1), - ) - self.pool = torch.nn.MaxPool2d( - kernel_size=(1, config.pool_size), - stride=(1, config.pool_size), + out_channels=model_cfg.conv1_channels, + kernel_size=model_cfg.conv_kernel_size, + padding=conv_padding, ) self.conv2 = torch.nn.Conv2d( - in_channels=config.conv1_channels, - out_channels=config.conv2_channels, - kernel_size=config.conv_kernel_size, - padding=config.conv_kernel_size // 2, - stride=(config.conv2_stride, 1), + in_channels=model_cfg.conv1_channels, + out_channels=model_cfg.conv2_channels, + kernel_size=model_cfg.conv_kernel_size, + padding=conv_padding, + ) + self.pool1 = torch.nn.MaxPool2d( + kernel_size=model_cfg.pool1_kernel_size, + stride=model_cfg.pool1_stride, + padding=pool1_padding, + ceil_mode=True, ) self.conv3 = torch.nn.Conv2d( - in_channels=config.conv2_channels, - out_channels=config.conv3_channels, - kernel_size=config.conv_kernel_size, - padding=config.conv_kernel_size // 2, - stride=(config.conv3_stride, 1), + in_channels=model_cfg.conv2_channels, + out_channels=model_cfg.conv3_channels, + kernel_size=model_cfg.conv_kernel_size, + padding=conv_padding, + ) + self.conv4 = torch.nn.Conv2d( + in_channels=model_cfg.conv3_channels, + out_channels=model_cfg.conv4_channels, + kernel_size=model_cfg.conv_kernel_size, + padding=conv_padding, ) + self.pool2 = torch.nn.MaxPool2d( + kernel_size=model_cfg.pool2_kernel_size, + stride=model_cfg.pool2_stride, + padding=pool2_padding, + ceil_mode=True, + ) + self.activation = model_cfg.activation self.linear = torch.nn.Linear( - config.conv3_channels * (config.num_inputs // config.pool_size), config.linear_size + in_features=self._calculate_dim(), + out_features=model_cfg.out_features, + bias=True, ) - self.subsample_factor = config.conv1_stride * config.conv2_stride * config.conv3_stride - self.dropout = config.dropout - self.layer_norm = torch.nn.LayerNorm(config.linear_size) def forward( - self, x: torch.Tensor, sequence_mask: Optional[torch.Tensor] + self, tensor: torch.Tensor, sequence_mask: Optional[torch.Tensor] = None ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: - x = x[:, None, :, :] # [B, 1, T, F] - x = self.conv1(x) # [B, C_1, T', F] - x = torch.nn.functional.silu(x) # [B, C_1, T', F] - x = self.pool(x) # [B, C_1, T', F'] - x = self.conv2(x) # [B, C_2, T'', F'] - x = torch.nn.functional.silu(x) # [B, C_2, T'', F'] - x = self.conv3(x) # [B, C_3, T''', F'] - x = torch.nn.functional.silu(x) # [B, C_3, T''', F'] - x = torch.transpose(x, 1, 2) # [B, T''', C_3, F'] - x = torch.flatten(x, start_dim=2) # [B, T''', C_3 * F'] - x = self.linear(x) # [B, T''', F''] - x = torch.nn.functional.dropout(x, p=self.dropout, training=self.training) # [B, T''', F''] - x = self.layer_norm(x) # [B, T''', F''] - - if sequence_mask is None: - subsampled_mask = None - else: - subsampled_mask = subsample_mask(sequence_mask, self.subsample_factor) - - return x, subsampled_mask + """ + T might be reduced to T' or T'' depending on stride of the layers + + stride is only allowed for the pool1 and pool2 operation. + other ops do not have stride configurable -> no update of mask sequence required but added anyway + + :param tensor: input tensor of shape [B,T,F] + :param sequence_mask: the sequence mask for the tensor + :return: torch.Tensor of shape [B,T",F'] and the shape of the sequence mask + """ + assert tensor.shape[-1] == self.cfg.in_features, f"shape {tensor.shape} vs in features {self.cfg.in_features}" + # and add a dim + tensor = tensor[:, None, :, :] # [B,C=1,T,F] + + tensor = self.conv1(tensor) + if sequence_mask is not None: + sequence_mask = mask_pool( + seq_mask=sequence_mask, + kernel_size=self.conv1.kernel_size[0], + stride=self.conv1.stride[0], + padding=self.conv1.padding[0], + ceil=False, + ) + + tensor = self.conv2(tensor) + if sequence_mask is not None: + sequence_mask = mask_pool( + sequence_mask, + kernel_size=self.conv2.kernel_size[0], + stride=self.conv2.stride[0], + padding=self.conv2.padding[0], + ceil=False, + ) + + tensor = self.activation(tensor) + tensor = self.pool1(tensor) # [B,C,T',F'] + if sequence_mask is not None: + sequence_mask = mask_pool( + sequence_mask, + kernel_size=self.pool1.kernel_size[0], + stride=self.pool1.stride[0], + padding=self.pool1.padding[0], + ceil=True, + ) + + tensor = self.conv3(tensor) + if sequence_mask is not None: + sequence_mask = mask_pool( + sequence_mask, + kernel_size=self.conv3.kernel_size[0], + stride=self.conv3.stride[0], + padding=self.conv3.padding[0], + ceil=False, + ) + + tensor = self.conv4(tensor) + if sequence_mask is not None: + sequence_mask = mask_pool( + sequence_mask, + kernel_size=self.conv4.kernel_size[0], + stride=self.conv4.stride[0], + padding=self.conv4.padding[0], + ceil=False, + ) + + tensor = self.activation(tensor) + tensor = self.pool2(tensor) # [B,C,T",F"] + if sequence_mask is not None: + sequence_mask = mask_pool( + sequence_mask, + kernel_size=self.pool2.kernel_size[0], + stride=self.pool2.stride[0], + padding=self.pool2.padding[0], + ceil=True, + ) + + tensor = torch.transpose(tensor, 1, 2) # transpose to [B,T",C,F"] + tensor = torch.flatten(tensor, start_dim=2, end_dim=-1) # [B,T",C*F"] + + tensor = self.linear(tensor) + + return tensor, sequence_mask + + def _calculate_dim(self) -> int: + # conv1 + out_dim = calculate_output_dim( + in_dim=self.cfg.in_features, + filter_size=self.conv1.kernel_size[1], + stride=self.conv1.stride[1], + padding=self.conv1.padding[1], + ) + # conv2 + out_dim = calculate_output_dim( + in_dim=out_dim, + filter_size=self.conv2.kernel_size[1], + stride=self.conv2.stride[1], + padding=self.conv2.padding[1], + ) + # pool1 + out_dim = calculate_output_dim( + in_dim=out_dim, + filter_size=self.pool1.kernel_size[1], + stride=self.pool1.stride[1], + padding=self.pool1.padding[1], + ) + # conv3 + out_dim = calculate_output_dim( + in_dim=out_dim, + filter_size=self.conv3.kernel_size[1], + stride=self.conv3.stride[1], + padding=self.conv3.padding[1], + ) + # conv4 + out_dim = calculate_output_dim( + in_dim=out_dim, + filter_size=self.conv4.kernel_size[1], + stride=self.conv4.stride[1], + padding=self.conv4.padding[1], + ) + # pool2 + out_dim = calculate_output_dim( + in_dim=out_dim, + filter_size=self.pool2.kernel_size[1], + stride=self.pool2.stride[1], + padding=self.pool2.padding[1], + ) + out_dim *= self.conv4.out_channels + return out_dim diff --git a/users/berger/pytorch/forward/transducer.py b/users/berger/pytorch/forward/transducer.py index b9b1caaf4..e6b4fcbbc 100644 --- a/users/berger/pytorch/forward/transducer.py +++ b/users/berger/pytorch/forward/transducer.py @@ -1,16 +1,15 @@ -import torch import numpy as np -from torchaudio.models.rnnt import RNNT -from returnn.frontend import Tensor -from returnn.tensor.tensor_dict import TensorDict -from i6_experiments.users.berger.pytorch.forward.transducer_beam_search import monotonic_timesync_beam_search -from sisyphus import tk +import torch from i6_core.lib.lexicon import Lexicon -from i6_experiments.users.berger.pytorch.helper_functions import map_tensor_to_minus1_plus1_interval +from i6_experiments.users.berger.pytorch.forward.transducer_beam_search import monotonic_timesync_beam_search from i6_experiments.users.berger.pytorch.models.conformer_transducer_v2 import ( FFNNTransducerDecoderOnly, FFNNTransducerEncoderOnly, ) +from returnn.frontend import Tensor +from returnn.tensor.tensor_dict import TensorDict +from sisyphus import tk +from torchaudio.models.rnnt import RNNT def encoder_forward_step(*, model: FFNNTransducerEncoderOnly, extern_data: TensorDict, **_): @@ -22,7 +21,7 @@ def encoder_forward_step(*, model: FFNNTransducerEncoderOnly, extern_data: Tenso device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - source_encodings, source_lengths = model( + source_encodings, source_lengths = model.forward( sources=sources.to(device), source_lengths=source_lengths.to(device), ) # [B, T, E], [B] @@ -39,14 +38,14 @@ def decoder_forward_step(*, model: FFNNTransducerDecoderOnly, extern_data: Tenso source_encodings = extern_data["source_encodings"].raw_tensor assert source_encodings is not None - targets = extern_data["targets"].raw_tensor - assert targets is not None + history = extern_data["history"].raw_tensor + assert history is not None device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - log_probs = model( + log_probs = model.forward( source_encodings=source_encodings.to(device), - targets=targets.to(device), + history=history.to(device), ) # [B, C] import returnn.frontend as rf diff --git a/users/berger/pytorch/forward/transducer_beam_search.py b/users/berger/pytorch/forward/transducer_beam_search.py index 0b8de6cb2..dd275cb0d 100644 --- a/users/berger/pytorch/forward/transducer_beam_search.py +++ b/users/berger/pytorch/forward/transducer_beam_search.py @@ -2,7 +2,6 @@ from dataclasses import dataclass import torch -from torchaudio.models.rnnt import RNNT @dataclass @@ -31,7 +30,7 @@ def extended_hypothesis( def monotonic_timesync_beam_search( - *, model: RNNT, features: torch.Tensor, feature_lengths: torch.Tensor, blank_id: int, beam_size: int = 10 + *, model: torch.nn.Module, features: torch.Tensor, feature_lengths: torch.Tensor, blank_id: int, beam_size: int = 10 ) -> Tuple[List[int], float]: # Some dimension checks if features.dim() == 2: # [T, F] @@ -49,8 +48,8 @@ def predict_next( token: int, history_state: Optional[List[List[torch.Tensor]]] ) -> Tuple[torch.Tensor, List[List[torch.Tensor]]]: new_pred_state, _, new_pred_history_state = model.predict( # [1, P] - targets=torch.tensor([[token]], device=enc.device), - target_lengths=torch.tensor([1], device=enc.device), + labels=torch.tensor([[token]], device=enc.device), + label_lengths=torch.tensor([1], device=enc.device), state=history_state, ) diff --git a/users/berger/pytorch/models/conformer_transducer_v2.py b/users/berger/pytorch/models/conformer_transducer_v2.py index 9df3af6c5..dc87638b7 100644 --- a/users/berger/pytorch/models/conformer_transducer_v2.py +++ b/users/berger/pytorch/models/conformer_transducer_v2.py @@ -1,18 +1,19 @@ -from dataclasses import dataclass +from dataclasses import dataclass, field from enum import Enum, auto -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, Dict, List, Optional, Tuple, Union import i6_models.assemblies.conformer as conformer_i6 import i6_models.parts.conformer as conformer_parts_i6 import torch from i6_core.returnn.config import CodeWrapper +from i6_core.tools.git import CloneGitRepositoryJob from i6_experiments.common.setups.returnn_pytorch.serialization import Collection -from i6_experiments.common.setups.serialization import Import, PartialImport +from i6_experiments.common.setups.serialization import Import, PartialImport, ExternalImport from i6_experiments.users.berger.pytorch.serializers.basic import ( get_basic_pt_network_serializer, ) from i6_models.config import ModelConfiguration, ModuleFactoryV1 -from i6_models.parts.frontend.generic_frontend import FrontendLayerType, GenericFrontendV1, GenericFrontendV1Config +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1, VGG4LayerActFrontendV1Config from i6_models.primitives.feature_extraction import ( RasrCompatibleLogMelFeatureExtractionV1, RasrCompatibleLogMelFeatureExtractionV1Config, @@ -22,6 +23,7 @@ SpecaugmentByLengthConfigV1, SpecaugmentByLengthModuleV1, ) +from ..custom_parts.vgg_frontend import VGG4LayerActFrontendCeilPoolV1, VGG4LayerActFrontendCeilPoolV1Config from .util import lengths_to_padding_mask @@ -30,6 +32,9 @@ class TransducerTranscriberConfig(ModelConfiguration): feature_extraction: ModuleFactoryV1 specaugment: ModuleFactoryV1 encoder: ModuleFactoryV1 + layer_size: int + target_size: int + enc_loss_layers: List[int] = field(default_factory=list) class TransducerTranscriber(torch.nn.Module): @@ -39,11 +44,19 @@ def __init__(self, cfg: TransducerTranscriberConfig, **_) -> None: self.specaugment = cfg.specaugment() self.encoder = cfg.encoder() + self.enc_loss_layers = cfg.enc_loss_layers + self.output_layers = torch.nn.ModuleDict( + { + f"output_{layer_idx}": torch.nn.Linear(cfg.layer_size, cfg.target_size) + for layer_idx in cfg.enc_loss_layers + } + ) + def forward( self, sources: torch.Tensor, # [B, T, F] source_lengths: torch.Tensor, # [B] - ) -> Tuple[torch.Tensor, torch.Tensor]: # [B, T, C], [B] + ) -> Tuple[torch.Tensor, Dict[int, torch.Tensor], torch.Tensor]: # [B, T, C], Dict(l: [B, T, C]), [B] with torch.no_grad(): sources = sources.squeeze(-1) x, source_lengths = self.feature_extraction(sources, source_lengths) @@ -51,30 +64,46 @@ def forward( x = self.specaugment(x) # [B, T, F] - x, sequence_mask = self.encoder(x, sequence_mask) # [B, T, E], [B, T] - - return x, torch.sum(sequence_mask, dim=1).to(torch.int32) # [B, T, C], [B] - - -class TransducerTranscriberNoFeatExtr(torch.nn.Module): - def __init__(self, cfg: TransducerTranscriberConfig, **_) -> None: - super().__init__() - self.specaugment = cfg.specaugment() - self.encoder = cfg.encoder() - - def forward( - self, - sources: torch.Tensor, # [B, T, F] - source_lengths: torch.Tensor, # [B] - ) -> Tuple[torch.Tensor, torch.Tensor]: # [B, T, C], [B] - with torch.no_grad(): - sequence_mask = lengths_to_padding_mask(source_lengths) - - x = self.specaugment(sources) # [B, T, F] - - x, sequence_mask = self.encoder(x, sequence_mask) # [B, T, E], [B, T] - - return x, torch.sum(sequence_mask, dim=1).to(torch.int32) # [B, T, C], [B] + return_layers = self.enc_loss_layers.copy() + return_layers.append(len(self.encoder.module_list) - 1) + + intermediate_encodings, sequence_mask = self.encoder( + x, sequence_mask=sequence_mask, return_layers=return_layers + ) # List([B, T, E]), [B, T] + + source_encodings = intermediate_encodings[-1] + + intermediate_logits = { + layer_idx: self.output_layers[f"output_{layer_idx}"](encoding) + for layer_idx, encoding in zip(self.enc_loss_layers, intermediate_encodings) + } # Dict(l: [B, T, C]) + + return ( + source_encodings, + intermediate_logits, + torch.sum(sequence_mask, dim=1).to(torch.int32), + ) # [B, T, E], Dict(l: [B, T, C]), [B] + + +# class TransducerTranscriberNoFeatExtr(torch.nn.Module): +# def __init__(self, cfg: TransducerTranscriberConfig, **_) -> None: +# super().__init__() +# self.specaugment = cfg.specaugment() +# self.encoder = cfg.encoder() +# +# def forward( +# self, +# sources: torch.Tensor, # [B, T, F] +# source_lengths: torch.Tensor, # [B] +# ) -> Tuple[torch.Tensor, torch.Tensor]: # [B, T, C], [B] +# with torch.no_grad(): +# sequence_mask = lengths_to_padding_mask(source_lengths) +# +# x = self.specaugment(sources) # [B, T, F] +# +# x, sequence_mask = self.encoder(x, sequence_mask) # [B, T, E], [B, T] +# +# return x, torch.sum(sequence_mask, dim=1).to(torch.int32) # [B, T, C], [B] @dataclass @@ -111,62 +140,72 @@ def forward( self, targets: torch.Tensor, # [B, S], target_lengths: torch.Tensor, # [B], - state: Optional[ - List[List[torch.Tensor]] - ] = None, # Most recently fed inputs, used for higher order context, shape [[[B, H-1]]]; list of lists for compatibility with torchaudio - ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]: # [B, S, C], [B], [[[B, H-1]]] - # print("") - # print("Enter predictor forward") - # print("Predictor received input", input.deeper()) - + history: Optional[torch.Tensor] = None, # [B, H], use all blanks if None + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: # [B, S + 1, C], [B], [B, H] # extend input by prepending either the state if it's given or some history consisting of blanks - if state is None: - prepend = torch.full( - (targets.size(0), self.context_history_size - 1), + if history is None: + history = torch.full( + (targets.size(0), self.context_history_size), fill_value=self.blank_id, dtype=targets.dtype, device=targets.device, - ) # [B, H-1] - # print("Predictor received no state. Use", prepend.deeper()) - else: - prepend = state[0][0] # [B, H-1] - # print("Predictor received state", prepend.deeper()) - extended_input = torch.concat([prepend, targets], dim=1) # [B, S+H-1] - # print("extended input", extended_input.deeper()) + ) # [B, H] - if self.context_history_size > 1: - return_state = extended_input[:, -(self.context_history_size - 1) :] # [B, H-1] - else: - return_state = torch.empty( - size=(targets.size(0), 0), dtype=targets.dtype, device=targets.device - ) # [B, 0] = [B, H-1] - # print("New state is ", return_state.deeper()) + extended_targets = torch.concat([history, targets], dim=1) # [B, S+H] + + return_history = extended_targets[:, -(self.context_history_size) :] # [B, H] # Build context at each position by shifting and cutting label sequence. - # E.g. for history size 2 and label sequence a_1, ..., a_S we have context - # a_2 a_3 a_4 ... a_S - # a_1 a_2 a_3 ... a_{S-1} + # E.g. for history size 2 and extended targets 0, 0, a_1, ..., a_S we have context + # 0, a_1, a_2 a_3 a_4 ... a_S + # 0, 0, a_1 a_2 a_3 ... a_{S-1} context = torch.stack( [ - extended_input[:, self.context_history_size - 1 - i : (-i if i != 0 else None)] # [B, S] + extended_targets[:, self.context_history_size - 1 - i : (-i if i != 0 else None)] # [B, S+1] for i in range(self.context_history_size) ], dim=-1, - ) # [B, S, H] - - # print("Predict based on context", context) - a = self.embedding(context) # [B, S, H, E] - a = torch.reshape(a, shape=[*(a.shape[:-2]), a.shape[-2] * a.shape[-1]]) # [B, S, H*E] - a = self.network(a) # [B, S, P] - # topk = torch.topk(torch.nn.functional.softmax(a, dim=-1), k=4) - # print("Result probabilities:") - # print(topk.indices.deeper()) - # print(topk.values.deeper()) - # print( - # "Repeat probabilities:", - # torch.gather(torch.nn.functional.softmax(a, dim=-1), dim=-1, index=context[:, :, -2:-1]).deeper(), - # ) - return a, target_lengths, [[return_state]] + ) # [B, S+1, H] + + a = self.embedding(context) # [B, S+1, H, E] + a = torch.reshape(a, shape=[*(a.shape[:-2]), a.shape[-2] * a.shape[-1]]) # [B, S+1, H*E] + a = self.network(a) # [B, S+1, P] + return a, target_lengths, return_history + + def forward_viterbi( + self, + targets: torch.Tensor, # [B, T], + target_lengths: torch.Tensor, # [B], + ) -> Tuple[torch.Tensor, torch.Tensor]: # [B, T, C], [B] + # Get alignment sequence including blanks as labels, e.g. + # [0, 0, 1, 0, 2, 0, 0, 3, 0] + + B, T = targets.shape + + history = torch.zeros( + (B, T, self.context_history_size), dtype=targets.dtype, device=targets.device + ) # [B, T, H] + + # start with all-blank history + recent_labels = torch.full( + (B, self.context_history_size), fill_value=self.blank_id, dtype=targets.dtype, device=targets.device + ) # [B, H] + + for t in range(T): + # set context at frame t + history[:, t, :] = recent_labels + + current_labels = targets[:, t] # [B] + non_blank_positions = current_labels != self.blank_id # [B] + + # shift recent_labels and append next one if we see a non-blank + recent_labels[non_blank_positions, 1:] = recent_labels[non_blank_positions, :-1] + recent_labels[non_blank_positions, 0] = current_labels[non_blank_positions] + + a = self.embedding(history) # [B, T, H, E] + a = torch.reshape(a, shape=[*(a.shape[:-2]), a.shape[-2] * a.shape[-1]]) # [B, T, H*E] + a = self.network(a) # [B, T, P] + return a, target_lengths class CombinationMode(Enum): @@ -195,25 +234,44 @@ def __init__(self, cfg: TransducerJoinerConfig, **_) -> None: def forward( self, - source_encodings: torch.Tensor, # [B, T, C_1], + source_encodings: torch.Tensor, # [B, T, E], source_lengths: torch.Tensor, # [B], - target_encodings: torch.Tensor, # [B, S, C_2], + target_encodings: torch.Tensor, # [B, S+1, P], target_lengths: torch.Tensor, # [B] - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: # [B, T, S, F], [B], [B], + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: # [B, T, S+1, C], [B], [B], source_encodings = source_encodings.unsqueeze(2).expand( target_encodings.size(0), -1, target_encodings.size(1), -1 - ) # [B, T, S, C_1] - target_encodings = target_encodings.unsqueeze(1).expand(-1, source_encodings.size(1), -1, -1) # [B, T, S, C_2] + ) # [B, T, S+1, E] + target_encodings = target_encodings.unsqueeze(1).expand(-1, source_encodings.size(1), -1, -1) # [B, T, S+1, P] + + if self.combination_mode == CombinationMode.CONCAT: + joiner_inputs = torch.concat([source_encodings, target_encodings], dim=-1) # [B, T, S+1, E + P] + elif self.combination_mode == CombinationMode.SUM: + joiner_inputs = source_encodings + target_encodings # [B, T, S+1, E=P] + output = self.network(joiner_inputs) # [B, T, S+1, C] + + if not self.training: + output = torch.log_softmax(output, dim=-1) # [B, T, S+1, C] + + return output, source_lengths, target_lengths + + def forward_viterbi( + self, + source_encodings: torch.Tensor, # [B, T, E], + source_lengths: torch.Tensor, # [B], + target_encodings: torch.Tensor, # [B, T, P], + target_lengths: torch.Tensor, # [B] + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: # [B, T, C], [B], [B], if self.combination_mode == CombinationMode.CONCAT: - joiner_inputs = torch.concat([source_encodings, target_encodings], dim=-1) # [B, T, S, C_1 + C_2] + joiner_inputs = torch.concat([source_encodings, target_encodings], dim=-1) # [B, T, E + P] elif self.combination_mode == CombinationMode.SUM: - joiner_inputs = source_encodings + target_encodings # [B, T, S, C_1=C_2] + joiner_inputs = source_encodings + target_encodings # [B, T, E=P] - output = self.network(joiner_inputs) # [B, T, S, C] + output = self.network(joiner_inputs) # [B, T, C] if not self.training: - output = torch.log_softmax(output, dim=-1) + output = torch.log_softmax(output, dim=-1) # [B, T, C] return output, source_lengths, target_lengths @@ -230,34 +288,40 @@ def __init__(self, cfg: TransducerJoinerConfig, **_) -> None: def forward( self, - source_encodings: torch.Tensor, # [B, T, C_1], + source_encodings: torch.Tensor, # [B, T, E], source_lengths: torch.Tensor, # [B], - target_encodings: torch.Tensor, # [B, S, C_2], + target_encodings: torch.Tensor, # [B, S+1, P], target_lengths: torch.Tensor, # [B] - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: # [T_1 * S_1 + T_2 * S_2 + ... + T_B * S_B, F], [B], [B], + ) -> Tuple[ + torch.Tensor, torch.Tensor, torch.Tensor + ]: # [T_1 * (S_1+1) + T_2 * (S_2+1) + ... + T_B * (S_B+1), C], [B], [B], batch_tensors = [] for b in range(source_encodings.size(0)): - valid_source = source_encodings[b, : source_lengths[b], :] # [T_b, C_1] - valid_target = target_encodings[b, : target_lengths[b], :] # [S_b, C_2] + valid_source = source_encodings[b, : source_lengths[b], :] # [T_b, E] + valid_target = target_encodings[b, : target_lengths[b] + 1, :] # [S_b+1, P] - expanded_source = valid_source.unsqueeze(1).expand(-1, int(target_lengths[b].item()), -1) # [T_b, S_b, C_1] - expanded_target = valid_target.unsqueeze(0).expand(int(source_lengths[b].item()), -1, -1) # [T_b, S_b, C_2] + expanded_source = valid_source.unsqueeze(1).expand( + -1, int(target_lengths[b].item()) + 1, -1 + ) # [T_b, S_b+1, E] + expanded_target = valid_target.unsqueeze(0).expand(int(source_lengths[b].item()), -1, -1) # [T_b, S_b+1, P] if self.combination_mode == CombinationMode.CONCAT: - combination = torch.concat([expanded_source, expanded_target], dim=-1) # [T_b, S_b, C_1 + C_2] + combination = torch.concat([expanded_source, expanded_target], dim=-1) # [T_b, S_b+1, E + P] elif self.combination_mode == CombinationMode.SUM: - combination = expanded_source + expanded_target # [T_b, S_b, C_1 (=C_2)] + combination = expanded_source + expanded_target # [T_b, S_b+1, E (=P)] else: raise NotImplementedError - batch_tensors.append(combination.reshape(-1, combination.size(2))) # [T_b * S_b, C'] + batch_tensors.append(combination.reshape(-1, combination.size(2))) # [T_b * (S_b+1), E(+P)] - joint_encodings = torch.concat(batch_tensors, dim=0) # [T_1 * S_1 + T_2 * S_2 + ... + T_B * S_B, C'] - output = self.network(joint_encodings) # [T_1 * S_1 + T_2 * S_2 + ... + T_B * S_B, F] + joint_encodings = torch.concat( + batch_tensors, dim=0 + ) # [T_1 * (S_1+1) + T_2 * (S_2+1) + ... + T_B * (S_B+1), E(+P)] + output = self.network(joint_encodings) # [T_1 * (S_1+1) + T_2 * (S_2+1) + ... + T_B * (S_B+1), C] if not self.training: - output = torch.log_softmax(output, dim=-1) + output = torch.log_softmax(output, dim=-1) # [T_1 * (S_1+1) + T_2 * (S_2+1) + ... + T_B * (S_B+1), C] return output, source_lengths, target_lengths @@ -269,6 +333,11 @@ class FFNNTransducerConfig(ModelConfiguration): joiner_cfg: TransducerJoinerConfig +@dataclass +class FFNNTransducerWithIlmConfig(FFNNTransducerConfig): + ilm_scale: float + + class FFNNTransducer(torch.nn.Module): def __init__(self, step: int, cfg: FFNNTransducerConfig, **_): super().__init__() @@ -276,22 +345,31 @@ def __init__(self, step: int, cfg: FFNNTransducerConfig, **_): self.predictor = FFNNTransducerPredictor(cfg.predictor_cfg) self.joiner = PackedTransducerJoiner(cfg.joiner_cfg) - def transcribe(self, sources: torch.Tensor, source_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - return self.transcriber(sources=sources, source_lengths=source_lengths) + def transcribe( + self, + sources: torch.Tensor, # [B, T, F] + source_lengths: torch.Tensor, # [B] + ) -> Tuple[torch.Tensor, Dict[int, torch.Tensor], torch.Tensor]: # [B, T, E], Dict(l: [B, T, E]), [B] + return self.transcriber.forward(sources=sources, source_lengths=source_lengths) def predict( - self, targets: torch.Tensor, target_lengths: torch.Tensor, state: Optional[List[List[torch.Tensor]]] - ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]: - return self.predictor(targets=targets, target_lengths=target_lengths, state=state) + self, + targets: torch.Tensor, # [B, S] + target_lengths: torch.Tensor, # [B] + history: Optional[torch.Tensor] = None, # [B, H] + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: # [B, S+1, P], [B], [B, H] + return self.predictor.forward(targets=targets, target_lengths=target_lengths, history=history) def join( self, - source_encodings: torch.Tensor, - source_lengths: torch.Tensor, - target_encodings: torch.Tensor, - target_lengths: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - return self.joiner( + source_encodings: torch.Tensor, # [B, T, E] + source_lengths: torch.Tensor, # [B] + target_encodings: torch.Tensor, # [B, S+1, P] + target_lengths: torch.Tensor, # [B] + ) -> Tuple[ + torch.Tensor, torch.Tensor, torch.Tensor + ]: # [T_1 * (S_1+1) + T_2 * (S_2+1) + ... + T_B * (S_B+1), C], [B], [B], + return self.joiner.forward( source_encodings=source_encodings, source_lengths=source_lengths, target_encodings=target_encodings, @@ -300,38 +378,102 @@ def join( def forward( self, - sources: torch.Tensor, - source_lengths: torch.Tensor, - targets: torch.Tensor, - target_lengths: torch.Tensor, - predictor_state: Optional[List[List[torch.Tensor]]] = None, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]: - source_encodings, source_lengths = self.transcribe(sources=sources, source_lengths=source_lengths) - target_encodings, target_lengths, state = self.predict( - targets=targets, target_lengths=target_lengths, state=predictor_state + sources: torch.Tensor, # [B, T, F] + source_lengths: torch.Tensor, # [B] + targets: torch.Tensor, # [B, S] + target_lengths: torch.Tensor, # [B] + history: Optional[torch.Tensor] = None, # [B, H] + ) -> Tuple[ + torch.Tensor, Dict[int, torch.Tensor], torch.Tensor, torch.Tensor, torch.Tensor + ]: # [T_1 * (S_1+1) + T_2 * (S_2+1) + ... + T_B * (S_B+1), C], Dict(l: [B, T, C]), [B], [B], [B, H] + source_encodings, intermediate_logits, source_lengths = self.transcribe( + sources=sources, source_lengths=source_lengths + ) + label_encodings, target_lengths, history = self.predict( + targets=targets, target_lengths=target_lengths, history=history ) output, source_lengths, target_lengths = self.join( + source_encodings=source_encodings, + source_lengths=source_lengths, + target_encodings=label_encodings, + target_lengths=target_lengths, + ) + + return output, intermediate_logits, source_lengths, target_lengths, history + + +class FFNNTransducerViterbi(torch.nn.Module): + def __init__(self, step: int, cfg: FFNNTransducerConfig, **_): + super().__init__() + self.transcriber = TransducerTranscriber(cfg.transcriber_cfg) + self.predictor = FFNNTransducerPredictor(cfg.predictor_cfg) + self.joiner = TransducerJoiner(cfg.joiner_cfg) + + def transcribe( + self, + sources: torch.Tensor, # [B, T, F] + source_lengths: torch.Tensor, # [B] + ) -> Tuple[torch.Tensor, Dict[int, torch.Tensor], torch.Tensor]: # [B, T, E], Dict(l: [B, T, C]), [B] + return self.transcriber.forward(sources=sources, source_lengths=source_lengths) + + def predict( + self, + targets: torch.Tensor, # [B, T] + target_lengths: torch.Tensor, # [B] + ) -> Tuple[torch.Tensor, torch.Tensor]: # [B, T, P], [B] + return self.predictor.forward_viterbi(targets=targets, target_lengths=target_lengths) + + def join( + self, + source_encodings: torch.Tensor, # [B, T, E] + source_lengths: torch.Tensor, # [B] + target_encodings: torch.Tensor, # [B, T, P] + target_lengths: torch.Tensor, # [B] + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: # [B, T, C], [B], [B] + return self.joiner.forward_viterbi( source_encodings=source_encodings, source_lengths=source_lengths, target_encodings=target_encodings, target_lengths=target_lengths, ) - return output, source_lengths, target_lengths, state + def forward( + self, + sources: torch.Tensor, # [B, T, F] + source_lengths: torch.Tensor, # [B] + targets: torch.Tensor, # [B, T] + target_lengths: torch.Tensor, # [B] + ) -> Tuple[ + torch.Tensor, Dict[int, torch.Tensor], torch.Tensor, torch.Tensor + ]: # [B, T, C], Dict(l: [B, T, C]), [B], [B] + source_encodings, intermediate_logits, source_lengths = self.transcribe( + sources=sources, source_lengths=source_lengths + ) + target_encodings, target_lengths = self.predict(targets=targets, target_lengths=target_lengths) + + output, source_lengths, target_lengths = self.join( + source_encodings=source_encodings, + source_lengths=source_lengths, + target_encodings=target_encodings, + target_lengths=target_lengths, + ) + + return output, intermediate_logits, source_lengths, target_lengths class FFNNTransducerEncoderOnly(torch.nn.Module): def __init__(self, step: int, cfg: FFNNTransducerConfig, **_): super().__init__() - self.transcriber = TransducerTranscriberNoFeatExtr(cfg.transcriber_cfg) + self.transcriber = TransducerTranscriber(cfg.transcriber_cfg) def forward( self, sources: torch.Tensor, # [B, T, F] source_lengths: torch.Tensor, # [B] ) -> Tuple[torch.Tensor, torch.Tensor]: # [B, T', E] - return self.transcriber(sources=sources, source_lengths=source_lengths) + source_encodings, _, source_lengths = self.transcriber.forward(sources=sources, source_lengths=source_lengths) + return source_encodings, source_lengths class FFNNTransducerDecoderOnly(torch.nn.Module): @@ -340,38 +482,72 @@ def __init__(self, step: int, cfg: FFNNTransducerConfig, **_): self.predictor = FFNNTransducerPredictor(cfg.predictor_cfg) self.joiner = TransducerJoiner(cfg.joiner_cfg) + if isinstance(cfg, FFNNTransducerWithIlmConfig): + self.ilm_scale = cfg.ilm_scale + else: + self.ilm_scale = 0.0 + def forward( self, source_encodings: torch.Tensor, # [B, E] - targets: torch.Tensor, # [B, H] + history: torch.Tensor, # [B, H] ) -> torch.Tensor: # [B, C] - dec_state = [[targets[:, :-1]]] # [[[B, H-1]]] - dec_current_label = targets[:, -1:] # [B, 1] - dec_length = torch.tensor([1] * targets.size(0), device=targets.device) # [B] + device = source_encodings.device + B = history.size(0) + source_lengths = torch.tensor([1] * B, device=device) # [B] + target_lengths = torch.tensor([0] * B, device=device) # [B] - decoder, _, _ = self.predictor( - targets=dec_current_label, target_lengths=dec_length, state=dec_state + history_encoding, _, _ = self.predictor.forward( + targets=torch.empty((B, 0), dtype=history.dtype, device=device), + target_lengths=target_lengths, + history=history, ) # [B, 1, P] source_encodings = source_encodings.unsqueeze(1) # [B, 1, E] - joint_output, _, _ = self.joiner( + joint_output, _, _ = self.joiner.forward( source_encodings=source_encodings, - source_lengths=dec_length, - target_encodings=decoder, - target_lengths=dec_length, + source_lengths=source_lengths, # [B] + target_encodings=history_encoding, + target_lengths=target_lengths, ) # [B, 1, 1, C] + joint_output = joint_output.squeeze(2).squeeze(1) # [B, C] + + if self.ilm_scale != 0: + assert self.predictor.blank_id == 0 + joint_output_ilm, _, _ = self.joiner.forward( + source_encodings=torch.zeros_like(source_encodings), + source_lengths=source_lengths, + target_encodings=history_encoding, + target_lengths=target_lengths, + ) # [B, 1, 1, C] + joint_output_ilm = joint_output_ilm.squeeze(2).squeeze(1) # [B, C] + + blank_log_probs = joint_output_ilm[:, :1] # [B, 1] + non_blank_log_probs = joint_output_ilm[:, 1:] # [B, C-1] + + joint_output_ilm = torch.concat( + [ + torch.zeros_like(blank_log_probs), + non_blank_log_probs - torch.log(1.0 - torch.exp(blank_log_probs)), + ], + dim=-1, + ) # [B, C] - return joint_output.squeeze(2).squeeze(1) # [B, C] + joint_output -= self.ilm_scale * joint_output_ilm + + return joint_output # [B, C] def get_train_serializer(model_config: FFNNTransducerConfig, **kwargs) -> Collection: assert __package__ is not None pytorch_package = __package__.rpartition(".")[0] + loss_repo = CloneGitRepositoryJob("https://github.com/SimBe195/monotonic-rnnt.git").out_repository return get_basic_pt_network_serializer( module_import_path=f"{__name__}.{FFNNTransducer.__name__}", model_config=model_config, additional_serializer_objects=[ + ExternalImport(import_path=loss_repo), PartialImport( code_object_path=f"{pytorch_package}.train_steps.transducer.train_step", hashed_arguments=kwargs, @@ -382,6 +558,46 @@ def get_train_serializer(model_config: FFNNTransducerConfig, **kwargs) -> Collec ) +def get_viterbi_train_serializer(model_config: FFNNTransducerConfig, **kwargs) -> Collection: + assert __package__ is not None + pytorch_package = __package__.rpartition(".")[0] + return get_basic_pt_network_serializer( + module_import_path=f"{__name__}.{FFNNTransducerViterbi.__name__}", + model_config=model_config, + additional_serializer_objects=[ + PartialImport( + code_object_path=f"{pytorch_package}.train_steps.transducer.train_step_viterbi", + import_as="train_step", + hashed_arguments=kwargs, + unhashed_package_root="", + unhashed_arguments={}, + ), + ], + ) + + +def get_align_restrict_train_serializer(model_config: FFNNTransducerConfig, **kwargs) -> Collection: + assert __package__ is not None + pytorch_package = __package__.rpartition(".")[0] + loss_repo = CloneGitRepositoryJob( + "https://github.com/SimBe195/monotonic-rnnt.git", commit="3fbb480107f7379347b25953be0727dcc4d0e57b" + ).out_repository + return get_basic_pt_network_serializer( + module_import_path=f"{__name__}.{FFNNTransducer.__name__}", + model_config=model_config, + additional_serializer_objects=[ + ExternalImport(import_path=loss_repo), + PartialImport( + code_object_path=f"{pytorch_package}.train_steps.transducer.train_step_align_restrict", + import_as="train_step", + hashed_arguments=kwargs, + unhashed_package_root="", + unhashed_arguments={}, + ), + ], + ) + + def get_torchaudio_train_serializer(model_config: FFNNTransducerConfig, **kwargs) -> Collection: assert __package__ is not None pytorch_package = __package__.rpartition(".")[0] @@ -505,26 +721,22 @@ def get_default_config_v1(num_outputs: int) -> FFNNTransducerConfig: ) frontend = ModuleFactoryV1( - GenericFrontendV1, - GenericFrontendV1Config( + VGG4LayerActFrontendV1, + VGG4LayerActFrontendV1Config( in_features=80, - layer_ordering=[ - FrontendLayerType.Conv2d, - FrontendLayerType.Conv2d, - FrontendLayerType.Pool2d, - FrontendLayerType.Conv2d, - FrontendLayerType.Conv2d, - FrontendLayerType.Pool2d, - FrontendLayerType.Activation, - ], - conv_kernel_sizes=[(3, 3), (3, 3), (3, 3), (3, 3)], - conv_paddings=None, - conv_out_dims=[32, 64, 64, 32], - conv_strides=[(1, 1), (1, 1), (1, 1), (1, 1)], - pool_kernel_sizes=[(2, 1), (2, 1)], - pool_strides=None, - pool_paddings=None, - activations=[torch.nn.ReLU()], + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation=torch.nn.ReLU(), out_features=384, ), ) @@ -551,13 +763,13 @@ def get_default_config_v1(num_outputs: int) -> FFNNTransducerConfig: norm=torch.nn.BatchNorm1d(num_features=384, affine=False), ) - block_cfg = conformer_i6.ConformerBlockV1Config( + block_cfg = conformer_i6.ConformerBlockV2Config( ff_cfg=ff_cfg, mhsa_cfg=mhsa_cfg, conv_cfg=conv_cfg, ) - conformer_cfg = conformer_i6.ConformerEncoderV1Config( + conformer_cfg = conformer_i6.ConformerEncoderV2Config( num_layers=12, frontend=frontend, block_cfg=block_cfg, @@ -566,7 +778,10 @@ def get_default_config_v1(num_outputs: int) -> FFNNTransducerConfig: transcriber_cfg = TransducerTranscriberConfig( feature_extraction=feature_extraction, specaugment=specaugment, - encoder=ModuleFactoryV1(module_class=conformer_i6.ConformerEncoderV1, cfg=conformer_cfg), + encoder=ModuleFactoryV1(module_class=conformer_i6.ConformerEncoderV2, cfg=conformer_cfg), + layer_size=384, + target_size=num_outputs, + enc_loss_layers=[5, 11], ) predictor_cfg = FFNNTransducerPredictorConfig( @@ -593,3 +808,119 @@ def get_default_config_v1(num_outputs: int) -> FFNNTransducerConfig: predictor_cfg=predictor_cfg, joiner_cfg=joiner_cfg, ) + + +def get_default_config_v2(num_outputs: int) -> FFNNTransducerConfig: + feature_extraction = ModuleFactoryV1( + module_class=RasrCompatibleLogMelFeatureExtractionV1, + cfg=RasrCompatibleLogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + min_amp=1.175494e-38, + num_filters=80, + alpha=0.97, + ), + ) + + specaugment = ModuleFactoryV1( + module_class=SpecaugmentByLengthModuleV1, + cfg=SpecaugmentByLengthConfigV1( + time_min_num_masks=2, + time_max_mask_per_n_frames=20, + time_mask_max_size=30, + freq_min_num_masks=2, + freq_max_num_masks=16, + freq_mask_max_size=5, + ), + ) + + frontend = ModuleFactoryV1( + VGG4LayerActFrontendV1, + VGG4LayerActFrontendV1Config( + in_features=80, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation=torch.nn.ReLU(), + out_features=768, + ), + ) + + ff_cfg = conformer_parts_i6.ConformerPositionwiseFeedForwardV1Config( + input_dim=768, + hidden_dim=3072, + dropout=0.3, + activation=torch.nn.SiLU(), + ) + + mhsa_cfg = conformer_parts_i6.ConformerMHSAV1Config( + input_dim=768, + num_att_heads=8, + att_weights_dropout=0.3, + dropout=0.3, + ) + + conv_cfg = conformer_parts_i6.ConformerConvolutionV1Config( + channels=768, + kernel_size=7, + dropout=0.3, + activation=torch.nn.SiLU(), + norm=torch.nn.BatchNorm1d(num_features=768, affine=False), + ) + + block_cfg = conformer_i6.ConformerBlockV2Config( + ff_cfg=ff_cfg, + mhsa_cfg=mhsa_cfg, + conv_cfg=conv_cfg, + modules=["ff", "conv", "mhsa", "ff"], + ) + + conformer_cfg = conformer_i6.ConformerEncoderV2Config( + num_layers=12, + frontend=frontend, + block_cfg=block_cfg, + ) + + transcriber_cfg = TransducerTranscriberConfig( + feature_extraction=feature_extraction, + specaugment=specaugment, + encoder=ModuleFactoryV1(module_class=conformer_i6.ConformerEncoderV2, cfg=conformer_cfg), + layer_size=768, + target_size=num_outputs, + enc_loss_layers=[5, 11], + ) + + predictor_cfg = FFNNTransducerPredictorConfig( + layers=2, + layer_size=384, + activation=torch.nn.Tanh(), + dropout=0.3, + context_history_size=1, + context_embedding_size=256, + blank_id=0, + target_size=num_outputs, + ) + + joiner_cfg = TransducerJoinerConfig( + input_size=1152, + layer_size=1024, + act=torch.nn.Tanh(), + target_size=num_outputs, + combination_mode=CombinationMode.CONCAT, + ) + + return FFNNTransducerConfig( + transcriber_cfg=transcriber_cfg, + predictor_cfg=predictor_cfg, + joiner_cfg=joiner_cfg, + ) diff --git a/users/berger/pytorch/train_steps/transducer.py b/users/berger/pytorch/train_steps/transducer.py index ceaae3809..e23a41d23 100644 --- a/users/berger/pytorch/train_steps/transducer.py +++ b/users/berger/pytorch/train_steps/transducer.py @@ -1,47 +1,268 @@ +from typing import Dict, Optional + import torch from returnn.tensor.tensor_dict import TensorDict -from i6_experiments.users.berger.pytorch.helper_functions import map_tensor_to_minus1_plus1_interval +from i6_experiments.users.berger.pytorch.models.conformer_transducer_v2 import FFNNTransducer, FFNNTransducerViterbi + + +def train_step( + *, + model: FFNNTransducer, + extern_data: TensorDict, + blank_idx: int = 0, + enc_loss_scales: Optional[Dict[int, float]] = None, + **_, +): + import returnn.frontend as rf + from pytorch_binding.monotonic_rnnt_op import monotonic_rnnt_loss + from returnn.tensor import batch_dim -def train_step(*, model: torch.nn.Module, extern_data: TensorDict, blank_idx: int = 0, **kwargs): - from returnn.extern_private.BergerMonotonicRNNT.monotonic_rnnt.pytorch_binding import monotonic_rnnt_loss + if enc_loss_scales is None: + enc_loss_scales = {} - audio_features = extern_data["data"].raw_tensor - assert audio_features is not None - audio_features = audio_features.float() - # audio_features = map_tensor_to_minus1_plus1_interval(audio_features) + sources = extern_data["data"].raw_tensor + assert sources is not None + sources = sources.float() assert extern_data["data"].dims[1].dyn_size_ext is not None - audio_feature_lengths = extern_data["data"].dims[1].dyn_size_ext.raw_tensor - assert audio_feature_lengths is not None - audio_feature_lengths = audio_feature_lengths.to(device="cuda") + source_lengths = extern_data["data"].dims[1].dyn_size_ext.raw_tensor + assert source_lengths is not None + source_lengths = source_lengths.to(device="cuda") assert extern_data["classes"].raw_tensor is not None targets = extern_data["classes"].raw_tensor.to(dtype=torch.int32) - assert extern_data["classes"].dims[1].dyn_size_ext is not None - target_lengths = extern_data["classes"].dims[1].dyn_size_ext.raw_tensor + target_lengths_rf = extern_data["classes"].dims[1].dyn_size_ext + assert target_lengths_rf is not None + + target_lengths = target_lengths_rf.raw_tensor assert target_lengths is not None target_lengths = target_lengths.to(device="cuda") - targets_padded = torch.nn.functional.pad(targets, (1, 0), mode="constant", value=blank_idx) - targets_padded_lengths = target_lengths + 1 + loss_norm_factor = rf.reduce_sum(target_lengths_rf, axis=batch_dim) - model_logits, input_lengths, _, _ = model( - sources=audio_features, - source_lengths=audio_feature_lengths, - targets=targets_padded, - target_lengths=targets_padded_lengths, + model_logits, intermediate_logits, source_lengths, _, _ = model.forward( + sources=sources, + source_lengths=source_lengths, + targets=targets, + target_lengths=target_lengths, ) loss = monotonic_rnnt_loss( acts=model_logits.to(dtype=torch.float32), labels=targets, - input_lengths=input_lengths, + input_lengths=source_lengths, label_lengths=target_lengths, blank_label=blank_idx, ) + rf.get_run_ctx().mark_as_loss(name="monotonic_rnnt", loss=loss, custom_inv_norm_factor=loss_norm_factor) + + targets_packed = torch.nn.utils.rnn.pack_padded_sequence( + targets, target_lengths.cpu(), batch_first=True, enforce_sorted=False + ) + + for layer_idx, scale in enc_loss_scales.items(): + logits = intermediate_logits[layer_idx] + log_probs = torch.nn.functional.log_softmax(logits, dim=-1) # [B, T, C] + log_probs = torch.transpose(log_probs, 0, 1) # [T, B, C] + + loss = torch.nn.functional.ctc_loss( + log_probs=log_probs, + targets=targets, + input_lengths=source_lengths, + target_lengths=target_lengths, + blank=blank_idx, + reduction="sum", + zero_infinity=True, + ) + + rf.get_run_ctx().mark_as_loss( + name=f"CTC_enc-{layer_idx}", + loss=loss, + scale=scale, + custom_inv_norm_factor=loss_norm_factor, + ) + + predictions = torch.argmax(log_probs, dim=-1) + predictions_packed = torch.nn.utils.rnn.pack_padded_sequence( + predictions, target_lengths.cpu(), batch_first=True, enforce_sorted=False + ) + num_incorrect_frames = torch.sum((targets_packed.data != predictions_packed.data)) + + rf.get_run_ctx().mark_as_loss( + name=f"FER_enc-{layer_idx}", + loss=num_incorrect_frames, + custom_inv_norm_factor=loss_norm_factor, + as_error=True, + ) + + +def train_step_viterbi( + *, + model: FFNNTransducerViterbi, + extern_data: TensorDict, + enc_loss_scales: Optional[Dict[int, float]] = None, + **_, +): + import returnn.frontend as rf + from returnn.tensor import batch_dim + + if enc_loss_scales is None: + enc_loss_scales = {} + + sources = extern_data["data"].raw_tensor + assert sources is not None + sources = sources.float() + + assert extern_data["classes"].raw_tensor is not None + targets = extern_data["classes"].raw_tensor.to(dtype=torch.int32) + + assert extern_data["data"].dims[1].dyn_size_ext is not None + source_lengths = extern_data["data"].dims[1].dyn_size_ext.raw_tensor + assert source_lengths is not None + source_lengths = source_lengths.to(device="cuda") + + target_lengths_rf = extern_data["classes"].dims[1].dyn_size_ext + assert target_lengths_rf is not None + + target_lengths = target_lengths_rf.raw_tensor + assert target_lengths is not None + target_lengths = target_lengths.to(device="cuda") + + loss_norm_factor = rf.reduce_sum(target_lengths_rf, axis=batch_dim) + + model_logits, intermediate_logits, _, _ = model.forward( + sources=sources, + source_lengths=source_lengths, + targets=targets, + target_lengths=target_lengths, + ) + + targets_packed = torch.nn.utils.rnn.pack_padded_sequence( + targets.long(), target_lengths.cpu(), batch_first=True, enforce_sorted=False + ) + targets_masked, _ = torch.nn.utils.rnn.pad_packed_sequence(targets_packed, batch_first=True, padding_value=-100) + for logits, scale, suffix in [(model_logits, 1.0, "final")] + [ + (intermediate_logits[layer_idx], scale, f"enc-{layer_idx}") for layer_idx, scale in enc_loss_scales.items() + ]: + log_probs = torch.log_softmax(logits, dim=-1) # [B, T, F] + log_probs = torch.transpose(log_probs, 1, 2) # [B, F, T] + + loss = torch.nn.functional.cross_entropy( + input=log_probs, target=targets_masked, ignore_index=-100, reduction="sum" + ) + + predictions = torch.argmax(logits, dim=-1) + predictions_packed = torch.nn.utils.rnn.pack_padded_sequence( + predictions.long(), target_lengths.cpu(), batch_first=True, enforce_sorted=False + ) + num_incorrect_frames = torch.sum((targets_packed.data != predictions_packed.data)) + + rf.get_run_ctx().mark_as_loss( + name=f"CE_{suffix}", loss=loss, scale=scale, custom_inv_norm_factor=loss_norm_factor + ) + rf.get_run_ctx().mark_as_loss( + name=f"FER_{suffix}", loss=num_incorrect_frames, custom_inv_norm_factor=loss_norm_factor, as_error=True + ) + + +def train_step_align_restrict( + *, + model: FFNNTransducer, + extern_data: TensorDict, + max_distance_from_alignment: int, + blank_idx: int = 0, + enc_loss_scales: Optional[Dict[int, float]] = None, + **_, +): import returnn.frontend as rf + from pytorch_binding.monotonic_rnnt_op import monotonic_rnnt_loss + from returnn.tensor import batch_dim + + if enc_loss_scales is None: + enc_loss_scales = {} + + sources = extern_data["data"].raw_tensor + assert sources is not None + sources = sources.float() + + assert extern_data["data"].dims[1].dyn_size_ext is not None + source_lengths = extern_data["data"].dims[1].dyn_size_ext.raw_tensor + assert source_lengths is not None + source_lengths = source_lengths.to(device="cuda") + + assert extern_data["classes"].raw_tensor is not None + alignments = extern_data["classes"].raw_tensor.to(torch.int32) # [B, T] + alignment_lengths_rf = extern_data["classes"].dims[1].dyn_size_ext + assert alignment_lengths_rf is not None + + target_lengths = (alignments != blank_idx).sum(dim=1) # [B] + target_lengths = target_lengths.to(device="cuda", dtype=torch.int32) + target_lengths_rf = rf.Tensor("target_lengths", dims=[batch_dim], dtype="int32", raw_tensor=target_lengths) + + max_target_length = target_lengths.max().item() + targets = torch.zeros((target_lengths.size(0), max_target_length), dtype=alignments.dtype, device=alignments.device) + + for i in range(targets.size(0)): + non_blanks = alignments[i][alignments[i] != blank_idx] + targets[i, : non_blanks.size(0)] = non_blanks + + target_loss_norm_factor = rf.reduce_sum(target_lengths_rf, axis=batch_dim) + alignment_loss_norm_factor = rf.reduce_sum(alignment_lengths_rf, axis=batch_dim) + + model_logits, intermediate_logits, source_lengths, _, _ = model.forward( + sources=sources, + source_lengths=source_lengths, + targets=targets, + target_lengths=target_lengths, + ) + + loss = monotonic_rnnt_loss( + acts=model_logits.to(dtype=torch.float32), + labels=targets, + input_lengths=source_lengths, + label_lengths=target_lengths, + alignment=alignments, + max_distance_from_alignment=max_distance_from_alignment, + blank_label=blank_idx, + ) + rf.get_run_ctx().mark_as_loss( + name=f"monotonic_rnnt_restrict-{max_distance_from_alignment}", + loss=loss, + custom_inv_norm_factor=target_loss_norm_factor, + ) + + alignments_packed = torch.nn.utils.rnn.pack_padded_sequence( + alignments, source_lengths.cpu(), batch_first=True, enforce_sorted=False + ) + alignments_masked, _ = torch.nn.utils.rnn.pad_packed_sequence( + alignments_packed, batch_first=True, padding_value=-100 + ) + + for logits, scale, suffix in [ + (intermediate_logits[layer_idx], scale, f"enc-{layer_idx}") for layer_idx, scale in enc_loss_scales.items() + ]: + log_probs = torch.log_softmax(logits, dim=-1) # [B, T, F] + log_probs = torch.transpose(log_probs, 1, 2) # [B, F, T] + + loss = torch.nn.functional.cross_entropy( + input=log_probs, target=alignments_masked.long(), ignore_index=-100, reduction="sum" + ) + + predictions = torch.argmax(logits, dim=-1) + predictions_packed = torch.nn.utils.rnn.pack_padded_sequence( + predictions.long(), source_lengths.cpu(), batch_first=True, enforce_sorted=False + ) + num_incorrect_frames = torch.sum((alignments_packed.data != predictions_packed.data)) - rf.get_run_ctx().mark_as_loss(name="MonoRNNT", loss=loss) + rf.get_run_ctx().mark_as_loss( + name=f"CE_{suffix}", loss=loss, scale=scale, custom_inv_norm_factor=alignment_loss_norm_factor + ) + rf.get_run_ctx().mark_as_loss( + name=f"FER_{suffix}", + loss=num_incorrect_frames, + custom_inv_norm_factor=alignment_loss_norm_factor, + as_error=True, + ) diff --git a/users/berger/recipe/returnn/onnx.py b/users/berger/recipe/returnn/onnx.py index 4e5794020..6e43b1726 100644 --- a/users/berger/recipe/returnn/onnx.py +++ b/users/berger/recipe/returnn/onnx.py @@ -1,6 +1,9 @@ import shutil import subprocess as sp import sys +from typing import Optional + +from optuna import Trial from i6_core import util from i6_core.returnn.config import ReturnnConfig @@ -8,6 +11,8 @@ from i6_experiments.users.berger.recipe.returnn.optuna_config import OptunaReturnnConfig from sisyphus import Job, Task, tk +from returnn.config import Config + class ExportPyTorchModelToOnnxJob(Job): """ @@ -34,6 +39,7 @@ def tasks(self): def run(self): sys.path.insert(0, self.returnn_root.get()) import torch + from returnn.config import Config config = Config() @@ -135,6 +141,7 @@ def tasks(self): def run(self): sys.path.insert(0, self.returnn_root.get()) import torch + from returnn.config import Config config = Config() @@ -160,3 +167,66 @@ def run(self): export_func = config.typed_value("export") assert export_func export_func(model=model, model_filename=self.out_onnx_model.get()) + + +class OptunaTorchOnnxExportJob(Job): + """ + Export an ONNX model using the appropriate RETURNN tool script. + + Currently only supports PyTorch via tools/torch_export_to_onnx.py + """ + + def __init__( + self, + *, + returnn_config: OptunaReturnnConfig, + checkpoint: PtCheckpoint, + trial: tk.Variable, + returnn_python_exe: tk.Path, + returnn_root: tk.Path, + device: str = "cpu", + ): + """ + + :param returnn_config: RETURNN config object + :param checkpoint: Path to the checkpoint for export + :param device: target device for graph creation + :param returnn_python_exe: file path to the executable for running returnn (python binary or .sh) + :param returnn_root: file path to the RETURNN repository root folder + """ + + self.returnn_config = returnn_config + self.trial = trial + self.checkpoint = checkpoint + self.device = device + self.returnn_python_exe = returnn_python_exe + self.returnn_root = returnn_root + + self.out_returnn_config = self.output_path("returnn.config") + self.out_onnx_model = self.output_path("model.onnx") + + def tasks(self): + yield Task("run", mini_task=True) + + def run(self): + returnn_config = self.returnn_config.generate_config(self.trial.get()) + returnn_config_path = self.out_returnn_config.get_path() + returnn_config.write(returnn_config_path) + + config = Config() + config.load_file(returnn_config_path) + + cmd = [ + self.returnn_python_exe.get_path(), + self.returnn_root.join_right("tools/torch_export_to_onnx.py").get_path(), + returnn_config_path, + str(self.checkpoint), + self.out_onnx_model.get_path(), + "--device", + self.device, + "--verbosity", + "5", + ] + + util.create_executable("compile.sh", cmd) # convenience file for manual execution + sp.run(cmd, check=True) diff --git a/users/berger/recipe/returnn/optuna_returnn_training.py b/users/berger/recipe/returnn/optuna_returnn_training.py index 11fb101ad..2334661fa 100644 --- a/users/berger/recipe/returnn/optuna_returnn_training.py +++ b/users/berger/recipe/returnn/optuna_returnn_training.py @@ -1,6 +1,5 @@ -__all__ = ["OptunaReturnnTrainingJob"] +__all__ = ["OptunaReturnnTrainingJob", "OptunaReportIntermediateScoreJob", "OptunaReportFinalScoreJob"] -import glob import inspect import logging import os @@ -28,7 +27,6 @@ def __init__( study_name: Optional[str] = None, study_storage: Optional[str] = None, sampler_seed: int = 42, - score_key: str = "dev_score", num_trials: int = 15, num_parallel: int = 3, *, @@ -41,6 +39,7 @@ def __init__( time_rqmt: int = 4, mem_rqmt: int = 4, cpu_rqmt: int = 2, + gpu_mem_rqmt: int = 11, horovod_num_processes: Optional[int] = None, multi_node_slots: Optional[int] = None, returnn_python_exe: Optional[tk.Path] = None, @@ -52,9 +51,8 @@ def __init__( self.optuna_returnn_config = optuna_returnn_config self.study_name = study_name or "optuna_study" - self.study_storage = study_storage or f"sqlite:///study_storage.db" + self.study_storage = study_storage or "sqlite:///study_storage.db" self.sampler_seed = sampler_seed - self.score_key = score_key self.num_trials = num_trials self.num_parallel = num_parallel @@ -93,11 +91,6 @@ def __init__( } for i in range(self.num_trials) } - self.out_checkpoints = { - k: Checkpoint(self.output_path(f"models/epoch.{k:03d}.index")) - for k in stored_epochs - if k in self.keep_epochs - } elif backend == Backend.PYTORCH: self.out_trial_checkpoints = { i: { @@ -107,31 +100,19 @@ def __init__( } for i in range(self.num_trials) } - self.out_checkpoints = { - k: PtCheckpoint(self.output_path(f"models/epoch.{k:03d}.pt")) - for k in stored_epochs - if k in self.keep_epochs - } else: raise NotImplementedError - self.out_trial_nums = {i: self.output_var(f"trial-{i:03d}/trial_num") for i in range(self.num_trials)} + self.out_task_id_to_trial_num = {i: self.output_var(f"task-{i:03d}-trial") for i in range(self.num_trials)} self.out_trials = {i: self.output_var(f"trial-{i:03d}/trial", pickle=True) for i in range(self.num_trials)} self.out_trial_params = {i: self.output_var(f"trial-{i:03d}/params") for i in range(self.num_trials)} - self.out_trial_scores = {i: self.output_var(f"trial-{i:03d}/score") for i in range(self.num_trials)} - self.out_best_trial_num = self.output_var("best_trial_num") - self.out_best_trial = self.output_var("best_trial", pickle=True) - self.out_best_params = self.output_var("best_params") - self.out_best_score = self.output_var("best_score") - - self.out_plot_se = self.output_path(f"score_and_error.png") - self.out_plot_lr = self.output_path(f"learning_rate.png") self.rqmt = { "gpu": 1 if device == "gpu" else 0, "cpu": cpu_rqmt, "mem": mem_rqmt, "time": time_rqmt, + "gpu_mem": gpu_mem_rqmt, } if self.multi_node_slots: @@ -203,33 +184,25 @@ def path_available(self, path: tk.Path) -> bool: return False - def get_returnn_config(self, trial: "optuna.Trial", task_id: int) -> ReturnnConfig: + def get_returnn_config(self, trial: "optuna.Trial") -> ReturnnConfig: returnn_config = self.optuna_returnn_config.generate_config(trial) - returnn_config.post_config["model"] = os.path.join(self.out_trial_model_dir[task_id].get_path(), "epoch") + trial_num = trial.number + returnn_config.post_config["model"] = os.path.join(self.out_trial_model_dir[trial_num].get_path(), "epoch") returnn_config.post_config.pop("learning_rate_file", None) - returnn_config.config["learning_rate_file"] = f"trial-{task_id:03d}/learning_rates" + returnn_config.config["learning_rate_file"] = f"trial-{trial_num:03d}/learning_rates" - returnn_config.post_config["log"] = f"./trial-{task_id:03d}/returnn.log" + returnn_config.post_config["log"] = f"./trial-{trial_num:03d}/returnn.log" ReturnnTrainingJob.check_blacklisted_parameters(returnn_config) returnn_config = ReturnnTrainingJob.create_returnn_config(returnn_config, **self.kwargs) return returnn_config - def prepare_trial_files(self, returnn_config: ReturnnConfig, task_id: int) -> None: - config_file = self.out_trial_returnn_config_files[task_id] + def prepare_trial_files(self, returnn_config: ReturnnConfig, trial_num: int) -> None: + config_file = self.out_trial_returnn_config_files[trial_num] returnn_config.write(config_file.get_path()) - os.mkdir(f"trial-{task_id:03d}") - util.create_executable(f"trial-{task_id:03d}/rnn.sh", self._get_run_cmd(config_file)) - - # Additional import packages that are created by returnn common - for f in glob.glob("../output/*"): - f_name = os.path.basename(f) - if f_name.startswith("trial-"): - continue - if f_name == "models": - continue - os.symlink(f"../{f_name}", f"../output/trial-{task_id:03d}/{f_name}") + os.mkdir(f"trial-{trial_num:03d}") + util.create_executable(f"trial-{trial_num:03d}/rnn.sh", self._get_run_cmd(config_file)) def prepare_env(self) -> None: if not self.multi_node_slots: @@ -246,31 +219,15 @@ def prepare_env(self) -> None: print("Cannot read:", exc) sys.stdout.flush() - def parse_lr_file(self, task_id: Optional[int] = None) -> dict: + def parse_lr_file(self, trial_num: int) -> dict: def EpochData(learningRate, error): return {"learning_rate": learningRate, "error": error} - if task_id is None: - filename = self.out_learning_rates - else: - filename = f"trial-{task_id:03d}/learning_rates" + filename = f"trial-{trial_num:03d}/learning_rates" with open(filename, "rt") as f: lr_text = f.read() return eval(lr_text) - def link_to_final_output(self, task_id: int) -> None: - os.link( - self.out_trial_returnn_config_files[task_id], - self.out_returnn_config_file, - ) - os.link(self.out_trial_learning_rates[task_id], self.out_learning_rates) - for k in self.out_checkpoints: - for suffix in ["index", "meta", "data-00000-of-00001", "pt"]: - orig_file = f"{self.out_trial_checkpoints[task_id][k]}.{suffix}" - if not os.path.exists(orig_file): - continue - os.link(orig_file, f"{self.out_checkpoints[k]}.{suffix}") - # ------------------ Tasks ------------------ def tasks(self) -> Generator[Task, None, None]: @@ -282,8 +239,6 @@ def tasks(self) -> Generator[Task, None, None]: parallel=self.num_parallel, args=range(self.num_trials), ) - yield Task("select_best_trial", mini_task=True) - yield Task("plot", resume="plot", mini_task=True) def create_study(self) -> None: import optuna @@ -320,17 +275,14 @@ def run(self, task_id: int) -> None: study = optuna.load_study( study_name=self.study_name, storage=storage, - pruner=optuna.pruners.PatientPruner( - wrapped_pruner=optuna.pruners.MedianPruner( - n_startup_trials=max(5, self.num_parallel), - ), - patience=10, - min_delta=0.1, + pruner=optuna.pruners.MedianPruner( + n_startup_trials=max(5, self.num_parallel), + n_warmup_steps=self.num_epochs // 2, ), ) - if self.out_trials[task_id].is_set(): - trial_num = int(self.out_trial_nums[task_id].get()) + if self.out_task_id_to_trial_num[task_id].is_set(): + trial_num = int(self.out_task_id_to_trial_num[task_id].get()) logging.info(f"Found existing trial with number {trial_num}") if self._check_trial_finished(study, trial_num): @@ -345,56 +297,40 @@ def run(self, task_id: int) -> None: else: trial = study.ask() trial_num = trial.number + self.out_task_id_to_trial_num[task_id].set(trial_num) logging.info(f"Start new trial with number {trial_num}") - returnn_config = self.get_returnn_config(trial, task_id) - self.prepare_trial_files(returnn_config, task_id) - self.out_trial_nums[task_id].set(trial_num) - self.out_trial_params[task_id].set(trial.params) - self.out_trials[task_id].set(optuna.trial.FixedTrial(trial.params, trial_num)) + returnn_config = self.get_returnn_config(trial) + self.prepare_trial_files(returnn_config, trial_num) + self.out_trial_params[trial_num].set(trial.params) + self.out_trials[trial_num].set(optuna.trial.FixedTrial(trial.params, trial_num)) - config_file = self.out_trial_returnn_config_files[task_id] + config_file = self.out_trial_returnn_config_files[trial_num] run_cmd = self._get_run_cmd(config_file) training_process = sp.Popen(run_cmd) - max_epoch = 0 - best_score = float("inf") trial_pruned = False while training_process.poll() is None: time.sleep(30) - try: - lr_data = self.parse_lr_file(task_id) - except (FileNotFoundError, SyntaxError): - continue - epochs = list(sorted(lr_data.keys())) - new_epochs = [e for e in epochs if e > max_epoch] - for e in new_epochs: - if self.score_key not in lr_data[e]["error"]: - continue - max_epoch = e - score = lr_data[e]["error"][self.score_key] - if score < best_score: - best_score = score - - trial.report(score, e) if trial.should_prune(): trial_pruned = True training_process.terminate() - study.tell(trial_num, state=optuna.trial.TrialState.PRUNED) + study.tell(trial, state=optuna.trial.TrialState.PRUNED) break if trial_pruned: logging.info("Pruned trial run") - self.out_trial_scores[task_id].set(best_score) os.link( - f"trial-{task_id:03d}/learning_rates", - self.out_trial_learning_rates[task_id].get_path(), + f"trial-{trial_num:03d}/learning_rates", + self.out_trial_learning_rates[trial_num].get_path(), ) + lr_data = self.parse_lr_file(trial_num) + max_epoch = max([ep for ep, ep_data in lr_data.items() if ep_data["error"] != {}]) + if not trial_pruned and max_epoch == self.num_epochs: logging.info("Finished trial run normally") - study.tell(trial_num, best_score, state=optuna.trial.TrialState.COMPLETE) os.link( f"trial-{task_id:03d}/learning_rates", self.out_trial_learning_rates[task_id].get_path(), @@ -404,95 +340,125 @@ def run(self, task_id: int) -> None: logging.info("Training had an error") raise sp.CalledProcessError(-1, cmd=run_cmd) - def select_best_trial(self) -> None: + @classmethod + def hash(cls, kwargs): + d = { + "returnn_config_generator": inspect.getsource(kwargs["optuna_returnn_config"].config_generator), + "returnn_config_generator_kwargs": list(sorted(kwargs["optuna_returnn_config"].config_kwargs)), + "sampler_seed": kwargs["sampler_seed"], + "num_trials": kwargs["num_trials"], + "num_parallel": kwargs["num_parallel"], + "returnn_python_exe": kwargs["returnn_python_exe"], + "returnn_root": kwargs["returnn_root"], + } + + if kwargs["study_name"] is not None: + d["study_name"] = kwargs["study_name"] + if kwargs["study_storage"] is not None: + d["study_storage"] = kwargs["study_storage"] + if kwargs["horovod_num_processes"] is not None: + d["horovod_num_processes"] = kwargs["horovod_num_processes"] + if kwargs["multi_node_slots"] is not None: + d["multi_node_slots"] = kwargs["multi_node_slots"] + + return super().hash(d) + + +class OptunaReportIntermediateScoreJob(Job): + def __init__( + self, + trial_num: int, + step: int, + score: tk.Variable, + study_name: Optional[str] = None, + study_storage: Optional[str] = None, + ) -> None: + self.study_name = study_name or "optuna_study" + self.study_storage = study_storage or "sqlite:///study_storage.db" + self.trial_num = trial_num + self.step = step + self.score = score + + self.out_reported_score = self.output_var("reported_score") + + def tasks(self) -> Generator[Task, None, None]: + yield Task("run", mini_task=True) + + def run(self) -> None: import optuna - study = optuna.load_study(study_name=self.study_name, storage=self.study_storage) - self.out_best_params.set(study.best_params) - self.out_best_trial.set(study.best_trial) - self.out_best_score.set(study.best_value) - for task_id, trial_num in self.out_trial_nums.items(): - if trial_num.get() == study.best_trial.number: - self.out_best_trial_num.set(task_id) - self.link_to_final_output(task_id=task_id) - break + storage = optuna.storages.get_storage(self.study_storage) + study = optuna.load_study( + study_name=self.study_name, + storage=storage, + ) - def plot(self): - data = self.parse_lr_file() + study_id = storage.get_study_id_from_name(self.study_name) + trial_id = storage.get_trial_id_from_study_id_trial_number(study_id, self.trial_num) + trial = optuna.Trial(study, trial_id) - epochs = list(sorted(data.keys())) - train_score_keys = [k for k in data[epochs[0]]["error"] if k.startswith("train_score")] - dev_score_keys = [k for k in data[epochs[0]]["error"] if k.startswith("dev_score")] - dev_error_keys = [k for k in data[epochs[0]]["error"] if k.startswith("dev_error")] + trial.report(value=self.score.get(), step=self.step) + self.out_reported_score.set(self.score.get()) - train_scores = [ - [(epoch, data[epoch]["error"][tsk]) for epoch in epochs if tsk in data[epoch]["error"]] - for tsk in train_score_keys - ] - dev_scores = [ - [(epoch, data[epoch]["error"][dsk]) for epoch in epochs if dsk in data[epoch]["error"]] - for dsk in dev_score_keys - ] - dev_errors = [ - [(epoch, data[epoch]["error"][dek]) for epoch in epochs if dek in data[epoch]["error"]] - for dek in dev_error_keys - ] - learing_rates = [data[epoch]["learning_rate"] for epoch in epochs] + @classmethod + def hash(cls, kwargs): + d = { + "trial_num": kwargs["trial_num"], + "step": kwargs["step"], + "score": kwargs["score"], + } - colors = ["#2A4D6E", "#AA3C39", "#93A537"] # blue red yellowgreen + if kwargs["study_name"] is not None: + d["study_name"] = kwargs["study_name"] + if kwargs["study_storage"] is not None: + d["study_storage"] = kwargs["study_storage"] - import matplotlib + return super().hash(d) + + +class OptunaReportFinalScoreJob(Job): + def __init__( + self, + trial_num: int, + scores: List[tk.Variable], + study_name: Optional[str] = None, + study_storage: Optional[str] = None, + ) -> None: + self.study_name = study_name or "optuna_study" + self.study_storage = study_storage or "sqlite:///study_storage.db" + self.trial_num = trial_num + self.scores = scores - matplotlib.use("Agg") - import matplotlib.pyplot as plt + self.out_reported_score = self.output_var("reported_score") - fig, ax1 = plt.subplots() - for ts in train_scores: - ax1.plot([d[0] for d in ts], [d[1] for d in ts], "o-", color=colors[0]) - for ds in dev_scores: - ax1.plot([d[0] for d in ds], [d[1] for d in ds], "o-", color=colors[1]) - ax1.set_xlabel("epoch") - ax1.set_ylabel("scores", color=colors[0]) - for tl in ax1.get_yticklabels(): - tl.set_color(colors[0]) + def tasks(self) -> Generator[Task, None, None]: + yield Task("run", mini_task=True) - if len(dev_errors) > 0 and any(len(de) > 0 for de in dev_errors): - ax2 = ax1.twinx() - ax2.set_ylabel("dev error", color=colors[2]) - for de in dev_errors: - ax2.plot([d[0] for d in de], [d[1] for d in de], "o-", color=colors[2]) - for tl in ax2.get_yticklabels(): - tl.set_color(colors[2]) + def run(self) -> None: + import optuna - fig.savefig(fname=self.out_plot_se.get_path()) + storage = optuna.storages.get_storage(self.study_storage) + study = optuna.load_study( + study_name=self.study_name, + storage=storage, + ) - fig, ax1 = plt.subplots() - ax1.semilogy(epochs, learing_rates, "ro-") - ax1.set_xlabel("epoch") - ax1.set_ylabel("learning_rate") + best_score = min([score.get() for score in self.scores]) - fig.savefig(fname=self.out_plot_lr.get_path()) + study.tell( + trial=self.trial_num, values=best_score, state=optuna.trial.TrialState.COMPLETE, skip_if_finished=True + ) @classmethod def hash(cls, kwargs): d = { - "returnn_config_generator": inspect.getsource(kwargs["optuna_returnn_config"].config_generator), - "returnn_config_generator_kwargs": list(sorted(kwargs["optuna_returnn_config"].config_kwargs)), - "sampler_seed": kwargs["sampler_seed"], - "score_key": kwargs["score_key"], - "num_trials": kwargs["num_trials"], - "num_parallel": kwargs["num_parallel"], - "returnn_python_exe": kwargs["returnn_python_exe"], - "returnn_root": kwargs["returnn_root"], + "trial_num": kwargs["trial_num"], + "scores": kwargs["scores"], } if kwargs["study_name"] is not None: d["study_name"] = kwargs["study_name"] if kwargs["study_storage"] is not None: d["study_storage"] = kwargs["study_storage"] - if kwargs["horovod_num_processes"] is not None: - d["horovod_num_processes"] = kwargs["horovod_num_processes"] - if kwargs["multi_node_slots"] is not None: - d["multi_node_slots"] = kwargs["multi_node_slots"] return super().hash(d) diff --git a/users/berger/systems/functors/alignment/optuna_legacy_alignment.py b/users/berger/systems/functors/alignment/optuna_legacy_alignment.py index 0dc2b6e53..af0b76a78 100644 --- a/users/berger/systems/functors/alignment/optuna_legacy_alignment.py +++ b/users/berger/systems/functors/alignment/optuna_legacy_alignment.py @@ -25,7 +25,7 @@ def __call__( num_inputs: int, num_classes: int, epoch: types.EpochType, - trial_num: types.TrialType = "best", + trial_num: int, prior_scale: float = 0, prior_args: Dict = {}, feature_type: dataclasses.FeatureType = dataclasses.FeatureType.SAMPLES, @@ -78,10 +78,7 @@ def __call__( ) exp_full = f"align_e-{self._get_epoch_string(epoch)}_prior-{prior_scale:02.2f}" - if trial_num is None: - path = f"nn_align/{align_corpus.name}/{train_job.name}/{exp_full}" - else: - path = f"nn_align/{align_corpus.name}/{train_job.name}/trial-{trial_num:03d}/{exp_full}" + path = f"nn_align/{align_corpus.name}/{train_job.name}/trial-{trial_num:03d}/{exp_full}" align.set_vis_name(f"Alignment {path}") align.add_alias(path) diff --git a/users/berger/systems/functors/alignment/optuna_seq2seq_alignment.py b/users/berger/systems/functors/alignment/optuna_seq2seq_alignment.py index c9ad662e6..8f32f7f54 100644 --- a/users/berger/systems/functors/alignment/optuna_seq2seq_alignment.py +++ b/users/berger/systems/functors/alignment/optuna_seq2seq_alignment.py @@ -26,7 +26,7 @@ def __call__( align_config: returnn.OptunaReturnnConfig, align_corpus: dataclasses.NamedCorpusInfo, epoch: types.EpochType, - trial_num: types.TrialType = "best", + trial_num: int, prior_scale: float = 0, prior_args: Dict = {}, label_unit: str = "phoneme", @@ -88,11 +88,7 @@ def __call__( ) exp_full = f"align_e-{self._get_epoch_string(epoch)}_prior-{prior_scale:02.2f}" - - if trial_num == "best": - path = f"nn_align/{align_corpus.name}/{train_job.name}/trial-{trial_num}/{exp_full}" - else: - path = f"nn_align/{align_corpus.name}/{train_job.name}/trial-{trial_num:03d}/{exp_full}" + path = f"nn_align/{align_corpus.name}/{train_job.name}/trial-{trial_num:03d}/{exp_full}" align.set_vis_name(f"Alignment {path}") align.add_alias(path) diff --git a/users/berger/systems/functors/alignment/seq2seq_alignment.py b/users/berger/systems/functors/alignment/seq2seq_alignment.py index 389dccfe6..8b99aaf35 100644 --- a/users/berger/systems/functors/alignment/seq2seq_alignment.py +++ b/users/berger/systems/functors/alignment/seq2seq_alignment.py @@ -6,6 +6,7 @@ from i6_experiments.users.berger.recipe import mm from i6_core.lexicon.allophones import DumpStateTyingJob, StoreAllophonesJob from sisyphus import tk +from i6_experiments.users.berger.recipe.returnn.training import Backend, get_backend from ... import dataclasses from ... import types @@ -32,6 +33,7 @@ def __call__( label_scorer_args: Dict = {}, feature_type: dataclasses.FeatureType = dataclasses.FeatureType.SAMPLES, flow_args: Dict = {}, + model_flow_args: Dict = {}, silence_phone: str = "", register_output: bool = False, **kwargs, @@ -46,15 +48,9 @@ def __call__( align_corpus.corpus_info, feature_type=feature_type, **flow_args ) - tf_graph = self._make_tf_graph( - train_job=train_job.job, - returnn_config=align_config, - epoch=epoch, - label_scorer_type=label_scorer_type, - ) + backend = get_backend(align_config) checkpoint = self._get_checkpoint(train_job=train_job.job, epoch=epoch) - assert isinstance(checkpoint, returnn.Checkpoint) if label_scorer_args.get("use_prior", False) and prior_scale: prior_file = self._get_prior_file(prior_config=prior_config, checkpoint=checkpoint, **prior_args) @@ -65,12 +61,37 @@ def __call__( label_scorer = custom_rasr.LabelScorer(label_scorer_type, **mod_label_scorer_args) - feature_flow = self._get_tf_feature_flow_for_label_scorer( - label_scorer=label_scorer, - base_feature_flow=base_feature_flow, - tf_graph=tf_graph, - checkpoint=checkpoint, - ) + if backend == Backend.TENSORFLOW: + tf_graph = self._make_tf_graph( + train_job=train_job.job, + returnn_config=align_config, + epoch=epoch, + label_scorer_type=label_scorer_type, + ) + assert isinstance(checkpoint, returnn.Checkpoint) + + feature_flow = self._get_tf_feature_flow_for_label_scorer( + label_scorer=label_scorer, + base_feature_flow=base_feature_flow, + tf_graph=tf_graph, + checkpoint=checkpoint, + **model_flow_args, + ) + elif backend == Backend.PYTORCH: + assert isinstance(checkpoint, returnn.PtCheckpoint) + onnx_model = self._make_onnx_model( + returnn_config=align_config, + checkpoint=checkpoint, + ) + feature_flow = self._get_onnx_feature_flow_for_label_scorer( + label_scorer=label_scorer, + base_feature_flow=base_feature_flow, + onnx_model=onnx_model, + feature_type=feature_type, + **model_flow_args, + ) + else: + raise NotImplementedError align = mm.Seq2SeqAlignmentJob( crp=crp, diff --git a/users/berger/systems/functors/optuna_rasr_base.py b/users/berger/systems/functors/optuna_rasr_base.py index a84edf950..c80120031 100644 --- a/users/berger/systems/functors/optuna_rasr_base.py +++ b/users/berger/systems/functors/optuna_rasr_base.py @@ -17,15 +17,12 @@ def _get_epoch_value( self, train_job: returnn.OptunaReturnnTrainingJob, epoch: types.EpochType, - trial_num: types.TrialType, + trial_num: int, ) -> Union[int, tk.Variable]: if epoch != "best": return epoch - if trial_num == "best": - lr = train_job.out_learning_rates - else: - lr = train_job.out_trial_learning_rates[trial_num] + lr = train_job.out_trial_learning_rates[trial_num] return returnn.GetBestEpochJob(lr).out_epoch def _make_tf_graph( @@ -33,13 +30,10 @@ def _make_tf_graph( train_job: returnn.OptunaReturnnTrainingJob, returnn_config: returnn.OptunaReturnnConfig, epoch: types.EpochType, + trial_num: int, label_scorer_type: str = "precomputed-log-posterior", - trial_num: types.TrialType = "best", ) -> tk.Path: - if trial_num == "best": - trial = train_job.out_best_trial - else: - trial = train_job.out_trials[trial_num] + trial = train_job.out_trials[trial_num] rec_step_by_step = "output" if self._is_autoregressive_decoding(label_scorer_type) else None graph_compile_job = returnn.OptunaCompileTFGraphJob( returnn_config, @@ -57,17 +51,15 @@ def _make_onnx_model( train_job: returnn.OptunaReturnnTrainingJob, returnn_config: returnn.OptunaReturnnConfig, checkpoint: returnn.PtCheckpoint, - trial_num: types.TrialType = "best", + trial_num: int, ) -> tk.Path: - if trial_num == "best": - trial = train_job.out_best_trial - else: - trial = train_job.out_trials[trial_num] + trial = train_job.out_trials[trial_num] - onnx_export_job = returnn.OptunaExportPyTorchModelToOnnxJob( - pytorch_checkpoint=checkpoint, + onnx_export_job = returnn.OptunaTorchOnnxExportJob( returnn_config=returnn_config, + checkpoint=checkpoint, trial=trial, + returnn_python_exe=self.returnn_python_exe, returnn_root=self.returnn_root, ) return onnx_export_job.out_onnx_model @@ -77,18 +69,9 @@ def _get_checkpoint( self, train_job: returnn.OptunaReturnnTrainingJob, epoch: types.EpochType, - trial_num: types.TrialType = "best", + trial_num: int, backend: Backend = Backend.TENSORFLOW, ) -> Checkpoint: - if trial_num == "best": - if epoch == "best": - return returnn.GetBestCheckpointJob( - model_dir=train_job.out_model_dir, - learning_rates=train_job.out_learning_rates, - backend=backend, - ).out_checkpoint - return train_job.out_checkpoints[epoch] - if epoch == "best": return returnn.GetBestCheckpointJob( model_dir=train_job.out_trial_model_dir[trial_num], @@ -102,14 +85,11 @@ def _get_prior_file( train_job: returnn.OptunaReturnnTrainingJob, prior_config: returnn.OptunaReturnnConfig, checkpoint: Checkpoint, - trial_num: types.TrialType = "best", + trial_num: int, backend: Backend = Backend.TENSORFLOW, **kwargs, ) -> tk.Path: - if trial_num == "best": - trial = train_job.out_best_trial - else: - trial = train_job.out_trials[trial_num] + trial = train_job.out_trials[trial_num] if backend == backend.TENSORFLOW: prior_job = returnn.OptunaReturnnComputePriorJob( model_checkpoint=checkpoint, @@ -134,14 +114,3 @@ def _get_prior_file( return forward_job.out_prior_xml_file else: raise NotImplementedError - - @lru_cache_with_signature - def _get_trial_value( - self, - train_job: returnn.OptunaReturnnTrainingJob, - trial_num: types.TrialType = "best", - ) -> Union[int, tk.Variable]: - if trial_num != "best": - return trial_num - - return train_job.out_best_trial_num diff --git a/users/berger/systems/functors/recognition/optuna_legacy_search.py b/users/berger/systems/functors/recognition/optuna_legacy_search.py index 9d3e9343b..ae367cc10 100644 --- a/users/berger/systems/functors/recognition/optuna_legacy_search.py +++ b/users/berger/systems/functors/recognition/optuna_legacy_search.py @@ -1,13 +1,13 @@ import copy import itertools -from typing import Dict, List, Optional +from typing import Dict, List from i6_core import mm, rasr, recognition -from i6_experiments.users.berger.recipe import returnn from sisyphus import tk -from ... import dataclasses -from ... import types +from i6_experiments.users.berger.recipe import returnn + +from ... import dataclasses, types from ..base import RecognitionFunctor from ..optuna_rasr_base import OptunaRasrFunctor @@ -25,7 +25,7 @@ def __call__( num_classes: int, epochs: List[types.EpochType], lm_scales: List[float], - trial_nums: List[Optional[int]] = [None], + trial_nums: List[int], prior_scales: List[float] = [0], pronunciation_scales: List[float] = [0], prior_args: Dict = {}, @@ -117,8 +117,8 @@ def __call__( dataclasses.SummaryKey.TRAIN_NAME.value: train_job.name, dataclasses.SummaryKey.RECOG_NAME.value: recog_config.name, dataclasses.SummaryKey.CORPUS.value: recog_corpus.name, - dataclasses.SummaryKey.TRIAL.value: self._get_trial_value(train_job.job, trial_num), - dataclasses.SummaryKey.EPOCH.value: self._get_epoch_value(train_job.job, epoch), + dataclasses.SummaryKey.TRIAL.value: trial_num, + dataclasses.SummaryKey.EPOCH.value: self._get_epoch_value(train_job.job, epoch, trial_num), dataclasses.SummaryKey.PRON.value: pronunciation_scale, dataclasses.SummaryKey.PRIOR.value: prior_scale, dataclasses.SummaryKey.LM.value: lm_scale, diff --git a/users/berger/systems/functors/recognition/optuna_seq2seq_search.py b/users/berger/systems/functors/recognition/optuna_seq2seq_search.py index c7ba7fc90..90d3ad5b6 100644 --- a/users/berger/systems/functors/recognition/optuna_seq2seq_search.py +++ b/users/berger/systems/functors/recognition/optuna_seq2seq_search.py @@ -1,11 +1,12 @@ import copy import itertools -from typing import Dict, List +from typing import Dict, List, Optional, Union from i6_experiments.users.berger.recipe import rasr as custom_rasr from i6_experiments.users.berger.recipe import recognition, returnn from i6_experiments.users.berger.recipe.returnn.training import Backend from sisyphus import tk +from i6_experiments.users.berger.systems.functors.rasr_base import RecognitionScoringType from ..base import RecognitionFunctor from ..optuna_rasr_base import OptunaRasrFunctor from ..seq2seq_base import Seq2SeqFunctor @@ -22,12 +23,14 @@ def __call__( self, train_job: dataclasses.NamedTrainJob[returnn.OptunaReturnnTrainingJob], prior_config: returnn.OptunaReturnnConfig, - recog_config: dataclasses.NamedConfig[returnn.OptunaReturnnConfig], + recog_config: dataclasses.NamedConfig[ + Union[returnn.OptunaReturnnConfig, dataclasses.EncDecConfig[returnn.OptunaReturnnConfig]] + ], recog_corpus: dataclasses.NamedCorpusInfo, lookahead_options: Dict, epochs: List[types.EpochType], + trial_nums: List[int], lm_scales: List[float] = [0], - trial_nums: List[types.TrialType] = ["best"], prior_scales: List[float] = [0], prior_args: Dict = {}, lattice_to_ctm_kwargs: Dict = {}, @@ -37,7 +40,12 @@ def __call__( label_scorer_args: Dict = {}, feature_type: dataclasses.FeatureType = dataclasses.FeatureType.SAMPLES, flow_args: Dict = {}, + model_flow_args: Dict = {}, backend: Backend = Backend.TENSORFLOW, + recognition_scoring_type=RecognitionScoringType.Lattice, + rqmt_update: Optional[dict] = None, + search_stats: bool = False, + seq2seq_v2: bool = False, **kwargs, ) -> List[Dict]: assert recog_corpus is not None @@ -60,6 +68,8 @@ def __call__( recog_results = [] + out_scores = {trial_num: [] for trial_num in trial_nums} + for lm_scale, prior_scale, epoch, trial_num in itertools.product(lm_scales, prior_scales, epochs, trial_nums): checkpoint = self._get_checkpoint(train_job.job, epoch, trial_num=trial_num, backend=backend) @@ -82,6 +92,7 @@ def __call__( label_scorer = custom_rasr.LabelScorer(label_scorer_type, **mod_label_scorer_args) if backend == Backend.TENSORFLOW: + assert isinstance(recog_config.config, returnn.OptunaReturnnConfig) tf_graph = self._make_tf_graph( train_job=train_job.job, returnn_config=recog_config.config, @@ -96,45 +107,84 @@ def __call__( base_feature_flow=base_feature_flow, tf_graph=tf_graph, checkpoint=checkpoint, + feature_type=feature_type, + **model_flow_args, ) elif backend == Backend.PYTORCH: assert isinstance(checkpoint, returnn.PtCheckpoint) - onnx_model = self._make_onnx_model( - train_job=train_job.job, - returnn_config=recog_config.config, - checkpoint=checkpoint, - trial_num=trial_num, - ) - feature_flow = self._get_onnx_feature_flow_for_label_scorer( + if isinstance(recog_config.config, returnn.OptunaReturnnConfig): + onnx_model = self._make_onnx_model( + train_job=train_job.job, + returnn_config=recog_config.config, + checkpoint=checkpoint, + trial_num=trial_num, + ) + feature_flow = self._get_onnx_feature_flow_for_label_scorer( + label_scorer=label_scorer, + base_feature_flow=base_feature_flow, + onnx_model=onnx_model, + feature_type=feature_type, + **model_flow_args, + ) + else: + enc_model = self._make_onnx_model( + train_job=train_job.job, + returnn_config=recog_config.config.encoder_config, + checkpoint=checkpoint, + trial_num=trial_num, + ) + dec_model = self._make_onnx_model( + train_job=train_job.job, + returnn_config=recog_config.config.decoder_config, + checkpoint=checkpoint, + trial_num=trial_num, + ) + feature_flow = self._get_onnx_feature_flow_for_label_scorer( + label_scorer=label_scorer, + base_feature_flow=base_feature_flow, + enc_onnx_model=enc_model, + dec_onnx_model=dec_model, + feature_type=feature_type, + **model_flow_args, + ) + else: + raise NotImplementedError + + if seq2seq_v2: + rec = recognition.GenericSeq2SeqSearchJobV2( + crp=crp, + feature_flow=feature_flow, label_scorer=label_scorer, - base_feature_flow=base_feature_flow, - onnx_model=onnx_model, + label_tree=label_tree, + lookahead_options=lookahead_options, + **kwargs, ) else: - raise NotImplementedError + rec = recognition.GenericSeq2SeqSearchJob( + crp=crp, + feature_flow=feature_flow, + label_scorer=label_scorer, + label_tree=label_tree, + lookahead_options=lookahead_options, + **kwargs, + ) - rec = recognition.GenericSeq2SeqSearchJob( - crp=crp, - feature_flow=feature_flow, - label_scorer=label_scorer, - label_tree=label_tree, - lookahead_options=lookahead_options, - **kwargs, - ) + if rqmt_update is not None: + rec.rqmt.update(rqmt_update) - exp_full = ( - f"{recog_config.name}_e-{self._get_epoch_string(epoch)}_prior-{prior_scale:02.2f}_lm-{lm_scale:02.2f}" - ) + exp_full = f"{recog_config.name}_e-{self._get_epoch_string(epoch)}" + if prior_scale != 0: + exp_full += f"_prior-{prior_scale:02.2f}" + if lm_scale != 0: + exp_full += f"_lm-{lm_scale:02.2f}" - if trial_num == "best": - path = f"nn_recog/{recog_corpus.name}/{train_job.name}/trial-{trial_num}/{exp_full}" - else: - path = f"nn_recog/{recog_corpus.name}/{train_job.name}/trial-{trial_num:03d}/{exp_full}" + path = f"nn_recog/{recog_corpus.name}/{train_job.name}/trial-{trial_num:03d}/{exp_full}" rec.set_vis_name(f"Recog {path}") rec.add_alias(path) - scorer_job = self._lattice_scoring( + scorer_job = self._score_recognition_output( + recognition_scoring_type=recognition_scoring_type, crp=crp, lattice_bundle=rec.out_lattice_bundle, scorer=recog_corpus.corpus_info.scorer, @@ -145,15 +195,36 @@ def __call__( scorer_job.out_report_dir, ) + out_scores[trial_num].append( + returnn.OptunaReportIntermediateScoreJob( + trial_num=trial_num, + step=epoch, + score=scorer_job.out_wer, + study_name=train_job.job.study_name, + study_storage=train_job.job.study_storage, + ).out_reported_score + ) + + rtf = None + if search_stats: + stats_job = recognition.ExtractSeq2SeqSearchStatisticsJob( + search_logs=list(rec.out_log_file.values()), + corpus_duration_hours=recog_corpus.corpus_info.data.corpus_object.duration, + ) + rtf = stats_job.overall_rtf + + tk.register_output(f"{path}.rtf", rtf) + recog_results.append( { dataclasses.SummaryKey.TRAIN_NAME.value: train_job.name, dataclasses.SummaryKey.RECOG_NAME.value: recog_config.name, dataclasses.SummaryKey.CORPUS.value: recog_corpus.name, - dataclasses.SummaryKey.TRIAL.value: self._get_trial_value(train_job.job, trial_num), + dataclasses.SummaryKey.TRIAL.value: trial_num, dataclasses.SummaryKey.EPOCH.value: self._get_epoch_value(train_job.job, epoch, trial_num), dataclasses.SummaryKey.PRIOR.value: prior_scale, dataclasses.SummaryKey.LM.value: lm_scale, + dataclasses.SummaryKey.RTF.value: rtf, dataclasses.SummaryKey.WER.value: scorer_job.out_wer, dataclasses.SummaryKey.SUB.value: scorer_job.out_percent_substitution, dataclasses.SummaryKey.DEL.value: scorer_job.out_percent_deletions, @@ -162,4 +233,18 @@ def __call__( } ) + path = f"nn_recog/{recog_corpus.name}/{train_job.name}/trial-{trial_num:03d}/{exp_full}" + + for trial_num in trial_nums: + final_score = returnn.OptunaReportFinalScoreJob( + trial_num=trial_num, + scores=out_scores[trial_num], + study_name=train_job.job.study_name, + study_storage=train_job.job.study_storage, + ).out_reported_score + tk.register_output( + f"optuna/{recog_corpus.name}/{train_job.name}/trial-{trial_num:03d}/best_wer", + value=final_score, + ) + return recog_results diff --git a/users/berger/systems/functors/recognition/seq2seq_search.py b/users/berger/systems/functors/recognition/seq2seq_search.py index 0667834c4..5a806b0d0 100644 --- a/users/berger/systems/functors/recognition/seq2seq_search.py +++ b/users/berger/systems/functors/recognition/seq2seq_search.py @@ -158,9 +158,11 @@ def __call__( if rqmt_update is not None: rec.rqmt.update(rqmt_update) - exp_full = ( - f"{recog_config.name}_e-{self._get_epoch_string(epoch)}_prior-{prior_scale:02.2f}_lm-{lm_scale:02.2f}" - ) + exp_full = f"{recog_config.name}_e-{self._get_epoch_string(epoch)}" + if prior_scale != 0: + exp_full += f"_prior-{prior_scale:02.2f}" + if lm_scale != 0: + exp_full += f"_lm-{lm_scale:02.2f}" path = f"nn_recog/{recog_corpus.name}/{train_job.name}/{exp_full}" diff --git a/users/berger/systems/functors/seq2seq_base.py b/users/berger/systems/functors/seq2seq_base.py index 46f95b3a2..dfa521efa 100644 --- a/users/berger/systems/functors/seq2seq_base.py +++ b/users/berger/systems/functors/seq2seq_base.py @@ -1,13 +1,14 @@ -from abc import ABC import copy +from abc import ABC from i6_core import lexicon, rasr, returnn +from sisyphus import tk + from i6_experiments.users.berger.recipe import rasr as custom_rasr from i6_experiments.users.berger.util import lru_cache_with_signature -from sisyphus import tk -from .rasr_base import RasrFunctor from ..dataclasses import FeatureType +from .rasr_base import RasrFunctor class Seq2SeqFunctor(RasrFunctor, ABC): @@ -38,7 +39,7 @@ def _make_onnx_enc_dec_config_for_label_scorer( enc_features_size: str = "sources:size1", enc_output_name: str = "source_encodings", dec_features_name: str = "source_encodings", - dec_history_name: str = "targets", + dec_history_name: str = "history", dec_output_name: str = "log_probs", ) -> None: encoder_io_map = rasr.RasrConfig() diff --git a/users/berger/systems/functors/training/optuna_returnn_training.py b/users/berger/systems/functors/training/optuna_returnn_training.py index 820ee813b..b8aeca6d7 100644 --- a/users/berger/systems/functors/training/optuna_returnn_training.py +++ b/users/berger/systems/functors/training/optuna_returnn_training.py @@ -28,6 +28,7 @@ def __call__( ) train_job.add_alias(f"train_nn/{train_config.name}") - tk.register_output(f"train_nn/{train_config.name}/learning_rate.png", train_job.out_plot_lr) + for trial_num, learning_rate_file in train_job.out_trial_learning_rates.items(): + tk.register_output(f"train_nn/{train_config.name}/trial-{trial_num:03d}/learning_rates", learning_rate_file) return train_job diff --git a/users/berger/systems/types.py b/users/berger/systems/types.py index 10601be47..e4ca32f4d 100644 --- a/users/berger/systems/types.py +++ b/users/berger/systems/types.py @@ -6,7 +6,6 @@ ScoreJobType = Union[Type[recognition.ScliteJob], Type[recognition.Hub5ScoreJob], Type[MeetEvalJob]] ScoreJob = Union[recognition.ScliteJob, recognition.Hub5ScoreJob] EpochType = Union[int, Literal["best"]] -TrialType = Union[int, Literal["best"]] CheckpointType = Union[returnn.Checkpoint, returnn.PtCheckpoint] ConfigType = TypeVar("ConfigType") From 747e8a45acf746312e97d36272b4a2c3a17ac2f3 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Tue, 11 Jun 2024 09:19:12 +0200 Subject: [PATCH 154/227] more --- users/zeyer/experiments/exp2024_04_23_baselines/ctc.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index e4ef939e8..2abdb5b21 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -78,7 +78,7 @@ def py(): (5, 1e-2), # (1, 1e-4), # 9.24 (1, 1e-3), - (1, 1e-2), + (1, 1e-2), # 8.16 ]: train_exp( f"v6-bhv20-11gb-f32-bs15k-accgrad{acc}" @@ -174,7 +174,7 @@ def py(): }, ) - train_exp( + train_exp( # 7.36 (vs without EOS 6.99), so EOS made it worse "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-spm10k-eos-spmSample07", config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, config_updates={ @@ -199,6 +199,7 @@ def py(): # featBN: {"dev-clean": 3.63, "dev-other": 6.96, "test-clean": 3.82, "test-other": 7.15} "featBN": {"feature_batch_norm": True}, # batch norm "featNorm": {"feature_norm": True}, # normalize (on sequence level) + # featGN: {"dev-clean": 3.65, "dev-other": 7.04, "test-clean": 3.82, "test-other": 7.27} "featGN": {"feature_stats": {"mean": feature_stats.mean, "std_dev": feature_stats.std_dev}}, # global norm }.items(): train_exp( From 411b58a8397bc48cee8b6fb0a4c3f95796b4bb23 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Tue, 11 Jun 2024 21:12:05 +0200 Subject: [PATCH 155/227] more --- users/zeyer/experiments/exp2024_04_23_baselines/ctc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 2abdb5b21..f0882934c 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -77,7 +77,7 @@ def py(): (5, 1e-3), (5, 1e-2), # (1, 1e-4), # 9.24 - (1, 1e-3), + (1, 1e-3), # 9.19 (1, 1e-2), # 8.16 ]: train_exp( @@ -198,6 +198,7 @@ def py(): None: None, # {"dev-clean": 3.69, "dev-other": 6.99, "test-clean": 3.83, "test-other": 7.32} # featBN: {"dev-clean": 3.63, "dev-other": 6.96, "test-clean": 3.82, "test-other": 7.15} "featBN": {"feature_batch_norm": True}, # batch norm + # featNorm: {"dev-clean": 3.65, "dev-other": 6.97, "test-clean": 3.74, "test-other": 7.34} "featNorm": {"feature_norm": True}, # normalize (on sequence level) # featGN: {"dev-clean": 3.65, "dev-other": 7.04, "test-clean": 3.82, "test-other": 7.27} "featGN": {"feature_stats": {"mean": feature_stats.mean, "std_dev": feature_stats.std_dev}}, # global norm From 365c3b22771022844762ff3953046c4b63ce810e Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Wed, 12 Jun 2024 09:18:21 +0200 Subject: [PATCH 156/227] more --- users/zeyer/experiments/exp2024_04_23_baselines/ctc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index f0882934c..1a50460e1 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -74,8 +74,8 @@ def py(): for acc, wd in [ # (5, 1e-5), # 9.90 - (5, 1e-3), - (5, 1e-2), + (5, 1e-3), # 9.53 + (5, 1e-2), # 9.23 # (1, 1e-4), # 9.24 (1, 1e-3), # 9.19 (1, 1e-2), # 8.16 From c55fefaa53dcdf9b7c4806733dc839d848671d26 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 13 Jun 2024 00:36:01 +0200 Subject: [PATCH 157/227] more --- .../exp2024_04_23_baselines/ctc.py | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 1a50460e1..2963c7ec6 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -139,20 +139,28 @@ def py(): # Testing different vocabs together with sampling. for vocab, alpha in [ # See archive/returnn-spm10-sample.config for playing around with alpha and checking avg seq len. - ("spm10k", 0.3), # 7.88 - ("spm10k", 0.5), # 7.13 + # The lower the alpha, the longer the seq len, i.e. the more aggressive the sampling. + # spm10k no sampling: 8.12 + ("spm10k", 0.8), # 7.08 ("spm10k", 0.7), # 6.99 - ("spm10k", 0.8), + ("spm10k", 0.5), # 7.13 + ("spm10k", 0.3), # 7.88 # alpha for SPM-BPE has a very different effect, and it causes the seq len to be much longer. # The higher the alpha, the longer (the reverse as for SPM Unigram). # See archive/returnn-spm_bpe10-sample.config. - ("spm_bpe10k", 0.005), - ("spm_bpe10k", 0.01), + # spm_bpe10k no sampling: 7.97 + ("spm_bpe10k", 0.001), + ("spm_bpe10k", 0.005), # 8.66 + ("spm_bpe10k", 0.01), # 8.99 # ("spm_bpe10k", 0.3), # broken # ("spm_bpe10k", 0.7), # broken # alpha for BPE is again a bit different, but more similar to SPM-BPE than SPM-Unigram. # See archive/returnn-bpe10-sample.config. - ("bpe10k", 0.01), + # The higher the alpha, the longer the sequence, i.e. the more aggressive the sampling. + # bpe10k no sampling: 8.23 + ("bpe10k", 0.005), + ("bpe10k", 0.01), # 7.10 + ("bpe10k", 0.02), ]: train_exp( f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-{vocab}" From c35b50a8f05cb854c3c94d1ba256cd26322a12df Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 13 Jun 2024 00:41:51 +0200 Subject: [PATCH 158/227] more --- .../exp2024_04_23_baselines/aed.py | 43 ++++++++++++------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/aed.py b/users/zeyer/experiments/exp2024_04_23_baselines/aed.py index 8099b81df..7a3354ec3 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/aed.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/aed.py @@ -85,21 +85,28 @@ def py(): vocab=vocab, ) - # Testing sampling in SPM. Baseline without sampling: 5.24 dev-other. - # The lower the alpha, the more aggressive the sampling. - # alpha=0.1 seems too aggressive for AED, bad convergence - for alpha in [ - 0.3, # 5.26 - 0.5, # 5.13 - 0.6, # 5.13 - 0.7, # 4.98 (!!) - 0.8, # 5.14 - 0.9, # 5.18 - 1.0, # 5.35. sanity check, should be like baseline (5.16), could be attributed to randomness? + for vocab, alpha in [ + # Testing sampling in SPM. + # The lower the alpha, the more aggressive the sampling. + # See archive/returnn-spm10-sample.config for playing around with alpha and checking avg seq len. + # spm10k without sampling: 5.24 dev-other + ("spm10k", 1.0), # 5.35. sanity check, should be like baseline (5.16), could be attributed to randomness? + ("spm10k", 0.9), # 5.18 + ("spm10k", 0.8), # 5.14 + ("spm10k", 0.7), # 4.98 (!!) + ("spm10k", 0.6), # 5.13 + ("spm10k", 0.5), # 5.13 + ("spm10k", 0.3), # 5.26 + # spm10k, alpha=0.1: seems too aggressive for AED, bad convergence + # alpha for BPE is again a bit different, but more similar to SPM-BPE than SPM-Unigram. + # See archive/returnn-bpe10-sample.config. + # The higher the alpha, the longer the sequence, i.e. the more aggressive the sampling. + # bpe10k without sampling: 5.32 + ("bpe10k", 0.01), ]: train_exp( - f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-spm10k" - f"-spmSample{str(alpha).replace('.', '')}", + f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-{vocab}" + f"-{'spmSample' if vocab.startswith('spm') else 'bpeSample'}{str(alpha).replace('.', '')}", config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, config_updates={ **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), @@ -107,8 +114,14 @@ def py(): "__train_audio_preprocess": speed_pert_librosa_config, "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], }, - vocab="spm10k", - train_vocab_opts={"other_opts": {"enable_sampling": True, "alpha": alpha}}, + vocab=vocab, + train_vocab_opts={ + "other_opts": ( + {"enable_sampling": True, "alpha": alpha} + if vocab.startswith("spm") + else {"class": "SamplingBytePairEncoding", "breadth_prob": alpha} + ) + }, ) From 69d746504520b1535b9a5c0fd0f236aa78aee740 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 13 Jun 2024 09:14:34 +0200 Subject: [PATCH 159/227] more --- users/zeyer/experiments/exp2024_04_23_baselines/aed.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/aed.py b/users/zeyer/experiments/exp2024_04_23_baselines/aed.py index 7a3354ec3..17e26e500 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/aed.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/aed.py @@ -69,9 +69,12 @@ def py(): ) for vocab in [ + "spm20k", "bpe10k", # 5.32 "spm10k", # 5.16 "spm_bpe10k", # 5.21 + "spm4k", + "spm1k", ]: train_exp( # 5.16 f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-{vocab}", From 3a71ddc004cee23accba5a66e7d68f2e24788f57 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 13 Jun 2024 10:09:22 +0200 Subject: [PATCH 160/227] prepare for some more modeling code --- users/zeyer/nn_rf/README.md | 5 ++++ users/zeyer/nn_rf/__init__.py | 0 users/zeyer/nn_rf/frontend.py | 36 ++++++++++++++++++++++++++++ users/zeyer/returnn/models/README.md | 3 +++ 4 files changed, 44 insertions(+) create mode 100644 users/zeyer/nn_rf/README.md create mode 100644 users/zeyer/nn_rf/__init__.py create mode 100644 users/zeyer/nn_rf/frontend.py create mode 100644 users/zeyer/returnn/models/README.md diff --git a/users/zeyer/nn_rf/README.md b/users/zeyer/nn_rf/README.md new file mode 100644 index 000000000..7e963a8cd --- /dev/null +++ b/users/zeyer/nn_rf/README.md @@ -0,0 +1,5 @@ +Putting some custom neural network modeling code here, +specifically using RETURNN frontend (RF) +(although the code could also be used outside of RETURNN). + +Some more modeling code is in `..returnn.models`. diff --git a/users/zeyer/nn_rf/__init__.py b/users/zeyer/nn_rf/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/zeyer/nn_rf/frontend.py b/users/zeyer/nn_rf/frontend.py new file mode 100644 index 000000000..5c6328b43 --- /dev/null +++ b/users/zeyer/nn_rf/frontend.py @@ -0,0 +1,36 @@ +""" +Custom frontends. + +In RF `ConformerEncoder`, that would be an argument for `input_layer`. + +Default we used in the past:: + + input_layer=ConformerConvSubsample( + in_dim, + out_dims=[Dim(32, name="conv1"), Dim(64, name="conv2"), Dim(64, name="conv3")], + filter_sizes=[(3, 3), (3, 3), (3, 3)], + pool_sizes=[(1, 2)], + strides=[(1, 1), (3, 1), (2, 1)], + ), + +This uses a downsampling factor of 6. +""" + +from typing import Tuple +from returnn.tensor import Dim +from returnn.frontend.encoder.conformer import ConformerConvSubsample +from returnn.frontend.encoder.base import ISeqDownsamplingEncoder + + +# TODO how to serialize variants? + + +def get_default(*, in_dim: Dim, time_strides: Tuple[int, int, int] = (1, 3, 2)) -> ISeqDownsamplingEncoder: + assert len(time_strides) == 3 + return ConformerConvSubsample( + in_dim, + out_dims=[Dim(32, name="conv1"), Dim(64, name="conv2"), Dim(64, name="conv3")], + filter_sizes=[(3, 3), (3, 3), (3, 3)], + strides=[(s, 1) for s in time_strides], + pool_sizes=[(1, 2)], + ) diff --git a/users/zeyer/returnn/models/README.md b/users/zeyer/returnn/models/README.md new file mode 100644 index 000000000..b0222aa37 --- /dev/null +++ b/users/zeyer/returnn/models/README.md @@ -0,0 +1,3 @@ +Some modeling code. + +Note, more is in module `...nn_rf` and `...nn_tf`. From 8c4d63f3012ae9abdbc3b8d426177a33232afcb4 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 13 Jun 2024 10:12:15 +0200 Subject: [PATCH 161/227] move SequentialLayerDrop --- users/zeyer/nn_rf/layerdrop.py | 37 ++++++++++++++++++++++ users/zeyer/returnn/models/rf_layerdrop.py | 35 ++------------------ 2 files changed, 40 insertions(+), 32 deletions(-) create mode 100644 users/zeyer/nn_rf/layerdrop.py diff --git a/users/zeyer/nn_rf/layerdrop.py b/users/zeyer/nn_rf/layerdrop.py new file mode 100644 index 000000000..5eeb0fb57 --- /dev/null +++ b/users/zeyer/nn_rf/layerdrop.py @@ -0,0 +1,37 @@ +""" +Layerdrop, the most simple way, like ESPnet + +https://github.com/espnet/espnet/blob/7c140c2ac9b4f642acb36131217dd984d4601681/espnet2/asr/encoder/conformer_encoder.py#L278 +https://github.com/espnet/espnet/blob/7c140c2ac9b4f642acb36131217dd984d4601681/espnet/nets/pytorch_backend/transformer/repeat.py#L29 +""" + +from __future__ import annotations +from typing import Optional, Dict +from returnn.tensor import Tensor, Dim +import returnn.frontend as rf + + +class SequentialLayerDrop(rf.Sequential): + def __init__(self, *args, layer_drop: float): + super().__init__(*args) + self.layer_drop = layer_drop + + def __call__(self, inp, *, collected_outputs: Optional[Dict[str, Tensor]] = None, **kwargs) -> Tensor: + def _layer_drop_call(): + x = inp + num_layers_dim = Dim(len(self), name="num_layers") + drop_probs = rf.random_uniform([num_layers_dim], device="cpu") + for i, (name, module) in enumerate(self.items()): + x = rf.cond( + rf.gather(drop_probs, indices=i, axis=num_layers_dim) >= self.layer_drop, + lambda: module(x, **kwargs), + lambda: x, + ) + if collected_outputs is not None: + collected_outputs[name] = x + return x + + def _no_layer_drop_call(): + return rf.Sequential.__call__(self, inp, collected_outputs=collected_outputs, **kwargs) + + return rf.cond(rf.get_run_ctx().train_flag, _layer_drop_call, _no_layer_drop_call) diff --git a/users/zeyer/returnn/models/rf_layerdrop.py b/users/zeyer/returnn/models/rf_layerdrop.py index 5eeb0fb57..3e47805dd 100644 --- a/users/zeyer/returnn/models/rf_layerdrop.py +++ b/users/zeyer/returnn/models/rf_layerdrop.py @@ -1,37 +1,8 @@ """ -Layerdrop, the most simple way, like ESPnet - -https://github.com/espnet/espnet/blob/7c140c2ac9b4f642acb36131217dd984d4601681/espnet2/asr/encoder/conformer_encoder.py#L278 -https://github.com/espnet/espnet/blob/7c140c2ac9b4f642acb36131217dd984d4601681/espnet/nets/pytorch_backend/transformer/repeat.py#L29 +Layerdrop """ -from __future__ import annotations -from typing import Optional, Dict -from returnn.tensor import Tensor, Dim -import returnn.frontend as rf - - -class SequentialLayerDrop(rf.Sequential): - def __init__(self, *args, layer_drop: float): - super().__init__(*args) - self.layer_drop = layer_drop - - def __call__(self, inp, *, collected_outputs: Optional[Dict[str, Tensor]] = None, **kwargs) -> Tensor: - def _layer_drop_call(): - x = inp - num_layers_dim = Dim(len(self), name="num_layers") - drop_probs = rf.random_uniform([num_layers_dim], device="cpu") - for i, (name, module) in enumerate(self.items()): - x = rf.cond( - rf.gather(drop_probs, indices=i, axis=num_layers_dim) >= self.layer_drop, - lambda: module(x, **kwargs), - lambda: x, - ) - if collected_outputs is not None: - collected_outputs[name] = x - return x +from ...nn_rf.layerdrop import * - def _no_layer_drop_call(): - return rf.Sequential.__call__(self, inp, collected_outputs=collected_outputs, **kwargs) - return rf.cond(rf.get_run_ctx().train_flag, _layer_drop_call, _no_layer_drop_call) +SequentialLayerDrop # noqa From bb6bb4295b53b4a6b03b225c413d82b3f2e1d8e0 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 13 Jun 2024 10:12:39 +0200 Subject: [PATCH 162/227] better --- users/zeyer/returnn/models/rf_layerdrop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/users/zeyer/returnn/models/rf_layerdrop.py b/users/zeyer/returnn/models/rf_layerdrop.py index 3e47805dd..282957223 100644 --- a/users/zeyer/returnn/models/rf_layerdrop.py +++ b/users/zeyer/returnn/models/rf_layerdrop.py @@ -5,4 +5,4 @@ from ...nn_rf.layerdrop import * -SequentialLayerDrop # noqa +__all__ = ["SequentialLayerDrop"] From 468537a19fdd4451b309d3c45f6b4af4024db4aa Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 13 Jun 2024 10:14:27 +0200 Subject: [PATCH 163/227] move mixup --- users/zeyer/nn_rf/mixup.py | 177 +++++++++++++++++++++++++ users/zeyer/returnn/models/rf_mixup.py | 175 +----------------------- 2 files changed, 180 insertions(+), 172 deletions(-) create mode 100644 users/zeyer/nn_rf/mixup.py diff --git a/users/zeyer/nn_rf/mixup.py b/users/zeyer/nn_rf/mixup.py new file mode 100644 index 000000000..a0c8cc375 --- /dev/null +++ b/users/zeyer/nn_rf/mixup.py @@ -0,0 +1,177 @@ +""" +Mixup with RF +""" + +from __future__ import annotations +from dataclasses import dataclass +from returnn.tensor import Tensor, Dim +import returnn.frontend as rf + + +@dataclass +class MixupOpts: + """ + Arguments: + buffer_size: number of frames. + apply_prob: probability to apply mixup at all + max_num_mix: maximum number of mixups (random int in [1, max_num_mix]) + lambda_min: minimum lambda value + lambda_max: maximum lambda value + """ + + buffer_size: int = 1_000_000 + apply_prob: float = 1.0 + max_num_mix: int = 4 + lambda_min: float = 0.1 + lambda_max: float = 0.4 + + +class Mixup(rf.Module): + """ + Mixup + """ + + def __init__(self, *, feature_dim: Dim, opts: MixupOpts): + super().__init__() + self.feature_dim = feature_dim + self.opts = opts + self.buffer_size_dim = Dim(opts.buffer_size, name="buffer_size") + self.buffer = rf.Parameter([self.buffer_size_dim, feature_dim], auxiliary=True) + self.buffer_pos = rf.Parameter( + [], dtype="int32", sparse_dim=self.buffer_size_dim, initial=0, auxiliary=True, device="cpu" + ) + self.buffer_filled = rf.Parameter([], dtype="bool", initial=False, auxiliary=True, device="cpu") + + def __call__(self, src: Tensor, *, spatial_dim: Dim) -> Tensor: + if not rf.get_run_ctx().train_flag: + return src + + assert spatial_dim in src.dims and self.feature_dim in src.dims + + # Apply mixup before we add the new data to the buffer. + src_ = self._maybe_apply_mixup(src, spatial_dim=spatial_dim) + + self._append_to_buffer(src, spatial_dim=spatial_dim) + + return src_ + + def _append_to_buffer(self, src: Tensor, *, spatial_dim: Dim): + batch_dims = src.remaining_dims((spatial_dim, self.feature_dim)) + opts = self.opts + + # Fill buffer with new data: + src_flat, src_flat_dim = rf.pack_padded(src, dims=batch_dims + [spatial_dim]) + new_pos = rf.minimum(self.buffer_pos + src_flat_dim.get_size_tensor(), opts.buffer_size) + part_fill_len = new_pos - self.buffer_pos + src_flat_part, src_flat_part_dim = rf.slice(src_flat, axis=src_flat_dim, end=part_fill_len) + self.buffer.assign_key( + axis=self.buffer_size_dim, + key=slice(self.buffer_pos, new_pos), + key_dim=src_flat_part_dim, + value=src_flat_part, + ) + if (self.buffer_pos + src_flat_dim.get_size_tensor() >= opts.buffer_size).raw_tensor: + self.buffer_filled.assign(True) + part_fill_len_ = rf.minimum(src_flat_dim.get_size_tensor() - part_fill_len, opts.buffer_size) + src_flat_part, src_flat_part_dim = rf.slice( + src_flat, axis=src_flat_dim, start=part_fill_len, end=part_fill_len + part_fill_len_ + ) + self.buffer.assign_key( + axis=self.buffer_size_dim, key=slice(0, part_fill_len_), key_dim=src_flat_part_dim, value=src_flat_part + ) + new_pos = part_fill_len_ + self.buffer_pos.assign(new_pos) + + def _maybe_apply_mixup(self, src: Tensor, *, spatial_dim: Dim) -> Tensor: + batch_dims = src.remaining_dims((spatial_dim, self.feature_dim)) + opts = self.opts + + if (rf.random_uniform((), device="cpu") >= opts.apply_prob).raw_tensor: + return src + + buffer_filled_size = rf.where(self.buffer_filled, opts.buffer_size, self.buffer_pos) + if (buffer_filled_size < spatial_dim.get_dim_value_tensor()).raw_tensor: + return src + + # Apply Mixup. Collect all data we are going to add for each sequence. + num_mixup = rf.random_uniform( + batch_dims, minval=1, maxval=opts.max_num_mix + 1, dtype="int32", device="cpu" + ) # [B] + num_mixup_dim = Dim(num_mixup, name="num_mixup") + + buffer_start = rf.random_uniform( + batch_dims + [num_mixup_dim], + maxval=buffer_filled_size - spatial_dim.get_dim_value_tensor() + 1, + dtype="int32", + sparse_dim=self.buffer_size_dim, + ) # [B, N] + n_mask = rf.sequence_mask(num_mixup_dim) # [B, N] + buffer_start_flat, num_mixup_flat_dim = rf.masked_select( + buffer_start, mask=n_mask, dims=batch_dims + [num_mixup_dim] + ) # [B_N'] + + idx = rf.range_over_dim(spatial_dim) # [T] + idx = rf.combine_bc(idx, "+", buffer_start_flat) # [B_N', T] + + mixup_values = rf.gather(self.buffer, indices=idx, axis=self.buffer_size_dim) # [B_N', T, F] + + # Scale the mixup values. + lambda_ = rf.random_uniform( + batch_dims + [num_mixup_dim], minval=opts.lambda_min, maxval=opts.lambda_max, dtype=src.dtype + ) + mixup_scales = rf.random_uniform(batch_dims + [num_mixup_dim], minval=0.001, maxval=1.0, dtype=src.dtype) + mixup_scales *= lambda_ / rf.reduce_sum(mixup_scales, axis=num_mixup_dim) # [B,N] + mixup_scales_flat, _ = rf.masked_select( + mixup_scales, mask=n_mask, dims=batch_dims + [num_mixup_dim], out_dim=num_mixup_flat_dim + ) # [B_N'] + mixup_values *= mixup_scales_flat # [B_N', T, F] + + idx_b = rf.range_over_merged_dims(batch_dims) # [B] -> B + idx_b, _ = rf.masked_select( + idx_b, mask=n_mask, dims=batch_dims + [num_mixup_dim], out_dim=num_mixup_flat_dim + ) # [B_N'] -> B + + mixup_value = rf.scatter( + mixup_values, indices=idx_b, indices_dim=num_mixup_flat_dim, out_dim=batch_dims + ) # [B,T,F] + + src = src + rf.stop_gradient(mixup_value) + return src + + +def _test_mixup(): + import numpy as np + + rf.select_backend_torch() + rf.init_train_step_run_ctx(train_flag=True, step=0) + + batch_dim = Dim(2, name="batch") + time_dim = Dim(rf.convert_to_tensor(np.array([7, 8], dtype="int32"), dims=[batch_dim]), name="time") + feature_dim = Dim(5, name="feature") + data = rf.convert_to_tensor( + np.arange(2 * 8 * 5).reshape(2, 8, 5).astype("float32"), dims=[batch_dim, time_dim, feature_dim] + ) + data /= rf.reduce_max(data, axis=time_dim) + print("data:", data, data.raw_tensor) + + mixup = Mixup(feature_dim=feature_dim, opts=MixupOpts(buffer_size=30, lambda_min=1.0, lambda_max=1.0)) + + x = mixup(data, spatial_dim=time_dim) + print("x:", x, x.raw_tensor) + print("buffer:", mixup.buffer, mixup.buffer.raw_tensor) + + batch_dim = Dim(3, name="batch") + time_dim = Dim(rf.convert_to_tensor(np.array([3, 4, 2], dtype="int32"), dims=[batch_dim]), name="time") + data = rf.ones([batch_dim, time_dim, feature_dim]) + x = mixup(data, spatial_dim=time_dim) + print("x':", x, x.raw_tensor) + print("buffer':", mixup.buffer, mixup.buffer.raw_tensor) + + data = -rf.ones([batch_dim, time_dim, feature_dim]) + x = mixup(data, spatial_dim=time_dim) + print("x'':", x, x.raw_tensor) + print("buffer'':", mixup.buffer, mixup.buffer.raw_tensor) + + +if __name__ == "__main__": + _test_mixup() diff --git a/users/zeyer/returnn/models/rf_mixup.py b/users/zeyer/returnn/models/rf_mixup.py index a0c8cc375..2e5272349 100644 --- a/users/zeyer/returnn/models/rf_mixup.py +++ b/users/zeyer/returnn/models/rf_mixup.py @@ -1,177 +1,8 @@ """ -Mixup with RF +Mixup """ -from __future__ import annotations -from dataclasses import dataclass -from returnn.tensor import Tensor, Dim -import returnn.frontend as rf +from ...nn_rf.mixup import * -@dataclass -class MixupOpts: - """ - Arguments: - buffer_size: number of frames. - apply_prob: probability to apply mixup at all - max_num_mix: maximum number of mixups (random int in [1, max_num_mix]) - lambda_min: minimum lambda value - lambda_max: maximum lambda value - """ - - buffer_size: int = 1_000_000 - apply_prob: float = 1.0 - max_num_mix: int = 4 - lambda_min: float = 0.1 - lambda_max: float = 0.4 - - -class Mixup(rf.Module): - """ - Mixup - """ - - def __init__(self, *, feature_dim: Dim, opts: MixupOpts): - super().__init__() - self.feature_dim = feature_dim - self.opts = opts - self.buffer_size_dim = Dim(opts.buffer_size, name="buffer_size") - self.buffer = rf.Parameter([self.buffer_size_dim, feature_dim], auxiliary=True) - self.buffer_pos = rf.Parameter( - [], dtype="int32", sparse_dim=self.buffer_size_dim, initial=0, auxiliary=True, device="cpu" - ) - self.buffer_filled = rf.Parameter([], dtype="bool", initial=False, auxiliary=True, device="cpu") - - def __call__(self, src: Tensor, *, spatial_dim: Dim) -> Tensor: - if not rf.get_run_ctx().train_flag: - return src - - assert spatial_dim in src.dims and self.feature_dim in src.dims - - # Apply mixup before we add the new data to the buffer. - src_ = self._maybe_apply_mixup(src, spatial_dim=spatial_dim) - - self._append_to_buffer(src, spatial_dim=spatial_dim) - - return src_ - - def _append_to_buffer(self, src: Tensor, *, spatial_dim: Dim): - batch_dims = src.remaining_dims((spatial_dim, self.feature_dim)) - opts = self.opts - - # Fill buffer with new data: - src_flat, src_flat_dim = rf.pack_padded(src, dims=batch_dims + [spatial_dim]) - new_pos = rf.minimum(self.buffer_pos + src_flat_dim.get_size_tensor(), opts.buffer_size) - part_fill_len = new_pos - self.buffer_pos - src_flat_part, src_flat_part_dim = rf.slice(src_flat, axis=src_flat_dim, end=part_fill_len) - self.buffer.assign_key( - axis=self.buffer_size_dim, - key=slice(self.buffer_pos, new_pos), - key_dim=src_flat_part_dim, - value=src_flat_part, - ) - if (self.buffer_pos + src_flat_dim.get_size_tensor() >= opts.buffer_size).raw_tensor: - self.buffer_filled.assign(True) - part_fill_len_ = rf.minimum(src_flat_dim.get_size_tensor() - part_fill_len, opts.buffer_size) - src_flat_part, src_flat_part_dim = rf.slice( - src_flat, axis=src_flat_dim, start=part_fill_len, end=part_fill_len + part_fill_len_ - ) - self.buffer.assign_key( - axis=self.buffer_size_dim, key=slice(0, part_fill_len_), key_dim=src_flat_part_dim, value=src_flat_part - ) - new_pos = part_fill_len_ - self.buffer_pos.assign(new_pos) - - def _maybe_apply_mixup(self, src: Tensor, *, spatial_dim: Dim) -> Tensor: - batch_dims = src.remaining_dims((spatial_dim, self.feature_dim)) - opts = self.opts - - if (rf.random_uniform((), device="cpu") >= opts.apply_prob).raw_tensor: - return src - - buffer_filled_size = rf.where(self.buffer_filled, opts.buffer_size, self.buffer_pos) - if (buffer_filled_size < spatial_dim.get_dim_value_tensor()).raw_tensor: - return src - - # Apply Mixup. Collect all data we are going to add for each sequence. - num_mixup = rf.random_uniform( - batch_dims, minval=1, maxval=opts.max_num_mix + 1, dtype="int32", device="cpu" - ) # [B] - num_mixup_dim = Dim(num_mixup, name="num_mixup") - - buffer_start = rf.random_uniform( - batch_dims + [num_mixup_dim], - maxval=buffer_filled_size - spatial_dim.get_dim_value_tensor() + 1, - dtype="int32", - sparse_dim=self.buffer_size_dim, - ) # [B, N] - n_mask = rf.sequence_mask(num_mixup_dim) # [B, N] - buffer_start_flat, num_mixup_flat_dim = rf.masked_select( - buffer_start, mask=n_mask, dims=batch_dims + [num_mixup_dim] - ) # [B_N'] - - idx = rf.range_over_dim(spatial_dim) # [T] - idx = rf.combine_bc(idx, "+", buffer_start_flat) # [B_N', T] - - mixup_values = rf.gather(self.buffer, indices=idx, axis=self.buffer_size_dim) # [B_N', T, F] - - # Scale the mixup values. - lambda_ = rf.random_uniform( - batch_dims + [num_mixup_dim], minval=opts.lambda_min, maxval=opts.lambda_max, dtype=src.dtype - ) - mixup_scales = rf.random_uniform(batch_dims + [num_mixup_dim], minval=0.001, maxval=1.0, dtype=src.dtype) - mixup_scales *= lambda_ / rf.reduce_sum(mixup_scales, axis=num_mixup_dim) # [B,N] - mixup_scales_flat, _ = rf.masked_select( - mixup_scales, mask=n_mask, dims=batch_dims + [num_mixup_dim], out_dim=num_mixup_flat_dim - ) # [B_N'] - mixup_values *= mixup_scales_flat # [B_N', T, F] - - idx_b = rf.range_over_merged_dims(batch_dims) # [B] -> B - idx_b, _ = rf.masked_select( - idx_b, mask=n_mask, dims=batch_dims + [num_mixup_dim], out_dim=num_mixup_flat_dim - ) # [B_N'] -> B - - mixup_value = rf.scatter( - mixup_values, indices=idx_b, indices_dim=num_mixup_flat_dim, out_dim=batch_dims - ) # [B,T,F] - - src = src + rf.stop_gradient(mixup_value) - return src - - -def _test_mixup(): - import numpy as np - - rf.select_backend_torch() - rf.init_train_step_run_ctx(train_flag=True, step=0) - - batch_dim = Dim(2, name="batch") - time_dim = Dim(rf.convert_to_tensor(np.array([7, 8], dtype="int32"), dims=[batch_dim]), name="time") - feature_dim = Dim(5, name="feature") - data = rf.convert_to_tensor( - np.arange(2 * 8 * 5).reshape(2, 8, 5).astype("float32"), dims=[batch_dim, time_dim, feature_dim] - ) - data /= rf.reduce_max(data, axis=time_dim) - print("data:", data, data.raw_tensor) - - mixup = Mixup(feature_dim=feature_dim, opts=MixupOpts(buffer_size=30, lambda_min=1.0, lambda_max=1.0)) - - x = mixup(data, spatial_dim=time_dim) - print("x:", x, x.raw_tensor) - print("buffer:", mixup.buffer, mixup.buffer.raw_tensor) - - batch_dim = Dim(3, name="batch") - time_dim = Dim(rf.convert_to_tensor(np.array([3, 4, 2], dtype="int32"), dims=[batch_dim]), name="time") - data = rf.ones([batch_dim, time_dim, feature_dim]) - x = mixup(data, spatial_dim=time_dim) - print("x':", x, x.raw_tensor) - print("buffer':", mixup.buffer, mixup.buffer.raw_tensor) - - data = -rf.ones([batch_dim, time_dim, feature_dim]) - x = mixup(data, spatial_dim=time_dim) - print("x'':", x, x.raw_tensor) - print("buffer'':", mixup.buffer, mixup.buffer.raw_tensor) - - -if __name__ == "__main__": - _test_mixup() +__all__ = ["MixupOpts", "Mixup"] From 87aae2537a09b4ecaaae5bdbea41a0aa9345569c Mon Sep 17 00:00:00 2001 From: "luca.gaudino" Date: Thu, 13 Jun 2024 11:54:11 +0200 Subject: [PATCH 164/227] rnnt dec rf WIP --- .../experiments/rnnt_bpe/baseline.py | 6 +- .../conformer_import_moh_att_2023_06_30.py | 6 +- .../conformer_import_moh_att_train.py | 2 +- .../librispeech_960/_import_model_nick.py | 333 +++++++++++++ .../librispeech_960/conformer_ctc_train.py | 47 +- .../librispeech_960/conformer_rnnt_train.py | 336 +++++++++++++ .../tedlium2/conformer_rnnt_train.py | 23 +- .../rf/conformer_rnnt/model_conformer_rnnt.py | 76 ++- .../asr/rf/conformer_rnnt/model_recog_rnnt.py | 441 +++++++++--------- 9 files changed, 1005 insertions(+), 265 deletions(-) create mode 100644 users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/_import_model_nick.py create mode 100644 users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_rnnt_train.py diff --git a/users/gaudino/experiments/ctc_rnnt_standalone_2024/librispeech_960/experiments/rnnt_bpe/baseline.py b/users/gaudino/experiments/ctc_rnnt_standalone_2024/librispeech_960/experiments/rnnt_bpe/baseline.py index abb40af5a..2dba0efab 100644 --- a/users/gaudino/experiments/ctc_rnnt_standalone_2024/librispeech_960/experiments/rnnt_bpe/baseline.py +++ b/users/gaudino/experiments/ctc_rnnt_standalone_2024/librispeech_960/experiments/rnnt_bpe/baseline.py @@ -149,7 +149,7 @@ def evaluate_helper( predictor_config = PredictorConfig( symbol_embedding_dim=256, emebdding_dropout=0.2, - num_lstm_layers=2, + num_lstm_layers=1, # 2 lstm_hidden_dim=512, lstm_dropout=0.1, ) @@ -222,8 +222,8 @@ def evaluate_helper( } training_name = ( - prefix_name + "/" + network_module + ".512dim_sub6_24gbgpu_25eps_accum2_fullspec1_continue_from_ctc50eps_numlstm2" - # prefix_name + "/" + network_module + ".512dim_sub6_24gbgpu_25eps_accum2_fullspec1_continue_from_ctc50eps" + # prefix_name + "/" + network_module + ".512dim_sub6_24gbgpu_25eps_accum2_fullspec1_continue_from_ctc50eps_numlstm2" + prefix_name + "/" + network_module + ".512dim_sub6_24gbgpu_25eps_accum2_fullspec1_continue_from_ctc50eps" ) train_job = training( training_name, train_data_bpe5000, train_args_warprnnt_accum2_fullspec1, num_epochs=250, **default_returnn diff --git a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py index 4b0b70543..faa8edbb8 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_2023_06_30.py @@ -627,7 +627,7 @@ def sis_run_with_prefix(prefix_name: str = None): task, model_with_checkpoint, model_recog, - dev_sets=["dev-other"], + dev_sets=["dev-clean", "dev-other", "test-clean", "test-other"], model_args=model_args, search_args=search_args, prefix_name=name, @@ -639,7 +639,7 @@ def sis_run_with_prefix(prefix_name: str = None): # opls att + ctc + trafo lm + ilm for scales, prior_scale, lm_scale, ilm_scale, beam_size in product( - [(0.8, 0.2)], [0.05, 0.07], [0.65], [0.4], [32, 48, 64] + [(0.8, 0.2)], [0.05, 0.07], [0.65], [0.4], [32, 40] ): att_scale, ctc_scale = scales recog_name = ( @@ -668,7 +668,7 @@ def sis_run_with_prefix(prefix_name: str = None): task, model_with_checkpoint, model_recog, - dev_sets=["dev-clean", "dev-other"], + dev_sets=["dev-clean", "dev-other", "test-clean", "test-other"], model_args=model_args, search_args=search_args, prefix_name=name, diff --git a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_train.py b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_train.py index 5bdc5eff2..475f6d6d7 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_train.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_train.py @@ -334,7 +334,7 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], }, ) - model = train_exp( + model = train_exp( # 5.42 "base-24gb-v6-lrlin1e_5_600k_noCTC", config_24gb_v6, config_updates={ diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/_import_model_nick.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/_import_model_nick.py new file mode 100644 index 000000000..0a4835e40 --- /dev/null +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/_import_model_nick.py @@ -0,0 +1,333 @@ +from __future__ import annotations + +from typing import Dict, Optional + +import os +import torch +import numpy + +from sisyphus import tk + +# from i6_core.returnn.training import Checkpoint +# from i6_experiments.users.zeyer.utils.generic_job_output import generic_job_output +# from i6_experiments.users.gaudino.returnn.convert_ckpt_rf import ( +# ConvertTfCheckpointToRfPtJob, +# ) + +from i6_experiments.users.gaudino.experiments.rf_conformer_att_2023.librispeech_960.conformer_import_moh_att_2023_06_30 import ( + MakeModel, +) + +from i6_experiments.users.gaudino.models.asr.rf.conformer_rnnt.model_conformer_rnnt import MakeModel as MakeModelRNNT + +from i6_experiments.users.gaudino.models.asr.rf.nn_lm.lm_import_2023_11_09 import ( + MakeModel as MakeModelLM, +) + +from i6_experiments.users.gaudino.models.asr.rf.nn_lm.lm_import_2023_09_03 import ( + MakeModel as MakeModelLSTMLM, +) + +from i6_experiments.users.gaudino.models.asr.rf.ilm_import_2024_04_17 import ( + MakeModel as MakeModelILM, +) + +import returnn.frontend as rf + +from itertools import product + +_nick_pure_torch_rnnt_ckpt_path = "/work/asr4/rossenbach/sisyphus_work_folders/tts_decoder_asr_work/i6_core/returnn/training/ReturnnTrainingJob.6lwn4XuFkhkI/output/models/epoch.250.pt" + +from i6_experiments.users.gaudino.experiments.conformer_att_2023.tedlium2.model_ckpt_info import models + + +def convert_checkpoint( + *, + ckpt_path: str, + ckpt_path_lm: Optional[str] = None, + ckpt_path_sep: Optional[str] = None, + out_dir: str, + print_params: bool = False, + save_model: bool = True, +): + """run""" + import returnn.frontend as rf + from returnn.torch.frontend.bridge import rf_module_to_pt_module + from returnn.util.basic import model_epoch_from_filename + + print("Input checkpoint:" + ckpt_path) + ckpt = torch.load(ckpt_path, map_location="cpu") + + if print_params: + for k, v in ckpt["model"].items(): + print(f"{k}: {v.shape if hasattr(v, 'shape') else v}") + # print(reader.debug_string().decode("utf-8")) + + + print() + + + print("Creating model...") + rf.select_backend_torch() + model = MakeModelRNNT(80, 1_057)() + + print("Created model:", model) + print("Model parameters:") + for name, param in model.named_parameters(): + assert isinstance(name, str) + assert isinstance(param, rf.Parameter) + if print_params: + print(f"{name}: {param}") + print() + + print("Create ParamMapping...") + param_mapping = {} + _add_params_predictor_joiner(param_mapping) + # _add_params_conformer(param_mapping, prefix="") + # if not ctc_only: + # _add_params_att_decoder(param_mapping) + # _add_params_trafo_lm(param_mapping) + # if model_args.get("encoder_ctc", False): + # _add_params_conformer(param_mapping, prefix="sep_enc_ctc_") + + for name, param in model.named_parameters(): + if name in param_mapping: + assert isinstance(name, str) + assert isinstance(param, rf.Parameter) + + value = map_param_func(ckpt, name, param, param_mapping) + assert isinstance(value, numpy.ndarray) + # noinspection PyProtectedMember + param._raw_backend.set_parameter_initial_value(param, value) + + epoch = 1 + if epoch is None: + epoch = model_epoch_from_filename(tk.Path(ckpt_path)) + pass + + step = 0 + # if step is None: + # assert reader.has_tensor("global_step") + # step = int(reader.get_tensor("global_step")) + + ckpt_name = os.path.basename(ckpt_path) + + pt_model = rf_module_to_pt_module(model) + + breakpoint() + + if save_model: + os.makedirs(out_dir, exist_ok=True) + filename = out_dir + "/" + ckpt_name + ".pt" + print(f"*** saving PyTorch model checkpoint: {filename}") + torch.save( + {"model": pt_model.state_dict(), "epoch": epoch, "step": step}, filename + ) + + if ckpt_name != "checkpoint": + symlink_filename = out_dir + "/checkpoint.pt" + print( + f"*** creating symlink {symlink_filename} -> {os.path.basename(filename)}" + ) + os.symlink(os.path.basename(filename), symlink_filename) + # create meta information + meta_filename = out_dir + "/" + ckpt_name + "." + str(epoch) + ".meta" + open(meta_filename, "w").close() + symlink_filename_1 = out_dir + "/checkpoint." + str(epoch) + ".meta" + symlink_filename_2 = out_dir + "/" + ckpt_name + ".meta" + os.symlink(os.path.basename(meta_filename), symlink_filename_1) + os.symlink(os.path.basename(meta_filename), symlink_filename_2) + # assert os.path.exists(self.out_checkpoint.get_path()) + +def _add_params_conformer(param_mapping: Dict[str, str], prefix: str): + # rf -> pt + # frontend + for layer_idx in [0, 1, 2]: + orig_name = "conv0" if layer_idx == 0 else f"subsample_conv{layer_idx - 1}" + param_mapping.update( + { + prefix + + f"encoder.input_layer.conv_layers.{layer_idx}.filter": f"{orig_name}/W", + prefix + + f"encoder.input_layer.conv_layers.{layer_idx}.bias": f"{orig_name}/bias", + } + ) + param_mapping.update( + { + prefix + "encoder.input_projection.weight": "source_linear/W", + # prefix + "ctc.weight": "ctc/W", + # prefix + "ctc.bias": "ctc/b", + prefix + "enc_aux_logits_12.weight": "ctc/W", + prefix + "enc_aux_logits_12.bias": "ctc/b", + } + ) + # conformer + for layer_idx in range(12): + # FF + for sub in [1, 2]: + param_mapping[ + prefix + f"encoder.layers.{layer_idx}.ffn{sub}.linear_ff.weight" + ] = f"conformer_block_{layer_idx + 1:02d}_ffmod_{sub}_ff1/W" + param_mapping[ + prefix + f"encoder.layers.{layer_idx}.ffn{sub}.linear_ff.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_ffmod_{sub}_ff1/b" + param_mapping[ + prefix + f"encoder.layers.{layer_idx}.ffn{sub}.linear_out.weight" + ] = f"conformer_block_{layer_idx + 1:02d}_ffmod_{sub}_ff2/W" + param_mapping[ + prefix + f"encoder.layers.{layer_idx}.ffn{sub}.linear_out.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_ffmod_{sub}_ff2/b" + param_mapping[ + prefix + f"encoder.layers.{layer_idx}.ffn{sub}_layer_norm.scale" + ] = f"conformer_block_{layer_idx + 1:02d}_ffmod_{sub}_ln/scale" + param_mapping[ + prefix + f"encoder.layers.{layer_idx}.ffn{sub}_layer_norm.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_ffmod_{sub}_ln/bias" + # conv + param_mapping[ + prefix + f"encoder.layers.{layer_idx}.conv_block.positionwise_conv1.weight" + ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_pointwise_conv1/W" + param_mapping[ + prefix + f"encoder.layers.{layer_idx}.conv_block.positionwise_conv1.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_pointwise_conv1/b" + param_mapping[ + prefix + f"encoder.layers.{layer_idx}.conv_block.depthwise_conv.filter" + ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_depthwise_conv2/W" + param_mapping[ + prefix + f"encoder.layers.{layer_idx}.conv_block.depthwise_conv.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_depthwise_conv2/bias" + param_mapping[ + prefix + f"encoder.layers.{layer_idx}.conv_block.positionwise_conv2.weight" + ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_pointwise_conv2/W" + param_mapping[ + prefix + f"encoder.layers.{layer_idx}.conv_block.positionwise_conv2.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_pointwise_conv2/b" + param_mapping[ + prefix + f"encoder.layers.{layer_idx}.conv_layer_norm.scale" + ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_ln/scale" + param_mapping[ + prefix + f"encoder.layers.{layer_idx}.conv_layer_norm.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_ln/bias" + # self-att + param_mapping[ + prefix + f"encoder.layers.{layer_idx}.self_att.qkv.weight" + ] = f"conformer_block_{layer_idx + 1:02d}_self_att/QKV" + param_mapping[ + prefix + f"encoder.layers.{layer_idx}.self_att.proj.weight" + ] = f"conformer_block_{layer_idx + 1:02d}_self_att_linear/W" + param_mapping[ + prefix + f"encoder.layers.{layer_idx}.self_att_layer_norm.scale" + ] = f"conformer_block_{layer_idx + 1:02d}_self_att_ln/scale" + param_mapping[ + prefix + f"encoder.layers.{layer_idx}.self_att_layer_norm.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_self_att_ln/bias" + param_mapping[ + prefix + f"encoder.layers.{layer_idx}.self_att.learned_pos_emb.pos_emb" + ] = f"conformer_block_{layer_idx + 1:02d}_self_att_ln_rel_pos_enc/encoding_matrix" + # final layer norm + param_mapping[ + prefix + f"encoder.layers.{layer_idx}.final_layer_norm.scale" + ] = f"conformer_block_{layer_idx + 1:02d}_ln/scale" + param_mapping[ + prefix + f"encoder.layers.{layer_idx}.final_layer_norm.bias" + ] = f"conformer_block_{layer_idx + 1:02d}_ln/bias" + +def _add_params_predictor_joiner(param_mapping: Dict[str, str]): + # add params of trafo lm + for layer_idx in range(1): + param_mapping.update( + { + f"predictor.layers.{layer_idx}.ff_weight": f"predictor.lstm_layers.{layer_idx}.weight_ih_l0", + + f"predictor.layers.{layer_idx}.rec_weight": f"predictor.lstm_layers.{layer_idx}.weight_hh_l0", + f"predictor.layers.{layer_idx}.bias": f"predictor.lstm_layers.{layer_idx}.bias_ih_l0", + } + ) + + param_mapping.update( + { + "predictor.embedding.weight": "predictor.embedding.weight", + "predictor.input_layer_norm.scale": "predictor.input_layer_norm.weight", + "predictor.input_layer_norm.bias": "predictor.input_layer_norm.bias", + "predictor.linear.weight": "predictor.linear.weight", + "predictor.linear.bias": "predictor.linear.bias", + "predictor.output_layer_norm.scale": "predictor.output_layer_norm.weight", + "predictor.output_layer_norm.bias": "predictor.output_layer_norm.bias", + # joiner + "joiner.linear.weight": "joiner.linear.weight", + "joiner.linear.bias": "joiner.linear.bias", + } + ) + + +def map_param_func( + ckpt, name: str, var: rf.Parameter, param_mapping: Dict[str, str] +) -> numpy.ndarray: + """map params, TF to RF""" + from i6_experiments.users.zeyer.returnn.convert.params import ( + numpy as convert_params_np, + ) + from i6_experiments.users.zeyer.returnn.convert.params import ( + tf_to_rf_np as convert_params_tf_to_rf_np, + ) + + assert isinstance(var, rf.Parameter) + + if name in param_mapping: + breakpoint() + var_name = param_mapping[name] + assert name in ckpt["model"].keys(), f"missing {var_name}" + value = ckpt["model"][name].numpy() + assert isinstance(value, numpy.ndarray) + + assert ( + value.shape == var.batch_shape + ), f"new param {name} {var.batch_shape} vs ckpt param {var_name} {value.shape}" + assert ( + value.dtype.name == var.dtype + ), f"new param {name} {var.dtype} vs ckpt param {var_name} {value.dtype}" + return value + + # if name == "s.ff_weight": + # value = reader.get_tensor("output/rec/s/rec/lstm_cell/kernel") + # value = convert_params_np.convert_tf_lstm_to_native_lstm_ff(value) + # assert value.shape == var.batch_shape, name + # assert value.dtype.name == var.dtype, name + # return value + # + # if name == "s.rec_weight": + # value = reader.get_tensor("output/rec/s/rec/lstm_cell/kernel") + # value = convert_params_np.convert_tf_lstm_to_native_lstm_rec(value) + # assert value.shape == var.batch_shape, name + # assert value.dtype.name == var.dtype, name + # return value + # + # if name == "s.bias": + # value = reader.get_tensor("output/rec/s/rec/lstm_cell/bias") + # value = convert_params_np.convert_tf_lstm_to_native_lstm_bias( + # value, forget_gate_bias=1.0 + # ) + # assert value.shape == var.batch_shape, name + # assert value.dtype.name == var.dtype, name + # return value + # + # if ".conv_block.norm." in name: + # assert name.startswith("encoder.layers.") + # layer_idx = int(name.split(".")[2]) + # value = convert_params_tf_to_rf_np.convert_tf_batch_norm_to_rf( + # reader=reader, + # rf_name=name, + # rf_prefix_name=f"encoder.layers.{layer_idx}.conv_block.norm.", + # tf_prefix_name=f"conformer_block_{layer_idx + 1:02d}_conv_mod_bn/batch_norm/", + # var=var, + # ) + # assert value.shape == var.batch_shape, name + # assert value.dtype.name == var.dtype, name + # return value + + raise NotImplementedError(f"cannot map {name!r} {var}") + + + +if __name__ == "__main__": + convert_checkpoint(ckpt_path=_nick_pure_torch_rnnt_ckpt_path, print_params=True, out_dir="", save_model=False) \ No newline at end of file diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py index 66fe36592..29ff6ef1f 100644 --- a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py @@ -183,6 +183,46 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): }, ) + # no eos + train_exp( # dev-other + "base-24gb-lrlin1e_5_600k_ctc_only_no_eos", + config_24gb_v6, + config_updates={ + "learning_rate": 1.0, + "dynamic_learning_rate": dyn_lr_piecewise_linear, + # total steps after 2000 epochs: 982.312 + "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], + "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + "aux_loss_layers":[], + "mel_normalization_ted2": False, + "hash_override": 1, + }, + search_config={ + "mel_normalization_ted2": False, + }, + with_eos_postfix=False, + ) + + train_exp( # dev-other + "base-24gb-lrlin1e_5_600k_ctc_only_aux4_8_no_eos", + config_24gb_v6, + config_updates={ + "learning_rate": 1.0, + "dynamic_learning_rate": dyn_lr_piecewise_linear, + # total steps after 2000 epochs: 982.312 + "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], + "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + "mel_normalization_ted2": False, + "hash_override": 1, + }, + search_config = { + "mel_normalization_ted2": False, + }, + with_eos_postfix=False, + + ) + + _torch_ckpt_path = "/u/luca.gaudino/setups/2023-08-10--rf-librispeech/work/i6_core/returnn/training/ReturnnTrainingJob.AWwVft0oGy8e/output/models/epoch.1981.pt" new_ckpt_path = tk.Path( @@ -262,6 +302,7 @@ def train_exp( time_rqmt: Optional[int] = None, model_avg: bool = False, search_config: Optional[Dict[str, Any]] = None, + with_eos_postfix: bool = True, ) -> ModelWithCheckpoints: """ Train experiment @@ -275,7 +316,7 @@ def train_exp( _sis_setup_global_prefix() prefix = _sis_prefix + "/" + name - task = _get_ls_task() + task = _get_ls_task(with_eos_postfix=with_eos_postfix) config = config.copy() config = dict_update_deep(config, config_updates, config_deletes) if "__num_epochs" in config: @@ -362,7 +403,7 @@ def train_exp( _ls_task = None -def _get_ls_task(): +def _get_ls_task(with_eos_postfix=True): global _ls_task if _ls_task: return _ls_task @@ -371,7 +412,7 @@ def _get_ls_task(): get_librispeech_task_bpe10k_raw, ) - _ls_task = get_librispeech_task_bpe10k_raw(with_eos_postfix=True) + _ls_task = get_librispeech_task_bpe10k_raw(with_eos_postfix=with_eos_postfix) return _ls_task diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_rnnt_train.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_rnnt_train.py new file mode 100644 index 000000000..3ce450f69 --- /dev/null +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_rnnt_train.py @@ -0,0 +1,336 @@ +"""Copied from Albert Zeyer 25.03.2024, then modified +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Optional, Union, Tuple, Sequence, List, Collection +import tree +import math +import numpy as np +import torch +import hashlib +import contextlib +import functools +from sisyphus import tk + +from returnn.tensor import Tensor, Dim, single_step_dim +import returnn.frontend as rf +from returnn.frontend.tensor_array import TensorArray +from returnn.frontend.encoder.conformer import ConformerEncoder, ConformerConvSubsample + +from i6_experiments.users.gaudino.model_interfaces.supports_label_scorer_torch import ( + RFModelWithMakeLabelScorer, +) +from i6_experiments.users.gaudino.experiments.rf_conformer_att_2023.tedlium2.configs import * +from i6_experiments.users.gaudino.experiments.rf_conformer_att_2023.tedlium2.configs import ( + _batch_size_factor, + _cfg_lrlin1e_5_295k, + _get_cfg_lrlin_oclr_by_bs_nep, +) +from i6_experiments.users.gaudino.experiments.rf_conformer_att_2023.librispeech_960.trafo_lm import trafo_lm_kazuki_import + +if TYPE_CHECKING: + from i6_experiments.users.gaudino.model_interfaces import ModelDef, RecogDef, TrainDef + +from i6_experiments.users.gaudino.model_with_checkpoints import ( + ModelWithCheckpoints, + ModelWithCheckpoint, +) + +from i6_experiments.users.gaudino.models.asr.rf.conformer_rnnt.model_conformer_rnnt import from_scratch_model_def, from_scratch_training +from i6_experiments.users.gaudino.models.asr.rf.conformer_rnnt.model_recog_rnnt import model_recog + + +# The model gets raw features (16khz) and does feature extraction internally. +_log_mel_feature_dim = 80 + + +def sis_run_with_prefix(prefix_name: Optional[str] = None): + """run the exp""" + + from i6_core.returnn.training import PtCheckpoint + + _sis_setup_global_prefix(prefix_name) + + # Moh: dev-clean 2.27, dev-other 5.39, test-clean 2.41, test-other 5.51 + # RF recog: {"dev-clean": 2.25, "dev-other": 5.34, "test-clean": 2.42, "test-other": 5.56} + # _recog_imported() + + rnnt_train_config = dict( + batching="laplace:.1000", + batch_size=15_000 * _batch_size_factor, + max_seqs=200, + # max_seq_length_default_target=75, + # specaugment_steps=(10_000, 20_000, 40_000), + specaugment_steps=(5_900, 18_000, 36_000), + # gradient_clip=0, + # gradient_clip_global_norm = 1.0 + optimizer={ + "class": "adamw", + "epsilon": 1e-8, + "weight_decay": 1e-6, + }, + # accum_grad_multiple_step=4, + # gradient_noise=0.0, + learning_rate=2.5e-3, + dynamic_learning_rate=dyn_lr_piecewise_linear, + # learning_rate_piecewise_steps= [261_000, 522_000, 580_000], # 45% 45 % 10% # 11gb + learning_rate_piecewise_steps = [85_500, 171_000, 190_000], # 45% 45 % 10% # 24gb + # aux_loss_layers=[4, 8], + max_seq_length_default_target=None, + # gradient_clip_global_norm=5.0, + accum_grad_multiple_step=2, + # aux_loss_layers=[12], +) + + # train_exp("base-11gb", config_11gb, gpu_mem=11) + # train_exp("base-11gb-v1", my_config_11gb, num_epochs=400, gpu_mem=11) + + train_exp( + "from-scratch-24gb_aux4_8", + config_24gb_v6, + config_updates={ + "batch_size": 8_000 * _batch_size_factor, + "learning_rate": 1.0, + "dynamic_learning_rate": dyn_lr_piecewise_linear, + # total steps after 2000 epochs: 982.312 + "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], + "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + "mel_normalization_ted2": False, + }, + config_deletes=["torch_amp"], + search_config={ + "mel_normalization_ted2": False, + }, + num_epochs=400, + gpu_mem=24, + ) + + + # some recog for debugging + _torch_ckpt_path = "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_core/returnn/training/ReturnnTrainingJob.J6Uj9xtt1v5J/output/models/epoch.003.pt" + + new_ckpt_path = tk.Path( + _torch_ckpt_path, + hash_overwrite= "rnnt" + "_torch_ckpt", + ) + new_ckpt = PtCheckpoint(new_ckpt_path) + + # _recog( + # "model_recogs/from-scratch-24gb/rnnt_beam_search/recog_results", + # ModelWithCheckpoint( + # definition=from_scratch_model_def, checkpoint=new_ckpt + # ), + # model_recog, + # dev_sets=["dev"] + # ) + + +_sis_prefix: Optional[str] = None + + +def _sis_setup_global_prefix(prefix_name: Optional[str] = None): + if not prefix_name: + from .sis_setup import get_prefix_for_config + + prefix_name = get_prefix_for_config(__file__) + global _sis_prefix + _sis_prefix = prefix_name + + +def _recog( + name: str, + model_with_checkpoint: ModelWithCheckpoint, + recog_def: RecogDef = None, + recog_config: Optional[Dict[str, Any]] = None, + *, + search_rqmt: Optional[Dict[str, Any]] = None, + dev_sets: Optional[Collection[str]] = None, +): + from sisyphus import tk + from i6_experiments.users.zeyer.recog import recog_model + + if recog_def is None: + recog_def = model_recog + + task = _get_ted2_task() + + res = recog_model( + task, + model_with_checkpoint, + recog_def=recog_def, + config=recog_config, + search_rqmt=search_rqmt, + dev_sets=dev_sets, + ) + tk.register_output(_sis_prefix + "/" + name, res.output) + + +# noinspection PyShadowingNames +def train_exp( + name: str, + config: Dict[str, Any], + *, + config_updates: Optional[Dict[str, Any]] = None, + config_deletes: Optional[Sequence[str]] = None, + post_config_updates: Optional[Dict[str, Any]] = None, + num_epochs: int = 2000, + gpu_mem: Optional[int] = 24, + num_processes: Optional[int] = None, + fine_tune: Optional[Union[int, List[Tuple[int, Dict[str, Any]]]]] = None, + time_rqmt: Optional[int] = None, + model_avg: bool = False, + search_config: Optional[Dict[str, Any]] = None, +) -> ModelWithCheckpoints: + """ + Train experiment + """ + from i6_experiments.users.gaudino.experiments.rf_conformer_att_2023.train import ( + train, + ) + from i6_experiments.users.zeyer.recog import recog_training_exp + + if _sis_prefix is None: + _sis_setup_global_prefix() + + prefix = _sis_prefix + "/" + name + task = _get_ls_task() + config = config.copy() + config = dict_update_deep(config, config_updates, config_deletes) + if "__num_epochs" in config: + num_epochs = config.pop("__num_epochs") + if "__gpu_mem" in config: + gpu_mem = config.pop("__gpu_mem") + if "__num_processes" in config: + num_processes = config.pop("__num_processes") + + model_with_checkpoint = train( + prefix, + task=task, + config=config, + post_config=dict_update_deep(post_config, post_config_updates), + model_def=from_scratch_model_def, + train_def=from_scratch_training, + num_epochs=num_epochs, + gpu_mem=gpu_mem, + num_processes=num_processes, + distributed_launch_cmd="torchrun" if num_processes else "mpirun", + time_rqmt=time_rqmt, + include_native_ops=True, + ) + recog_training_exp( + prefix, task, model_with_checkpoint, recog_def=model_recog, model_avg=model_avg, search_config=search_config + ) + + if fine_tune: + if isinstance(fine_tune, int): + fine_tune = [(fine_tune, {})] + for ep, opts in fine_tune: + assert isinstance(ep, int) and isinstance(opts, dict) + suffix = f"/finetune/{ep}" + opts = opts.copy() + if opts: + for k, v in sorted(opts.items()): + k: str + suffix += "-" + k.lstrip("_") + v = str(v).replace("-", "_") + if len(v) > 16 and not k.startswith("_"): + suffix += "_" + hashlib.md5(v.encode("utf8")).hexdigest()[:8] + else: + suffix += v + num_epochs_ = opts.pop("num_epochs", 50) + config_ = config.copy() + config_["import_model_train_epoch1"] = model_with_checkpoint.get_epoch( + ep + ).checkpoint + config_.pop("dynamic_learning_rate") + lrs = opts.pop("learning_rates", None) + if lrs is None: + lr_decay_type = opts.pop( + "lr_decay_type", "geomspace" + ) # geomspace or linspace + lr_decay_func = getattr(np, lr_decay_type) + lr = config_["learning_rate"] + final_lr = opts.pop("final_lr", 1e-7) + lrs = list(lr_decay_func(lr, final_lr, num=num_epochs_)) + else: + assert isinstance(lrs, (list, tuple)) + assert len(lrs) == num_epochs_ + config_["learning_rates"] = lrs + config_["learning_rate"] = float(lrs[-1]) + config_["specaugment_steps"] = (0, 0, 0) + config_.update({k: v for k, v in opts.items() if not k.startswith("_")}) + + finetune_model_with_ckpt = train( + prefix + suffix, + task=task, + config=config_, + post_config=post_config, + model_def=from_scratch_model_def, + train_def=from_scratch_training, + num_epochs=num_epochs_, + gpu_mem=gpu_mem, + ) + # _recog(name + suffix + "/recog/last", finetune_model_with_ckpt.get_last_fixed_epoch()) + recog_training_exp( + prefix + suffix, task, finetune_model_with_ckpt, recog_def=model_recog + ) + + return model_with_checkpoint + + +_ls_task = None +_ted2_task = None + + +def _get_ls_task(): + global _ls_task + if _ls_task: + return _ls_task + + from i6_experiments.users.zeyer.datasets.librispeech import ( + get_librispeech_task_bpe10k_raw, + ) + + _ls_task = get_librispeech_task_bpe10k_raw(with_eos_postfix=True) + return _ls_task + + +def _get_ted2_task(): + global _ted2_task + if _ted2_task: + return _ted2_task + + from i6_experiments.users.gaudino.datasets.tedlium2 import ( + get_tedlium2_task_bpe1k_raw, + ) + + _ted2_task = get_tedlium2_task_bpe1k_raw(with_eos_postfix=True, train_epoch_wise_filter=None) + return _ted2_task + + +py = sis_run_with_prefix # if run directly via `sis m ... + + +def model_warmup(*, model: Model, **_kwargs): + """warmup, for more reliable timing measures""" + import torch + import time + from returnn.config import get_global_config + from returnn.tensor import Dim + import returnn.frontend as rf + + config = get_global_config() + start_time = time.monotonic() + limit = start_time + config.float("model_warmup_time", 10.0) + + print("*** warming up...") + while time.monotonic() < limit: + batch_dim = Dim(10, name="dummy_batch") + time_dim = Dim(rf.full(dims=[batch_dim], fill_value=16_000), name="dummy_time") + feat_dim = Dim(1, name="dummy_feat") + source = rf.zeros([batch_dim, time_dim, feat_dim]) + res = model.encode(source=source, in_spatial_dim=time_dim) + if source.raw_tensor.device.type == "cuda": + torch.cuda.synchronize(source.raw_tensor.device) + res # noqa # keep ref to make sure it is calculated diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_rnnt_train.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_rnnt_train.py index d075dec27..f552ed0e4 100644 --- a/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_rnnt_train.py +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/tedlium2/conformer_rnnt_train.py @@ -56,7 +56,7 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): # RF recog: {"dev-clean": 2.25, "dev-other": 5.34, "test-clean": 2.42, "test-other": 5.56} # _recog_imported() - rnnt_train_config = dict( + rnnt_train_config_24gb = dict( batching="laplace:.1000", batch_size=15_000 * _batch_size_factor, max_seqs=200, @@ -86,16 +86,17 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): # train_exp("base-11gb", config_11gb, gpu_mem=11) # train_exp("base-11gb-v1", my_config_11gb, num_epochs=400, gpu_mem=11) - # train_exp( # TODO: runs in loss nan - # "from-scratch-24gb", - # rnnt_train_config, - # config_updates={ - # "learning_rate": 1.0, - # "learning_rate_piecewise_values": [8e-5, 8e-4, 8e-5, 1e-6], - # }, - # num_epochs=400, - # gpu_mem=24, - # ) + train_exp( # TODO: runs in loss nan + "from-scratch-24gb_norm_loss", + rnnt_train_config_24gb, + config_updates={ + "learning_rate": 1.0, + "learning_rate_piecewise_values": [8e-5, 8e-4, 8e-5, 1e-6], + "hash_override": 1, + }, + num_epochs=400, + gpu_mem=24, + ) _torch_ckpt_path = "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_core/returnn/training/ReturnnTrainingJob.J6Uj9xtt1v5J/output/models/epoch.003.pt" diff --git a/users/gaudino/models/asr/rf/conformer_rnnt/model_conformer_rnnt.py b/users/gaudino/models/asr/rf/conformer_rnnt/model_conformer_rnnt.py index 0bdac51aa..8198baeac 100644 --- a/users/gaudino/models/asr/rf/conformer_rnnt/model_conformer_rnnt.py +++ b/users/gaudino/models/asr/rf/conformer_rnnt/model_conformer_rnnt.py @@ -1,6 +1,15 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Optional, Union, Tuple, Sequence, List, Collection, Dict +from typing import ( + TYPE_CHECKING, + Optional, + Union, + Tuple, + Sequence, + List, + Collection, + Dict, +) import tree import math import numpy as np @@ -145,6 +154,7 @@ def __init__( self.output_dim = output_dim self.embedding_dropout = emebdding_dropout self.lstm_dropout = lstm_dropout + self.num_lstm_layers = num_lstm_layers self.symbol_embedding_dim = Dim( name="symbol_embedding", dimension=symbol_embedding_dim @@ -159,7 +169,7 @@ def __init__( self.symbol_embedding_dim if idx == 0 else self.lstm_hidden_dim, self.lstm_hidden_dim, ) - for idx in range(num_lstm_layers) + for idx in range(self.num_lstm_layers) ) # self.lstm_layers = torch.nn.ModuleList( @@ -291,6 +301,7 @@ def __call__( # source_lengths: rf.Tensor, target_encodings: rf.Tensor, # target_lengths: rf.Tensor, + batch_dims: Sequence[Dim], ) -> Tuple[rf.Tensor, rf.Tensor, rf.Tensor]: r"""Forward pass for training. @@ -320,19 +331,21 @@ def __call__( number of valid elements along dim 2 for i-th batch element in joint network output. """ + time_axis = len(batch_dims) + joint_encodings_raw = ( - source_encodings.raw_tensor.unsqueeze(2).contiguous() - + target_encodings.raw_tensor.unsqueeze(1).contiguous() + source_encodings.raw_tensor.unsqueeze(time_axis + 1).contiguous() + + target_encodings.raw_tensor.unsqueeze(time_axis).contiguous() ) joint_encodings = rf.Tensor( name="joint_encodings", raw_tensor=joint_encodings_raw, - dims=[ - source_encodings.dims[0], - source_encodings.dims[1], - target_encodings.dims[1], - source_encodings.dims[2], + dims=batch_dims + + [ + source_encodings.dims[time_axis], # T + target_encodings.dims[time_axis], # U + source_encodings.dims[-1], # F ], dtype=source_encodings.dtype, ) @@ -372,7 +385,6 @@ def __init__( enc_att_dropout: float = 0.1, l2: float = 0.0001, language_model: Optional[RFModelWithMakeLabelScorer] = None, - mel_normalization: bool = True, joiner_dim: int = 640, ): super(Model, self).__init__() @@ -381,7 +393,7 @@ def __init__( config = get_global_config(return_empty_if_none=True) - self.mel_normalization = mel_normalization + self.mel_normalization = config.typed_value("mel_normalization_ted2", True) self.in_dim = in_dim self.encoder = ConformerEncoder( @@ -603,12 +615,13 @@ def loop_step( state: Optional[rf.State] = None, ) -> Tuple[Dict[str, rf.Tensor], rf.State]: """step of the inner loop""" + batch_dims = enc.remaining_dims( + remove=(enc.feature_dim, enc_spatial_dim) + if enc_spatial_dim != single_step_dim + else (enc.feature_dim,) + ) + if state is None: - batch_dims = enc.remaining_dims( - remove=(enc.feature_dim, enc_spatial_dim) - if enc_spatial_dim != single_step_dim - else (enc.feature_dim,) - ) state = self.decoder_default_initial_state( batch_dims=batch_dims, enc_spatial_dim=enc_spatial_dim ) @@ -619,9 +632,9 @@ def loop_step( target, state.predictor, spatial_dim=target_spatial_dim ) - pred_out = pred_lstm.copy_swap_axes(0,1) + pred_out = pred_lstm.copy_swap_axes(0, 1) - joiner = self.joiner(enc_lin, pred_out) + joiner = self.joiner(enc_lin, pred_out, batch_dims=batch_dims) return {"output": joiner}, state_ @@ -746,12 +759,26 @@ def from_scratch_training( targets_mod = targets.copy() targets_mod.sparse_dim = model.target_dim_w_blank - blanks = rf.expand_dim(rf.full(dims=targets_mod.dims[:-1], fill_value=model.blank_idx, dtype=targets_mod.dtype), Dim(1)) + blanks = rf.expand_dim( + rf.full( + dims=targets_mod.dims[:-1], + fill_value=model.blank_idx, + dtype=targets_mod.dtype, + ), + Dim(1), + ) blanks.sparse_dim = model.target_dim_w_blank - targets_mod, targets_spatial_dim = rf.concat((blanks, blanks.dims[1]), (targets_mod, targets.dims[1])) + targets_mod, targets_spatial_dim = rf.concat( + (blanks, blanks.dims[1]), (targets_mod, targets.dims[1]) + ) - step_out, _ = model.loop_step(**enc_args, enc_spatial_dim=enc_spatial_dim, target=targets_mod, target_spatial_dim=targets_spatial_dim) + step_out, _ = model.loop_step( + **enc_args, + enc_spatial_dim=enc_spatial_dim, + target=targets_mod, + target_spatial_dim=targets_spatial_dim, + ) logits = step_out["output"] @@ -783,10 +810,10 @@ def from_scratch_training( dtype=logprobs.dtype, ) - num_phonemes = rf.reduce_sum(labels_len, axis=labels_len.dims[0]) - rnnt_loss.mark_as_loss( - name="rnnt", custom_inv_norm_factor=num_phonemes + name="rnnt", + custom_inv_norm_factor=targets_spatial_dim.get_size_tensor(), + use_normalized_loss=use_normalized_loss, ) # def _body(input_embed: Tensor, state: rf.State): @@ -842,6 +869,7 @@ def from_scratch_training( from_scratch_training: TrainDef[Model] from_scratch_training.learning_rate_control_error_measure = "dev_score_full_sum" + @contextlib.contextmanager def _opt_apply_pretrain_to_encoder( encoder: ConformerEncoder, diff --git a/users/gaudino/models/asr/rf/conformer_rnnt/model_recog_rnnt.py b/users/gaudino/models/asr/rf/conformer_rnnt/model_recog_rnnt.py index e47145a67..7c512166f 100644 --- a/users/gaudino/models/asr/rf/conformer_rnnt/model_recog_rnnt.py +++ b/users/gaudino/models/asr/rf/conformer_rnnt/model_recog_rnnt.py @@ -51,28 +51,13 @@ def _get_hypo_key(hypo: Hypothesis) -> str: return str(hypo[0]) -def _batch_state(hypos: List[Hypothesis]) -> List[List[torch.Tensor]]: # TODO - states: List[List[torch.Tensor]] = [] - for i in range(len(_get_hypo_state(hypos[0]))): - batched_state_components: List[torch.Tensor] = [] - for j in range(len(_get_hypo_state(hypos[0])[i])): - batched_state_components.append(torch.cat([_get_hypo_state(hypo)[i][j] for hypo in hypos])) - states.append(batched_state_components) - return states - - -def _slice_state(states: List[List[torch.Tensor]], idx: int, device: torch.device) -> List[List[torch.Tensor]]: # TODO - idx_tensor = torch.tensor([idx], device=device) - return [[state.index_select(0, idx_tensor) for state in state_tuple] for state_tuple in states] - - def _default_hypo_sort_key(hypo: Hypothesis) -> float: return _get_hypo_score(hypo) / (len(_get_hypo_tokens(hypo)) + 1) # is this doing length normalization ? def _compute_updated_scores( hypos: List[Hypothesis], - next_token_probs: Tensor, + next_token_probs: torch.Tensor, beam_width: int, ) -> Tuple[Tensor, Tensor, Tensor]: # TODO hypo_scores = torch.tensor([_get_hypo_score(h) for h in hypos]).unsqueeze(1) @@ -135,24 +120,61 @@ def model_recog( ended = rf.constant(False, dims=batch_dims_) out_seq_len = rf.constant(0, dims=batch_dims_) seq_log_prob = rf.constant(0.0, dims=batch_dims_) + batch_dim = batch_dims[0] + batch_size = batch_dim.get_dim_value() blank_idx = model.target_dim.get_dim_value() - enc_out = enc_args["enc"] + enc_out = model.encoder_out_linear(enc_args["enc"]) # TODO implement rnnt search temperature = 1.0 step_max_tokens = 100 + def _batch_state(hypos: List[Hypothesis], beam_dim: Dim) -> List[List[torch.Tensor]]: # TODO + + accum_state = rf.State() + # might be improved by not hardcoding lstm predictor + for i in range(model.predictor.num_lstm_layers): + state_h_arr = TensorArray(_get_hypo_state(hypos[0])[str(i)]["h"]) + state_c_arr = TensorArray(_get_hypo_state(hypos[0])[str(i)]["c"]) + for hypo in hypos: + state_h_arr = state_h_arr.push_back(_get_hypo_state(hypo)[str(i)]["h"]) + state_c_arr = state_c_arr.push_back(_get_hypo_state(hypo)[str(i)]["c"]) + state_lay_i = rf.State() + state_lay_i["h"] = state_h_arr.stack(axis=beam_dim) + state_lay_i["c"] = state_c_arr.stack(axis=beam_dim) + accum_state[str(i)] = state_lay_i + + return accum_state + + # states: List[List[torch.Tensor]] = [] + # for i in range(len(_get_hypo_state(hypos[0]))): + # batched_state_components: List[torch.Tensor] = [] + # for j in range(len(_get_hypo_state(hypos[0])[i])): + # batched_state_components.append(torch.cat([_get_hypo_state(hypo)[i][j] for hypo in hypos])) + # states.append(batched_state_components) + # return states + + def _slice_state(states: List[List[torch.Tensor]], idx: int, beam_dim: Dim, device: torch.device) -> List[ + List[torch.Tensor]]: # TODO + sliced_state = rf.State() + for i in range(model.predictor.num_lstm_layers): + sliced_state[str(i)] = rf.State() + sliced_state[str(i)]["h"] = rf.gather(states[str(i)]["h"], indices=idx, axis=beam_dim) + sliced_state[str(i)]["c"] = rf.gather(states[str(i)]["c"], indices=idx, axis=beam_dim) + return sliced_state + # return [[state.index_select(0, idx_tensor) for state in state_tuple] for state_tuple in states] + def _init_b_hypos(device: torch.device) -> List[Hypothesis]: token = blank_idx decoder_state = model.decoder_default_initial_state( - batch_dims=batch_dims_, enc_spatial_dim=enc_spatial_dim + batch_dims=[], enc_spatial_dim=enc_spatial_dim ) - blank_tensor = rf.constant(blank_idx, dims=Dim(1), sparse_dim=model.target_dim_w_blank) - pred_out, _, pred_state = model.predictor(blank_tensor, decoder_state.predictor) + blank_tensor = rf.constant(blank_idx, dims=[], sparse_dim=model.target_dim_w_blank) + pred_out, pred_state = model.predictor(blank_tensor, decoder_state.predictor) init_hypo = ( [token], pred_out, # pred_out[0].detach(), TODO: what is this doing? @@ -164,16 +186,28 @@ def _init_b_hypos(device: torch.device) -> List[Hypothesis]: def _gen_next_token_probs( enc_out: Tensor, hypos: List[Hypothesis], device: torch.device ) -> torch.Tensor: - one_tensor = torch.tensor([1], device=device) - predictor_out = torch.stack([_get_hypo_predictor_out(h) for h in hypos], dim=0) - joined_out, _, _ = model.join( + pred_out_template = _get_hypo_predictor_out(hypos[0]) + state_arr = TensorArray(pred_out_template) + for hypo in hypos: + state_arr = state_arr.push_back(_get_hypo_predictor_out(hypo)) + + step_beam_dim = Dim(len(hypos), name="beam") + step_u_dim = Dim(1, name="u") + predictor_out = state_arr.stack(axis=step_beam_dim) + predictor_out = rf.expand_dim(predictor_out, dim=step_u_dim) + enc_out = rf.expand_dim(enc_out, dim=step_beam_dim) + enc_out = rf.expand_dim(enc_out, dim=single_step_dim) + + joined_out = model.joiner( enc_out, - one_tensor, predictor_out, - torch.tensor([1] * len(hypos), device=device), - ) # [beam_width, 1, 1, num_tokens] - joined_out = torch.nn.functional.log_softmax(joined_out / temperature, dim=3) - return joined_out[:, 0, 0] + batch_dims=[step_beam_dim], + ) # [beam_width,1, 1, num_tokens] + joined_out = rf.log_softmax(joined_out / temperature, axis=model.target_dim_w_blank) + joined_out = rf.squeeze(joined_out, axis=single_step_dim) + joined_out = rf.squeeze(joined_out, axis=step_u_dim) + + return joined_out def _gen_b_hypos( b_hypos: List[Hypothesis], @@ -245,147 +279,114 @@ def _gen_new_hypos( t: int, device: torch.device, ) -> List[Hypothesis]: - tgt_tokens = torch.tensor([[token] for token in tokens], device=device) - states = _batch_state(base_hypos) - pred_out, _, pred_states = model.predict( + beam_dim=Dim(len(base_hypos), name="beam") + tgt_tokens_raw = torch.tensor(tokens, device=device) + tgt_tokens = rf.Tensor( + name="tgt_tokens", + dims=[beam_dim], + raw_tensor=tgt_tokens_raw, + sparse_dim=model.target_dim_w_blank, + dtype="int64", + ) + + states = _batch_state(base_hypos, beam_dim) + pred_out, pred_state = model.predictor( tgt_tokens, - torch.tensor([1] * len(base_hypos), device=device), states, ) new_hypos: List[Hypothesis] = [] for i, h_a in enumerate(base_hypos): new_tokens = _get_hypo_tokens(h_a) + [tokens[i]] - new_hypos.append((new_tokens, pred_out[i].detach(), _slice_state(pred_states, i, device), scores[i])) + new_hypos.append((new_tokens, rf.gather(pred_out, indices=i, axis=beam_dim), _slice_state(pred_state, i, beam_dim, device), scores[i])) # detach? return new_hypos - # TODO: call for every seq - # for enc_out in enc_out_batched: - - # from _search function - n_time_steps = enc_out.get_dim(1) device = enc_out.device + beam_width = beam_size - breakpoint() - a_hypos: List[Hypothesis] = [] - b_hypos = _init_b_hypos(device) # used for streaming: if hypo is None else hypo - for t in range(n_time_steps): - a_hypos = b_hypos - b_hypos = torch.jit.annotate(List[Hypothesis], []) - key_to_b_hypo: Dict[str, Hypothesis] = {} - symbols_current_t = 0 - - while a_hypos: - next_token_probs = _gen_next_token_probs(enc_out[:, t: t + 1], a_hypos, device) - next_token_probs = next_token_probs.cpu() - b_hypos = _gen_b_hypos(b_hypos, a_hypos, next_token_probs, key_to_b_hypo) - - if symbols_current_t == step_max_tokens: - break - - a_hypos = _gen_a_hypos( - a_hypos, - b_hypos, - next_token_probs, - t, - beam_width, - device, - ) - if a_hypos: - symbols_current_t += 1 - _, sorted_idx = torch.tensor([hypo_sort_key(hyp) for hyp in b_hypos]).topk(beam_width) - b_hypos = [b_hypos[idx] for idx in sorted_idx] + seq_log_scores_raw = torch.full((batch_size, beam_width), fill_value= 1e-30, device=device) + seq_targets_raw = [[[] for _ in range(beam_width)] for _ in range(batch_size)] - # return b_hypos - # results is in b_hypoes - # TODO: extract results + # non-batched search + for i in range(batch_size): + enc_out_i = rf.gather(enc_out, indices=i, axis=batch_dim) - return seq_targets, seq_log_prob, out_spatial_dim, beam_dim + # from _search function + n_time_steps = enc_out_i.get_dim(0) - # old search code - i = 0 - seq_targets = [] - seq_backrefs = [] - while True: - # if i == 0: - # input_embed = rf.zeros( - # batch_dims_ + [model.target_embed.out_dim], - # feature_dim=model.target_embed.out_dim, - # ) - # else: - # input_embed = model.target_embed(target) - step_out, decoder_state = model.loop_step( - **enc_args, - enc_spatial_dim=enc_spatial_dim, - input_embed=target, - state=decoder_state, - ) - # logits = model.decode_logits(input_embed=input_embed, **step_out) - label_log_prob = rf.log_softmax(step_out["output"], axis=model.target_dim) - - # Filter out finished beams - label_log_prob = rf.where( - ended, - rf.sparse_to_dense( - model.eos_idx, - axis=model.target_dim, - label_value=0.0, - other_value=-1.0e30, - ), - label_log_prob, - ) - seq_log_prob = seq_log_prob + label_log_prob # Batch, InBeam, Vocab - seq_log_prob, (backrefs, target), beam_dim = rf.top_k( - seq_log_prob, - k_dim=Dim(beam_size, name=f"dec-step{i}-beam"), - axis=[beam_dim, model.target_dim], - ) # seq_log_prob, backrefs, target: Batch, Beam - seq_targets.append(target) - seq_backrefs.append(backrefs) - decoder_state = tree.map_structure( - lambda s: rf.gather(s, indices=backrefs), decoder_state - ) - ended = rf.gather(ended, indices=backrefs) - out_seq_len = rf.gather(out_seq_len, indices=backrefs) - i += 1 + a_hypos: List[Hypothesis] = [] + b_hypos = _init_b_hypos(device) # used for streaming: if hypo is None else hypo + for t in range(n_time_steps): + a_hypos = b_hypos + b_hypos = torch.jit.annotate(List[Hypothesis], []) + key_to_b_hypo: Dict[str, Hypothesis] = {} + symbols_current_t = 0 - ended = rf.logical_or(ended, target == model.eos_idx) - ended = rf.logical_or(ended, rf.copy_to_device(i >= max_seq_len)) - if bool(rf.reduce_all(ended, axis=ended.dims).raw_tensor): - break - out_seq_len = out_seq_len + rf.where(ended, 0, 1) - - if i > 1 and length_normalization_exponent != 0: - # Length-normalized scores, so we evaluate score_t/len. - # If seq ended, score_i/i == score_{i-1}/(i-1), thus score_i = score_{i-1}*(i/(i-1)) - # Because we count with EOS symbol, shifted by one. - seq_log_prob *= rf.where( - ended, - (i / (i - 1)) ** length_normalization_exponent, - 1.0, - ) + while a_hypos: + next_token_probs = _gen_next_token_probs(rf.gather(enc_out_i, indices=t, axis=enc_out_i.dims[0]), a_hypos, device) + next_token_probs = rf.copy_to_device(next_token_probs, "cpu") + b_hypos = _gen_b_hypos(b_hypos, a_hypos, next_token_probs.raw_tensor, key_to_b_hypo) - if i > 0 and length_normalization_exponent != 0: - seq_log_prob *= (1 / i) ** length_normalization_exponent + if symbols_current_t == step_max_tokens: + break - # Backtrack via backrefs, resolve beams. - seq_targets_ = [] - indices = rf.range_over_dim(beam_dim) # FinalBeam -> FinalBeam - for backrefs, target in zip(seq_backrefs[::-1], seq_targets[::-1]): - # indices: FinalBeam -> Beam - # backrefs: Beam -> PrevBeam - seq_targets_.insert(0, rf.gather(target, indices=indices)) - indices = rf.gather(backrefs, indices=indices) # FinalBeam -> PrevBeam + a_hypos = _gen_a_hypos( + a_hypos, + b_hypos, + next_token_probs.raw_tensor, + t, + beam_width, + device, + ) + if a_hypos: + symbols_current_t += 1 + + _, sorted_idx = torch.tensor([_default_hypo_sort_key(hyp) for hyp in b_hypos]).topk(beam_width) + b_hypos = [b_hypos[idx] for idx in sorted_idx] - seq_targets__ = TensorArray(seq_targets_[0]) - for target in seq_targets_: - seq_targets__ = seq_targets__.push_back(target) - out_spatial_dim = Dim(out_seq_len, name="out-spatial") - seq_targets = seq_targets__.stack(axis=out_spatial_dim) + seq_log_scores_raw[i] = torch.tensor([_get_hypo_score(hypo) for hypo in b_hypos]) + for j, hypo in enumerate(b_hypos): + seq_targets_raw[i][j] = _get_hypo_tokens(hypo) + + # pad targets to max length + # how to create a dynamic tensor? + lens = [] + for i in range(batch_size): + for j in range(beam_width): + lens.append(len(seq_targets_raw[i][j])-1) + max_hyp_len = max(lens) # first blank token will be removed + + seq_targets_raw_padded = torch.full((batch_size, beam_width, max_hyp_len), fill_value=model.eos_idx, device=device) + for i in range(batch_size): + for j in range(beam_width): + seq_targets_raw_padded[i, j, :len(seq_targets_raw[i][j])-1] = torch.tensor(seq_targets_raw[i][j][1:]) # remove first blank token + + out_spatial_dim = Dim(max_hyp_len, name="out-spatial") + beam_dim = Dim(beam_width, name="beam") + out_spatial_dim.dyn_size_ext = rf.Tensor( + name="out_seq_lens", + dims=[batch_dim, beam_dim], + raw_tensor=torch.tensor(lens, dtype=torch.int32).view(batch_size, beam_width), + dtype="int32", + ) + + seq_targets = rf.Tensor( + name="seq_targets", + dims=[batch_dim, beam_dim, out_spatial_dim], + raw_tensor=seq_targets_raw_padded, + dtype="int64", + sparse_dim=model.target_dim, + ) + + seq_log_prob = rf.Tensor( + name="seq_log_prob", + dims=[batch_dim, beam_dim], + raw_tensor=seq_log_scores_raw, + dtype="float32", + ) return seq_targets, seq_log_prob, out_spatial_dim, beam_dim - # RecogDef API model_recog: RecogDef[Model] model_recog.output_with_beam = True @@ -399,78 +400,78 @@ def _gen_new_hypos( ### Copied from torchaudio # TODO: Adapt to rf -from typing import Callable, Dict, List, Optional, Tuple - -import torch -from torchaudio.models import RNNT - - -__all__ = ["Hypothesis", "RNNTBeamSearch"] - - -Hypothesis = Tuple[List[int], torch.Tensor, List[List[torch.Tensor]], float] -Hypothesis.__doc__ = """Hypothesis generated by RNN-T beam search decoder, - represented as tuple of (tokens, prediction network output, prediction network state, score). - """ - - -def _get_hypo_tokens(hypo: Hypothesis) -> List[int]: - return hypo[0] - - -def _get_hypo_predictor_out(hypo: Hypothesis) -> torch.Tensor: - return hypo[1] - - -def _get_hypo_state(hypo: Hypothesis) -> List[List[torch.Tensor]]: - return hypo[2] - - -def _get_hypo_score(hypo: Hypothesis) -> float: - return hypo[3] - - -def _get_hypo_key(hypo: Hypothesis) -> str: - return str(hypo[0]) - - -def _batch_state(hypos: List[Hypothesis]) -> List[List[torch.Tensor]]: - states: List[List[torch.Tensor]] = [] - for i in range(len(_get_hypo_state(hypos[0]))): - batched_state_components: List[torch.Tensor] = [] - for j in range(len(_get_hypo_state(hypos[0])[i])): - batched_state_components.append(torch.cat([_get_hypo_state(hypo)[i][j] for hypo in hypos])) - states.append(batched_state_components) - return states - - -def _slice_state(states: List[List[torch.Tensor]], idx: int, device: torch.device) -> List[List[torch.Tensor]]: - idx_tensor = torch.tensor([idx], device=device) - return [[state.index_select(0, idx_tensor) for state in state_tuple] for state_tuple in states] - - -def _default_hypo_sort_key(hypo: Hypothesis) -> float: - return _get_hypo_score(hypo) / (len(_get_hypo_tokens(hypo)) + 1) - - -def _compute_updated_scores( - hypos: List[Hypothesis], - next_token_probs: torch.Tensor, - beam_width: int, -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - hypo_scores = torch.tensor([_get_hypo_score(h) for h in hypos]).unsqueeze(1) - nonblank_scores = hypo_scores + next_token_probs[:, :-1] # [beam_width, num_tokens - 1] - nonblank_nbest_scores, nonblank_nbest_idx = nonblank_scores.reshape(-1).topk(beam_width) - nonblank_nbest_hypo_idx = nonblank_nbest_idx.div(nonblank_scores.shape[1], rounding_mode="trunc") - nonblank_nbest_token = nonblank_nbest_idx % nonblank_scores.shape[1] - return nonblank_nbest_scores, nonblank_nbest_hypo_idx, nonblank_nbest_token - - -def _remove_hypo(hypo: Hypothesis, hypo_list: List[Hypothesis]) -> None: - for i, elem in enumerate(hypo_list): - if _get_hypo_key(hypo) == _get_hypo_key(elem): - del hypo_list[i] - break +# from typing import Callable, Dict, List, Optional, Tuple +# +# import torch +# from torchaudio.models import RNNT +# +# +# __all__ = ["Hypothesis", "RNNTBeamSearch"] +# +# +# Hypothesis = Tuple[List[int], torch.Tensor, List[List[torch.Tensor]], float] +# Hypothesis.__doc__ = """Hypothesis generated by RNN-T beam search decoder, +# represented as tuple of (tokens, prediction network output, prediction network state, score). +# """ +# +# +# def _get_hypo_tokens(hypo: Hypothesis) -> List[int]: +# return hypo[0] +# +# +# def _get_hypo_predictor_out(hypo: Hypothesis) -> torch.Tensor: +# return hypo[1] +# +# +# def _get_hypo_state(hypo: Hypothesis) -> List[List[torch.Tensor]]: +# return hypo[2] +# +# +# def _get_hypo_score(hypo: Hypothesis) -> float: +# return hypo[3] +# +# +# def _get_hypo_key(hypo: Hypothesis) -> str: +# return str(hypo[0]) +# +# +# def _batch_state(hypos: List[Hypothesis]) -> List[List[torch.Tensor]]: +# states: List[List[torch.Tensor]] = [] +# for i in range(len(_get_hypo_state(hypos[0]))): +# batched_state_components: List[torch.Tensor] = [] +# for j in range(len(_get_hypo_state(hypos[0])[i])): +# batched_state_components.append(torch.cat([_get_hypo_state(hypo)[i][j] for hypo in hypos])) +# states.append(batched_state_components) +# return states +# +# +# def _slice_state(states: List[List[torch.Tensor]], idx: int, device: torch.device) -> List[List[torch.Tensor]]: +# idx_tensor = torch.tensor([idx], device=device) +# return [[state.index_select(0, idx_tensor) for state in state_tuple] for state_tuple in states] +# +# +# def _default_hypo_sort_key(hypo: Hypothesis) -> float: +# return _get_hypo_score(hypo) / (len(_get_hypo_tokens(hypo)) + 1) +# +# +# def _compute_updated_scores( +# hypos: List[Hypothesis], +# next_token_probs: torch.Tensor, +# beam_width: int, +# ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: +# hypo_scores = torch.tensor([_get_hypo_score(h) for h in hypos]).unsqueeze(1) +# nonblank_scores = hypo_scores + next_token_probs[:, :-1] # [beam_width, num_tokens - 1] +# nonblank_nbest_scores, nonblank_nbest_idx = nonblank_scores.reshape(-1).topk(beam_width) +# nonblank_nbest_hypo_idx = nonblank_nbest_idx.div(nonblank_scores.shape[1], rounding_mode="trunc") +# nonblank_nbest_token = nonblank_nbest_idx % nonblank_scores.shape[1] +# return nonblank_nbest_scores, nonblank_nbest_hypo_idx, nonblank_nbest_token +# +# +# def _remove_hypo(hypo: Hypothesis, hypo_list: List[Hypothesis]) -> None: +# for i, elem in enumerate(hypo_list): +# if _get_hypo_key(hypo) == _get_hypo_key(elem): +# del hypo_list[i] +# break class RNNTBeamSearch(torch.nn.Module): From 00219c564f91d73111e91eebbe36e518e5844ac7 Mon Sep 17 00:00:00 2001 From: Simon Berger Date: Thu, 13 Jun 2024 16:21:09 +0200 Subject: [PATCH 165/227] Update users/berger --- .../__init__.py | 2 + .../config_02b_transducer_wei_data.py | 301 +++++++++----- ...config_02c_transducer_wei_data_tinaconf.py | 56 ++- ...onfig_02d_transducer_wei_data_am_scales.py | 379 ++++++++++++++++++ .../config_03b_transducer_fullsum_wei_data.py | 28 +- users/berger/corpus/switchboard/lm_data.py | 10 +- 6 files changed, 650 insertions(+), 126 deletions(-) create mode 100644 users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_02d_transducer_wei_data_am_scales.py diff --git a/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/__init__.py b/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/__init__.py index 4715643f8..4a6c7734c 100644 --- a/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/__init__.py +++ b/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/__init__.py @@ -9,6 +9,7 @@ from .config_01c_ctc_blstm_wei_data import py as py_01c from .config_02b_transducer_wei_data import py as py_02b from .config_02c_transducer_wei_data_tinaconf import py as py_02c +from .config_02d_transducer_wei_data_am_scales import py as py_02d from .config_03b_transducer_fullsum_wei_data import py as py_03b @@ -21,6 +22,7 @@ def main() -> SummaryReport: py_01c()[0], py_02b()[0], py_02c()[0], + py_02d()[0], py_03b(), ]: subreport = copy.deepcopy(subreport) diff --git a/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_02b_transducer_wei_data.py b/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_02b_transducer_wei_data.py index 9ec52b39d..d90590e69 100644 --- a/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_02b_transducer_wei_data.py +++ b/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_02b_transducer_wei_data.py @@ -180,7 +180,9 @@ def generate_returnn_config( return returnn_config -def run_exp(alignments: Dict[str, AlignmentData], name_suffix: str = "") -> Tuple[SummaryReport, Checkpoint]: +def run_exp( + alignments: Dict[str, AlignmentData], name_suffix: str = "" +) -> Tuple[SummaryReport, Checkpoint]: assert tools.returnn_root is not None assert tools.returnn_python_exe is not None assert tools.rasr_binary_path is not None @@ -306,134 +308,225 @@ def run_exp(alignments: Dict[str, AlignmentData], name_suffix: str = "") -> Tupl train_data_config=data.train_data_config, dev_data_config=data.cv_data_config, ) - for ilm_scale in [0.0, 0.2] + for ilm_scale in [0.0, 0.1, 0.15, 0.2, 0.25, 0.3] }, ), ) system.run_train_step(**train_args) - system.run_dev_recog_step(**recog_args) - recog_args.update( - { - "lm_scales": [0.6, 0.7], - "epochs": [ - 213, - 249, - 261, - 279, - 283, - 283, - 284, - 285, - 286, - 289, - 291, - 297, - 298, - 299, - 300, + if False: + recog_args.update( + { + "lm_scales": [0.6, 0.7], + "epochs": [ + 213, + 249, + 261, + 279, + 283, + 283, + 284, + 285, + 286, + 289, + 291, + 297, + 298, + 299, + 300, + ], + } + ) + system.run_dev_recog_step( + exp_names=[ + f"Conformer_Transducer_Viterbi_wei-data_{name_suffix}_lr-0.0008" ], - } - ) + recog_exp_names=["recog_ilm-0.1", "recog_ilm-0.2"], + **recog_args, + ) + + recog_args.update({"lm_scales": [0.6], "epochs": [298]}) system.run_dev_recog_step( exp_names=[f"Conformer_Transducer_Viterbi_wei-data_{name_suffix}_lr-0.0008"], - recog_exp_names=["recog_ilm-0.1", "recog_ilm-0.2"], + recog_exp_names=["recog_ilm-0.2"], **recog_args, ) - recog_args.update( - { - "epochs": [ - 213, - 249, - 261, - 267, - 273, - 276, - 279, - 280, - 281, - 282, - 283, - 284, - 285, - 286, - 289, - 291, - 298, - 298, - 299, - 300, - ] - } - ) + if False: + recog_args.update( + { + "epochs": [ + 213, + 249, + 261, + 267, + 273, + 276, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 289, + 291, + 298, + 298, + 299, + 300, + ] + } + ) + system.run_dev_recog_step( + exp_names=[ + f"Conformer_Transducer_Viterbi_wei-data_{name_suffix}_lr-0.0008_loss-boost" + ], + recog_exp_names=["recog_ilm-0.1", "recog_ilm-0.2"], + **recog_args, + ) + + recog_args.update({"epochs": [300], "lm_scales": [0.7]}) system.run_dev_recog_step( - exp_names=[f"Conformer_Transducer_Viterbi_wei-data_{name_suffix}_lr-0.0008_loss-boost"], - recog_exp_names=["recog_ilm-0.1", "recog_ilm-0.2"], + exp_names=[ + f"Conformer_Transducer_Viterbi_wei-data_{name_suffix}_lr-0.0008_loss-boost" + ], + recog_exp_names=["recog_ilm-0.1"], **recog_args, ) - recog_args.update( - { - "epochs": [ - 213, - 249, - 261, - 267, - 273, - 279, - 284, - 285, - 289, - 291, - 297, - 298, - 299, - 300, - ] - } - ) + if False: + recog_args.update( + { + "epochs": [ + 213, + 249, + 261, + 267, + 273, + 279, + 284, + 285, + 289, + 291, + 297, + 298, + 299, + 300, + ] + } + ) + system.run_dev_recog_step( + exp_names=[ + f"Conformer_Transducer_Viterbi_wei-data_{name_suffix}_lr-0.0008_ls-0.2" + ], + recog_exp_names=["recog_ilm-0.1", "recog_ilm-0.2"], + **recog_args, + ) + + recog_args.update({"epochs": [300], "lm_scales": [0.4]}) system.run_dev_recog_step( - exp_names=[f"Conformer_Transducer_Viterbi_wei-data_{name_suffix}_lr-0.0008_ls-0.2"], - recog_exp_names=["recog_ilm-0.1", "recog_ilm-0.2"], + exp_names=[ + f"Conformer_Transducer_Viterbi_wei-data_{name_suffix}_lr-0.0008_ls-0.2" + ], + recog_exp_names=["recog_ilm-0.1"], **recog_args, ) - recog_args.update( - { - "epochs": [ - 213, - 249, - 261, - 267, - 273, - 274, - 279, - 280, - 281, - 282, - 283, - 284, - 285, - 286, - 289, - 291, - 297, - 298, - 299, - 300, - ] - } + if False: + recog_args.update( + { + "epochs": [ + 213, + 249, + 261, + 267, + 273, + 274, + 279, + 280, + 281, + 282, + 283, + 284, + 285, + 286, + 289, + 291, + 297, + 298, + 299, + 300, + ] + } + ) + system.run_dev_recog_step( + exp_names=[ + f"Conformer_Transducer_Viterbi_wei-data_{name_suffix}_lr-0.0008_ls-0.2_loss-boost" + ], + recog_exp_names=["recog_ilm-0.1", "recog_ilm-0.2"], + **recog_args, + ) + + recog_args.update({"epochs": [291], "lm_scales": [0.7]}) + system.run_dev_recog_step( + exp_names=[ + f"Conformer_Transducer_Viterbi_wei-data_{name_suffix}_lr-0.0008_ls-0.2_loss-boost" + ], + recog_exp_names=["recog_ilm-0.1"], + **recog_args, ) + + if False: + recog_args.update( + { + "epochs": [ + 213, + 225, + 249, + 261, + 280, + 281, + 285, + 286, + 289, + 292, + 297, + 298, + 299, + 300, + ], + "lm_scales": [0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7], + } + ) + system.run_dev_recog_step( + exp_names=[ + f"Conformer_Transducer_Viterbi_wei-data_specaug-v2_{name_suffix}" + ], + recog_exp_names=[ + "recog_ilm-0.0", + "recog_ilm-0.1", + "recog_ilm-0.15", + "recog_ilm-0.2", + "recog_ilm-0.25", + "recog_ilm-0.3", + ], + **recog_args, + ) + + recog_args.update({"epochs": [292], "lm_scales": [0.45]}) system.run_dev_recog_step( - exp_names=[f"Conformer_Transducer_Viterbi_wei-data_{name_suffix}_lr-0.0008_ls-0.2_loss-boost"], - recog_exp_names=["recog_ilm-0.1", "recog_ilm-0.2"], + exp_names=[f"Conformer_Transducer_Viterbi_wei-data_specaug-v2_{name_suffix}"], + recog_exp_names=["recog_ilm-0.15"], **recog_args, ) - train_job = system.get_train_job(f"Conformer_Transducer_Viterbi_wei-data_{name_suffix}_lr-0.0008") - model = train_job.out_checkpoints[298] + train_job = system.get_train_job( + f"Conformer_Transducer_Viterbi_wei-data_specaug-v2_{name_suffix}" + ) + model = train_job.out_checkpoints[292] assert isinstance(model, Checkpoint) assert system.summary_report diff --git a/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_02c_transducer_wei_data_tinaconf.py b/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_02c_transducer_wei_data_tinaconf.py index 5ad7a0e0e..5e304575b 100644 --- a/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_02c_transducer_wei_data_tinaconf.py +++ b/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_02c_transducer_wei_data_tinaconf.py @@ -16,11 +16,17 @@ from i6_experiments.users.berger.systems.returnn_seq2seq_system import ( ReturnnSeq2SeqSystem, ) -from i6_experiments.users.berger.systems.dataclasses import ReturnnConfigs, FeatureType, SummaryKey +from i6_experiments.users.berger.systems.dataclasses import ( + ReturnnConfigs, + FeatureType, + SummaryKey, +) from i6_experiments.users.berger.util import default_tools, recursive_update from i6_private.users.vieting.helpers.returnn import serialize_dim_tags from i6_experiments.users.berger.systems.dataclasses import AlignmentData -from i6_experiments.users.berger.corpus.switchboard.viterbi_transducer_data import get_switchboard_data +from i6_experiments.users.berger.corpus.switchboard.viterbi_transducer_data import ( + get_switchboard_data, +) from .config_01c_ctc_blstm_wei_data import py as py_ctc_blstm from sisyphus import gs, tk @@ -151,7 +157,9 @@ def generate_returnn_config( return returnn_config -def run_exp(alignments: Dict[str, AlignmentData], name_suffix: str = "") -> Tuple[SummaryReport, Checkpoint]: +def run_exp( + alignments: Dict[str, AlignmentData], name_suffix: str = "" +) -> Tuple[SummaryReport, Checkpoint]: assert tools.returnn_root is not None assert tools.returnn_python_exe is not None assert tools.rasr_binary_path is not None @@ -277,7 +285,9 @@ def run_exp(alignments: Dict[str, AlignmentData], name_suffix: str = "") -> Tupl ) system.run_dev_recog_step( - exp_names=["Conformer_Transducer_Viterbi_wei-data_tinaconf_align-blstm-am-1.0_lr-0.0008"], + exp_names=[ + "Conformer_Transducer_Viterbi_wei-data_tinaconf_align-blstm-am-1.0_lr-0.0008" + ], recog_descriptor=f"bp-{bp}", **recog_args, ) @@ -288,26 +298,46 @@ def run_exp(alignments: Dict[str, AlignmentData], name_suffix: str = "") -> Tupl { "epochs": [300], "lm_scales": [0.8], - "search_parameters": {"blank-label-penalty": 1.0, "label-pruning": lp}, + "search_parameters": { + "blank-label-penalty": 1.0, + "label-pruning": lp, + }, }, ) system.run_dev_recog_step( - exp_names=["Conformer_Transducer_Viterbi_wei-data_tinaconf_align-blstm-am-1.0_lr-0.0008"], + exp_names=[ + "Conformer_Transducer_Viterbi_wei-data_tinaconf_align-blstm-am-1.0_lr-0.0008" + ], recog_exp_names={ - "Conformer_Transducer_Viterbi_wei-data_tinaconf_align-blstm-am-1.0_lr-0.0008": ["recog_ilm-0.1"] + "Conformer_Transducer_Viterbi_wei-data_tinaconf_align-blstm-am-1.0_lr-0.0008": [ + "recog_ilm-0.1" + ] }, recog_descriptor=f"lp-{lp}", **recog_args, ) recursive_update( - recog_args, {"epochs": [300], "lm_scales": [0.8], "search_parameters": {"blank-label-penalty": 1.0}} + recog_args, + { + "epochs": [300], + "lm_scales": [0.8], + "search_parameters": {"blank-label-penalty": 1.0}, + }, + ) + system.run_dev_recog_step( + recog_exp_names={key: ["recog_ilm-0.1"] for key in system.get_exp_names()}, + **recog_args, + ) + system.run_test_recog_step( + recog_exp_names={key: ["recog_ilm-0.1"] for key in system.get_exp_names()}, + **recog_args, ) - system.run_dev_recog_step(recog_exp_names={key: ["recog_ilm-0.1"] for key in system.get_exp_names()}, **recog_args) - system.run_test_recog_step(recog_exp_names={key: ["recog_ilm-0.1"] for key in system.get_exp_names()}, **recog_args) - train_job = system.get_train_job(f"Conformer_Transducer_Viterbi_wei-data_tinaconf_{name_suffix}_lr-0.0008") + train_job = system.get_train_job( + f"Conformer_Transducer_Viterbi_wei-data_tinaconf_{name_suffix}_lr-0.0008" + ) model = train_job.out_checkpoints[300] assert isinstance(model, Checkpoint) @@ -326,7 +356,9 @@ def py() -> Tuple[SummaryReport, Dict[str, Checkpoint]]: for align_model_name, alignments in alignments_blstm.items(): am_scale_pos = align_model_name.find("am-") - align_model_name = "blstm-" + align_model_name[am_scale_pos : am_scale_pos + len("am-1.0")] + align_model_name = ( + "blstm-" + align_model_name[am_scale_pos : am_scale_pos + len("am-1.0")] + ) sub_report, model = run_exp(alignments, name_suffix=f"align-{align_model_name}") models[align_model_name] = model summary_report.merge_report(sub_report, update_structure=True) diff --git a/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_02d_transducer_wei_data_am_scales.py b/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_02d_transducer_wei_data_am_scales.py new file mode 100644 index 000000000..2fe0411fe --- /dev/null +++ b/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_02d_transducer_wei_data_am_scales.py @@ -0,0 +1,379 @@ +import copy +import os +from typing import Dict, Tuple + +import i6_core.rasr as rasr +from i6_core.recognition import Hub5ScoreJob +from i6_core.returnn import Checkpoint +from i6_core.returnn.config import ReturnnConfig +from i6_experiments.users.berger.args.experiments import transducer as exp_args +from i6_experiments.users.berger.args.returnn.config import get_returnn_config +from i6_experiments.users.berger.args.returnn.learning_rates import ( + LearningRateSchedules, +) +import i6_experiments.users.berger.network.models.context_1_transducer as transducer_model +from i6_experiments.users.berger.recipe.summary.report import SummaryReport +from i6_experiments.users.berger.systems.returnn_seq2seq_system import ( + ReturnnSeq2SeqSystem, +) +from i6_experiments.users.berger.systems.dataclasses import ( + ReturnnConfigs, + FeatureType, + SummaryKey, +) +from i6_experiments.users.berger.util import default_tools +from i6_private.users.vieting.helpers.returnn import serialize_dim_tags +from i6_experiments.users.berger.systems.dataclasses import AlignmentData +from i6_experiments.users.berger.corpus.switchboard.viterbi_transducer_data import ( + get_switchboard_data, +) +from .config_01c_ctc_blstm_wei_data import py as py_ctc_blstm +from sisyphus import gs, tk + +tools = copy.deepcopy(default_tools) + +# ********** Settings ********** + +rasr.flow.FlowNetwork.default_flags = {"cache_mode": "task_dependent"} + + +num_classes = 88 + + +# ********** Return Config ********** + + +def generate_returnn_config( + train: bool, + *, + train_data_config: dict, + dev_data_config: dict, + **kwargs, +) -> ReturnnConfig: + specaug_v2 = kwargs.get("specaug_v2", False) + + if specaug_v2: + specaug_args = { + "min_reps_time": 0, + "max_reps_time": 20, + "max_len_time": 20, + "min_reps_feature": 0, + "max_reps_feature": 1, + "max_len_feature": 15, + } + else: + specaug_args = { + "max_time_num": 1, + "max_time": 15, + "max_feature_num": 5, + "max_feature": 4, + } + + if train: + ( + network_dict, + extra_python, + ) = transducer_model.make_context_1_conformer_transducer( + num_outputs=num_classes, + specaug_args=specaug_args, + conformer_args={ + "num_blocks": 12, + "size": 512, + "dropout": 0.1, + "l2": 5e-06, + }, + decoder_args={ + "dec_mlp_args": { + "num_layers": 2, + "size": 640, + "activation": "tanh", + "dropout": 0.1, + "l2": 5e-06, + }, + "combination_mode": "concat", + "joint_mlp_args": { + "num_layers": 1, + "size": 1024, + "dropout": 0.1, + "l2": 5e-06, + "activation": "tanh", + }, + }, + output_args={ + "label_smoothing": kwargs.get("label_smoothing", None), + }, + loss_boost_scale=kwargs.get("loss_boost_scale", 5.0), + loss_boost_v2=kwargs.get("loss_boost_v2", False), + specaug_v2=specaug_v2, + ) + else: + ( + network_dict, + extra_python, + ) = transducer_model.make_context_1_conformer_transducer_recog( + num_outputs=num_classes, + conformer_args={ + "num_blocks": 12, + "size": 512, + "dropout": 0.1, + "l2": 5e-06, + }, + decoder_args={ + "dec_mlp_args": { + "num_layers": 2, + "size": 640, + "activation": "tanh", + }, + "combination_mode": "concat", + "joint_mlp_args": { + "num_layers": 1, + "size": 1024, + "activation": "tanh", + }, + "ilm_scale": kwargs.get("ilm_scale", 0.0), + }, + ) + + extra_config = { + "train": train_data_config, + "dev": dev_data_config, + "chunking": ( + { + "data": 256, + "classes": 64, + }, + { + "data": 128, + "classes": 32, + }, + ), + } + + if kwargs.get("model_preload", None) is not None: + extra_config["preload_from_files"] = { + "base": { + "init_for_train": True, + "ignore_missing": True, + "filename": kwargs.get("model_preload", None), + } + } + + returnn_config = get_returnn_config( + network=network_dict, + target="classes", + num_epochs=300, + python_prolog=[ + "import sys", + "sys.setrecursionlimit(10 ** 6)", + ], + extra_python=extra_python, + num_inputs=40, + num_outputs=num_classes, + extern_target_kwargs={"dtype": "int8" if train else "int32"}, + extern_data_config=True, + grad_noise=0.0, + grad_clip=0.0, + schedule=LearningRateSchedules.OCLR, + initial_lr=1e-05, + peak_lr=kwargs.get("peak_lr", 8e-04), + final_lr=1e-06, + n_steps_per_epoch=3210, + batch_size=12500, + extra_config=extra_config, + ) + returnn_config = serialize_dim_tags(returnn_config) + + return returnn_config + + +def run_exp( + alignments: Dict[str, AlignmentData], name_suffix: str = "" +) -> Tuple[SummaryReport, Checkpoint]: + assert tools.returnn_root is not None + assert tools.returnn_python_exe is not None + assert tools.rasr_binary_path is not None + + data = get_switchboard_data( + tools.returnn_root, + tools.returnn_python_exe, + rasr_binary_path=tools.rasr_binary_path, + alignments=alignments, + use_wei_data=True, + test_keys=["hub5e01"], + feature_type=FeatureType.GAMMATONE_8K, + dc_detection=True, + ) + + # ********** System ********** + + system = ReturnnSeq2SeqSystem( + tools, + summary_keys=[ + SummaryKey.TRAIN_NAME, + SummaryKey.CORPUS, + SummaryKey.RECOG_NAME, + SummaryKey.EPOCH, + SummaryKey.LM, + SummaryKey.WER, + SummaryKey.SUB, + SummaryKey.INS, + SummaryKey.DEL, + SummaryKey.ERR, + ], + summary_sort_keys=[SummaryKey.ERR, SummaryKey.CORPUS], + ) + + # ********** Step args ********** + + train_args = exp_args.get_transducer_train_step_args( + num_epochs=300, + ) + + recog_args = exp_args.get_transducer_recog_step_args( + num_classes, + lm_scales=[0.4, 0.45, 0.5], + epochs=list(range(290, 301)), + search_parameters={"label-pruning": 14.4}, + feature_type=FeatureType.GAMMATONE_8K, + reduction_factor=4, + reduction_subtrahend=0, + flow_args={"dc_detection": True}, + ) + recog_am_args = copy.deepcopy(exp_args.transducer_recog_am_args) + recog_am_args.update( + { + # "state_tying": "lookup", + # "state_tying_file": tk.Path("/work/asr4/berger/dependencies/switchboard/state_tying/wei_mono-eow"), + "tying_type": "global-and-nonword", + "nonword_phones": ["[NOISE]", "[VOCALIZEDNOISE]", "[LAUGHTER]"], + } + ) + + system.init_corpora( + dev_keys=data.dev_keys, + test_keys=data.test_keys, + align_keys=data.align_keys, + corpus_data=data.data_inputs, + am_args=recog_am_args, + ) + system.setup_scoring(scorer_type=Hub5ScoreJob) + + # ********** Returnn Configs ********** + + for lr in [8e-04]: + for label_smoothing in [None]: + for loss_boost_scale in [0.0]: + train_config = generate_returnn_config( + train=True, + train_data_config=data.train_data_config, + dev_data_config=data.cv_data_config, + peak_lr=lr, + label_smoothing=label_smoothing, + loss_boost_v2=True, + loss_boost_scale=loss_boost_scale, + model_preload=None, + specaug_v2=True, + ) + + returnn_configs = ReturnnConfigs( + train_config=train_config, + recog_configs={ + f"recog_ilm-{ilm_scale}": generate_returnn_config( + train=False, + ilm_scale=ilm_scale, + train_data_config=data.train_data_config, + dev_data_config=data.cv_data_config, + ) + for ilm_scale in [0.0, 0.1, 0.15, 0.2, 0.3] + }, + ) + name = f"Conformer_Transducer_Viterbi_wei-data_{name_suffix}_lr-{lr}" + if label_smoothing: + name += f"_ls-{label_smoothing}" + if loss_boost_scale: + name += "_loss-boost" + + system.add_experiment_configs(name, returnn_configs) + + system.run_train_step(**train_args) + + if False: + system.run_dev_recog_step(**recog_args) + + if "am-1.0" in name_suffix: + recog_args.update( + { + "lm_scales": [0.5], + "epochs": [291], + } + ) + system.run_dev_recog_step(recog_exp_names=["recog_ilm-0.2"], **recog_args) + system.run_test_recog_step(recog_exp_names=["recog_ilm-0.2"], **recog_args) + elif "am-0.7" in name_suffix: + recog_args.update( + { + "lm_scales": [0.5], + "epochs": [291], + } + ) + system.run_dev_recog_step(recog_exp_names=["recog_ilm-0.3"], **recog_args) + system.run_test_recog_step(recog_exp_names=["recog_ilm-0.3"], **recog_args) + elif "am-0.5" in name_suffix: + recog_args.update( + { + "lm_scales": [0.4], + "epochs": [297], + } + ) + system.run_dev_recog_step(recog_exp_names=["recog_ilm-0.3"], **recog_args) + system.run_test_recog_step(recog_exp_names=["recog_ilm-0.3"], **recog_args) + elif "am-0.3" in name_suffix: + recog_args.update( + { + "lm_scales": [0.4], + "epochs": [297], + } + ) + system.run_dev_recog_step(recog_exp_names=["recog_ilm-0.3"], **recog_args) + system.run_test_recog_step(recog_exp_names=["recog_ilm-0.3"], **recog_args) + elif "am-0.1" in name_suffix: + recog_args.update( + { + "lm_scales": [0.4], + "epochs": [297], + } + ) + system.run_dev_recog_step(recog_exp_names=["recog_ilm-0.3"], **recog_args) + system.run_test_recog_step(recog_exp_names=["recog_ilm-0.3"], **recog_args) + + train_job = system.get_train_job( + f"Conformer_Transducer_Viterbi_wei-data_{name_suffix}_lr-0.0008" + ) + model = train_job.out_checkpoints[300] + assert isinstance(model, Checkpoint) + + assert system.summary_report + return system.summary_report, model + + +def py() -> Tuple[SummaryReport, Dict[str, Checkpoint]]: + _, alignments_blstm = py_ctc_blstm() + + filename_handle = os.path.splitext(os.path.basename(__file__))[0][len("config_") :] + gs.ALIAS_AND_OUTPUT_SUBDIR = f"{filename_handle}/" + + summary_report = SummaryReport() + models = {} + + for align_model_name, alignments in alignments_blstm.items(): + am_scale_pos = align_model_name.find("am-") + align_model_name = ( + "blstm-" + align_model_name[am_scale_pos : am_scale_pos + len("am-1.0")] + ) + sub_report, model = run_exp(alignments, name_suffix=f"align-{align_model_name}") + models[align_model_name] = model + summary_report.merge_report(sub_report, update_structure=True) + + tk.register_report(f"{gs.ALIAS_AND_OUTPUT_SUBDIR}/summary.report", summary_report) + + return summary_report, models diff --git a/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_03b_transducer_fullsum_wei_data.py b/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_03b_transducer_fullsum_wei_data.py index 6e103088c..e9487f3cc 100644 --- a/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_03b_transducer_fullsum_wei_data.py +++ b/users/berger/configs/switchboard/20240202_phoneme_transducer_pipeline/config_03b_transducer_fullsum_wei_data.py @@ -11,13 +11,19 @@ from i6_experiments.users.berger.args.returnn.learning_rates import ( LearningRateSchedules, ) -from i6_experiments.users.berger.corpus.switchboard.viterbi_transducer_data import get_switchboard_data +from i6_experiments.users.berger.corpus.switchboard.viterbi_transducer_data import ( + get_switchboard_data, +) import i6_experiments.users.berger.network.models.context_1_transducer as transducer_model from i6_experiments.users.berger.recipe.summary.report import SummaryReport from i6_experiments.users.berger.systems.returnn_seq2seq_system import ( ReturnnSeq2SeqSystem, ) -from i6_experiments.users.berger.systems.dataclasses import ReturnnConfigs, FeatureType, SummaryKey +from i6_experiments.users.berger.systems.dataclasses import ( + ReturnnConfigs, + FeatureType, + SummaryKey, +) from i6_experiments.users.berger.util import default_tools from i6_private.users.vieting.helpers.returnn import serialize_dim_tags from i6_experiments.users.berger.systems.dataclasses import AlignmentData @@ -47,7 +53,7 @@ def generate_returnn_config( **kwargs, ) -> ReturnnConfig: if train: - (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_fullsum( + network_dict, extra_python = transducer_model.make_context_1_conformer_transducer_fullsum( num_outputs=num_classes, specaug_args={ "max_time_num": 1, @@ -79,9 +85,10 @@ def generate_returnn_config( "activation": "tanh", }, }, + fullsum_v2=True, ) else: - (network_dict, extra_python,) = transducer_model.make_context_1_conformer_transducer_recog( + network_dict, extra_python = transducer_model.make_context_1_conformer_transducer_recog( num_outputs=num_classes, conformer_args={ "num_blocks": 12, @@ -164,7 +171,7 @@ def run_exp(alignments: Dict[str, AlignmentData], viterbi_model_checkpoint: Chec recog_args = exp_args.get_transducer_recog_step_args( num_classes, - lm_scales=[0.5, 0.6, 0.7], + lm_scales=[0.45, 0.5, 0.6, 0.7], epochs=[300], search_parameters={"label-pruning": 14.4}, feature_type=FeatureType.GAMMATONE_8K, @@ -220,13 +227,17 @@ def run_exp(alignments: Dict[str, AlignmentData], viterbi_model_checkpoint: Chec for lr, batch_size, accum_grad in [(8e-05, 3000, 3)]: train_config = generate_returnn_config( - train=True, lr=lr, batch_size=batch_size, accum_grad=accum_grad, **config_generator_kwargs + train=True, + lr=lr, + batch_size=batch_size, + accum_grad=accum_grad, + **config_generator_kwargs, ) recog_configs = { f"recog_ilm-{ilm_scale}": generate_returnn_config( train=False, ilm_scale=ilm_scale, **config_generator_kwargs ) - for ilm_scale in [0.0, 0.25] + for ilm_scale in [0.0, 0.15, 0.25] } returnn_configs = ReturnnConfigs( @@ -234,7 +245,8 @@ def run_exp(alignments: Dict[str, AlignmentData], viterbi_model_checkpoint: Chec recog_configs=recog_configs, ) system.add_experiment_configs( - f"Conformer_Transducer_Fullsum_lr-{lr}_bs-{batch_size*accum_grad}", returnn_configs + f"Conformer_Transducer_Fullsum_lr-{lr}_bs-{batch_size*accum_grad}", + returnn_configs, ) system.run_train_step(**train_args) diff --git a/users/berger/corpus/switchboard/lm_data.py b/users/berger/corpus/switchboard/lm_data.py index cd566bd73..79a586991 100644 --- a/users/berger/corpus/switchboard/lm_data.py +++ b/users/berger/corpus/switchboard/lm_data.py @@ -5,9 +5,15 @@ def get_lm(name: str) -> rasr_lm_config.LMData: if name == "zoltan_4gram": - return rasr_lm_config.ArpaLMData(10, tk.Path("/work/asr4/berger/dependencies/switchboard/lm/zoltan_4gram.gz")) + return rasr_lm_config.ArpaLMData( + scale=10, + filename=tk.Path("/work/asr4/berger/dependencies/switchboard/lm/zoltan_4gram.gz"), + lookahead_lm=None, + ) elif name == "fisher_4gram": return rasr_lm_config.ArpaLMData( - 10, tk.Path("/home/tuske/work/ASR/switchboard/corpus/lm/data/mylm/swb.fsh.4gr.voc30k.LM.gz") + scale=10, + filename=tk.Path("/home/tuske/work/ASR/switchboard/corpus/lm/data/mylm/swb.fsh.4gr.voc30k.LM.gz"), + lookahead_lm=None, ) raise ValueError From 7a97565677255e933c9116ae6eed60b108999051 Mon Sep 17 00:00:00 2001 From: marvin84 Date: Thu, 13 Jun 2024 16:21:05 +0200 Subject: [PATCH 166/227] update users/raissi monofactored --- .../swb/legacy/data_preparation/lm_data.py | 10 +- .../data_preparation/pipeline_base_args_v2.py | 144 +++++ .../raissi/setups/common/analysis/__init__.py | 6 +- .../common/analysis/frame_statistics.py | 289 +++++++++ users/raissi/setups/common/analysis/labels.py | 610 ++++++++++++++++++ .../analysis/phoneme_length_statistics.py | 124 ++++ .../decoder/BASE_factored_hybrid_search.py | 2 + .../common/helpers/train/network_params.py | 20 +- .../legacy/SWB_TF_factored_hybrid_system.py | 3 + 9 files changed, 1199 insertions(+), 9 deletions(-) create mode 100644 users/raissi/experiments/tedlium/data_preparation/pipeline_base_args_v2.py create mode 100755 users/raissi/setups/common/analysis/frame_statistics.py create mode 100644 users/raissi/setups/common/analysis/labels.py create mode 100755 users/raissi/setups/common/analysis/phoneme_length_statistics.py diff --git a/users/raissi/experiments/swb/legacy/data_preparation/lm_data.py b/users/raissi/experiments/swb/legacy/data_preparation/lm_data.py index 81631b7e1..bb02038b0 100644 --- a/users/raissi/experiments/swb/legacy/data_preparation/lm_data.py +++ b/users/raissi/experiments/swb/legacy/data_preparation/lm_data.py @@ -6,16 +6,18 @@ def get_lm(name: str) -> rasr_lm_config.LMData: if name == "zoltan_4gram": return rasr_lm_config.ArpaLMData( - 10, - tk.Path( + scale=10, + lookahead_lm=None, + filename=tk.Path( "/work/asr4/berger/dependencies/switchboard/lm/zoltan_4gram.gz", hash_overwrite="/work/asr4/berger/dependencies", ), ) elif name == "fisher_4gram": return rasr_lm_config.ArpaLMData( - 10, - tk.Path( + scale=10, + lookahead_lm=None, + filename=tk.Path( "/work/asr4/vieting/setups/swb/dependencies/swb.fsh.4gr.voc30k.LM.gz", hash_overwrite="/home/tuske/work/ASR/switchboard/corpus/lm/data/mylm/swb.fsh.4gr.voc30k.LM.gz", ), diff --git a/users/raissi/experiments/tedlium/data_preparation/pipeline_base_args_v2.py b/users/raissi/experiments/tedlium/data_preparation/pipeline_base_args_v2.py new file mode 100644 index 000000000..4adace3cc --- /dev/null +++ b/users/raissi/experiments/tedlium/data_preparation/pipeline_base_args_v2.py @@ -0,0 +1,144 @@ +__all__ = [ + "get_init_args", + "get_corpus_data_inputs", + "get_final_output", +] + +from typing import Dict +from collections import defaultdict + +#----------recipes-------------------- +from i6_core.features.filterbank import filter_width_from_channels + +from i6_experiments.common.baselines.librispeech.default_tools import SCTK_BINARY_PATH +from i6_experiments.common.baselines.librispeech.data import CorpusData +import i6_experiments.common.datasets.tedlium2_v2 as ted_dataset +import i6_experiments.common.setups.rasr as rasr_util +from i6_experiments.common.setups.rasr.config.lex_config import ( + LexiconRasrConfig, +) +from i6_experiments.common.setups.rasr.config.lm_config import ArpaLmRasrConfig + +#TED specific +from i6_experiments.common.datasets.tedlium2.constants import CONCURRENT +from i6_experiments.common.datasets.tedlium2_v2.corpus import get_corpus_object_dict +from i6_experiments.common.datasets.tedlium2_v2.lexicon import ( + get_g2p_augmented_bliss_lexicon, +) +from i6_experiments.common.baselines.tedlium2.lm.ngram_config import run_tedlium2_ngram_lm + +from i6_experiments.users.raissi.setups.common.data.pipeline_helpers import ( + InputKey +) + +def get_init_args(): + am_args = { + "state_tying": "monophone", + "states_per_phone": 3, + "state_repetitions": 1, + "across_word_model": True, + "early_recombination": False, + "tdp_scale": 1.0, + "tdp_transition": (3.0, 0.0, "infinity", 0.0), + "tdp_silence": (0.0, 3.0, "infinity", 20.0), + "tying_type": "global", + "nonword_phones": "", + "tdp_nonword": ( + 0.0, + 3.0, + "infinity", + 6.0, + ), # only used when tying_type = global-and-nonword + } + + costa_args = {"eval_recordings": True, "eval_lm": False} + + feature_extraction_args = { + "fb": { + "filterbank_options": { + "warping_function": "mel", + "filter_width": filter_width_from_channels(channels=80, warping_function="mel", f_max=8000), + "normalize": True, + "normalization_options": None, + "without_samples": False, + "samples_options": { + "audio_format": "wav", + "dc_detection": False, + }, + "fft_options": None, + "add_features_output": True, + "apply_log": True, + "add_epsilon": True, + } + } + } + + scorer_args = {"sctk_binary_path": SCTK_BINARY_PATH} + + return rasr_util.RasrInitArgs( + costa_args=costa_args, + am_args=am_args, + feature_extraction_args=feature_extraction_args, + scorer_args=scorer_args, + ) + + + +def get_corpus_data_inputs(add_unknown_phoneme_and_mapping: bool = True) -> Dict[str, Dict[str, rasr_util.RasrDataInput]]: + + corpus_object_dict = get_corpus_object_dict(audio_format="wav", output_prefix="corpora") + + train_lexicon = LexiconRasrConfig( + get_g2p_augmented_bliss_lexicon( + add_unknown_phoneme_and_mapping=add_unknown_phoneme_and_mapping, output_prefix="lexicon" + ), + False, + ) + + lms_system = run_tedlium2_ngram_lm(add_unknown_phoneme_and_mapping=False) + lm = lms_system.interpolated_lms["dev-pruned"]["4gram"] + comb_lm = ArpaLmRasrConfig(lm_path=lm.ngram_lm) + + train_data_inputs = {} + dev_data_inputs = {} + test_data_inputs = {} + + train_data_inputs["train"] = rasr_util.RasrDataInput( + corpus_object=corpus_object_dict["train"], + lexicon=train_lexicon.get_dict(), + concurrent=CONCURRENT["train"], + lm=None, + ) + dev_data_inputs["dev"] = rasr_util.RasrDataInput( + corpus_object=corpus_object_dict["dev"], + lexicon=train_lexicon.get_dict(), + concurrent=CONCURRENT["dev"], + lm=comb_lm.get_dict(), + ) + test_data_inputs["test"] = rasr_util.RasrDataInput( + corpus_object=corpus_object_dict["test"], + lexicon=train_lexicon.get_dict(), + concurrent=CONCURRENT["test"], + lm=comb_lm.get_dict(), + ) + + return CorpusData( + train_data=train_data_inputs, + dev_data=dev_data_inputs, + test_data=test_data_inputs, + ) + + + + +# -------------------- helpers -------------------- +def get_final_output(name=InputKey.BASE): + output_args = rasr_util.OutputArgs(name) + + output_args.define_corpus_type("train", "train") + output_args.define_corpus_type("dev", "dev") + output_args.define_corpus_type("test", "test") + + output_args.add_feature_to_extract("fb") + + return output_args diff --git a/users/raissi/setups/common/analysis/__init__.py b/users/raissi/setups/common/analysis/__init__.py index 401678888..4809f11e9 100644 --- a/users/raissi/setups/common/analysis/__init__.py +++ b/users/raissi/setups/common/analysis/__init__.py @@ -1,4 +1,2 @@ -from .plot import PlotPhonemeDurationsJob, PlotViterbiAlignmentsJob -from .sample_alignment import ComputeAlignmentSamplingStatisticsJob -from .statistics import ComputeSilencePercentageJob -from .tse import ComputeTimestampErrorJob, ComputeWordLevelTimestampErrorJob +from .plot import PlotViterbiAlignmentsJob +from .tse import ComputeWordLevelTimestampErrorJob diff --git a/users/raissi/setups/common/analysis/frame_statistics.py b/users/raissi/setups/common/analysis/frame_statistics.py new file mode 100755 index 000000000..96b8e0239 --- /dev/null +++ b/users/raissi/setups/common/analysis/frame_statistics.py @@ -0,0 +1,289 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import argparse +import collections +import itertools as it +import matplotlib.pyplot as plt +import os +import pandas as pd +import pickle +import sys + +import sprint_cache as sc + + +def load_alignment(allophones_path, alignments_path): + alignments = [] + f = sc.FileArchive(alignments_path) + f.setAllophones(allophones_path) + for i, k in enumerate(f.ft): + finfo = f.ft[k] + if 'attrib' not in finfo.name: + alignment = [(f.allophones[mix], state) for time, mix, state, _ in f.read(finfo.name, 'align')] + alignments.append(alignment) + return alignments + + +def count_hmm_states(alignment): + label_lengths = list() + for cur_label, val in it.groupby(alignment): + label_lengths.append(tuple([cur_label, len(list(val))])) + return label_lengths + + +def count_merge_hmm_states(alignment): + label_lengths = list() + for cur_label, val in it.groupby(alignment, key=lambda t : t[0]): + label_lengths.append(tuple([cur_label, len(list(val))])) + return label_lengths + + +def count_cart_lengths(count, silence_symbol): + cart_lengths = [] + hmm0_lengths = [] + hmm1_lengths = [] + hmm2_lengths = [] + for label, length in count: + if label[0].strip().find(silence_symbol) == 0: + continue + cart_lengths.append(length) + if label[1] == 0: + hmm0_lengths.append(length) + if label[1] == 1: + hmm1_lengths.append(length) + if label[1] == 2: + hmm2_lengths.append(length) + return cart_lengths, hmm0_lengths, hmm1_lengths, hmm2_lengths + + +def count_phon_lengths(count, silence_symbol): + phon_lengths = [] + for label, length in count: + if label[0].strip().find(silence_symbol) == 0: + continue + phon_lengths.append(length) + return phon_lengths + + +def count_silence(count, silence_symbol): + if count[0][0][0].strip().find(silence_symbol) == 0: + sil_begin_length = count.pop(0)[1] + else: + sil_begin_length = 0 + if count[-1][0][0].strip().find(silence_symbol) == 0: + sil_end_length = count.pop(-1)[1] + else: + sil_end_length = 0 + sil_middle_length = [] + for label, length in count: + if label[0].strip().find(silence_symbol) == 0: + sil_middle_length.append(length) + return [sil_begin_length], sil_middle_length, [sil_end_length] + + +def main(allophones_path, alignments_path, output_dir, silence_symbol, non_speech_symbol): + assert isinstance(silence_symbol, str), (silence_symbol, "is not str type") + pickle_path = f"{output_dir}/data.pickle" + + alignments = collections.defaultdict(list) # dict(int:list) + alignments_merge = collections.defaultdict(list) + silence_begin = list() + silence_end = list() + silence_intra = list() + phoneme_total_lengths = list() + cart_total_lengths = list() + hmm0_total_lengths = list() + hmm1_total_lengths = list() + hmm2_total_lengths = list() + sil_hist_begin = collections.defaultdict(int) + sil_hist_end = collections.defaultdict(int) + sil_hist_intra = collections.defaultdict(int) + cart_hist = collections.defaultdict(int) + hmm0_hist = collections.defaultdict(int) + hmm1_hist = collections.defaultdict(int) + hmm2_hist = collections.defaultdict(int) + phon_hist = collections.defaultdict(int) + if not os.path.isfile(pickle_path): + print("collecting statistics") + num_alignments = len(alignments_path) + idx = 0 + for i, ap in enumerate(alignments_path): + print(f"alignment: {i+1}/{num_alignments}") + assert os.path.isfile(ap), ap + alignment = load_alignment(allophones_path, ap) + for align in alignment: + alignments[idx] = count = count_hmm_states(align) + alignments_merge[idx] = count_merge = count_merge_hmm_states(align) + + sil_begin, sil_intra, sil_end = count_silence(count, silence_symbol) + + silence_begin.extend(sil_begin) + silence_intra.extend(sil_intra) + silence_end.extend(sil_end) + sil_hist_begin[sil_begin[0]] += 1 + sil_hist_end[sil_end[0]] += 1 + for i in sil_intra: + sil_hist_intra[i] += 1 + + cart_lengths, hmm0_lengths, hmm1_lengths, hmm2_lengths = count_cart_lengths(count, non_speech_symbol) + for cart in cart_lengths: + cart_hist[cart] += 1 + for h0 in hmm0_lengths: + hmm0_hist[h0] += 1 + for h1 in hmm1_lengths: + hmm1_hist[h1] += 1 + for h2 in hmm2_lengths: + hmm2_hist[h2] += 1 + + phon_lengths = count_phon_lengths(count_merge, non_speech_symbol) + for phon in phon_lengths: + phon_hist[phon] += 1 + + phoneme_total_lengths.extend(phon_lengths) + cart_total_lengths.extend(cart_lengths) + hmm0_total_lengths.extend(hmm0_lengths) + hmm1_total_lengths.extend(hmm1_lengths) + hmm2_total_lengths.extend(hmm2_lengths) + + idx += 1 + + results = [alignments, alignments_merge, silence_begin, silence_intra, silence_end, sil_hist_begin, sil_hist_intra, sil_hist_end, cart_total_lengths, hmm0_total_lengths, hmm1_total_lengths, hmm2_total_lengths, phoneme_total_lengths, cart_hist, hmm0_hist, hmm1_hist, hmm2_hist, phon_hist] + + with open(pickle_path, "wb") as out_pickle: + data_dump = tuple(results) + pickle.dump(data_dump, out_pickle, protocol=4) # protocol version4 for python 3.4+ support + else: + print("found pickled statistics") + with open(pickle_path, "rb") as in_pickle: + results = pickle.load(in_pickle) + + # alignments : dict[int] = list[cart labels, cart lengths] + # alignments_merge : dict[int] = list[phon labels, phon lengths] + # silence_begin : list[sil lengths] + # silence_intra : list[sil lengths] + # silence_end : list[sil lengths] + # sil_hist_begin : dict[sil lengths] = occurences + # sil_hist_intra : dict[sil lengths] = occurences + # sil_hist_end : dict[sil lengths] = occurences + # cart_lengths : list[cart labels lengths] + # hmm0_lengths : list[hmm state 0 lengths] + # hmm1_lengths : list[hmm state 1 lengths] + # hmm2_lengths : list[hmm state 2 lengths] + # phon_lengths : list[phon labels lengths] + # cart_hist : dict[cart labels] = occurences + # hmm0_hist : dict[hmm state 0] = occurences + # hmm1_hist : dict[hmm state 1] = occurences + # hmm2_hist : dict[hmm state 2] = occurences + # phon_hist : dict[phon labels] = occurences + return results + + +def hist_data_to_dataframe(x_label, y_label, data_dict): + d_t = collections.defaultdict(list) + for k, v in sorted(data_dict.items()): + d_t[x_label].append(k) + d_t[y_label].append(v) + + df = pd.DataFrame(data=d_t) + + return df + + +def plot(output_dir, plot_dir, inputs): + print("calculating averages") + alignments, alignments_merge, silence_begin, silence_intra, silence_end, sil_hist_begin, sil_hist_intra, sil_hist_end, cart_lengths, hmm0_lengths, hmm1_lengths, hmm2_lengths, phon_lengths, cart_hist, hmm0_hist, hmm1_hist, hmm2_hist, phon_hist = inputs + # *** stat calculation *** + num_seqs = len(alignments.keys()) + assert num_seqs == len(silence_begin) + assert num_seqs == len(silence_end) + total_num_sil = sum(silence_begin) + sum(silence_intra) + sum(silence_end) + avg_sil_begin = sum(silence_begin) / num_seqs + avg_sil_intra = sum(silence_intra) / len(silence_intra) if len(silence_intra) > 0 else 0 + avg_sil_end = sum(silence_end) / num_seqs + avg_cart = sum(cart_lengths) / len(cart_lengths) + avg_hmm0 = sum(hmm0_lengths) / len(hmm0_lengths) + avg_hmm1 = sum(hmm1_lengths) / len(hmm1_lengths) + avg_hmm2 = sum(hmm2_lengths) / len(hmm2_lengths) + avg_phon = sum(phon_lengths) / len(phon_lengths) + + total_num_frames = 0 + for _, v in alignments.items(): + for label, length in v: + total_num_frames += int(length) + + + with open(f"{output_dir}/statistics.txt", "wt") as out_stats: + out_stats.write(f"average silence length at sequence begin: {avg_sil_begin:.2f}\n") + out_stats.write(f"average silence length intra sequence : {avg_sil_intra:.2f}\n") + out_stats.write(f"average silence length at sequence end : {avg_sil_end:.2f}\n") + out_stats.write(f"average cart label length : {avg_cart:.2f}\n") + out_stats.write(f"average 0. hmm state length : {avg_hmm0:.2f}\n") + out_stats.write(f"average 1. hmm state length : {avg_hmm1:.2f}\n") + out_stats.write(f"average 2. hmm state length : {avg_hmm2:.2f}\n") + out_stats.write(f"average phoneme label length : {avg_phon:.2f}\n") + out_stats.write(f"average number of frames per sequence : {total_num_frames/num_seqs:.2f}\n") + out_stats.write(f"total number of silence frames : {total_num_sil:.0f}\n") + out_stats.write(f"total number of frames : {total_num_frames:.0f}\n") + out_stats.write(f"number of sequences : {num_seqs:.0f}\n") + + + print("creating plots") + # *** data to pandas dataframe *** + sil_begin_dataframe = hist_data_to_dataframe("begin silence lengths", "occurences", sil_hist_begin) + sil_intra_dataframe = hist_data_to_dataframe("intra silence lengths", "occurences", sil_hist_intra) + sil_end_dataframe = hist_data_to_dataframe("end silence lengths", "occurences", sil_hist_end) + cart_dataframe = hist_data_to_dataframe("cart lengths", "occurences", cart_hist) + hmm0_dataframe = hist_data_to_dataframe("hmm 0 lengths", "occurences", hmm0_hist) + hmm1_dataframe = hist_data_to_dataframe("hmm 1 lengths", "occurences", hmm1_hist) + hmm2_dataframe = hist_data_to_dataframe("hmm 2 lengths", "occurences", hmm2_hist) + phon_dataframe = hist_data_to_dataframe("phon lengths", "occurences", phon_hist) + + # *** plot histogram *** + sil_begin_dataframe.plot(x="begin silence lengths", y="occurences", logy=True).get_figure().savefig(f"{plot_dir}/silence_begin_histogram.png") + sil_intra_dataframe.plot(x="intra silence lengths", y="occurences", logy=True).get_figure().savefig(f"{plot_dir}/silence_intra_histogram.png") + sil_end_dataframe.plot(x="end silence lengths", y="occurences", logy=True).get_figure().savefig(f"{plot_dir}/silence_end_histogram.png") + cart_dataframe.plot(x="cart lengths", y="occurences", logy=True).get_figure().savefig(f"{plot_dir}/cart_label_histogram.png") + hmm0_dataframe.plot(x="hmm 0 lengths", y="occurences", logy=True).get_figure().savefig(f"{plot_dir}/hmm0_histogram.png") + hmm1_dataframe.plot(x="hmm 1 lengths", y="occurences", logy=True).get_figure().savefig(f"{plot_dir}/hmm1_histogram.png") + hmm2_dataframe.plot(x="hmm 2 lengths", y="occurences", logy=True).get_figure().savefig(f"{plot_dir}/hmm2_histogram.png") + phon_dataframe.plot(x="phon lengths", y="occurences", logy=True).get_figure().savefig(f"{plot_dir}/phon_label_histogram.png") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="alignment statistics") + parser.add_argument("allophones", nargs=1, type=str, help="allophone file path") + parser.add_argument("alignments", nargs="*", type=str, help="alignment file paths") + parser.add_argument("--root_dir", nargs=1, type=str, help="output directory path") + parser.add_argument("--sub_dir", nargs=1, type=str, help="plot directory path") + parser.add_argument("--silence_symbol", nargs=1, type=str, help="which silence symbol to use", default=["[SILENCE]{#+#}@i@f"]) + parser.add_argument("--non_speech_symbol", nargs=1, type=str, help="which non speech symbol to use", default=["["]) + args = parser.parse_args() + + assert len(args.allophones) == 1 + allophones_path = args.allophones[0] + assert os.path.isfile(allophones_path) + + assert len(args.root_dir) == 1 + root_dir = args.root_dir[0] + assert len(args.sub_dir) == 1 + sub_dir = args.sub_dir[0] + output_dir = os.path.join(root_dir, "statistics", sub_dir) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + plot_dir = os.path.join(root_dir, "plots", sub_dir) + if not os.path.exists(plot_dir): + os.makedirs(plot_dir) + + alignments_path = args.alignments + + assert len(args.silence_symbol) == 1 + silence_symbol = args.silence_symbol[0] + + assert len(args.non_speech_symbol) == 1 + non_speech_symbol = args.non_speech_symbol[0] + + intermediate = main(allophones_path, alignments_path, output_dir, silence_symbol, non_speech_symbol) + + plot(output_dir, plot_dir, intermediate) + diff --git a/users/raissi/setups/common/analysis/labels.py b/users/raissi/setups/common/analysis/labels.py new file mode 100644 index 000000000..99204a2f2 --- /dev/null +++ b/users/raissi/setups/common/analysis/labels.py @@ -0,0 +1,610 @@ +__all__ = ["ComputeSilenceRatioJob", "ComputeAveragePhonemeLengthJob", "ComputeLabelStatisticsJob"] + +from collections import Counter, defaultdict +import itertools +import logging +from typing import List, Union + +from sisyphus import tk, Job, Task + +import i6_core.lib.rasr_cache as rasr_cache + + +class ComputeSilenceRatioJob(Job): + def __init__( + self, + allophone_file: tk.Path, + alignment_files: Union[tk.Path, List[tk.Path]], + silence_label: str, + ): + self.allophone_file = allophone_file + self.alignment_files = alignment_files + self.silence_label = silence_label + + self.out_silence_frames = self.output_var("silence_frames.txt") + self.out_total_frames = self.output_var("total_frames.txt") + self.out_silence_ratio = self.output_var("silence_ratio.txt") + + self.rqmt = {"cpu": 1, "mem": 24, "time": 24, "sbatch_args": ["-p", "cpu_slow"]} + + def tasks(self): + yield Task("run", rqmt=self.rqmt) + + @staticmethod + def load_alignment(allophones_file: tk.Path, alignment_file: Union[tk.Path, rasr_cache.FileArchive]): + if isinstance(alignment_file, tk.Path): + f = rasr_cache.FileArchive(alignment_file.get_path()) + f.setAllophones(allophones_file.get_path()) + elif isinstance(alignment_file, rasr_cache.FileArchive): + f = alignment_file + else: + raise NotImplementedError + + alignments = [] + for i, k in enumerate(f.ft): + finfo = f.ft[k] + if "attrib" not in finfo.name: + alignment = [(f.allophones[mix], state) for time, mix, state, _ in f.read(finfo.name, "align")] + alignments.append(alignment) + return alignments + + @staticmethod + def count(alignment, silence_symbol): + silence_frames = list() + total_frames = list() + for cur_label, val in itertools.groupby(alignment): + v_len = len(list(val)) + if cur_label[0] == silence_symbol: + print("silence:", cur_label, v_len) + silence_frames.append(v_len) + + print("frame:", cur_label, v_len) + total_frames.append(v_len) + + return sum(silence_frames), sum(total_frames) + + def run(self): + logging.info("Collection statistics...") + if isinstance(self.alignment_files, List): + num_alignments = len(self.alignment_files) + alignment_files = self.alignment_files + elif isinstance(self.alignment_files, tk.Path): + if self.alignment_files.get_path().endswith(".bundle"): + bundle_file = rasr_cache.FileArchiveBundle(self.alignment_files.get_path()) + bundle_file.setAllophones(self.allophone_file.get_path()) + alignment_files = list(bundle_file.archives.values()) + num_alignments = len(alignment_files) + else: + alignment_files = [self.alignment_files] + num_alignments = 1 + else: + raise NotImplementedError + + silence_frames = [] + total_frames = [] + + for i, ap in enumerate(alignment_files, start=1): + logging.info(f"Alignment: {i}/{num_alignments}") + alignment_content = self.load_alignment(self.allophone_file, ap) + for align in alignment_content: + seg_silence_frames, seg_total_frames = self.count(align, self.silence_label) + silence_frames.append(seg_silence_frames) + total_frames.append(seg_total_frames) + + sum_silence_frames = sum(silence_frames) + sum_total_frames = sum(total_frames) + + self.out_silence_frames.set(sum_silence_frames) + self.out_total_frames.set(sum_total_frames) + self.out_silence_ratio.set(sum_silence_frames / sum_total_frames) + + +class ComputeAveragePhonemeLengthJob(Job): + def __init__( + self, + allophone_file: tk.Path, + alignment_files: Union[tk.Path, List[tk.Path]], + silence_label: str, + non_speech_labels: Union[str, List[str]], + ): + self.allophone_file = allophone_file + self.alignment_files = alignment_files + self.silence_label = silence_label + self.non_speech_labels = non_speech_labels + + self.out_average_phoneme_length = self.output_var("average_phoneme_length.txt") + self.out_average_phoneme_length_state_0 = self.output_var("average_phoneme_length_state_0.txt") + self.out_average_phoneme_length_state_1 = self.output_var("average_phoneme_length_state_1.txt") + self.out_average_phoneme_length_state_2 = self.output_var("average_phoneme_length_state_2.txt") + self.out_average_phoneme_length_with_non_speech = self.output_var("average_phoneme_length_with_non_speech.txt") + self.out_total_frames = self.output_var("total_frames.txt") + self.out_num_seqs = self.output_var("num_seqs.txt") + + self.rqmt = {"cpu": 1, "mem": 24, "time": 24, "sbatch_args": ["-p", "cpu_slow"]} + + def tasks(self): + yield Task("run", rqmt=self.rqmt) + + @staticmethod + def load_alignment(allophones_file: tk.Path, alignment_file: Union[tk.Path, rasr_cache.FileArchive]): + if isinstance(alignment_file, tk.Path): + f = rasr_cache.FileArchive(alignment_file.get_path()) + f.setAllophones(allophones_file.get_path()) + elif isinstance(alignment_file, rasr_cache.FileArchive): + f = alignment_file + else: + raise NotImplementedError + + alignments = [] + for i, k in enumerate(f.ft): + finfo = f.ft[k] + if "attrib" not in finfo.name: + alignment = [(f.allophones[mix], state) for time, mix, state, _ in f.read(finfo.name, "align")] + alignments.append(alignment) + return alignments + + @staticmethod + def count(alignment, silence_symbol, non_speech_symbols): + hmm_lengths = list() + hmm_unk_lengths = list() + hmm_0_lengths = list() + hmm_1_lengths = list() + hmm_2_lengths = list() + hmm_num = 0 + hmm_unk_num = 0 + hmm_0_num = 0 + hmm_1_num = 0 + hmm_2_num = 0 + total_frames = list() + + for cur_label, val in itertools.groupby(alignment): + v_len = len(list(val)) + total_frames.append(v_len) + + if cur_label[0].strip() == silence_symbol: + continue + + hmm_unk_lengths.append(v_len) + hmm_unk_num += 1 + + if cur_label[0].strip() in non_speech_symbols: + continue + + hmm_lengths.append(v_len) + hmm_num += 1 + + if cur_label[1] == 0: + hmm_0_lengths.append(v_len) + hmm_0_num += 1 + if cur_label[1] == 1: + hmm_1_lengths.append(v_len) + hmm_1_num += 1 + if cur_label[1] == 2: + hmm_2_lengths.append(v_len) + hmm_2_num += 1 + + return ( + hmm_lengths, + hmm_num, + hmm_0_lengths, + hmm_0_num, + hmm_1_lengths, + hmm_1_num, + hmm_2_lengths, + hmm_2_num, + total_frames, + hmm_unk_lengths, + hmm_unk_num, + ) + + def run(self): + logging.info("Collection statistics...") + if isinstance(self.alignment_files, List): + num_alignments = len(self.alignment_files) + alignment_files = self.alignment_files + elif isinstance(self.alignment_files, tk.Path): + if self.alignment_files.get_path().endswith(".bundle"): + bundle_file = rasr_cache.FileArchiveBundle(self.alignment_files.get_path()) + bundle_file.setAllophones(self.allophone_file.get_path()) + alignment_files = list(bundle_file.archives.values()) + num_alignments = len(alignment_files) + else: + alignment_files = [self.alignment_files] + num_alignments = 1 + else: + raise NotImplementedError + + num_seqs = 0 + hmm_lengths = 0 + hmm_unk_lengths = 0 + hmm_0_lengths = 0 + hmm_1_lengths = 0 + hmm_2_lengths = 0 + hmm_num = 0 + hmm_unk_num = 0 + hmm_0_num = 0 + hmm_1_num = 0 + hmm_2_num = 0 + total_frames = 0 + + for i, ap in enumerate(alignment_files, start=1): + logging.info(f"Alignment: {i}/{num_alignments}") + alignment_content = self.load_alignment(self.allophone_file, ap) + for align in alignment_content: + num_seqs += 1 + ( + seg_hmm_lengths, + seg_hmm_num, + seg_hmm_0_lengths, + seg_hmm_0_num, + seg_hmm_1_lengths, + seq_hmm_1_num, + seg_hmm_2_lengths, + seg_hmm_2_num, + seg_total_frames, + seg_hmm_unk_lengths, + seg_hmm_unk_num, + ) = self.count( + align, + self.silence_label, + self.non_speech_labels, + ) + hmm_lengths += sum(seg_hmm_lengths) + hmm_0_lengths += sum(seg_hmm_0_lengths) + hmm_1_lengths += sum(seg_hmm_1_lengths) + hmm_2_lengths += sum(seg_hmm_2_lengths) + hmm_unk_lengths += sum(seg_hmm_unk_lengths) + hmm_num += seg_hmm_num + hmm_0_num += seg_hmm_0_num + hmm_1_num += seq_hmm_1_num + hmm_2_num += seg_hmm_2_num + hmm_unk_num += seg_hmm_unk_num + total_frames += sum(seg_total_frames) + + self.out_average_phoneme_length.set(hmm_lengths / hmm_num) + self.out_average_phoneme_length_with_non_speech.set(hmm_unk_lengths / hmm_unk_num) + self.out_average_phoneme_length_state_0.set(hmm_0_lengths / hmm_0_num) + self.out_average_phoneme_length_state_1.set(hmm_1_lengths / hmm_1_num if hmm_1_num > 0 else 0) + self.out_average_phoneme_length_state_2.set(hmm_2_lengths / hmm_2_num if hmm_2_num > 0 else 0) + self.out_total_frames.set(total_frames) + self.out_num_seqs.set(num_seqs) + + +class ComputeLabelStatisticsJob(Job): + def __init__( + self, + allophone_file: tk.Path, + alignment_files: Union[tk.Path, List[tk.Path]], + silence_label: str, + non_speech_labels: Union[str, List[str]], + ): + self.allophone_file = allophone_file + self.alignment_files = alignment_files + self.silence_label = silence_label + self.non_speech_labels = non_speech_labels + + self.out_counts = self.output_var("counts.pickle", pickle=True) + self.out_label_lengths = self.output_var("label_lengths.txt") + self.out_statistics = self.output_path("statistics.txt") + self.out_silence_begin_histogram = self.output_path("silence_begin_histogram.png") + self.out_silence_intra_histogram = self.output_path("silence_intra_histogram.png") + self.out_silence_end_histogram = self.output_path("silence_end_histogram.png") + self.out_hmm_histogram = self.output_path("hmm_label_histogram.png") + self.out_hmm_0_histogram = self.output_path("hmm_0_histogram.png") + self.out_hmm_1_histogram = self.output_path("hmm_1_histogram.png") + self.out_hmm_2_histogram = self.output_path("hmm_2_histogram.png") + self.out_phoneme_histogram = self.output_path("label_histogram.png") + + self.rqmt = {"cpu": 1, "mem": 24, "time": 24, "sbatch_args": ["-p", "cpu_slow"]} + + def tasks(self): + yield Task("count", rqmt=self.rqmt) + yield Task("plot", rqmt=self.rqmt) + + @staticmethod + def load_alignment(allophones_file: tk.Path, alignment_file: Union[tk.Path, rasr_cache.FileArchive]): + if isinstance(alignment_file, tk.Path): + f = rasr_cache.FileArchive(alignment_file.get_path()) + f.setAllophones(allophones_file.get_path()) + elif isinstance(alignment_file, rasr_cache.FileArchive): + f = alignment_file + else: + raise NotImplementedError + + alignments = [] + for i, k in enumerate(f.ft): + finfo = f.ft[k] + if "attrib" not in finfo.name: + alignment = [(f.allophones[mix], state) for time, mix, state, _ in f.read(finfo.name, "align")] + alignments.append(alignment) + return alignments + + @staticmethod + def count_hmm_states(alignment): + label_lengths = list() + for cur_label, val in itertools.groupby(alignment): + label_lengths.append(tuple([cur_label, len(list(val))])) + return label_lengths + + @staticmethod + def count_merge_hmm_states(alignment): + label_lengths = list() + for cur_label, val in itertools.groupby(alignment, key=lambda t: t[0]): + label_lengths.append(tuple([cur_label, len(list(val))])) + return label_lengths + + @staticmethod + def count_hmm_lengths(count, silence_symbol): + hmm_lengths = [] + hmm0_lengths = [] + hmm1_lengths = [] + hmm2_lengths = [] + for label, length in count: + if isinstance(silence_symbol, str): + if label[0].strip().find(silence_symbol) == 0: + continue + elif isinstance(silence_symbol, List): + for s in silence_symbol: + if label[0].strip().find(s) == 0: + continue + else: + raise NotImplementedError + + hmm_lengths.append(length) + if label[1] == 0: + hmm0_lengths.append(length) + if label[1] == 1: + hmm1_lengths.append(length) + if label[1] == 2: + hmm2_lengths.append(length) + return hmm_lengths, hmm0_lengths, hmm1_lengths, hmm2_lengths + + @staticmethod + def count_phon_lengths(count, silence_symbol): + phon_lengths = [] + for label, length in count: + if isinstance(silence_symbol, str): + if label.strip().find(silence_symbol) == 0: + continue + elif isinstance(silence_symbol, List): + for s in silence_symbol: + if label.strip().find(s) == 0: + continue + else: + raise NotImplementedError + + phon_lengths.append(length) + return phon_lengths + + @staticmethod + def count_silence(count, silence_symbol): + if count[0][0][0].strip().find(silence_symbol) == 0: + sil_begin_length = count.pop(0)[1] + else: + sil_begin_length = 0 + if count[-1][0][0].strip().find(silence_symbol) == 0: + sil_end_length = count.pop(-1)[1] + else: + sil_end_length = 0 + sil_middle_length = [] + for label, length in count: + if label[0].strip().find(silence_symbol) == 0: + sil_middle_length.append(length) + return [sil_begin_length], sil_middle_length, [sil_end_length] + + @staticmethod + def hist_data_to_dataframe(x_label, y_label, data_dict): + import pandas + + d_t = defaultdict(list) + for k, v in sorted(data_dict.items()): + d_t[x_label].append(k) + d_t[y_label].append(v) + + df = pandas.DataFrame(data=d_t) + + return df + + def count(self): + logging.info("Collection statistics...") + if isinstance(self.alignment_files, List): + num_alignments = len(self.alignment_files) + alignment_files = self.alignment_files + elif isinstance(self.alignment_files, tk.Path): + if self.alignment_files.get_path().endswith(".bundle"): + bundle_file = rasr_cache.FileArchiveBundle(self.alignment_files.get_path()) + bundle_file.setAllophones(self.allophone_file.get_path()) + alignment_files = list(bundle_file.archives.values()) + num_alignments = len(alignment_files) + else: + alignment_files = [self.alignment_files] + num_alignments = 1 + else: + raise NotImplementedError + + labels_counter = defaultdict(list) + labels_merged_counter = defaultdict(list) + silence_begin_counter = list() + silence_intra_counter = list() + silence_end_counter = list() + silence_begin_histogram = Counter() + silence_end_histogram = Counter() + silence_intra_histogram = Counter() + hmm_histogram = Counter() + hmm_0_histogram = Counter() + hmm_1_histogram = Counter() + hmm_2_histogram = Counter() + phoneme_histogram = Counter() + phoneme_counter = list() + hmm_counter = list() + hmm_0_counter = list() + hmm_1_counter = list() + hmm_2_counter = list() + + idx = 0 + for i, ap in enumerate(alignment_files, start=1): + logging.info(f"Alignment: {i}/{num_alignments}") + alignment_content = self.load_alignment(self.allophone_file, ap) + for align in alignment_content: + labels_counter[idx] = count = self.count_hmm_states(align) + labels_merged_counter[idx] = count_merge = self.count_merge_hmm_states(align) + + sil_begin, sil_intra, sil_end = self.count_silence(count, self.silence_label) + + silence_begin_counter.extend(sil_begin) + silence_intra_counter.extend(sil_intra) + silence_end_counter.extend(sil_end) + silence_begin_histogram[sil_begin[0]] += 1 + silence_end_histogram[sil_end[0]] += 1 + for ii in sil_intra: + silence_intra_histogram[i] += 1 + + hmm_lengths, hmm0_lengths, hmm1_lengths, hmm2_lengths = self.count_hmm_lengths( + count, self.non_speech_labels + ) + for hmm in hmm_lengths: + hmm_histogram[hmm] += 1 + for h0 in hmm0_lengths: + hmm_0_histogram[h0] += 1 + for h1 in hmm1_lengths: + hmm_1_histogram[h1] += 1 + for h2 in hmm2_lengths: + hmm_2_histogram[h2] += 1 + + phon_lengths = self.count_phon_lengths(count_merge, self.non_speech_labels) + for phon in phon_lengths: + phoneme_histogram[phon] += 1 + + phoneme_counter.extend(phon_lengths) + hmm_counter.extend(hmm_lengths) + hmm_0_counter.extend(hmm0_lengths) + hmm_1_counter.extend(hmm1_lengths) + hmm_2_counter.extend(hmm2_lengths) + + idx += 1 + + del alignment_content, alignment_files + + results = [ + labels_counter, + labels_merged_counter, + silence_begin_counter, + silence_intra_counter, + silence_end_counter, + silence_begin_histogram, + silence_intra_histogram, + silence_end_histogram, + hmm_counter, + hmm_0_counter, + hmm_1_counter, + hmm_2_counter, + phoneme_counter, + hmm_histogram, + hmm_0_histogram, + hmm_1_histogram, + hmm_2_histogram, + phoneme_histogram, + ] + + logging.info("Pickling...") + self.out_counts.set(tuple(results)) + + def plot(self): + logging.info("Unpickling") + ( + labels_counter, + labels_merged_counter, + silence_begin_counter, + silence_intra_counter, + silence_end_counter, + silence_begin_histogram, + silence_intra_histogram, + silence_end_histogram, + hmm_counter, + hmm_0_counter, + hmm_1_counter, + hmm_2_counter, + phoneme_counter, + hmm_histogram, + hmm_0_histogram, + hmm_1_histogram, + hmm_2_histogram, + phoneme_histogram, + ) = self.out_counts.get() + + # *** stat calculation *** + logging.info("Calculating averages") + num_seqs = len(labels_counter.keys()) + assert num_seqs == len(silence_begin_counter), (num_seqs, len(silence_begin_counter)) + assert num_seqs == len(silence_end_counter), (num_seqs, len(silence_end_counter)) + total_num_sil = sum(silence_begin_counter) + sum(silence_intra_counter) + sum(silence_end_counter) + avg_sil_begin = sum(silence_begin_counter) / num_seqs + avg_sil_intra = sum(silence_intra_counter) / len(silence_intra_counter) if len(silence_intra_counter) > 0 else 0 + avg_sil_end = sum(silence_end_counter) / num_seqs + avg_hmm = sum(hmm_counter) / len(hmm_counter) + avg_hmm0 = sum(hmm_0_counter) / len(hmm_0_counter) + avg_hmm1 = sum(hmm_1_counter) / len(hmm_1_counter) if len(hmm_1_counter) > 0 else 0 + avg_hmm2 = sum(hmm_2_counter) / len(hmm_2_counter) if len(hmm_2_counter) > 0 else 0 + avg_phon = sum(phoneme_counter) / len(phoneme_counter) + + total_num_frames = 0 + for _, v in labels_counter.items(): + for label, length in v: + total_num_frames += int(length) + + with open(self.out_statistics.get_path(), "wt") as out_stats: + out_stats.write(f"average silence length at sequence begin: {avg_sil_begin:.2f}\n") + out_stats.write(f"average silence length intra sequence : {avg_sil_intra:.2f}\n") + out_stats.write(f"average silence length at sequence end : {avg_sil_end:.2f}\n") + out_stats.write(f"average cart label length : {avg_hmm:.2f}\n") + out_stats.write(f"average 0. hmm state length : {avg_hmm0:.2f}\n") + out_stats.write(f"average 1. hmm state length : {avg_hmm1:.2f}\n") + out_stats.write(f"average 2. hmm state length : {avg_hmm2:.2f}\n") + out_stats.write(f"average phoneme label length : {avg_phon:.2f}\n") + out_stats.write(f"average number of frames per sequence : {total_num_frames / num_seqs:.2f}\n") + out_stats.write(f"total number of silence frames : {total_num_sil:.0f}\n") + out_stats.write(f"total number of frames : {total_num_frames:.0f}\n") + out_stats.write(f"number of sequences : {num_seqs:.0f}\n") + + print("creating plots") + # *** data to pandas dataframe *** + sil_begin_dataframe = self.hist_data_to_dataframe( + "begin silence lengths", "occurences", silence_begin_histogram + ) + sil_intra_dataframe = self.hist_data_to_dataframe( + "intra silence lengths", "occurences", silence_intra_histogram + ) + sil_end_dataframe = self.hist_data_to_dataframe("end silence lengths", "occurences", silence_end_histogram) + cart_dataframe = self.hist_data_to_dataframe("hmm lengths", "occurences", hmm_histogram) + hmm0_dataframe = self.hist_data_to_dataframe("hmm 0 lengths", "occurences", hmm_0_histogram) + hmm1_dataframe = self.hist_data_to_dataframe("hmm 1 lengths", "occurences", hmm_1_histogram) + hmm2_dataframe = self.hist_data_to_dataframe("hmm 2 lengths", "occurences", hmm_2_histogram) + phon_dataframe = self.hist_data_to_dataframe("phon lengths", "occurences", phoneme_histogram) + + # *** plot histogram *** + sil_begin_dataframe.plot(x="begin silence lengths", y="occurences", logy=True).get_figure().savefig( + self.out_silence_begin_histogram.get_path() + ) + sil_intra_dataframe.plot(x="intra silence lengths", y="occurences", logy=True).get_figure().savefig( + self.out_silence_intra_histogram.get_path() + ) + sil_end_dataframe.plot(x="end silence lengths", y="occurences", logy=True).get_figure().savefig( + self.out_silence_end_histogram.get_path() + ) + cart_dataframe.plot(x="hmm lengths", y="occurences", logy=True).get_figure().savefig( + self.out_hmm_histogram.get_path() + ) + hmm0_dataframe.plot(x="hmm 0 lengths", y="occurences", logy=True).get_figure().savefig( + self.out_hmm_0_histogram.get_path() + ) + if len(hmm_1_counter) > 0: + hmm1_dataframe.plot(x="hmm 1 lengths", y="occurences", logy=True).get_figure().savefig( + self.out_hmm_1_histogram.get_path() + ) + if len(hmm_2_counter) > 0: + hmm2_dataframe.plot(x="hmm 2 lengths", y="occurences", logy=True).get_figure().savefig( + self.out_hmm_2_histogram.get_path() + ) + phon_dataframe.plot(x="phon lengths", y="occurences", logy=True).get_figure().savefig( + self.out_phoneme_histogram.get_path() + ) diff --git a/users/raissi/setups/common/analysis/phoneme_length_statistics.py b/users/raissi/setups/common/analysis/phoneme_length_statistics.py new file mode 100755 index 000000000..9426e326b --- /dev/null +++ b/users/raissi/setups/common/analysis/phoneme_length_statistics.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import argparse +import collections +import itertools as it +import matplotlib.pyplot as plt +import os +import pandas as pd +import pickle +import sys + +import sprint_cache as sc + +from frame_statistics import load_alignment + + +def get_phoneme(label): + return label.strip().split("{")[0] + + +def main(allophones_path, alignments_path, output_dir): + pickle_path = f"{output_dir}/data2.pickle" + + single_phoneme_lengths = collections.defaultdict(list) + long_sequences = list() + + if not os.path.isfile(pickle_path): + print("collecting statistics") + num_alignments = len(alignments_path) + for i, ap in enumerate(alignments_path): + print(f"alignment: {i+1}/{num_alignments}") + assert os.path.isfile(ap), ap + alignment = load_alignment(allophones_path, ap) + for align in alignment: + for cur_label, val in it.groupby(align, key=lambda t : t[0]): + single_phon = get_phoneme(cur_label) + label_length = len(list(val)) + single_phoneme_lengths[single_phon].append(label_length) + if label_length >= 50: + long_sequences.append(tuple([single_phon, label_length, align])) + + results = [single_phoneme_lengths, long_sequences] + + with open(pickle_path, "wb") as out_pickle: + data_dump = tuple(results) + pickle.dump(data_dump, out_pickle, protocol=4) # protocol version4 for python 3.4+ support + else: + print("found pickled statistics") + with open(pickle_path, "rb") as in_pickle: + results = pickle.load(in_pickle) + + return results + + +def hist_data_to_dataframe(x_label, y_label, data_dict): + d_t = collections.defaultdict(list) + for k, v in sorted(data_dict.items()): + d_t[x_label].append(k) + d_t[y_label].append(v) + + df = pd.DataFrame(data=d_t) + + return df + + +def plot(output_dir, plot_dir, inputs): + long_sequences_path = f"{output_dir}/long_sequences.txt" + print("calculating averages") + single_phoneme_lengths, long_sequences = inputs + # *** dump stats *** + with open(long_sequences_path, "wt") as out_seqs: + for seq in long_sequences: + out_seqs.write(f"{seq}\n\n") + + with open(f"{output_dir}/phoneme_lengths.txt", "wt") as out_stats: + out_stats.write("average phoneme length") + for k, v in sorted(single_phoneme_lengths.items()): + avg = sum(v)/len(v) + out_stats.write(f"{k}: {avg:.2f}\n") + + + print("creating plots") + # *** data to pandas dataframe *** + for label, lengths in sorted(single_phoneme_lengths.items()): + hist = collections.defaultdict(int) + for l in lengths: + hist[l] += 1 + + phon_df = hist_data_to_dataframe(f"phoneme label lengths {label}", "occurences", hist) + + # *** plot histogram *** + phon_df.plot(x=f"phoneme label lengths {label}", y="occurences", logy=True).get_figure().savefig(f"{plot_dir}/phoneme_{label}.png") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="alignment statistics") + parser.add_argument("allophones", nargs=1, type=str, help="allophone file path") + parser.add_argument("alignments", nargs="*", type=str, help="alignment file paths") + parser.add_argument("--root_dir", nargs=1, type=str, help="output directory path") + parser.add_argument("--sub_dir", nargs=1, type=str, help="plot directory path") + args = parser.parse_args() + + assert len(args.allophones) == 1 + allophones_path = args.allophones[0] + assert os.path.isfile(allophones_path) + + assert len(args.root_dir) == 1 + root_dir = args.root_dir[0] + assert len(args.sub_dir) == 1 + sub_dir = args.sub_dir[0] + output_dir = os.path.join(root_dir, "statistics", sub_dir) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + plot_dir = os.path.join(root_dir, "plots", sub_dir) + if not os.path.exists(plot_dir): + os.makedirs(plot_dir) + + alignments_path = args.alignments + + intermediate = main(allophones_path, alignments_path, output_dir) + + plot(output_dir, plot_dir, intermediate) + diff --git a/users/raissi/setups/common/decoder/BASE_factored_hybrid_search.py b/users/raissi/setups/common/decoder/BASE_factored_hybrid_search.py index c27141a02..61bfbeb1f 100644 --- a/users/raissi/setups/common/decoder/BASE_factored_hybrid_search.py +++ b/users/raissi/setups/common/decoder/BASE_factored_hybrid_search.py @@ -948,6 +948,8 @@ def recognize( name_after_rerun = re.sub(r"Lm[0-9]*.[0.9*]", f"Lm{rounded_lm_scale}", name) name_prefix_len = len(f"{name_prefix}{self.name}/") + #in order to have access afterwards to the lm scale mainly + self.tuned_params = params return self.recognize( add_sis_alias_and_output=add_sis_alias_and_output, diff --git a/users/raissi/setups/common/helpers/train/network_params.py b/users/raissi/setups/common/helpers/train/network_params.py index 03be19583..9e8a055c2 100644 --- a/users/raissi/setups/common/helpers/train/network_params.py +++ b/users/raissi/setups/common/helpers/train/network_params.py @@ -83,7 +83,25 @@ def get_sa_name(self): #Conformer from-scratch -frameshift40_conformer_fullsum_from_scratch = GeneralNetworkParams( +frameshift40_conformer_fullsum_from_scratch_multi_nomlp = GeneralNetworkParams( + l2=5e-6, + use_multi_task=True, + add_mlps=False, + specaug_args=asdict(default_sa_args), + frame_rate_reduction_ratio_factor=4, + auxilary_loss_layers=[], +) + +frameshift40_conformer_fullsum_from_scratch_multi_mlp = GeneralNetworkParams( + l2=5e-6, + use_multi_task=True, + add_mlps=True, + specaug_args=asdict(default_sa_args), + frame_rate_reduction_ratio_factor=4, + auxilary_loss_layers=[], +) + +frameshift40_conformer_fullsum_from_scratch= GeneralNetworkParams( l2=5e-6, use_multi_task=False, add_mlps=False, diff --git a/users/raissi/setups/swb/legacy/SWB_TF_factored_hybrid_system.py b/users/raissi/setups/swb/legacy/SWB_TF_factored_hybrid_system.py index b4916131a..ca1cdd8a2 100644 --- a/users/raissi/setups/swb/legacy/SWB_TF_factored_hybrid_system.py +++ b/users/raissi/setups/swb/legacy/SWB_TF_factored_hybrid_system.py @@ -197,6 +197,9 @@ def __init__( } # 1/9 for 3-state, same amount of silence } + self.transcript_prior_xml = {"monostate": ("/").join( + [self.dependencies_path, "haotian/monostate/monostate.we.transcript.prior.xml"] + ),} # -------------------- External helpers -------------------- From cb8687953c3341089a9ac4e502239be134ab9ad6 Mon Sep 17 00:00:00 2001 From: schmitt Date: Thu, 6 Jun 2024 11:14:48 +0200 Subject: [PATCH 167/227] update --- .../segmental/model_new/blank_model/train.py | 47 ++++++++++++++----- 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/train.py index 0d961b411..7c03707fb 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/train.py @@ -141,9 +141,18 @@ def viterbi_training_v4( emit_blank_target_dim: Dim, batch_dims: List[Dim], ): - enc_spatial_dim.declare_same_as(non_blank_mask_dim) + # using dim.declare_same_as() leads to an error after an epoch is finished + # (UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 0: ordinal not in range(128)) + # therefore, we use the following workaround + enc = enc_args["enc"] # type: rf.Tensor + enc_raw = enc.raw_tensor + enc = enc.copy_template_replace_dim_tag( + enc.get_axis_from_description(enc_spatial_dim), non_blank_mask_dim + ) + enc.raw_tensor = enc_raw + am, _ = utils.get_masked( - input=enc_args["enc"], + input=enc, mask=non_blank_mask, mask_dim=non_blank_mask_dim, batch_dims=batch_dims, @@ -152,9 +161,9 @@ def viterbi_training_v4( singleton_dim = Dim(name="singleton", dimension=1) first_enc_frame = rf.gather( - enc_args["enc"], + enc, indices=rf.convert_to_tensor(0, dtype="int32"), - axis=enc_spatial_dim, + axis=non_blank_mask_dim, ) first_enc_frame = rf.expand_dim(first_enc_frame, singleton_dim) am, _ = rf.concat( @@ -178,7 +187,7 @@ def viterbi_training_v4( blank_logits_packed, pack_dim, emit_ground_truth_packed = get_packed_logits_and_emit_ground_truth( blank_logits=model.decode_logits(s_blank=s_unmasked), - align_targets_spatial_dim=enc_spatial_dim, + align_targets_spatial_dim=non_blank_mask_dim, emit_ground_truth=emit_ground_truth, batch_dims=batch_dims ) @@ -202,11 +211,18 @@ def viterbi_training_v5( emit_blank_target_dim: Dim, batch_dims: List[Dim], ): - enc_spatial_dim.declare_same_as(label_states_unmasked_spatial_dim) - blank_logits = model.emit_prob(rf.concat_features(enc_args["enc"], label_states_unmasked)) + # using dim.declare_same_as() leads to an error after an epoch is finished (see viterbi_training_v4) + enc = enc_args["enc"] # type: rf.Tensor + enc_raw = enc.raw_tensor + enc = enc.copy_template_replace_dim_tag( + enc.get_axis_from_description(enc_spatial_dim), label_states_unmasked_spatial_dim + ) + enc.raw_tensor = enc_raw + + blank_logits = model.emit_prob(rf.concat_features(enc, label_states_unmasked)) blank_logits_packed, pack_dim, emit_ground_truth_packed = get_packed_logits_and_emit_ground_truth( blank_logits=blank_logits, - align_targets_spatial_dim=enc_spatial_dim, + align_targets_spatial_dim=label_states_unmasked_spatial_dim, emit_ground_truth=emit_ground_truth, batch_dims=batch_dims ) @@ -230,16 +246,23 @@ def viterbi_training_v6( emit_blank_target_dim: Dim, batch_dims: List[Dim], ): - enc_spatial_dim.declare_same_as(label_states_unmasked_spatial_dim) + # using dim.declare_same_as() leads to an error after an epoch is finished (see viterbi_training_v4) + enc = enc_args["enc"] # type: rf.Tensor + enc_raw = enc.raw_tensor + enc = enc.copy_template_replace_dim_tag( + enc.get_axis_from_description(enc_spatial_dim), label_states_unmasked_spatial_dim + ) + enc.raw_tensor = enc_raw + s, _ = model.s( - enc_args["enc"], + enc, state=model.s.default_initial_state(batch_dims=batch_dims,), - spatial_dim=enc_spatial_dim + spatial_dim=label_states_unmasked_spatial_dim ) blank_logits = model.emit_prob(rf.concat_features(s, label_states_unmasked)) blank_logits_packed, pack_dim, emit_ground_truth_packed = get_packed_logits_and_emit_ground_truth( blank_logits=blank_logits, - align_targets_spatial_dim=enc_spatial_dim, + align_targets_spatial_dim=label_states_unmasked_spatial_dim, emit_ground_truth=emit_ground_truth, batch_dims=batch_dims ) From c0d85963f87064cd8c1113d729806d34bdfabc53 Mon Sep 17 00:00:00 2001 From: schmitt Date: Fri, 14 Jun 2024 10:33:58 +0200 Subject: [PATCH 168/227] update --- .../global_vs_segmental_2022_23/recog_new.py | 192 +++-- .../returnn/config_builder_rf/base.py | 189 ++++- .../returnn/network_builder_rf/base.py | 105 ++- .../network_builder_rf/global_/decoder.py | 31 +- .../network_builder_rf/global_/model.py | 17 +- .../network_builder_rf/global_/recog.py | 111 ++- .../network_builder_rf/global_/train.py | 4 + .../returnn/network_builder_rf/lm/__init__.py | 0 .../network_builder_rf/lm/lstm/__init__.py | 0 .../network_builder_rf/lm/lstm/model.py | 120 +++ .../lm/lstm/model_import.py | 126 +++ .../network_builder_rf/lm/trafo/__init__.py | 0 .../network_builder_rf/lm/trafo/model.py | 263 ++++++ .../lm/trafo/model_import.py | 127 +++ .../returnn/network_builder_rf/recog.py | 7 +- .../network_builder_rf/segmental/model.py | 22 +- .../segmental/model_new/blank_model/model.py | 12 + .../segmental/model_new/blank_model/train.py | 18 +- .../segmental/model_new/label_model/model.py | 15 +- .../segmental/model_new/label_model/train.py | 794 +----------------- .../segmental/realignment.py | 476 +++++++++++ .../network_builder_rf/segmental/recog.py | 474 ++++++----- .../segmental/recombination.py | 28 +- .../network_builder_rf/segmental/train.py | 298 +++++-- .../network_builder_rf/segmental/utils.py | 70 +- .../center_window_att/baseline_v3/__init__.py | 211 +++-- .../center_window_att/baseline_v3/baseline.py | 2 + .../center_window_att/baseline_v4/__init__.py | 166 ++-- .../center_window_att/config_builder.py | 11 +- .../center_window_att/realign.py | 80 ++ .../center_window_att/recog.py | 1 + .../center_window_att/train.py | 27 +- .../global_att/baseline_v1/__init__.py | 41 +- .../global_att/baseline_v1/baseline.py | 46 +- .../global_att/config_builder.py | 41 + .../pipeline_ls_conf/global_att/recog.py | 7 +- users/schmitt/hdf.py | 69 ++ .../model_interfaces/training.py | 4 + users/schmitt/visualization/visualization.py | 83 +- 39 files changed, 2899 insertions(+), 1389 deletions(-) create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/__init__.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/lstm/__init__.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/lstm/model.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/lstm/model_import.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/trafo/__init__.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/trafo/model.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/trafo/model_import.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/realignment.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/realign.py create mode 100644 users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/config_builder.py diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/recog_new.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/recog_new.py index 5e88e111e..c3b8704f3 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/recog_new.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23/recog_new.py @@ -18,7 +18,8 @@ from i6_experiments.users.schmitt.flow import get_raw_wav_feature_flow from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.general.rasr.config import RasrConfigBuilder -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.config_builder_rf.base import ConfigBuilderRF +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.config_builder_rf.base import ConfigBuilderRF, GlobalAttConfigBuilderRF, SegmentalAttConfigBuilderRF +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.global_att.config_builder import get_global_att_config_builder_rf from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.returnn.config_builder.base import ConfigBuilder from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.returnn.config_builder.global_ import GlobalConfigBuilder from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.returnn.config_builder.segmental import SegmentalConfigBuilder @@ -41,6 +42,7 @@ TEDLIUM2BPE1057_CTC_ALIGNMENT, ) from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.corpora import tedlium2, librispeech +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.global_.train import _returnn_v2_train_step, from_scratch_training class DecodingExperiment(ABC): @@ -166,28 +168,60 @@ def __init__(self, config_builder: GlobalConfigBuilder, **kwargs): def get_mini_att_checkpoint(self, train_mini_lstm_opts: Dict) -> Checkpoint: assert train_mini_lstm_opts["use_eos"], "trivial for global att but set for clarity" - num_epochs = 10 - train_mini_lstm_exp = GlobalTrainExperiment( - config_builder=self.config_builder, - alias=self.alias, - num_epochs=num_epochs, - train_opts={ - "import_model_train_epoch1": self.checkpoint, - "lr_opts": { - "type": "newbob", - "learning_rate": 1e-4, - "learning_rate_control": "newbob_multi_epoch", - "learning_rate_control_min_num_epochs_per_new_lr": 3, - "learning_rate_control_relative_error_relative_lr": True, - "learning_rate_control_error_measure": "dev_error_label_model/output_prob" - }, - "train_mini_lstm_opts": train_mini_lstm_opts, - "tf_session_opts": {"gpu_options": {"per_process_gpu_memory_fraction": 0.95}}, - "max_seq_length": {"targets": 75} - } - ) - mini_att_checkpoints, model_dir, learning_rates = train_mini_lstm_exp.run_train() + + if isinstance(self.config_builder, GlobalConfigBuilder): + train_mini_lstm_exp = GlobalTrainExperiment( + config_builder=self.config_builder, + alias=self.alias, + num_epochs=num_epochs, + train_opts={ + "import_model_train_epoch1": self.checkpoint, + "lr_opts": { + "type": "newbob", + "learning_rate": 1e-4, + "learning_rate_control": "newbob_multi_epoch", + "learning_rate_control_min_num_epochs_per_new_lr": 3, + "learning_rate_control_relative_error_relative_lr": True, + "learning_rate_control_error_measure": "dev_error_label_model/output_prob" + }, + "train_mini_lstm_opts": train_mini_lstm_opts, + "tf_session_opts": {"gpu_options": {"per_process_gpu_memory_fraction": 0.95}}, + "max_seq_length": {"targets": 75} + } + ) + mini_att_checkpoints, model_dir, learning_rates = train_mini_lstm_exp.run_train() + else: + assert isinstance(self.config_builder, GlobalAttConfigBuilderRF) + train_mini_lstm_exp = GlobalTrainExperiment( + config_builder=self.config_builder, + alias=self.alias, + num_epochs=num_epochs, + train_rqmt={"time": 10}, + train_opts={ + "preload_from_files": { + "pretrained_global_att_params": { + "filename": self.checkpoint, + "init_for_train": True, + "ignore_missing": True, + } + }, + "train_def": from_scratch_training, + "train_step_func": _returnn_v2_train_step, + "batching": "random", + "aux_loss_layers": None, + "lr_opts": { + "type": "const_then_linear", + "const_lr": 1e-4, + "const_frac": 1 / 3, + "final_lr": 1e-6, + "num_epochs": num_epochs + }, + "train_mini_lstm_opts": train_mini_lstm_opts, + } + ) + mini_att_checkpoints, model_dir, learning_rates = train_mini_lstm_exp.run_train() + return mini_att_checkpoints[num_epochs] @@ -209,24 +243,61 @@ def get_mini_att_checkpoint(self, train_mini_lstm_opts: Dict) -> Checkpoint: train_opts = {"dataset_opts": {"hdf_targets": align_targets}} num_epochs = 10 - train_mini_lstm_exp = SegmentalTrainExperiment( - config_builder=self.config_builder, - alias=self.alias, - num_epochs=num_epochs, - train_opts={ - "import_model_train_epoch1": self.checkpoint, - "lr_opts": { - "type": "newbob", - "learning_rate": 1e-4, - "learning_rate_control": "newbob_multi_epoch", - "learning_rate_control_min_num_epochs_per_new_lr": 3, - "learning_rate_control_relative_error_relative_lr": True, - "learning_rate_control_error_measure": "dev_error_label_model/output_prob" - }, - "train_mini_lstm_opts": train_mini_lstm_opts, - **train_opts - } - ) + + if isinstance(self.config_builder, SegmentalConfigBuilder): + train_mini_lstm_exp = SegmentalTrainExperiment( + config_builder=self.config_builder, + alias=self.alias, + num_epochs=num_epochs, + train_opts={ + "import_model_train_epoch1": self.checkpoint, + "lr_opts": { + "type": "newbob", + "learning_rate": 1e-4, + "learning_rate_control": "newbob_multi_epoch", + "learning_rate_control_min_num_epochs_per_new_lr": 3, + "learning_rate_control_relative_error_relative_lr": True, + "learning_rate_control_error_measure": "dev_error_label_model/output_prob" + }, + "train_mini_lstm_opts": train_mini_lstm_opts, + **train_opts + } + ) + else: + assert isinstance(self.config_builder, SegmentalAttConfigBuilderRF) + _, config_builder_ = get_global_att_config_builder_rf( + use_weight_feedback=self.config_builder.use_weight_feedback, + use_att_ctx_in_state=self.config_builder.use_att_ctx_in_state + ) + + train_mini_lstm_exp = GlobalTrainExperiment( + config_builder=config_builder_, + alias=self.alias, + num_epochs=num_epochs, + train_rqmt={"time": 10}, + train_opts={ + "preload_from_files": { + "pretrained_global_att_params": { + "filename": self.checkpoint, + "init_for_train": True, + "ignore_missing": True, + } + }, + "train_def": from_scratch_training, + "train_step_func": _returnn_v2_train_step, + "batching": "random", + "aux_loss_layers": None, + "lr_opts": { + "type": "const_then_linear", + "const_lr": 1e-4, + "const_frac": 1 / 3, + "final_lr": 1e-6, + "num_epochs": num_epochs + }, + "train_mini_lstm_opts": train_mini_lstm_opts, + } + ) + mini_att_checkpoints, model_dir, learning_rates = train_mini_lstm_exp.run_train() return mini_att_checkpoints[num_epochs] @@ -733,12 +804,12 @@ def __init__( ilm_opts: Optional[Dict] = None, run_analysis: bool = False, search_rqmt: Optional[Dict] = None, - search_alias: Optional[str] = None + search_alias: Optional[str] = None, + corpus_keys: Tuple[str, ...] = ("dev-other",) ): self.recog_opts = recog_opts if recog_opts is not None else {} - assert "lm_opts" not in self.recog_opts, "lm_opts are set by the pipeline" - assert "ilm_correction_opts" not in self.recog_opts, "ilm_correction_opts are set by the pipeline" - assert "beam_size" not in self.recog_opts, "beam_size is set by the pipeline" + for key in ("lm_opts", "ilm_correction_opts", "beam_size", "search_corpus_key"): + assert key not in self.recog_opts, f"{key} is set by the pipeline" self.alias = alias self.config_builder = config_builder @@ -753,6 +824,7 @@ def __init__( self.run_analysis = run_analysis self.search_rqmt = search_rqmt self.search_alias = search_alias + self.corpus_keys = corpus_keys @abstractmethod def run_experiment( @@ -764,12 +836,20 @@ def run(self): for beam_size in self.beam_sizes: for lm_scale in self.lm_scales: for ilm_scale in self.ilm_scales: - self.run_experiment( - beam_size=beam_size, - lm_scale=lm_scale, - ilm_scale=ilm_scale, - checkpoint_alias=checkpoint_alias - ) + for corpus_key in self.corpus_keys: + self.recog_opts.update({ + "lm_opts": {"scale": lm_scale, **self.lm_opts} if lm_scale > 0 else None, + "ilm_correction_opts": { + "scale": ilm_scale, **self.ilm_opts} if ilm_scale > 0 and lm_scale > 0 else None, + "beam_size": beam_size, + "search_corpus_key": corpus_key + }) + self.run_experiment( + beam_size=beam_size, + lm_scale=lm_scale, + ilm_scale=ilm_scale, + checkpoint_alias=checkpoint_alias + ) class ReturnnGlobalAttDecodingPipeline(DecodingPipeline): @@ -778,12 +858,6 @@ def __init__(self, config_builder: GlobalConfigBuilder, **kwargs): self.config_builder = config_builder def run_experiment(self, beam_size: int, lm_scale: float, ilm_scale: float, checkpoint_alias: str): - self.recog_opts.update({ - "lm_opts": {"scale": lm_scale, **self.lm_opts} if lm_scale > 0 else None, - "ilm_correction_opts": {"scale": ilm_scale, **self.ilm_opts} if ilm_scale > 0 and lm_scale > 0 else None, - "beam_size": beam_size - }) - if lm_scale > 0 and beam_size in (50, 84) and "batch_size" not in self.recog_opts: self.recog_opts["batch_size"] = 4000 * 160 @@ -826,12 +900,6 @@ def __init__(self, config_builder: SegmentalConfigBuilder, **kwargs): self.analysis_opts["ground_truth_hdf"] = self.realignment def run_experiment(self, beam_size: int, lm_scale: float, ilm_scale: float, checkpoint_alias: str): - self.recog_opts.update({ - "lm_opts": {"scale": lm_scale, **self.lm_opts} if lm_scale > 0 else None, - "ilm_correction_opts": {"scale": ilm_scale, **self.ilm_opts} if ilm_scale > 0 else None, - "beam_size": beam_size - }) - if lm_scale > 0: if beam_size == 12 and "batch_size" not in self.recog_opts: self.recog_opts["batch_size"] = 7500 * 160 diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/config_builder_rf/base.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/config_builder_rf/base.py index 6130a6092..5c969dc99 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/config_builder_rf/base.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/config_builder_rf/base.py @@ -1,5 +1,8 @@ from i6_experiments.users.schmitt.datasets.oggzip import get_dataset_dict as get_oggzip_dataset_dict from i6_experiments.users.schmitt.datasets.concat import get_concat_dataset_dict +from i6_experiments.users.schmitt.datasets.variable import ( + get_interpolation_alignment_dataset, get_interpolation_alignment_scores_dataset +) from i6_experiments.users.schmitt.datasets.extern_sprint import get_dataset_dict as get_extern_sprint_dataset_dict from i6_experiments.users.schmitt.specaugment import * from i6_experiments.users.schmitt import dynamic_lr @@ -68,6 +71,10 @@ def get_train_config(self, opts: Dict): python_epilog = copy.deepcopy(self.python_epilog) dataset_opts = opts.pop("dataset_opts", {}) + + if opts.get("full_sum_alignment_interpolation_factor", 0.0) > 0.0: + dataset_opts["add_alignment_interpolation_datasets"] = True + config_dict.update(self.get_train_datasets(dataset_opts=dataset_opts)) extern_data_raw = self.get_extern_data_dict(dataset_opts) extern_data_raw = instanciate_delayed(extern_data_raw) @@ -82,6 +89,9 @@ def get_train_config(self, opts: Dict): if opts.get("cleanup_old_models"): post_config_dict["cleanup_old_models"] = opts.pop("cleanup_old_models") + if opts.get("train_mini_lstm_opts") is not None: + config_dict["use_mini_att"] = True + config_dict.update(self.get_lr_settings(lr_opts=opts.pop("lr_opts"), python_epilog=python_epilog)) config_dict["batch_size"] = opts.pop("batch_size", 15_000) * self.batch_size_factor @@ -101,8 +111,7 @@ def get_train_config(self, opts: Dict): "gradient_clip_global_norm", # "specaugment_steps", "torch_amp", - "full_sum_training_beam_size", - # "max_seq_length" + # "max_seq_length", ] config_dict.update( {k: opts.pop(k) for k in remaining_opt_keys if k in opts} @@ -159,6 +168,62 @@ def get_recog_config(self, opts: Dict): config_dict["batch_size"] = opts.get("batch_size", 15_000) * self.batch_size_factor + config_dict["beam_search_opts"] = { + "beam_size": opts.get("beam_size", 12), + } + + lm_opts = opts.get("lm_opts", None) # type: Optional[Dict] + if lm_opts is not None: + assert lm_opts.get("type", "trafo") == "trafo" + + config_dict["external_lm"] = { + "class": "TransformerDecoder", + "vocab_dim": 10_025, + "model_dim": 1024, + "embed_dim": 128, + "num_layers": 24, + "decoder_layer_opts": {"self_att_opts": {"with_bias": False, "att_dropout_broadcast": False}}, + "input_embedding_scale": 1.0, + "share_embedding": False, + "logits_with_bias": True, + "input_dropout": 0.1, + } + + if "preload_from_files" not in config_dict: + config_dict["preload_from_files"] = {} + config_dict["preload_from_files"]["external_lm"] = { + "filename": "/work/asr3/zeyer/schmitt/sisyphus_work_dirs/segmental_models_2022_23_rf/i6_experiments/users/schmitt/returnn_frontend/convert/checkpoint/ConvertTfCheckpointToRfPtJob.7haAE0Cx93dA/output/model/network.023.pt", + "prefix": "language_model.", + "ignore_missing": False, + } + + config_dict["beam_search_opts"]["external_lm_scale"] = lm_opts["scale"] + + ilm_correction_opts = opts.get("ilm_correction_opts", None) + if ilm_correction_opts is not None: + assert ilm_correction_opts["type"] == "mini_att" + + config_dict["use_mini_att"] = True + + if "preload_from_files" not in config_dict: + config_dict["preload_from_files"] = {} + config_dict["preload_from_files"]["mini_lstm"] = { + "filename": ilm_correction_opts["mini_att_checkpoint"], + "prefix": "do_not_load_", + "var_name_mapping": {layer: f"do_not_load_{layer}" for layer in ( + "label_decoder.mini_att.bias", + "label_decoder.mini_att.weight", + "label_decoder.mini_att_lstm.bias", + "label_decoder.mini_att_lstm.rec_weight", + "label_decoder.mini_att_lstm.ff_weight", + )} + } + + config_dict["beam_search_opts"].update({ + "ilm_type": "mini_att", + "ilm_correction_scale": ilm_correction_opts["scale"], + }) + python_epilog.append( serialization.Collection( [ @@ -188,7 +253,6 @@ def get_recog_config(self, opts: Dict): # serialize remaining functions, e.g. dynamic learning rate return get_serializable_config(returnn_train_config, serialize_dim_tags=False) - def get_recog_checkpoints( self, model_dir: Path, learning_rates: Path, key: str, checkpoints: Dict[int, Checkpoint], n_epochs: int): # last checkpoint @@ -230,6 +294,16 @@ def get_lr_settings(self, lr_opts, python_epilog: Optional[List] = None): "learning_rates": [const_lr] * int((num_epochs*const_frac)) + list(np.linspace(const_lr, final_lr, num_epochs - int((num_epochs*const_frac)))), }) elif lr_opts["type"] == "dyn_lr_piecewise_linear": + # By batch size (in k) and num (sub)epochs. + # 500 subepochs is usually for multi-GPU with 4 GPUs, + # i.e. the same as single-GPU 2000 subepochs. + # If the dict is missing some entry, + # unfortunately there is currently no good automatic way to get the number. + # I need to log at the stats of some setup with this batch size. + # I just run some setup with some arbitrary LR scheduling (calling it "wrongLr"), + # or maybe with sqrt-decay, and then look at the stats (steps/ep, or total num steps), + # and give some estimates for the steps here, i.e. 45%, 90%, almost 100%, + # making sure the last number is slightly below the real total number of steps. _lrlin_oclr_steps_by_bs_nep = { (3, 125): [194_000, 388_000, 430_000], # ~3450steps/ep, 125 eps -> 430k steps in total (8, 125): [139_000, 279_000, 310_000], # ~2485steps/ep, 125 eps -> 310k steps in total @@ -415,9 +489,14 @@ def get_extern_data_dict(self, dataset_opts: Dict): extern_data_dict["data"] = {"dim_tags": [batch_dim, time_dim, audio_dim]} out_spatial_dim = Dim(description="out_spatial", dimension=None, kind=Dim.Types.Spatial) + + if isinstance(self, SegmentalAttConfigBuilderRF) and self.use_joint_model: + vocab_dimension = self.variant_params["dependencies"].model_hyperparameters.target_num_labels_wo_blank + else: + vocab_dimension = self.variant_params["dependencies"].model_hyperparameters.target_num_labels vocab_dim = Dim( description="vocab", - dimension=self.variant_params["dependencies"].model_hyperparameters.target_num_labels, + dimension=vocab_dimension, kind=Dim.Types.Spatial ) extern_data_dict["targets"] = { @@ -425,15 +504,49 @@ def get_extern_data_dict(self, dataset_opts: Dict): "sparse_dim": vocab_dim, } + if dataset_opts.get("add_alignment_interpolation_datasets"): + score_dim = Dim(description="interpolation_alignment_score", dimension=1, kind=Dim.Types.Feature) + interpolation_alignment_spatial_dim = Dim( + description="interpolation_alignment_spatial", dimension=None, kind=Dim.Types.Spatial) + extern_data_dict.update({ + "interpolation_alignment": { + "dim_tags": [batch_dim, interpolation_alignment_spatial_dim], "sparse_dim": vocab_dim}, + "interpolation_alignment_scores": {"dim_tags": [batch_dim, score_dim]} + }) + return extern_data_dict def get_train_datasets(self, dataset_opts: Dict): - return dict( + datasets = dict( train=self.get_train_dataset_dict(dataset_opts), dev=self.get_cv_dataset_dict(dataset_opts), eval_datasets={"devtrain": self.get_devtrain_dataset_dict(dataset_opts)} ) + if dataset_opts.get("add_alignment_interpolation_datasets"): + for corpus_key, dataset_dict in datasets.items(): + if corpus_key == "eval_datasets": + dataset_dict = dataset_dict["devtrain"] + + assert dataset_dict["class"] == "MetaDataset" + assert set(dataset_dict["data_map"].keys()) == {"data", "targets"} + dataset_dict["datasets"].update({ + "interpolation_alignment_dataset": { + "class": "VariableDataset", + "get_dataset": get_interpolation_alignment_dataset, + }, + "interpolation_alignment_scores_dataset": { + "class": "VariableDataset", + "get_dataset": get_interpolation_alignment_scores_dataset, + } + }) + dataset_dict["data_map"].update({ + "interpolation_alignment": ("interpolation_alignment_dataset", "data"), + "interpolation_alignment_scores": ("interpolation_alignment_scores_dataset", "data"), + }) + + return datasets + def get_search_dataset(self, search_corpus_key: str, dataset_opts: Dict): return dict( forward_data=self.get_search_dataset_dict(corpus_key=search_corpus_key, dataset_opts=dataset_opts) @@ -499,6 +612,9 @@ def __init__( self.config_dict.update(dict( center_window_size=center_window_size, )) + self.use_joint_model = use_joint_model + self.label_decoder_state = label_decoder_state + self.use_weight_feedback = use_weight_feedback if use_joint_model: assert not blank_decoder_version, "Either use joint model or separate label and blank model" @@ -518,6 +634,16 @@ def get_train_config(self, opts: Dict): if opts.get("alignment_augmentation_opts"): train_config.config["alignment_augmentation_opts"] = opts["alignment_augmentation_opts"] + remaining_opt_keys = [ + "full_sum_beam_size", + "full_sum_alignment_interpolation_factor", + "full_sum_lattice_downsampling", + "full_sum_train_on_viterbi_paths", + ] + train_config.config.update( + {k: opts.pop(k) for k in remaining_opt_keys if k in opts} + ) + return train_config def get_recog_config(self, opts: Dict): @@ -533,6 +659,57 @@ def get_recog_config(self, opts: Dict): use_recombination = opts.get("use_recombination") if use_recombination is not None: - recog_config.config["use_recombination"] = use_recombination + recog_config.config["beam_search_opts"]["use_recombination"] = use_recombination return recog_config + + def get_realign_config(self, opts: Dict): + config_dict = copy.deepcopy(self.config_dict) + post_config_dict = copy.deepcopy(self.post_config_dict) + python_prolog = copy.deepcopy(self.python_prolog) + python_epilog = copy.deepcopy(self.python_epilog) + + dataset_opts = opts.get("dataset_opts", {}) + config_dict.update(dict( + task="forward", + batching=opts.get("batching", "random") + )) + + config_dict.update( + self.get_search_dataset( + search_corpus_key=opts["corpus_key"], + dataset_opts=dataset_opts + )) + extern_data_raw = self.get_extern_data_dict(dataset_opts) + extern_data_raw = instanciate_delayed(extern_data_raw) + + config_dict["batch_size"] = opts.get("batch_size", 15_000) * self.batch_size_factor + + python_epilog.append( + serialization.Collection( + [ + serialization.NonhashedCode(get_import_py_code()), + serialization.NonhashedCode( + nn.ReturnnConfigSerializer.get_base_extern_data_py_code_str_direct(extern_data_raw) + ), + *serialize_model_def(self.model_def), + serialization.Import(self.get_model_func, import_as="get_model"), + serialization.Import(opts["realign_def"], import_as="_realign_def", ignore_import_as_for_hash=True), + serialization.Import(opts["forward_step_func"], import_as="forward_step"), + serialization.Import(opts["forward_callback"], import_as="forward_callback"), + serialization.PythonEnlargeStackWorkaroundNonhashedCode, + serialization.PythonCacheManagerFunctionNonhashedCode, + serialization.PythonModelineNonhashedCode + ] + ) + ) + + returnn_train_config = ReturnnConfig( + config=config_dict, + post_config=post_config_dict, + python_prolog=python_prolog, + python_epilog=python_epilog, + ) + + # serialize remaining functions, e.g. dynamic learning rate + return get_serializable_config(returnn_train_config, serialize_dim_tags=False) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/base.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/base.py index 4151392f2..b2f3e7baa 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/base.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/base.py @@ -25,6 +25,8 @@ def __init__( l2: float = 0.0001, use_weight_feedback: bool = True, use_att_ctx_in_state: bool = True, + decoder_state: str = "nb-lstm", + use_mini_att: bool = False, ): super(BaseLabelDecoder, self).__init__() @@ -40,29 +42,53 @@ def __init__( self.target_embed = rf.Embedding(target_dim, Dim(name="target_embed", dimension=640)) - zoneout_lstm_opts = dict( - out_dim=Dim(name="lstm", dimension=1024), - zoneout_factor_cell=0.15, - zoneout_factor_output=0.05, - use_zoneout_output=False, # like RETURNN/TF ZoneoutLSTM old default - # parts_order="icfo", # like RETURNN/TF ZoneoutLSTM - # parts_order="ifco", - parts_order="jifo", # NativeLSTM (the code above converts it...) - forget_bias=0.0, # the code above already adds it during conversion - ) self.use_att_ctx_in_state = use_att_ctx_in_state - if use_att_ctx_in_state: - self.s = rf.ZoneoutLSTM( - self.target_embed.out_dim + att_num_heads * enc_out_dim, - **zoneout_lstm_opts, + self.use_weight_feedback = use_weight_feedback + + self.decoder_state = decoder_state + if "lstm" in decoder_state: + ilm_layer_class = rf.ZoneoutLSTM + ilm_layer_opts = dict( + out_dim=Dim(name="lstm", dimension=1024), + zoneout_factor_cell=0.15, + zoneout_factor_output=0.05, + use_zoneout_output=False, # like RETURNN/TF ZoneoutLSTM old default + # parts_order="icfo", # like RETURNN/TF ZoneoutLSTM + # parts_order="ifco", + parts_order="jifo", # NativeLSTM (the code above converts it...) + forget_bias=0.0, # the code above already adds it during conversion ) + if use_att_ctx_in_state: + self.s = ilm_layer_class( + self.target_embed.out_dim + att_num_heads * enc_out_dim, + **ilm_layer_opts, + ) + else: + self.s_wo_att = ilm_layer_class( + self.target_embed.out_dim, + **ilm_layer_opts, + ) else: - self.s_wo_att = rf.ZoneoutLSTM( - self.target_embed.out_dim, - **zoneout_lstm_opts, + ilm_layer_class = rf.Linear + ilm_layer_opts = dict( + out_dim=Dim(name="linear-ilm", dimension=1024), ) + if use_att_ctx_in_state: + self.s_linear = ilm_layer_class( + self.target_embed.out_dim + att_num_heads * enc_out_dim, + **ilm_layer_opts, + ) + else: + self.s_wo_att_linear = ilm_layer_class( + self.target_embed.out_dim, + **ilm_layer_opts, + ) + + self.use_mini_att = use_mini_att + if use_mini_att: + self.mini_att_lstm = rf.LSTM(self.target_embed.out_dim, Dim(name="mini-att-lstm", dimension=50)) + self.mini_att = rf.Linear(self.mini_att_lstm.out_dim, self.att_num_heads * self.enc_out_dim) - self.use_weight_feedback = use_weight_feedback if use_weight_feedback: self.weight_feedback = rf.Linear(att_num_heads, enc_key_total_dim, with_bias=False) @@ -86,15 +112,42 @@ def _update_state( self, input_embed: rf.Tensor, prev_att: rf.Tensor, - prev_s_state: rf.LstmState, - ): - if self.use_att_ctx_in_state: - return self.s(rf.concat_features(input_embed, prev_att), state=prev_s_state, spatial_dim=single_step_dim) + prev_s_state: Optional[rf.LstmState], + ) -> Tuple[rf.Tensor, Optional[rf.LstmState]]: + if "lstm" in self.decoder_state: + ilm_forward_opts = dict( + state=prev_s_state, + spatial_dim=single_step_dim, + ) + if self.use_att_ctx_in_state: + return self.s(rf.concat_features(input_embed, prev_att), **ilm_forward_opts) + else: + return self.s_wo_att(input_embed, **ilm_forward_opts) else: - return self.s_wo_att(input_embed, state=prev_s_state, spatial_dim=single_step_dim) + if self.use_att_ctx_in_state: + return self.s_linear(rf.concat_features(input_embed, prev_att)), None + else: + return self.s_wo_att_linear(input_embed), None def get_lstm(self): - if self.use_att_ctx_in_state: - return self.s + if "lstm" in self.decoder_state: + if self.use_att_ctx_in_state: + return self.s + else: + return self.s_wo_att else: - return self.s_wo_att + if self.use_att_ctx_in_state: + return self.s_linear + else: + return self.s_wo_att_linear + + def get_att( + self, + att_weights: rf.Tensor, + enc: rf.Tensor, + reduce_dim: Dim + ) -> rf.Tensor: + att0 = rf.dot(att_weights, enc, reduce=reduce_dim, use_mask=False) + att0.feature_dim = self.enc_out_dim + att, _ = rf.merge_dims(att0, dims=(self.att_num_heads, self.enc_out_dim)) + return att diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/decoder.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/decoder.py index 6aed0ba96..d6d5c95d5 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/decoder.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/decoder.py @@ -22,6 +22,9 @@ def decoder_default_initial_state(self, *, batch_dims: Sequence[Dim], enc_spatia ) state.att.feature_dim_axis = len(state.att.dims) - 1 + if self.use_mini_att: + state.mini_att_lstm = self.mini_att_lstm.default_initial_state(batch_dims=batch_dims) + if self.use_weight_feedback: state.accum_att_weights = rf.zeros( list(batch_dims) + [enc_spatial_dim, self.att_num_heads], feature_dim=self.att_num_heads @@ -52,6 +55,7 @@ def loop_step( enc_spatial_dim: Dim, input_embed: rf.Tensor, state: Optional[rf.State] = None, + use_mini_att: bool = False, ) -> Tuple[Dict[str, rf.Tensor], rf.State]: """step of the inner loop""" if state is None: @@ -79,9 +83,12 @@ def loop_step( if self.use_weight_feedback: state_.accum_att_weights = state.accum_att_weights + att_weights * inv_fertility * 0.5 - att0 = rf.dot(att_weights, enc, reduce=enc_spatial_dim, use_mask=False) - att0.feature_dim = self.enc_out_dim - att, _ = rf.merge_dims(att0, dims=(self.att_num_heads, self.enc_out_dim)) + if use_mini_att: + att_lstm, state_.mini_att_lstm = self.mini_att_lstm( + input_embed, state=state.mini_att_lstm, spatial_dim=single_step_dim) + att = self.mini_att(att_lstm) + else: + att = self.get_att(att_weights, enc, enc_spatial_dim) state_.att = att return {"s": s, "att": att}, state_ @@ -110,6 +117,9 @@ def __call__( enc_ctx: rf.Tensor, enc_spatial_dim: Dim, s: rf.Tensor, + input_embed: rf.Tensor, + input_embed_spatial_dim: Dim, + use_mini_att: bool = False, ) -> rf.Tensor: s_transformed = self.s_transformed(s) @@ -118,10 +128,17 @@ def __call__( energy_in = enc_ctx + weight_feedback + s_transformed energy = self.energy(rf.tanh(energy_in)) att_weights = rf.softmax(energy, axis=enc_spatial_dim) - # we do not need use_mask because the softmax output is already padded with zeros - att0 = rf.dot(att_weights, enc, reduce=enc_spatial_dim, use_mask=False) - att0.feature_dim = self.enc_out_dim - att, _ = rf.merge_dims(att0, dims=(self.att_num_heads, self.enc_out_dim)) + if use_mini_att: + att_lstm, _ = self.mini_att_lstm( + input_embed, + state=self.mini_att_lstm.default_initial_state( + batch_dims=input_embed.remaining_dims([input_embed_spatial_dim, input_embed.feature_dim])), + spatial_dim=input_embed_spatial_dim + ) + att = self.mini_att(att_lstm) + else: + # we do not need use_mask because the softmax output is already padded with zeros + att = self.get_att(att_weights, enc, enc_spatial_dim) return att diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model.py index 8af71daf3..d8f595ed2 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/model.py @@ -23,7 +23,7 @@ def __init__( enc_key_total_dim: Dim = Dim(name="enc_key_total_dim", dimension=1024), att_dropout: float = 0.1, l2: float = 0.0001, - language_model: Optional[RFModelWithMakeLabelScorer] = None, + language_model: Optional[rf.Module] = None, enc_in_dim: Dim, enc_out_dim: Dim = Dim(name="enc", dimension=512), enc_num_layers: int = 12, @@ -36,6 +36,7 @@ def __init__( eos_idx: int, use_weight_feedback: bool = True, use_att_ctx_in_state: bool = True, + use_mini_att: bool = False, ): super(GlobalAttentionModel, self).__init__() @@ -72,17 +73,22 @@ def __init__( eos_idx=eos_idx, use_weight_feedback=use_weight_feedback, use_att_ctx_in_state=use_att_ctx_in_state, + use_mini_att=use_mini_att, ) if language_model: - self.language_model, self.language_model_make_label_scorer = language_model + self.language_model = language_model else: self.language_model = None - self.language_model_make_label_scorer = None self.blank_idx = blank_idx self.target_dim = target_dim + if use_mini_att: + for name, param in self.named_parameters(): + if "mini_att" not in name: + param.trainable = False + class MakeModel: """for import""" @@ -131,6 +137,7 @@ def make_model( language_model: Optional[Dict[str, Any]] = None, use_weight_feedback: bool = True, use_att_ctx_in_state: bool = True, + use_mini_att: bool = False, **extra, ) -> GlobalAttentionModel: """make""" @@ -145,7 +152,6 @@ def make_model( from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.lm.trafo import model as trafo_lm lm = trafo_lm.MakeModel(vocab_dim=target_dim, **language_model)() - lm = (lm, functools.partial(trafo_lm.make_label_scorer_torch, model=lm)) return GlobalAttentionModel( enc_in_dim=in_dim, @@ -172,6 +178,7 @@ def make_model( language_model=lm, use_weight_feedback=use_weight_feedback, use_att_ctx_in_state=use_att_ctx_in_state, + use_mini_att=use_mini_att, **extra, ) @@ -213,6 +220,7 @@ def from_scratch_model_def(*, epoch: int, in_dim: Dim, target_dim: Dim) -> Globa lm_opts = config.typed_value("external_lm") use_weight_feedback = config.bool("use_weight_feedback", True) use_att_ctx_in_state = config.bool("use_att_ctx_in_state", True) + use_mini_att = config.bool("use_mini_att", False) return MakeModel.make_model( in_dim, @@ -222,6 +230,7 @@ def from_scratch_model_def(*, epoch: int, in_dim: Dim, target_dim: Dim) -> Globa language_model=lm_opts, use_weight_feedback=use_weight_feedback, use_att_ctx_in_state=use_att_ctx_in_state, + use_mini_att=use_mini_att, ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/recog.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/recog.py index 8448a5744..084d0b401 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/recog.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/recog.py @@ -1,7 +1,7 @@ from typing import Optional, Dict, Any, Tuple import tree -from returnn.tensor import Tensor, Dim +from returnn.tensor import Tensor, Dim, single_step_dim import returnn.frontend as rf from returnn.frontend.tensor_array import TensorArray @@ -15,7 +15,11 @@ def model_recog( model: GlobalAttentionModel, data: Tensor, data_spatial_dim: Dim, + beam_size: int, max_seq_len: Optional[int] = None, + external_lm_scale: Optional[float] = None, + ilm_type: Optional[str] = None, + ilm_correction_scale: Optional[float] = None, ) -> Tuple[Tensor, Tensor, Dim, Dim]: """ Function is run within RETURNN. @@ -30,37 +34,70 @@ def model_recog( out_spatial_dim, final beam_dim """ - assert not model.label_decoder.language_model # not implemented here. use the pure PyTorch search instead - batch_dims = data.remaining_dims((data_spatial_dim, data.feature_dim)) + if ilm_type is not None: + assert ilm_type in ("mini_att",) + assert ilm_correction_scale is not None + + # --------------------------------- init encoder, dims, etc --------------------------------- + enc_args, enc_spatial_dim = model.encoder.encode(data, in_spatial_dim=data_spatial_dim) - beam_size = 12 - length_normalization_exponent = 1.0 + if max_seq_len is None: max_seq_len = enc_spatial_dim.get_size_tensor() else: max_seq_len = rf.convert_to_tensor(max_seq_len, dtype="int32") - print("** max seq len:", max_seq_len.raw_tensor) - # Eager-mode implementation of beam search. - # Initial state. + batch_dims = data.remaining_dims((data_spatial_dim, data.feature_dim)) beam_dim = Dim(1, name="initial-beam") batch_dims_ = [beam_dim] + batch_dims - decoder_state = model.label_decoder.decoder_default_initial_state(batch_dims=batch_dims_, enc_spatial_dim=enc_spatial_dim) - target = rf.constant(model.label_decoder.bos_idx, dims=batch_dims_, sparse_dim=model.target_dim) + + length_normalization_exponent = 1.0 + ended = rf.constant(False, dims=batch_dims_) out_seq_len = rf.constant(0, dims=batch_dims_) seq_log_prob = rf.constant(0.0, dims=batch_dims_) - i = 0 + # lists of [B, beam] tensors seq_targets = [] seq_backrefs = [] + + # --------------------------------- init states --------------------------------- + + decoder_state = model.label_decoder.decoder_default_initial_state( + batch_dims=batch_dims_, enc_spatial_dim=enc_spatial_dim) + + # external LM + if model.language_model: + lm_state = model.language_model.default_initial_state(batch_dims=batch_dims_) + else: + lm_state = None + + # ILM + if ilm_type is not None: + ilm_state = model.label_decoder.decoder_default_initial_state( + batch_dims=batch_dims_, enc_spatial_dim=enc_spatial_dim) + else: + ilm_state = None + + # --------------------------------- init targets --------------------------------- + + target = rf.constant(model.label_decoder.bos_idx, dims=batch_dims_, sparse_dim=model.target_dim) + + # --------------------------------- main loop --------------------------------- + + i = 0 while True: + # --------------------------------- get embeddings --------------------------------- + if i == 0: input_embed = rf.zeros( batch_dims_ + [model.label_decoder.target_embed.out_dim], feature_dim=model.label_decoder.target_embed.out_dim) else: input_embed = model.label_decoder.target_embed(target) + + # --------------------------------- decoder step --------------------------------- + step_out, decoder_state = model.label_decoder.loop_step( **enc_args, enc_spatial_dim=enc_spatial_dim, @@ -69,19 +106,71 @@ def model_recog( ) logits = model.label_decoder.decode_logits(input_embed=input_embed, **step_out) label_log_prob = rf.log_softmax(logits, axis=model.target_dim) + + # --------------------------------- external LM step --------------------------------- + + if lm_state is not None: + lm_logits, lm_state = model.language_model( + target, + spatial_dim=single_step_dim, + state=lm_state, + ) + lm_label_log_prob = rf.log_softmax(lm_logits, axis=model.target_dim) + label_log_prob += external_lm_scale * lm_label_log_prob + + # --------------------------------- ILM step --------------------------------- + + if ilm_state is not None: + ilm_step_out, ilm_state = model.label_decoder.loop_step( + **enc_args, + enc_spatial_dim=enc_spatial_dim, + input_embed=input_embed, + state=ilm_state, + use_mini_att=True + ) + ilm_logits = model.label_decoder.decode_logits(input_embed=input_embed, **ilm_step_out) + ilm_label_log_prob = rf.log_softmax(ilm_logits, axis=model.target_dim) + label_log_prob -= ilm_correction_scale * ilm_label_log_prob + + # --------------------------------- filter finished beams, pick top-k --------------------------------- + # Filter out finished beams label_log_prob = rf.where( ended, rf.sparse_to_dense(model.label_decoder.eos_idx, axis=model.target_dim, label_value=0.0, other_value=-1.0e30), label_log_prob, ) + seq_log_prob = seq_log_prob + label_log_prob # Batch, InBeam, Vocab seq_log_prob, (backrefs, target), beam_dim = rf.top_k( seq_log_prob, k_dim=Dim(beam_size, name=f"dec-step{i}-beam"), axis=[beam_dim, model.target_dim] ) # seq_log_prob, backrefs, target: Batch, Beam seq_targets.append(target) seq_backrefs.append(backrefs) + + # --------------------------------- update states --------------------------------- + + # decoder decoder_state = tree.map_structure(lambda s: rf.gather(s, indices=backrefs), decoder_state) + + # external LM + if lm_state is not None: + def _get_lm_state(state): + if isinstance(state, Dim): + return state + + assert isinstance(state, Tensor) + if len(state.dims) == 0: + return state + + return rf.gather(state, indices=backrefs) + + lm_state = tree.map_structure(lambda state: _get_lm_state(state), lm_state) + + # ILM + if ilm_state is not None: + ilm_state = tree.map_structure(lambda s: rf.gather(s, indices=backrefs), ilm_state) + ended = rf.gather(ended, indices=backrefs) out_seq_len = rf.gather(out_seq_len, indices=backrefs) i += 1 diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/train.py index b068dc565..ccc65e321 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/global_/train.py @@ -48,6 +48,7 @@ def _body(input_embed: Tensor, state: rf.State): enc_spatial_dim=enc_spatial_dim, input_embed=input_embed, state=state.decoder, + use_mini_att=model.use_mini_att, ) return loop_out_, new_state @@ -84,6 +85,9 @@ def get_s_and_att_efficient( enc_ctx=enc_args["enc_ctx"], enc_spatial_dim=enc_spatial_dim, s=s, + input_embed=input_embeddings, + input_embed_spatial_dim=targets_spatial_dim, + use_mini_att=model.use_mini_att, ) return s, att diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/lstm/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/lstm/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/lstm/model.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/lstm/model.py new file mode 100644 index 000000000..9e0ba43bc --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/lstm/model.py @@ -0,0 +1,120 @@ +from typing import Optional, Any, Tuple, Dict, Sequence, List +import tree +from returnn.tensor import Tensor, Dim, single_step_dim +import returnn.frontend as rf + + +class LSTM_LM_Model(rf.Module): + """Model definition""" + + def __init__( + self, + in_dim: Dim, + target_dim: Dim, + *, + num_enc_layers: int = 12, + + lstm_input_dim: Dim = Dim(name="lstm-input", dimension=128), + lstm_model_dim: Dim = Dim(name="lstm-model", dimension=2048), + # enc_att_num_heads: int = 4, + # enc_conformer_layer_opts: Optional[Dict[str, Any]] = None, + # enc_key_total_dim: Dim = Dim(name="enc_key_total_dim", dimension=1024), + # att_num_heads: Dim = Dim(name="att_num_heads", dimension=1), + # att_dropout: float = 0.1, + # enc_dropout: float = 0.1, + # enc_att_dropout: float = 0.1, + # l2: float = 0.0001, + search_args: Optional[Dict[str, Any]] = None, + ): + super(LSTM_LM_Model, self).__init__() + self.in_dim = in_dim + + self.input = rf.Embedding(in_dim, lstm_input_dim) + self.input_bias = rf.Parameter((lstm_input_dim,)) + + self.lstm_0 = rf.LSTM(lstm_input_dim, lstm_model_dim, with_bias=True) + self.lstm_1 = rf.LSTM(lstm_model_dim, lstm_model_dim, with_bias=True) + self.lstm_2 = rf.LSTM(lstm_model_dim, lstm_model_dim, with_bias=True) + self.lstm_3 = rf.LSTM(lstm_model_dim, lstm_model_dim, with_bias=True) + + self.output = rf.Linear(lstm_model_dim, target_dim) + + def loop_step(self, prev_target, prev_state): + """loop step""" + lm_state = rf.State() + input = self.input(prev_target) + input += self.input_bias + # breakpoint() + lstm_0, lstm_0_state = self.lstm_0(input, state=prev_state.lstm_0, spatial_dim=single_step_dim) + lm_state.lstm_0 = lstm_0_state + lstm_1, lstm_1_state = self.lstm_1(lstm_0, state=prev_state.lstm_1, spatial_dim=single_step_dim) + lm_state.lstm_1 = lstm_1_state + lstm_2, lstm_2_state = self.lstm_2(lstm_1, state=prev_state.lstm_2, spatial_dim=single_step_dim) + lm_state.lstm_2 = lstm_2_state + lstm_3, lstm_3_state = self.lstm_3(lstm_2, state=prev_state.lstm_3, spatial_dim=single_step_dim) + lm_state.lstm_3 = lstm_3_state + output = self.output(lstm_3) + return {"output": output}, lm_state + + def lm_default_initial_state(self, *, batch_dims: Sequence[Dim] + ) -> rf.State: + """Default initial state""" + state = rf.State( + lstm_0=self.lstm_0.default_initial_state(batch_dims=batch_dims), + lstm_1=self.lstm_1.default_initial_state(batch_dims=batch_dims), + lstm_2=self.lstm_2.default_initial_state(batch_dims=batch_dims), + lstm_3=self.lstm_3.default_initial_state(batch_dims=batch_dims), + ) + return state + + def select_state(self, state: rf.State, backrefs) -> rf.State: + state = tree.map_structure( + lambda s: rf.gather(s, indices=backrefs), state + ) + return state + +class MakeModel: + """for import""" + + def __init__( + self, + in_dim: int, + target_dim: int, + *, + eos_label: int = 0, + # num_enc_layers: int = 12, + ): + self.in_dim = in_dim + self.target_dim = target_dim + + self.eos_label = eos_label + + + def __call__(self) -> LSTM_LM_Model: + from returnn.datasets.util.vocabulary import Vocabulary + + in_dim = Dim(name="in", dimension=self.in_dim, kind=Dim.Types.Feature) + target_dim = Dim( + name="target", dimension=self.target_dim, kind=Dim.Types.Feature + ) + target_dim.vocab = Vocabulary.create_vocab_from_labels( + [str(i) for i in range(target_dim.dimension)], eos_label=self.eos_label + ) + + return self.make_model(in_dim, target_dim) + + @classmethod + def make_model( + cls, + in_dim: Dim, + target_dim: Dim, + # *, + # search_args: Optional[Dict[str, Any]], + # num_enc_layers: int = 12, + ) -> LSTM_LM_Model: + """make""" + return LSTM_LM_Model( + in_dim, + # num_enc_layers=num_enc_layers, + target_dim=target_dim, + ) \ No newline at end of file diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/lstm/model_import.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/lstm/model_import.py new file mode 100644 index 000000000..68f78cf12 --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/lstm/model_import.py @@ -0,0 +1,126 @@ +""" +https://arxiv.org/abs/1905.04226 +Reference: https://github.com/rwth-i6/returnn-experiments/blob/master/2019-lm-transformers/librispeech/bpe_10k/transfo_24_d00.4096_1024.sgd.lr1.8_heads.config +""" + +from __future__ import annotations + +from typing import Dict + +import numpy + +from sisyphus import tk + +from i6_core.returnn.training import Checkpoint +from i6_experiments.users.schmitt.returnn_frontend.convert.checkpoint import ConvertTfCheckpointToRfPtJob + +import returnn.frontend as rf + +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.lm.trafo import model as trafo_lm + +_returnn_tf_ckpt_filename = "/u/zhou/asr-exps/librispeech/dependencies/kazuki_lstmlm_27062019/network.040" +TrafoLmOpts = { + "vocab_dim": 10_025, + "model_dim": 1024, + "embed_dim": 128, + "num_layers": 24, + "decoder_layer_opts": {"self_att_opts": {"with_bias": False, "att_dropout_broadcast": False}}, + "input_embedding_scale": 1.0, + "share_embedding": False, + "logits_with_bias": True, + "input_dropout": 0.1, +} + +_ParamMapping = {} # type: Dict[str,str] + + +def get_tf_checkpoint_path() -> tk.Path: + """ + :return: Sisyphus tk.Path to the original TF checkpoint file + + https://arxiv.org/abs/1905.04226 + Reference: https://github.com/rwth-i6/returnn-experiments/blob/master/2019-lm-transformers/librispeech/bpe_10k/transfo_24_d00.4096_1024.sgd.lr1.8_heads.config + """ + return tk.Path( + _returnn_tf_ckpt_filename, hash_overwrite="librispeech-2018-kazuki-transfo_24_d00.4096_1024.sgd.lr1.8_heads" + ) + + +def get_pt_checkpoint_path() -> tk.Path: + """ + :return: Sisyphus tk.Path to the PyTorch checkpoint file + + https://arxiv.org/abs/1905.04226 + Reference: https://github.com/rwth-i6/returnn-experiments/blob/master/2019-lm-transformers/librispeech/bpe_10k/transfo_24_d00.4096_1024.sgd.lr1.8_heads.config + """ + old_tf_ckpt_path = get_tf_checkpoint_path() + old_tf_ckpt = Checkpoint(index_path=old_tf_ckpt_path) + make_model_func = trafo_lm.MakeModel(**TrafoLmOpts) # eos_label=0 + converter = ConvertTfCheckpointToRfPtJob( + checkpoint=old_tf_ckpt, + make_model_func=make_model_func, + map_func=map_param_func_v2, + epoch=1, + step=0, + ) + return converter.out_checkpoint + + +def _add_params(): + _ParamMapping.update( + { + "input_embedding.weight": "output/rec/target_embed_raw/W", + "input_embedding_proj.weight": "output/rec/target_embed_lin/W", + "final_layer_norm.scale": "output/rec/decoder/scale", + "final_layer_norm.bias": "output/rec/decoder/bias", + "logits.weight": "output/rec/output/W", + "logits.bias": "output/rec/output/b", + } + ) + + for layer_idx in range(TrafoLmOpts["num_layers"]): + # FF + _ParamMapping[f"layers.{layer_idx}.ff.linear_ff.weight"] = f"output/rec/dec_{layer_idx}_ff_conv1/W" + _ParamMapping[f"layers.{layer_idx}.ff.linear_ff.bias"] = f"output/rec/dec_{layer_idx}_ff_conv1/b" + _ParamMapping[f"layers.{layer_idx}.ff.linear_out.weight"] = f"output/rec/dec_{layer_idx}_ff_conv2/W" + _ParamMapping[f"layers.{layer_idx}.ff.linear_out.bias"] = f"output/rec/dec_{layer_idx}_ff_conv2/b" + _ParamMapping[f"layers.{layer_idx}.ff_layer_norm.scale"] = f"output/rec/dec_{layer_idx}_ff_laynorm/scale" + _ParamMapping[f"layers.{layer_idx}.ff_layer_norm.bias"] = f"output/rec/dec_{layer_idx}_ff_laynorm/bias" + + # self-att + _ParamMapping[f"layers.{layer_idx}.self_att.qkv.weight"] = f"output/rec/dec_{layer_idx}_self_att_att/QKV" + _ParamMapping[f"layers.{layer_idx}.self_att.proj.weight"] = f"output/rec/dec_{layer_idx}_self_att_lin/W" + _ParamMapping[f"layers.{layer_idx}.self_att_layer_norm.scale"] = ( + f"output/rec/dec_{layer_idx}_self_att_laynorm/scale" + ) + _ParamMapping[f"layers.{layer_idx}.self_att_layer_norm.bias"] = ( + f"output/rec/dec_{layer_idx}_self_att_laynorm/bias" + ) + + +_add_params() + + +def map_param_func_v2(reader, name: str, var: rf.Parameter) -> numpy.ndarray: + """map params, TF to RF""" + from tensorflow.python.training.py_checkpoint_reader import CheckpointReader + + assert isinstance(reader, CheckpointReader) + assert isinstance(var, rf.Parameter) + + tf_var_name = name.replace(".", "/") + if reader.has_tensor(tf_var_name): + return reader.get_tensor(tf_var_name) + + if name in _ParamMapping: + var_name = _ParamMapping[name] + assert reader.has_tensor(var_name) + value = reader.get_tensor(var_name) + assert isinstance(value, numpy.ndarray) + assert ( + value.shape == var.batch_shape + ), f"new param {name} {var.batch_shape} vs ckpt param {var_name} {value.shape}" + assert value.dtype.name == var.dtype, f"new param {name} {var.dtype} vs ckpt param {var_name} {value.dtype}" + return value + + raise NotImplementedError(f"cannot map {name!r} {var}") diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/trafo/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/trafo/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/trafo/model.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/trafo/model.py new file mode 100644 index 000000000..863e2ddac --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/trafo/model.py @@ -0,0 +1,263 @@ +""" +Transformer LM + +First: import our existing TF model + +checkpoint: /work/asr3/irie/experiments/lm/librispeech/2018-03-05--lmbpe-zeyer/data-train/transfo_24_d00.4096_1024.sgd.lr1.8_heads/bk-net-model/network.023 +config example: /work/asr4/zeineldeen/setups-data/ubuntu_22_setups/2023-04-17--conformer-att/work/i6_core/returnn/search/ReturnnSearchJobV2.i6YlJ7HAXfGs/output/returnn.config +""" + +from __future__ import annotations +from typing import Union, Any, Tuple, Optional +from returnn.tensor import Tensor, Dim, single_step_dim +import returnn.frontend as rf +from returnn.frontend.decoder.transformer import TransformerDecoder +from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.label_scorer import LabelScorerIntf + + +class MakeModel: + """for import""" + + def __init__(self, vocab_dim: Union[int, Dim], model_dim: Union[int, Dim], *, num_layers: int, **extra): + self.vocab_dim = vocab_dim + self.model_dim = model_dim + self.num_layers = num_layers + self.extra = extra + + def __call__(self) -> TransformerDecoder: + if isinstance(self.vocab_dim, int): + vocab_dim = Dim(self.vocab_dim, name="vocab") + elif isinstance(self.vocab_dim, Dim): + vocab_dim = self.vocab_dim + else: + raise TypeError(f"vocab dim type {type(self.vocab_dim).__name__}") + + if isinstance(self.model_dim, int): + model_dim = Dim(self.model_dim, name="model") + elif isinstance(self.model_dim, Dim): + model_dim = self.model_dim + else: + raise TypeError(f"model dim type {type(self.model_dim).__name__}") + + if vocab_dim.vocab is None: + from returnn.datasets.util.vocabulary import Vocabulary + + vocab_dim.vocab = Vocabulary.create_vocab_from_labels( + [str(i) for i in range(vocab_dim.dimension)], + ) + + opts = self.extra.copy() + for k, v in list(opts.items()): + if k.endswith("_dim") and isinstance(v, int): + opts[k] = Dim(v, name=k[: -len("_dim")]) + + return self.make_model(vocab_dim=vocab_dim, model_dim=model_dim, num_layers=self.num_layers, **opts) + + @classmethod + def make_model(cls, vocab_dim: Dim, model_dim: Dim, *, num_layers: int, **extra) -> TransformerDecoder: + """make""" + return TransformerDecoder( + encoder_dim=None, + vocab_dim=vocab_dim, + model_dim=model_dim, + num_layers=num_layers, + **extra, + ) + + +def make_label_sync_label_scorer_torch( + model: TransformerDecoder, +) -> LabelScorerIntf: + """ + Make label scorer + """ + import torch + import tree + import functools + from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.label_scorer import ( + StateObjTensorExt, + StateObjIgnored, + ) + + class LabelScorer(LabelScorerIntf): + """TransformerDecoder label scorer""" + + def get_initial_state(self, *, batch_size: int, device: torch.device) -> Any: + """Initial state.""" + batch_dim = Dim(batch_size, name="batch") + beam_dim = Dim(1, name="initial-beam") + batch_dims_ = [batch_dim, beam_dim] + decoder_state = model.default_initial_state(batch_dims=batch_dims_) + return tree.map_structure( + functools.partial(self._map_tensor_to_raw, batch_dim=batch_dim, beam_dim=beam_dim), decoder_state + ) + + def max_remaining_seq_score( + self, *, state: Any, max_remaining_steps: torch.Tensor, device: torch.device + ) -> torch.Tensor: + """max remaining""" + return torch.zeros((1, 1), device=device) + + def score_and_update_state( + self, + *, + prev_state: Any, + prev_label: torch.Tensor, + prev_align_label: Optional[torch.Tensor] = None, # not used + t: Optional[int] = None, # not used + ) -> Tuple[torch.Tensor, Any]: + """update state""" + batch_dim = Dim(prev_label.shape[0], name="batch") + beam_dim = Dim(prev_label.shape[1], name="beam") + + def _map_raw_to_tensor(v): + if isinstance(v, StateObjTensorExt): + tensor: Tensor = v.extra + tensor = tensor.copy_template_new_dim_tags( + (batch_dim, beam_dim) + tensor.dims[2:], keep_special_axes=True + ) + tensor.raw_tensor = v.tensor + return tensor + elif isinstance(v, StateObjIgnored): + return v.content + else: + raise TypeError(f"_map_raw_to_tensor: unexpected {v} ({type(v).__name__})") + + logits, decoder_state = model( + rf.convert_to_tensor(prev_label, dims=[batch_dim, beam_dim], sparse_dim=model.vocab_dim), + spatial_dim=single_step_dim, + state=tree.map_structure(_map_raw_to_tensor, prev_state), + ) + label_log_prob = rf.log_softmax(logits, axis=model.vocab_dim) + assert set(label_log_prob.dims) == {batch_dim, beam_dim, model.vocab_dim} + + return ( + self._map_tensor_to_raw(label_log_prob, batch_dim=batch_dim, beam_dim=beam_dim).tensor, + tree.map_structure( + functools.partial(self._map_tensor_to_raw, batch_dim=batch_dim, beam_dim=beam_dim), decoder_state + ), + ) + + @staticmethod + def _map_tensor_to_raw(v, *, batch_dim: Dim, beam_dim: Dim): + if isinstance(v, Tensor): + if beam_dim not in v.dims: + return StateObjIgnored(v) + batch_dims_ = [batch_dim, beam_dim] + v = v.copy_transpose(batch_dims_ + [dim for dim in v.dims if dim not in batch_dims_]) + raw = v.raw_tensor + return StateObjTensorExt(raw, v.copy_template()) + elif isinstance(v, Dim): + return StateObjIgnored(v) + else: + raise TypeError(f"_map_tensor_to_raw: unexpected {v} ({type(v).__name__})") + + return LabelScorer() + + +def make_time_sync_label_scorer_torch( + model: TransformerDecoder, + align_target_dim: Dim, +) -> LabelScorerIntf: + """ + Make label scorer + """ + import torch + import tree + import functools + from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.label_scorer import ( + StateObjTensorExt, + StateObjIgnored, + ) + + class LabelScorer(LabelScorerIntf): + """TransformerDecoder label scorer""" + + def get_initial_state(self, *, batch_size: int, device: torch.device) -> Any: + """Initial state.""" + batch_dim = Dim(batch_size, name="batch") + beam_dim = Dim(1, name="initial-beam") + batch_dims_ = [batch_dim, beam_dim] + decoder_state = model.default_initial_state(batch_dims=batch_dims_) + return tree.map_structure( + functools.partial(self._map_tensor_to_raw, batch_dim=batch_dim, beam_dim=beam_dim), decoder_state + ) + + def max_remaining_seq_score( + self, *, state: Any, max_remaining_steps: torch.Tensor, device: torch.device + ) -> torch.Tensor: + """max remaining""" + return torch.zeros((1, 1), device=device) + + def score_and_update_state( + self, + *, + prev_state: Any, + prev_label: torch.Tensor, + prev_align_label: Optional[torch.Tensor] = None, + t: Optional[int] = None, + ) -> Tuple[torch.Tensor, Any]: + """update state""" + batch_dim = Dim(prev_label.shape[0], name="batch") + beam_dim = Dim(prev_label.shape[1], name="beam") + + def _map_raw_to_tensor(v): + if isinstance(v, StateObjTensorExt): + tensor: Tensor = v.extra + tensor = tensor.copy_template_new_dim_tags( + (batch_dim, beam_dim) + tensor.dims[2:], keep_special_axes=True + ) + tensor.raw_tensor = v.tensor + return tensor + elif isinstance(v, StateObjIgnored): + return v.content + else: + raise TypeError(f"_map_raw_to_tensor: unexpected {v} ({type(v).__name__})") + + initial_output_mask = rf.convert_to_tensor(prev_label == -1, dims=[batch_dim, beam_dim]) + prev_label = rf.convert_to_tensor(prev_label, dims=[batch_dim, beam_dim], sparse_dim=model.vocab_dim) + # replace -1 by 0 (assuming 0 is the BOS symbol) + prev_label = rf.where( + initial_output_mask, + rf.zeros_like(prev_label), + prev_label + ) + logits, decoder_state = model( + prev_label, + spatial_dim=single_step_dim, + state=tree.map_structure(_map_raw_to_tensor, prev_state), + ) + label_log_prob = rf.log_softmax(logits, axis=model.vocab_dim) + blank_log_prob = rf.zeros( + [Dim(1, name="blank_log_prob_label_scorer")], + dtype="float32" + ) + output_log_prob, _ = rf.concat( + (label_log_prob, model.vocab_dim), (blank_log_prob, blank_log_prob.dims[0]), + out_dim=align_target_dim, + allow_broadcast=True + ) + assert set(output_log_prob.dims) == {batch_dim, beam_dim, align_target_dim} + + return ( + self._map_tensor_to_raw(output_log_prob, batch_dim=batch_dim, beam_dim=beam_dim).tensor, + tree.map_structure( + functools.partial(self._map_tensor_to_raw, batch_dim=batch_dim, beam_dim=beam_dim), decoder_state + ), + ) + + @staticmethod + def _map_tensor_to_raw(v, *, batch_dim: Dim, beam_dim: Dim): + if isinstance(v, Tensor): + if beam_dim not in v.dims: + return StateObjIgnored(v) + batch_dims_ = [batch_dim, beam_dim] + v = v.copy_transpose(batch_dims_ + [dim for dim in v.dims if dim not in batch_dims_]) + raw = v.raw_tensor + return StateObjTensorExt(raw, v.copy_template()) + elif isinstance(v, Dim): + return StateObjIgnored(v) + else: + raise TypeError(f"_map_tensor_to_raw: unexpected {v} ({type(v).__name__})") + + return LabelScorer() diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/trafo/model_import.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/trafo/model_import.py new file mode 100644 index 000000000..b1371afd1 --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/lm/trafo/model_import.py @@ -0,0 +1,127 @@ +""" +https://arxiv.org/abs/1905.04226 +Reference: https://github.com/rwth-i6/returnn-experiments/blob/master/2019-lm-transformers/librispeech/bpe_10k/transfo_24_d00.4096_1024.sgd.lr1.8_heads.config +""" + +from __future__ import annotations + +from typing import Dict + +import numpy + +from sisyphus import tk + +from i6_core.returnn.training import Checkpoint +from i6_experiments.users.schmitt.returnn_frontend.convert.checkpoint import ConvertTfCheckpointToRfPtJob + +import returnn.frontend as rf + +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.lm.trafo import model as trafo_lm + +_returnn_tf_ckpt_filename = "/work/asr3/irie/experiments/lm/librispeech/2018-03-05--lmbpe-zeyer/data-train/transfo_24_d00.4096_1024.sgd.lr1.8_heads/bk-net-model/network.023.index" +_load_existing_ckpt_in_test = True +TrafoLmOpts = { + "vocab_dim": 10_025, + "model_dim": 1024, + "embed_dim": 128, + "num_layers": 24, + "decoder_layer_opts": {"self_att_opts": {"with_bias": False, "att_dropout_broadcast": False}}, + "input_embedding_scale": 1.0, + "share_embedding": False, + "logits_with_bias": True, + "input_dropout": 0.1, +} + +_ParamMapping = {} # type: Dict[str,str] + + +def get_tf_checkpoint_path() -> tk.Path: + """ + :return: Sisyphus tk.Path to the original TF checkpoint file + + https://arxiv.org/abs/1905.04226 + Reference: https://github.com/rwth-i6/returnn-experiments/blob/master/2019-lm-transformers/librispeech/bpe_10k/transfo_24_d00.4096_1024.sgd.lr1.8_heads.config + """ + return tk.Path( + _returnn_tf_ckpt_filename, hash_overwrite="librispeech-2018-kazuki-transfo_24_d00.4096_1024.sgd.lr1.8_heads" + ) + + +def get_pt_checkpoint_path() -> tk.Path: + """ + :return: Sisyphus tk.Path to the PyTorch checkpoint file + + https://arxiv.org/abs/1905.04226 + Reference: https://github.com/rwth-i6/returnn-experiments/blob/master/2019-lm-transformers/librispeech/bpe_10k/transfo_24_d00.4096_1024.sgd.lr1.8_heads.config + """ + old_tf_ckpt_path = get_tf_checkpoint_path() + old_tf_ckpt = Checkpoint(index_path=old_tf_ckpt_path) + make_model_func = trafo_lm.MakeModel(**TrafoLmOpts) # eos_label=0 + converter = ConvertTfCheckpointToRfPtJob( + checkpoint=old_tf_ckpt, + make_model_func=make_model_func, + map_func=map_param_func_v2, + epoch=1, + step=0, + ) + return converter.out_checkpoint + + +def _add_params(): + _ParamMapping.update( + { + "input_embedding.weight": "output/rec/target_embed_raw/W", + "input_embedding_proj.weight": "output/rec/target_embed_lin/W", + "final_layer_norm.scale": "output/rec/decoder/scale", + "final_layer_norm.bias": "output/rec/decoder/bias", + "logits.weight": "output/rec/output/W", + "logits.bias": "output/rec/output/b", + } + ) + + for layer_idx in range(TrafoLmOpts["num_layers"]): + # FF + _ParamMapping[f"layers.{layer_idx}.ff.linear_ff.weight"] = f"output/rec/dec_{layer_idx}_ff_conv1/W" + _ParamMapping[f"layers.{layer_idx}.ff.linear_ff.bias"] = f"output/rec/dec_{layer_idx}_ff_conv1/b" + _ParamMapping[f"layers.{layer_idx}.ff.linear_out.weight"] = f"output/rec/dec_{layer_idx}_ff_conv2/W" + _ParamMapping[f"layers.{layer_idx}.ff.linear_out.bias"] = f"output/rec/dec_{layer_idx}_ff_conv2/b" + _ParamMapping[f"layers.{layer_idx}.ff_layer_norm.scale"] = f"output/rec/dec_{layer_idx}_ff_laynorm/scale" + _ParamMapping[f"layers.{layer_idx}.ff_layer_norm.bias"] = f"output/rec/dec_{layer_idx}_ff_laynorm/bias" + + # self-att + _ParamMapping[f"layers.{layer_idx}.self_att.qkv.weight"] = f"output/rec/dec_{layer_idx}_self_att_att/QKV" + _ParamMapping[f"layers.{layer_idx}.self_att.proj.weight"] = f"output/rec/dec_{layer_idx}_self_att_lin/W" + _ParamMapping[f"layers.{layer_idx}.self_att_layer_norm.scale"] = ( + f"output/rec/dec_{layer_idx}_self_att_laynorm/scale" + ) + _ParamMapping[f"layers.{layer_idx}.self_att_layer_norm.bias"] = ( + f"output/rec/dec_{layer_idx}_self_att_laynorm/bias" + ) + + +_add_params() + + +def map_param_func_v2(reader, name: str, var: rf.Parameter) -> numpy.ndarray: + """map params, TF to RF""" + from tensorflow.python.training.py_checkpoint_reader import CheckpointReader + + assert isinstance(reader, CheckpointReader) + assert isinstance(var, rf.Parameter) + + tf_var_name = name.replace(".", "/") + if reader.has_tensor(tf_var_name): + return reader.get_tensor(tf_var_name) + + if name in _ParamMapping: + var_name = _ParamMapping[name] + assert reader.has_tensor(var_name) + value = reader.get_tensor(var_name) + assert isinstance(value, numpy.ndarray) + assert ( + value.shape == var.batch_shape + ), f"new param {name} {var.batch_shape} vs ckpt param {var_name} {value.shape}" + assert value.dtype.name == var.dtype, f"new param {name} {var.dtype} vs ckpt param {var_name} {value.dtype}" + return value + + raise NotImplementedError(f"cannot map {name!r} {var}") diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/recog.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/recog.py index 902ee0481..c2ff922b3 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/recog.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/recog.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Dict from returnn.tensor import TensorDict @@ -25,9 +25,8 @@ def _returnn_v2_forward_step(*, model, extern_data: TensorDict, **_kwargs_unused targets = extern_data[default_target_key] extra.update(dict(targets=targets, targets_spatial_dim=targets.get_time_dim_tag())) - use_recombination = config.typed_value("use_recombination", None) - if use_recombination: - extra.update(dict(use_recombination=use_recombination)) + beam_search_opts = config.typed_value("beam_search_opts", {}) # type: Dict + extra.update(beam_search_opts) recog_out = recog_def(model=model, data=data, data_spatial_dim=data_spatial_dim, **extra) if len(recog_out) == 5: diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model.py index 7e441184c..f87653f90 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model.py @@ -68,9 +68,9 @@ def __init__( ) assert blank_decoder_version in {1, 3, 4, 5, 6} - assert label_decoder_state in {"nb-lstm", "joint-lstm"} + assert label_decoder_state in {"nb-lstm", "joint-lstm", "nb-linear1"} if not use_joint_model: - assert label_decoder_state == "nb-lstm" + assert label_decoder_state in ("nb-lstm", "nb-linear1") if not use_weight_feedback and not use_att_ctx_in_state: label_decoder_cls = SegmentalAttEfficientLabelDecoder @@ -88,6 +88,7 @@ def __init__( center_window_size=center_window_size, use_weight_feedback=use_weight_feedback, use_att_ctx_in_state=use_att_ctx_in_state, + decoder_state=label_decoder_state, ) if not use_joint_model: @@ -280,6 +281,10 @@ def from_scratch_model_def( def _returnn_v2_get_model(*, epoch: int, **_kwargs_unused): + """ + Here, we use a separate blank model and define the blank_index=len(target_vocab). In this case, the target_dim + is one smaller than the align_target_dim and the EOS label is unused. + """ from returnn.tensor import Tensor, Dim from returnn.config import get_global_config @@ -302,9 +307,14 @@ def _returnn_v2_get_model(*, epoch: int, **_kwargs_unused): return model -def _returnn_v2_get_model_for_full_sum_training(*, epoch: int, **_kwargs_unused): - from returnn.tensor import Tensor, Dim +def _returnn_v2_get_joint_model(*, epoch: int, **_kwargs_unused): + """ + Here, we reinterpret the EOS label as a blank label and use a single softmax for both blank and non-blank labels. + Therefore, we assume align_target_dim and target_dim to be the same. + """ + from returnn.tensor import Tensor from returnn.config import get_global_config + from returnn.datasets.util.vocabulary import BytePairEncoding config = get_global_config() default_input_key = config.typed_value("default_input") @@ -313,6 +323,10 @@ def _returnn_v2_get_model_for_full_sum_training(*, epoch: int, **_kwargs_unused) data = Tensor(name=default_input_key, **extern_data_dict[default_input_key]) targets = Tensor(name=default_target_key, **extern_data_dict[default_target_key]) + non_blank_vocab = config.typed_value("non_blank_vocab") + if non_blank_vocab is not None: + targets.sparse_dim.vocab = BytePairEncoding(**non_blank_vocab) + model_def = config.typed_value("_model_def") model = model_def( epoch=epoch, in_dim=data.feature_dim, align_target_dim=targets.sparse_dim, target_dim=targets.sparse_dim) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/model.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/model.py index 3954b544f..af571e7bf 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/model.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/model.py @@ -229,6 +229,11 @@ def __init__( self.emit_prob = rf.Linear(encoder_out_dim + label_state_dim, self.emit_prob_dim) + def default_initial_state(self, *, batch_dims: Sequence[Dim]) -> rf.State: + """Default initial state""" + state = rf.State() + return state + def get_label_decoder_deps(self) -> Optional[List[str]]: return ["s"] @@ -249,6 +254,13 @@ def __init__( ) self.emit_prob = rf.Linear(self.length_model_state_dim + label_state_dim, self.emit_prob_dim) + def default_initial_state(self, *, batch_dims: Sequence[Dim]) -> rf.State: + """Default initial state""" + state = rf.State( + s_blank=self._s.default_initial_state(batch_dims=batch_dims), + ) + return state + @property def _s(self) -> rf.LSTM: return self.s diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/train.py index 7c03707fb..40c4c019b 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/blank_model/train.py @@ -145,11 +145,7 @@ def viterbi_training_v4( # (UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 0: ordinal not in range(128)) # therefore, we use the following workaround enc = enc_args["enc"] # type: rf.Tensor - enc_raw = enc.raw_tensor - enc = enc.copy_template_replace_dim_tag( - enc.get_axis_from_description(enc_spatial_dim), non_blank_mask_dim - ) - enc.raw_tensor = enc_raw + enc = utils.copy_tensor_replace_dim_tag(enc, enc_spatial_dim, non_blank_mask_dim) am, _ = utils.get_masked( input=enc, @@ -213,11 +209,7 @@ def viterbi_training_v5( ): # using dim.declare_same_as() leads to an error after an epoch is finished (see viterbi_training_v4) enc = enc_args["enc"] # type: rf.Tensor - enc_raw = enc.raw_tensor - enc = enc.copy_template_replace_dim_tag( - enc.get_axis_from_description(enc_spatial_dim), label_states_unmasked_spatial_dim - ) - enc.raw_tensor = enc_raw + enc = utils.copy_tensor_replace_dim_tag(enc, enc_spatial_dim, label_states_unmasked_spatial_dim) blank_logits = model.emit_prob(rf.concat_features(enc, label_states_unmasked)) blank_logits_packed, pack_dim, emit_ground_truth_packed = get_packed_logits_and_emit_ground_truth( @@ -248,11 +240,7 @@ def viterbi_training_v6( ): # using dim.declare_same_as() leads to an error after an epoch is finished (see viterbi_training_v4) enc = enc_args["enc"] # type: rf.Tensor - enc_raw = enc.raw_tensor - enc = enc.copy_template_replace_dim_tag( - enc.get_axis_from_description(enc_spatial_dim), label_states_unmasked_spatial_dim - ) - enc.raw_tensor = enc_raw + enc = utils.copy_tensor_replace_dim_tag(enc, enc_spatial_dim, label_states_unmasked_spatial_dim) s, _ = model.s( enc, diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/model.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/model.py index be2c24c36..9de6a3894 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/model.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/model.py @@ -23,13 +23,15 @@ def default_initial_state( ) -> rf.State: """Default initial state""" state = rf.State( - s=self.get_lstm().default_initial_state(batch_dims=batch_dims), att=rf.zeros(list(batch_dims) + [self.att_num_heads * self.enc_out_dim]), segment_starts=rf.zeros(batch_dims, sparse_dim=segment_starts_sparse_dim, dtype="int32"), segment_lens=rf.zeros(batch_dims, sparse_dim=segment_lens_sparse_dim, dtype="int32"), ) state.att.feature_dim_axis = len(state.att.dims) - 1 + if "lstm" in self.decoder_state: + state.s = self.get_lstm().default_initial_state(batch_dims=batch_dims) + if self.use_weight_feedback: state.accum_att_weights = rf.zeros( list(batch_dims) + [self.accum_att_weights_dim, self.att_num_heads], feature_dim=self.att_num_heads @@ -144,11 +146,14 @@ def loop_step( # during search, these need to be the values from the previous "emit" step (not necessarily the previous time step) prev_att = state.att - prev_s_state = state.s + prev_s_state = state.s if "lstm" in self.decoder_state else None prev_segment_starts = state.segment_starts prev_segment_lens = state.segment_lens - s, state_.s = self._update_state(input_embed, prev_att, prev_s_state) + s, s_state = self._update_state(input_embed, prev_att, prev_s_state) + if "lstm" in self.decoder_state: + state_.s = s_state + s_transformed = self.s_transformed(s) slice_dim = Dim(name="slice", dimension=segment_lens) @@ -177,9 +182,7 @@ def loop_step( energy = self.energy(rf.tanh(energy_in)) att_weights = rf.softmax(energy, axis=slice_dim) # we do not need use_mask because the softmax output is already padded with zeros - att0 = rf.dot(att_weights, enc_sliced, reduce=slice_dim, use_mask=False) - att0.feature_dim = self.enc_out_dim - att, _ = rf.merge_dims(att0, dims=(self.att_num_heads, self.enc_out_dim)) + att = self.get_att(att_weights, enc_sliced, reduce_dim=slice_dim) state_.att = att if self.use_weight_feedback: diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/train.py index 52fb27d5b..42c66a89b 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/model_new/label_model/train.py @@ -23,8 +23,12 @@ def _calc_ce_loss_and_fer( batch_dims: List[Dim], targets_spatial_dim: Dim, target_dim: Dim, + beam_dim: Optional[Dim] = None, ): + if beam_dim is not None: + batch_dims = logits.remaining_dims([beam_dim, targets_spatial_dim, target_dim]) logits_packed, pack_dim = rf.pack_padded(logits, dims=batch_dims + [targets_spatial_dim], enforce_sorted=False) + non_blank_targets_packed, _ = rf.pack_padded( targets, dims=batch_dims + [targets_spatial_dim], enforce_sorted=False, out_dim=pack_dim ) @@ -133,20 +137,25 @@ def viterbi_training_efficient( non_blank_mask: Optional[rf.Tensor] = None, non_blank_mask_spatial_dim: Optional[Dim] = None, return_label_model_states: bool = False, + beam_dim: Optional[Dim] = None, ) -> Optional[Tuple[rf.Tensor, Dim]]: input_embeddings = model.target_embed(targets) input_embeddings_shifted = rf.shift_right( input_embeddings, axis=targets_spatial_dim, pad_value=0.0) - label_lstm_out, final_state = model.s_wo_att( - input_embeddings_shifted, - state=model.s_wo_att.default_initial_state(batch_dims=batch_dims), - spatial_dim=targets_spatial_dim, - ) + if "lstm" in model.decoder_state: + s_out, s_final_state = model.s_wo_att( + input_embeddings_shifted, + state=model.s_wo_att.default_initial_state(batch_dims=batch_dims), + spatial_dim=targets_spatial_dim, + ) + else: + s_out = model.s_wo_att_linear(input_embeddings_shifted) + s_final_state = None if non_blank_mask is not None: - label_lstm_out = utils.get_unmasked( - input=label_lstm_out, + s_out = utils.get_unmasked( + input=s_out, input_spatial_dim=targets_spatial_dim, mask=non_blank_mask, mask_spatial_dim=non_blank_mask_spatial_dim, @@ -161,14 +170,14 @@ def viterbi_training_efficient( # need to move size tensor to GPU since otherwise there is an error in some merge_dims call inside rf.gather # because two tensors have different devices # TODO: fix properly in the gather implementation - targets_spatial_dim.dyn_size_ext = rf.copy_to_device(targets_spatial_dim.dyn_size_ext, label_lstm_out.device) + targets_spatial_dim.dyn_size_ext = rf.copy_to_device(targets_spatial_dim.dyn_size_ext, s_out.device) if non_blank_mask_spatial_dim is not None: - non_blank_mask_spatial_dim.dyn_size_ext = rf.copy_to_device(non_blank_mask_spatial_dim.dyn_size_ext, label_lstm_out.device) + non_blank_mask_spatial_dim.dyn_size_ext = rf.copy_to_device(non_blank_mask_spatial_dim.dyn_size_ext, s_out.device) att = model( enc=enc_args["enc"], enc_ctx=enc_args["enc_ctx"], enc_spatial_dim=enc_spatial_dim, - s=label_lstm_out, + s=s_out, segment_starts=segment_starts, segment_lens=segment_lens, ) @@ -179,10 +188,10 @@ def viterbi_training_efficient( logits = model.decode_logits( input_embed=input_embeddings_shifted, att=att, - s=label_lstm_out, + s=s_out, ) - _calc_ce_loss_and_fer(logits, ce_targets, batch_dims, ce_spatial_dim, model.target_dim) + _calc_ce_loss_and_fer(logits, ce_targets, batch_dims, ce_spatial_dim, model.target_dim, beam_dim=beam_dim) if return_label_model_states: # need to run the lstm one more time to get the last output (which is not needed for the loss computation) @@ -194,14 +203,18 @@ def viterbi_training_efficient( clip_to_valid=True, ) singleton_dim = Dim(name="singleton", dimension=1) - last_lstm_out, _ = model.s_wo_att( - last_embedding, - state=final_state, - spatial_dim=singleton_dim, - ) + if "lstm" in model.decoder_state: + last_s_out, _ = model.s_wo_att( + last_embedding, + state=s_final_state, + spatial_dim=single_step_dim, + ) + else: + last_s_out = model.s_wo_att_linear(last_embedding) + return rf.concat( - (label_lstm_out, targets_spatial_dim), - (rf.expand_dim(last_lstm_out, singleton_dim), singleton_dim), + (s_out, targets_spatial_dim), + (rf.expand_dim(last_s_out, singleton_dim), singleton_dim), ) return None @@ -270,746 +283,3 @@ def full_sum_training( loss.mark_as_loss("full_sum_loss", scale=1.0, use_normalized_loss=True) return None - - -def full_sum_training_w_beam( - *, - model: SegmentalAttEfficientLabelDecoder, - enc_args: Dict, - enc_spatial_dim: Dim, - non_blank_targets: rf.Tensor, # [B, S, V] - non_blank_targets_spatial_dim: Dim, - # segment_starts: rf.Tensor, # [B, T] - # segment_lens: rf.Tensor, # [B, T] - batch_dims: List[Dim], - beam_size: int, -) -> Optional[Dict[str, Tuple[rf.Tensor, Dim]]]: - assert len(batch_dims) == 1, "not supported yet" - - non_blank_input_embeddings = model.target_embed(non_blank_targets) # [B, S, D] - non_blank_input_embeddings, non_blank_targets_padded_spatial_dim = rf.pad( - non_blank_input_embeddings, - axes=[non_blank_targets_spatial_dim], - padding=[(1, 0)], - value=0.0, - ) - non_blank_targets_padded_spatial_dim = non_blank_targets_padded_spatial_dim[0] - - # add blank idx on the right - # this way, when the label index for gathering reached the last non-blank index, it will gather blank after that - # which then only allows corresponding hypotheses to be extended by blank - non_blank_targets_padded, _ = rf.pad( - non_blank_targets, - axes=[non_blank_targets_spatial_dim], - padding=[(0, 1)], - value=model.blank_idx, - out_dims=[non_blank_targets_padded_spatial_dim] - ) - - non_blank_targets_padded_sizes = rf.copy_to_device( - non_blank_targets_padded_spatial_dim.dyn_size_ext, non_blank_targets.device - ) - non_blank_targets_spatial_sizes = rf.copy_to_device( - non_blank_targets_spatial_dim.dyn_size_ext, non_blank_targets.device) - enc_spatial_sizes = rf.copy_to_device(enc_spatial_dim.dyn_size_ext, non_blank_targets.device) - - linear_label_positions = enc_spatial_sizes / non_blank_targets_spatial_sizes - linear_label_positions = linear_label_positions * rf.range_over_dim(non_blank_targets_spatial_dim) - # print("linear_label_positions", linear_label_positions.raw_tensor) - # exit() - - # print("non_blank_targets_padded", non_blank_targets_padded.raw_tensor) - - beam_dim = Dim(1, name="initial-beam") - batch_dims_ = [beam_dim] + batch_dims - bos_idx = 0 - seq_log_prob = rf.constant(0.0, dims=batch_dims_) - - max_seq_len = enc_spatial_dim.get_size_tensor() - max_seq_len = rf.reduce_max(max_seq_len, axis=max_seq_len.dims) - - label_lstm_state = model.s_wo_att.default_initial_state(batch_dims=batch_dims_) - target = rf.constant(bos_idx, dims=batch_dims_, sparse_dim=model.target_dim) - # target_non_blank = target.copy() - update_state_mask = rf.convert_to_tensor(target != model.blank_idx) - label_indices = rf.zeros(batch_dims_, dtype="int64") - - # input_embed = rf.zeros( - # batch_dims_ + [model.target_embed.out_dim], - # feature_dim=model.target_embed.out_dim, - # dtype="float32" - # ) - - vocab_range = rf.range_over_dim(model.target_dim) - blank_tensor = rf.convert_to_tensor(model.blank_idx, dtype=vocab_range.dtype) - log_lambda = rf.log(rf.convert_to_tensor(0.004)) * rf.ones([model.target_dim], dtype="float32") - # lambda_ = rf.shift_right(lambda_, axis=model.target_dim, pad_value=0.0) - log_lambda = rf.where( - vocab_range == model.blank_idx, - rf.constant(0.0, dims=[model.target_dim], dtype="float32"), - log_lambda - ) - - old_beam_dim = beam_dim.copy() - backrefs = rf.zeros(batch_dims_, dtype="int32") - - i = 0 - seq_targets = [] - seq_backrefs = [] - while i < max_seq_len.raw_tensor: - if i > 0: - # target_non_blank = rf.where(update_state_mask, target, rf.gather(target_non_blank, indices=backrefs)) - # input_embed = rf.where( - # update_state_mask, - # model.target_embed(target_non_blank), - # rf.gather(input_embed, indices=backrefs) - # ) - prev_label_indices = rf.gather(label_indices, indices=backrefs) - label_indices = rf.where( - update_state_mask, - rf.where( - prev_label_indices == non_blank_targets_padded_sizes - 1, - prev_label_indices, - prev_label_indices + 1 - ), - prev_label_indices - ) - - ground_truth = rf.gather( - non_blank_targets_padded, - indices=label_indices, - axis=non_blank_targets_padded_spatial_dim, - clip_to_valid=True - ) - input_embed = rf.gather( - non_blank_input_embeddings, - indices=label_indices, - axis=non_blank_targets_padded_spatial_dim, - clip_to_valid=True - ) - - label_lstm_out, label_lstm_state_updated = model.s_wo_att( - input_embed, - state=label_lstm_state, - spatial_dim=single_step_dim, - ) - - center_position = rf.minimum( - rf.full(dims=[beam_dim] + batch_dims, fill_value=i, dtype="int32"), - rf.copy_to_device(enc_spatial_dim.get_size_tensor() - 1, label_lstm_out.device) - ) - segment_starts = rf.maximum( - rf.convert_to_tensor(0, dtype="int32"), center_position - model.center_window_size // 2) - segment_ends = rf.minimum( - rf.copy_to_device(enc_spatial_dim.get_size_tensor() - 1, label_lstm_out.device), - center_position + model.center_window_size // 2 - ) - segment_lens = segment_ends - segment_starts + 1 - - att = model( - enc=enc_args["enc"], - enc_ctx=enc_args["enc_ctx"], - enc_spatial_dim=enc_spatial_dim, - s=label_lstm_out, - segment_starts=segment_starts, - segment_lens=segment_lens, - ) # [B, S+1, T, D] - # print("att", att) - - logits = model.decode_logits( - input_embed=input_embed, - att=att, - s=label_lstm_out, - ) # [B, S+1, T, D] - # print("logits", logits) - - label_log_prob = rf.log_softmax(logits, axis=model.target_dim) - - def custom_backward(grad): - grad[:, :, 0] *= 0.001 - return grad - - if rf.get_run_ctx().train_flag: - label_log_prob.raw_tensor.register_hook(custom_backward) - - # log prob needs to correspond to the next non-blank label... - log_prob_mask = vocab_range == ground_truth - rem_frames = enc_spatial_sizes - i - rem_labels = non_blank_targets_spatial_sizes - label_indices - # ... or to blank if there are more frames than labels left - log_prob_mask = rf.logical_or( - log_prob_mask, - rf.logical_and( - vocab_range == blank_tensor, - rem_frames > rem_labels - ) - ) - label_log_prob = rf.where( - log_prob_mask, - label_log_prob, - rf.constant(-float("inf"), dims=batch_dims + [beam_dim, model.target_dim]) - ) - - seq_log_prob = seq_log_prob + label_log_prob # Batch, InBeam, Vocab - old_beam_dim = beam_dim.copy() - seq_log_prob, (backrefs, target), beam_dim = rf.top_k( - seq_log_prob, - k_dim=Dim(beam_size, name=f"dec-step{i}-beam"), - axis=[beam_dim, model.target_dim] - ) # seq_log_prob, backrefs, target: Batch, Beam - seq_targets.append(target) - seq_backrefs.append(backrefs) - - update_state_mask = rf.logical_and( - rf.convert_to_tensor(target != model.blank_idx), - seq_log_prob != rf.convert_to_tensor(-float("inf"), dtype="float32") - ) - - def _get_masked_state(old, new, mask): - old = rf.gather(old, indices=backrefs, axis=old_beam_dim) - new = rf.gather(new, indices=backrefs, axis=old_beam_dim) - return rf.where(mask, new, old) - - label_lstm_state = tree.map_structure( - lambda old_state, new_state: _get_masked_state(old_state, new_state, update_state_mask), - label_lstm_state, label_lstm_state_updated - ) - - i += 1 - - # # Backtrack via backrefs, resolve beams. - # seq_targets_ = [] - # indices = rf.range_over_dim(beam_dim) # FinalBeam -> FinalBeam - # for backrefs, target in zip(seq_backrefs[::-1], seq_targets[::-1]): - # # indices: FinalBeam -> Beam - # # backrefs: Beam -> PrevBeam - # seq_targets_.insert(0, rf.gather(target, indices=indices)) - # indices = rf.gather(backrefs, indices=indices) # FinalBeam -> PrevBeam - # - # seq_targets__ = TensorArray(seq_targets_[0]) - # for target in seq_targets_: - # seq_targets__ = seq_targets__.push_back(target) - # seq_targets = seq_targets__.stack(axis=enc_spatial_dim) - - # torch.set_printoptions(threshold=10_000) - # print("seq_targets", seq_targets.copy_transpose(batch_dims + [beam_dim, enc_spatial_dim]).raw_tensor[0, 0]) - # print("seq_log_prob", seq_log_prob.raw_tensor[0]) - - # calculate full-sum loss using the log-sum-exp trick - max_log_prob = rf.reduce_max(seq_log_prob, axis=beam_dim) - loss = -1 * (max_log_prob + rf.log(rf.reduce_sum(rf.exp(seq_log_prob - max_log_prob), axis=beam_dim))) - - # loss = -rf.log(rf.reduce_sum(rf.exp(seq_log_prob), axis=beam_dim)) - loss.mark_as_loss("full_sum_loss", scale=1.0, use_normalized_loss=True) - - return None - - -def full_sum_training_w_beam_eff( - *, - model: SegmentalAttEfficientLabelDecoder, - enc_args: Dict, - enc_spatial_dim: Dim, - non_blank_targets: rf.Tensor, # [B, S, V] - non_blank_targets_spatial_dim: Dim, - segment_starts: rf.Tensor, # [B, T] - segment_lens: rf.Tensor, # [B, T] - batch_dims: List[Dim], - beam_size: int, -) -> Optional[Dict[str, Tuple[rf.Tensor, Dim]]]: - assert len(batch_dims) == 1, "not supported yet" - assert model.blank_idx == 0, "blank idx needs to be zero because of the way the gradient is scaled" - - # ------------------------ init some variables ------------------------ - beam_dim = Dim(1, name="initial-beam") - batch_dims_ = [beam_dim] + batch_dims - bos_idx = 0 - seq_log_prob = rf.constant(0.0, dims=batch_dims_) - max_seq_len = enc_spatial_dim.get_size_tensor() - max_seq_len = rf.reduce_max(max_seq_len, axis=max_seq_len.dims) - label_lstm_state = model.s_wo_att.default_initial_state(batch_dims=batch_dims) - target = rf.constant(bos_idx, dims=batch_dims_, sparse_dim=model.target_dim) - update_state_mask = rf.convert_to_tensor(target != model.blank_idx) - label_indices = rf.zeros(batch_dims_, dtype="int32") - vocab_range = rf.range_over_dim(model.target_dim) - blank_tensor = rf.convert_to_tensor(model.blank_idx, dtype=vocab_range.dtype) - backrefs = rf.zeros(batch_dims_, dtype="int32") - seq_hash = rf.constant(0, dims=batch_dims_, dtype="int64") - - # ------------------------ targets/embeddings ------------------------ - - non_blank_input_embeddings = model.target_embed(non_blank_targets) # [B, S, D] - non_blank_input_embeddings, non_blank_targets_padded_spatial_dim = rf.pad( - non_blank_input_embeddings, - axes=[non_blank_targets_spatial_dim], - padding=[(1, 0)], - value=0.0, - ) # [B, S+1, D] - non_blank_targets_padded_spatial_dim = non_blank_targets_padded_spatial_dim[0] - - # add blank idx on the right - # this way, when the label index for gathering reached the last non-blank index, it will gather blank after that - # which then only allows corresponding hypotheses to be extended by blank - non_blank_targets_padded, _ = rf.pad( - non_blank_targets, - axes=[non_blank_targets_spatial_dim], - padding=[(0, 1)], - value=model.blank_idx, - out_dims=[non_blank_targets_padded_spatial_dim] - ) - - # ------------------------ sizes ------------------------ - - non_blank_targets_padded_spatial_sizes = rf.copy_to_device( - non_blank_targets_padded_spatial_dim.dyn_size_ext, non_blank_targets.device - ) - non_blank_targets_spatial_sizes = rf.copy_to_device( - non_blank_targets_spatial_dim.dyn_size_ext, non_blank_targets.device) - max_num_labels = rf.reduce_max( - non_blank_targets_spatial_sizes, axis=non_blank_targets_spatial_sizes.dims - ).raw_tensor.item() - enc_spatial_sizes = rf.copy_to_device(enc_spatial_dim.dyn_size_ext, non_blank_targets.device) - - # ------------------------ compute LSTM sequence ------------------------ - - label_lstm_out_seq, _ = model.s_wo_att( - non_blank_input_embeddings, - state=label_lstm_state, - spatial_dim=non_blank_targets_padded_spatial_dim, - ) - - # ------------------------ chunk dim ------------------------ - - chunk_size = 20 - chunk_dim = Dim(chunk_size, name="chunk") - chunk_range = rf.expand_dim(rf.range_over_dim(chunk_dim), batch_dims[0]) - - i = 0 - seq_targets = [] - seq_backrefs = [] - while i < max_seq_len.raw_tensor: - # get current number of labels for each hypothesis - if i > 0: - prev_label_indices = rf.gather(label_indices, indices=backrefs) - label_indices = rf.where( - update_state_mask, - rf.where( - prev_label_indices == non_blank_targets_padded_spatial_sizes - 1, - prev_label_indices, - prev_label_indices + 1 - ), - prev_label_indices - ) - - # gather ground truth, input embeddings and LSTM output for current label index - ground_truth = rf.gather( - non_blank_targets_padded, - indices=label_indices, - axis=non_blank_targets_padded_spatial_dim, - clip_to_valid=True - ) - input_embed = rf.gather( - non_blank_input_embeddings, - indices=label_indices, - axis=non_blank_targets_padded_spatial_dim, - clip_to_valid=True - ) - label_lstm_out = rf.gather( - label_lstm_out_seq, - indices=label_indices, - axis=non_blank_targets_padded_spatial_dim, - clip_to_valid=True - ) - - # precompute attention for the current chunk (more efficient than computing it individually for each label index) - if i % chunk_size == 0: - seg_starts = rf.gather( - segment_starts, - indices=chunk_range, - axis=enc_spatial_dim, - clip_to_valid=True - ) - seg_lens = rf.gather( - segment_lens, - indices=chunk_range, - axis=enc_spatial_dim, - clip_to_valid=True - ) - att = model( - enc=enc_args["enc"], - enc_ctx=enc_args["enc_ctx"], - enc_spatial_dim=enc_spatial_dim, - s=label_lstm_out_seq, - segment_starts=seg_starts, - segment_lens=seg_lens, - ) # [B, S+1, T, D] - chunk_range += chunk_size - - # gather attention for the current label index - att_step = rf.gather( - att, - indices=label_indices, - axis=non_blank_targets_padded_spatial_dim, - clip_to_valid=True - ) - att_step = rf.gather( - att_step, - indices=rf.constant(i % chunk_size, dims=batch_dims, device=att_step.device), - axis=chunk_dim, - clip_to_valid=True - ) - - logits = model.decode_logits( - input_embed=input_embed, - att=att_step, - s=label_lstm_out, - ) # [B, S+1, T, D] - - label_log_prob = rf.log_softmax(logits, axis=model.target_dim) - - # alpha = 0. - # label_log_prob = label_log_prob + rf.stop_gradient(label_log_prob * (alpha - 1)) - - # # scale down blank gradient to avoid outputting only blanks in the beginning - # # and then all other labels in the end - # def custom_backward(grad): - # grad[:, :, 0] *= 0.00005 - # return grad - # - # if rf.get_run_ctx().train_flag: - # label_log_prob.raw_tensor.register_hook(custom_backward) - - # mask label log prob in order to only allow hypotheses corresponding to the ground truth: - # log prob needs to correspond to the next non-blank label... - log_prob_mask = vocab_range == ground_truth - rem_frames = enc_spatial_sizes - i - rem_labels = non_blank_targets_spatial_sizes - label_indices - # ... or to blank if there are more frames than labels left - log_prob_mask = rf.logical_or( - log_prob_mask, - rf.logical_and( - vocab_range == blank_tensor, - rem_frames > rem_labels - ) - ) - label_log_prob = rf.where( - log_prob_mask, - label_log_prob, - rf.constant(-1.0e30, dims=batch_dims + [beam_dim, model.target_dim]) - ) - - # recombine hypotheses corresponding to the same node in the lattice (= same hash value -> same label history) - # do this by setting the log prob of all but the best hypothesis to -inf - # and setting the log prob of the best hypothesis to either the max or the sum of the equivalent hypotheses - seq_log_prob = recombination.recombine_seqs(seq_targets, seq_log_prob, seq_hash, beam_dim, batch_dims[0]) - - # set the beam size as low as possible according to the following rules (using recombination): - # 1) in frame i, there are i+1 nodes in the lattice and from each node, we can spawn 2 hypotheses - # 2) if T-i frames remain, only (T-i)*2 hypotheses can survive in order to reach the last node - # 3) in a frame, there are at most S+1 nodes, i.e. (S+1)*2 hypotheses can be spawned (see 1)) - # 4) the beam size should not exceed the given beam size - beam_size_ = min( - min((i + 1) * 2, rf.reduce_max(rem_frames, axis=rem_frames.dims).raw_tensor.item() * 2), - min((max_num_labels + 1) * 2 - 1, beam_size) - ) - - # update sequence log prob and beam indices - seq_log_prob = seq_log_prob + label_log_prob # Batch, InBeam, Vocab - seq_log_prob, (backrefs, target), beam_dim = rf.top_k( - seq_log_prob, - k_dim=Dim(beam_size_, name=f"dec-step{i}-beam"), - axis=[beam_dim, model.target_dim] - ) # seq_log_prob, backrefs, target: Batch, Beam - seq_targets.append(target) - seq_backrefs.append(backrefs) - - seq_hash = recombination.update_seq_hash(seq_hash, target, backrefs, model.blank_idx) - - # mask blank label - update_state_mask = rf.convert_to_tensor(target != model.blank_idx) - - i += 1 - - # last recombination - seq_log_prob = recombination.recombine_seqs(seq_targets, seq_log_prob, seq_hash, beam_dim, batch_dims[0]) - - # # Backtrack via backrefs, resolve beams. - # seq_targets_ = [] - # indices = rf.range_over_dim(beam_dim) # FinalBeam -> FinalBeam - # for backrefs, target in zip(seq_backrefs[::-1], seq_targets[::-1]): - # # indices: FinalBeam -> Beam - # # backrefs: Beam -> PrevBeam - # seq_targets_.insert(0, rf.gather(target, indices=indices)) - # indices = rf.gather(backrefs, indices=indices) # FinalBeam -> PrevBeam - # - # seq_targets__ = TensorArray(seq_targets_[0]) - # for target in seq_targets_: - # seq_targets__ = seq_targets__.push_back(target) - # seq_targets = seq_targets__.stack(axis=enc_spatial_dim) - # - # torch.set_printoptions(threshold=10_000) - # print("seq_targets", seq_targets.copy_transpose(batch_dims + [beam_dim, enc_spatial_dim]).raw_tensor[0, :]) - # print("seq_log_prob", seq_log_prob.raw_tensor[0]) - - # calculate full-sum loss using the log-sum-exp trick - max_log_prob = rf.reduce_max(seq_log_prob, axis=beam_dim) - loss = -1 * (max_log_prob + rf.log(rf.reduce_sum(rf.exp(seq_log_prob - max_log_prob), axis=beam_dim))) - - # loss = -rf.log(rf.reduce_sum(rf.exp(seq_log_prob), axis=beam_dim)) - loss.mark_as_loss("full_sum_loss", scale=1.0, use_normalized_loss=True) - - return None - - -def full_sum_training_w_beam_eff_w_recomb( - *, - model: SegmentalAttEfficientLabelDecoder, - enc_args: Dict, - enc_spatial_dim: Dim, - non_blank_targets: rf.Tensor, # [B, S, V] - non_blank_targets_spatial_dim: Dim, - segment_starts: rf.Tensor, # [B, T] - segment_lens: rf.Tensor, # [B, T] - batch_dims: List[Dim], - beam_size: int, -) -> Optional[Dict[str, Tuple[rf.Tensor, Dim]]]: - assert len(batch_dims) == 1, "not supported yet" - assert model.blank_idx == 0, "blank idx needs to be zero because of the way the gradient is scaled" - - # ------------------------ init some variables ------------------------ - beam_dim = Dim(1, name="initial-beam") - batch_dims_ = [beam_dim] + batch_dims - bos_idx = 0 - seq_log_prob = rf.constant(0.0, dims=batch_dims_) - max_seq_len = enc_spatial_dim.get_size_tensor() - max_seq_len = rf.reduce_max(max_seq_len, axis=max_seq_len.dims) - label_lstm_state = model.s_wo_att.default_initial_state(batch_dims=batch_dims) - target = rf.constant(bos_idx, dims=batch_dims_, sparse_dim=model.target_dim) - vocab_range = rf.range_over_dim(model.target_dim) - blank_tensor = rf.convert_to_tensor(model.blank_idx, dtype=vocab_range.dtype) - backrefs = rf.zeros(batch_dims_, dtype="int32") - - # ------------------------ targets/embeddings ------------------------ - - non_blank_input_embeddings = model.target_embed(non_blank_targets) # [B, S, D] - non_blank_input_embeddings, non_blank_targets_padded_spatial_dim = rf.pad( - non_blank_input_embeddings, - axes=[non_blank_targets_spatial_dim], - padding=[(1, 0)], - value=0.0, - ) # [B, S+1, D] - non_blank_targets_padded_spatial_dim = non_blank_targets_padded_spatial_dim[0] - - # add blank idx on the right - # this way, when the label index for gathering reached the last non-blank index, it will gather blank after that - # which then only allows corresponding hypotheses to be extended by blank - non_blank_targets_padded, _ = rf.pad( - non_blank_targets, - axes=[non_blank_targets_spatial_dim], - padding=[(0, 1)], - value=model.blank_idx, - out_dims=[non_blank_targets_padded_spatial_dim] - ) - - # ------------------------ sizes ------------------------ - - non_blank_targets_padded_spatial_sizes = rf.copy_to_device( - non_blank_targets_padded_spatial_dim.dyn_size_ext, non_blank_targets.device - ) - non_blank_targets_spatial_sizes = rf.copy_to_device( - non_blank_targets_spatial_dim.dyn_size_ext, non_blank_targets.device) - max_num_labels = rf.reduce_max( - non_blank_targets_spatial_sizes, axis=non_blank_targets_spatial_sizes.dims - ).raw_tensor.item() - single_col_dim = Dim(dimension=max_num_labels + 1, name="max-num-labels") - label_indices = rf.zeros(batch_dims_, dtype="int32", sparse_dim=single_col_dim) - - enc_spatial_sizes = rf.copy_to_device(enc_spatial_dim.dyn_size_ext, non_blank_targets.device) - - # ------------------------ compute LSTM sequence ------------------------ - - label_lstm_out_seq, _ = model.s_wo_att( - non_blank_input_embeddings, - state=label_lstm_state, - spatial_dim=non_blank_targets_padded_spatial_dim, - ) - - # ------------------------ chunk dim ------------------------ - - chunk_size = 20 - chunk_dim = Dim(chunk_size, name="chunk") - chunk_range = rf.expand_dim(rf.range_over_dim(chunk_dim), batch_dims[0]) - - i = 0 - seq_targets = [] - seq_backrefs = [] - while i < max_seq_len.raw_tensor: - # get current number of labels for each hypothesis - if i > 0: - prev_label_indices = rf.gather(label_indices, indices=backrefs) - # mask blank label - update_state_mask = rf.convert_to_tensor(target != prev_label_indices) - label_indices = rf.where( - update_state_mask, - rf.where( - prev_label_indices == non_blank_targets_padded_spatial_sizes - 1, - prev_label_indices, - prev_label_indices + 1 - ), - prev_label_indices - ) - - # gather ground truth, input embeddings and LSTM output for current label index - ground_truth = rf.gather( - non_blank_targets_padded, - indices=label_indices, - axis=non_blank_targets_padded_spatial_dim, - clip_to_valid=True - ) - input_embed = rf.gather( - non_blank_input_embeddings, - indices=label_indices, - axis=non_blank_targets_padded_spatial_dim, - clip_to_valid=True - ) - label_lstm_out = rf.gather( - label_lstm_out_seq, - indices=label_indices, - axis=non_blank_targets_padded_spatial_dim, - clip_to_valid=True - ) - - # precompute attention for the current chunk (more efficient than computing it individually for each label index) - if i % chunk_size == 0: - seg_starts = rf.gather( - segment_starts, - indices=chunk_range, - axis=enc_spatial_dim, - clip_to_valid=True - ) - seg_lens = rf.gather( - segment_lens, - indices=chunk_range, - axis=enc_spatial_dim, - clip_to_valid=True - ) - att = model( - enc=enc_args["enc"], - enc_ctx=enc_args["enc_ctx"], - enc_spatial_dim=enc_spatial_dim, - s=label_lstm_out_seq, - segment_starts=seg_starts, - segment_lens=seg_lens, - ) # [B, S+1, T, D] - chunk_range += chunk_size - - # gather attention for the current label index - att_step = rf.gather( - att, - indices=label_indices, - axis=non_blank_targets_padded_spatial_dim, - clip_to_valid=True - ) - att_step = rf.gather( - att_step, - indices=rf.constant(i % chunk_size, dims=batch_dims, device=att_step.device), - axis=chunk_dim, - clip_to_valid=True - ) - - logits = model.decode_logits( - input_embed=input_embed, - att=att_step, - s=label_lstm_out, - ) # [B, S+1, T, D] - - label_log_prob = rf.log_softmax(logits, axis=model.target_dim) - - # mask label log prob in order to only allow hypotheses corresponding to the ground truth: - # log prob needs to correspond to the next non-blank label... - log_prob_mask = vocab_range == ground_truth - rem_frames = enc_spatial_sizes - i - rem_labels = non_blank_targets_spatial_sizes - label_indices - # ... or to blank if there are more frames than labels left - log_prob_mask = rf.logical_or( - log_prob_mask, - rf.logical_and( - vocab_range == blank_tensor, - rem_frames > rem_labels - ) - ) - label_log_prob = rf.where( - log_prob_mask, - label_log_prob, - rf.constant(-1.0e30, dims=batch_dims + [beam_dim, model.target_dim]) - ) - - label_log_prob = rf.where( - rf.convert_to_tensor(i >= rf.copy_to_device(enc_spatial_dim.get_size_tensor(), label_log_prob.device)), - rf.sparse_to_dense( - model.blank_idx, - axis=model.target_dim, - label_value=0.0, - other_value=-1.0e30 - ), - label_log_prob - ) - - seq_log_prob = recombination.recombine_seqs_train( - seq_log_prob=seq_log_prob, - label_log_prob=label_log_prob, - label_indices=label_indices, - ground_truth=ground_truth, - target_dim=model.target_dim, - single_col_dim=single_col_dim, - labels_padded_spatial_sizes=non_blank_targets_padded_spatial_sizes, - beam_dim=beam_dim, - batch_dims=batch_dims, - blank_idx=model.blank_idx, - ) - - beam_size_ = min( - min((i + 2), rf.reduce_max(rem_frames, axis=rem_frames.dims).raw_tensor.item()), - min((max_num_labels + 1), beam_size) - ) - - # update sequence log prob and beam indices - # seq_log_prob = seq_log_prob + label_log_prob # Batch, InBeam, Vocab - seq_log_prob, (backrefs, target), beam_dim = rf.top_k( - seq_log_prob, - k_dim=Dim(beam_size_, name=f"dec-step{i}-beam"), - axis=[beam_dim, single_col_dim] - ) # seq_log_prob, backrefs, target: Batch, Beam - seq_targets.append(target) - seq_backrefs.append(backrefs) - - i += 1 - - # Backtrack via backrefs, resolve beams. - seq_targets_ = [] - indices = rf.range_over_dim(beam_dim) # FinalBeam -> FinalBeam - for backrefs, target in zip(seq_backrefs[::-1], seq_targets[::-1]): - # indices: FinalBeam -> Beam - # backrefs: Beam -> PrevBeam - seq_targets_.insert(0, rf.gather(target, indices=indices)) - indices = rf.gather(backrefs, indices=indices) # FinalBeam -> PrevBeam - - seq_targets__ = TensorArray(seq_targets_[0]) - for target in seq_targets_: - seq_targets__ = seq_targets__.push_back(target) - seq_targets = seq_targets__.stack(axis=enc_spatial_dim) - - torch.set_printoptions(threshold=10_000) - print("seq_targets", seq_targets.copy_transpose(batch_dims + [beam_dim, enc_spatial_dim]).raw_tensor[0, 0]) - - loss = -1 * seq_log_prob - - # loss = -rf.log(rf.reduce_sum(rf.exp(seq_log_prob), axis=beam_dim)) - loss.mark_as_loss("full_sum_loss", scale=1.0, use_normalized_loss=True) - - # print("loss", loss.raw_tensor) - # print("single_col_dim", single_col_dim.dimension) - # exit() - - return None - diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/realignment.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/realignment.py new file mode 100644 index 000000000..7e4b1a0b3 --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/realignment.py @@ -0,0 +1,476 @@ +from typing import Optional, Dict, Any, Sequence, Tuple, List + +import torch + +from i6_experiments.users.schmitt import hdf +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental import utils +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental import recombination +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model import SegmentalAttentionModel +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.model import ( + SegmentalAttLabelDecoder, + SegmentalAttEfficientLabelDecoder +) + +from returnn.tensor import Dim, single_step_dim, TensorDict +import returnn.frontend as rf +from returnn.frontend.tensor_array import TensorArray + + +def model_realign_( + *, + model: SegmentalAttentionModel, + data: rf.Tensor, + data_spatial_dim: Dim, + non_blank_targets: rf.Tensor, + non_blank_targets_spatial_dim: Dim, +): + assert model.use_joint_model + assert isinstance(model.label_decoder, SegmentalAttEfficientLabelDecoder) + assert model.label_decoder_state == "nb-lstm" + + if data.feature_dim and data.feature_dim.dimension == 1: + data = rf.squeeze(data, axis=data.feature_dim) + assert not data.feature_dim # raw audio + + batch_dims = data.remaining_dims(data_spatial_dim) + enc_args, enc_spatial_dim = model.encoder.encode(data, in_spatial_dim=data_spatial_dim) + + segment_starts, segment_lens = utils.get_segment_starts_and_lens( + rf.sequence_mask(batch_dims + [enc_spatial_dim]), # this way, every frame is interpreted as non-blank + enc_spatial_dim, + model, + batch_dims, + enc_spatial_dim + ) + + max_num_labels = rf.reduce_max( + non_blank_targets_spatial_dim.dyn_size_ext, + axis=batch_dims + ) + max_num_labels = max_num_labels.raw_tensor.item() + + seq_log_prob, viterbi_alignment, viterbi_alignment_spatial_dim = model_realign( + model=model.label_decoder, + enc=enc_args["enc"], + enc_ctx=enc_args["enc_ctx"], + enc_spatial_dim=enc_spatial_dim, + non_blank_targets=non_blank_targets, + non_blank_targets_spatial_dim=non_blank_targets_spatial_dim, + segment_starts=segment_starts, + segment_lens=segment_lens, + batch_dims=batch_dims, + beam_size=max_num_labels, + downsampling=1, + precompute_chunk_size=10, + interpolation_alignment=None, + interpolation_alignment_factor=0.0, + use_recombination="max", + return_realignment=True, + ) + + return viterbi_alignment, seq_log_prob, viterbi_alignment_spatial_dim + + +def model_realign( + *, + model: SegmentalAttEfficientLabelDecoder, + enc: rf.Tensor, + enc_ctx: rf.Tensor, + enc_spatial_dim: Dim, + non_blank_targets: rf.Tensor, # [B, S, V] + non_blank_targets_spatial_dim: Dim, + segment_starts: rf.Tensor, # [B, T] + segment_lens: rf.Tensor, # [B, T] + batch_dims: List[Dim], + beam_size: int, + downsampling: int, + precompute_chunk_size: int, + interpolation_alignment: Optional[rf.Tensor], + interpolation_alignment_factor: float, + use_recombination: Optional[str] = "sum", + return_realignment: bool = False, +) -> Tuple[rf.Tensor, Optional[rf.Tensor], Optional[Dim]]: + assert len(batch_dims) == 1, "not supported yet" + assert model.blank_idx == 0, "blank idx needs to be zero because of the way the gradient is scaled" + if interpolation_alignment_factor > 0.0: + assert interpolation_alignment is not None + + # ------------------------ downsample encoder ------------------------ + + if downsampling > 1: + enc_spatial_dim_downsampled = enc_spatial_dim // downsampling + downsample_indices = rf.range_over_dim(enc_spatial_dim_downsampled) * downsampling + downsample_indices = rf.expand_dim(downsample_indices, batch_dims[0]) + segment_starts_downsampled = rf.gather(segment_starts, indices=downsample_indices, axis=enc_spatial_dim, clip_to_valid=True) + segment_lens_downsampled = rf.gather(segment_lens, indices=downsample_indices, axis=enc_spatial_dim, clip_to_valid=True) + + segment_starts = segment_starts_downsampled + segment_lens = segment_lens_downsampled + else: + enc_spatial_dim_downsampled = enc_spatial_dim + + # ------------------------ init some variables ------------------------ + beam_dim = Dim(1, name="initial-beam") + batch_dims_ = [beam_dim] + batch_dims + bos_idx = 0 + seq_log_prob = rf.constant(0.0, dims=batch_dims_) + max_seq_len = enc_spatial_dim_downsampled.get_size_tensor() + max_seq_len = rf.reduce_max(max_seq_len, axis=max_seq_len.dims) + label_lstm_state = model.s_wo_att.default_initial_state(batch_dims=batch_dims) + target = rf.constant(bos_idx, dims=batch_dims_, sparse_dim=model.target_dim) + vocab_range = rf.range_over_dim(model.target_dim) + blank_tensor = rf.convert_to_tensor(model.blank_idx, dtype=vocab_range.dtype) + backrefs = rf.zeros(batch_dims_, dtype="int32") + update_state_mask = rf.ones(batch_dims_, dtype="bool") + + # ------------------------ targets/embeddings ------------------------ + + non_blank_input_embeddings = model.target_embed(non_blank_targets) # [B, S, D] + non_blank_input_embeddings, non_blank_targets_padded_spatial_dim = rf.pad( + non_blank_input_embeddings, + axes=[non_blank_targets_spatial_dim], + padding=[(1, 0)], + value=0.0, + ) # [B, S+1, D] + non_blank_targets_padded_spatial_dim = non_blank_targets_padded_spatial_dim[0] + + # add blank idx on the right + # this way, when the label index for gathering reached the last non-blank index, it will gather blank after that + # which then only allows corresponding hypotheses to be extended by blank + non_blank_targets_padded, _ = rf.pad( + non_blank_targets, + axes=[non_blank_targets_spatial_dim], + padding=[(0, 1)], + value=model.blank_idx, + out_dims=[non_blank_targets_padded_spatial_dim] + ) + + # ------------------------ sizes ------------------------ + + non_blank_targets_padded_spatial_sizes = rf.copy_to_device( + non_blank_targets_padded_spatial_dim.dyn_size_ext, non_blank_targets.device + ) + non_blank_targets_spatial_sizes = rf.copy_to_device( + non_blank_targets_spatial_dim.dyn_size_ext, non_blank_targets.device) + max_num_labels = rf.reduce_max( + non_blank_targets_spatial_sizes, axis=non_blank_targets_spatial_sizes.dims + ).raw_tensor.item() + single_col_dim = Dim(dimension=max_num_labels + 1, name="max-num-labels") + label_indices = rf.zeros(batch_dims_, dtype="int32", sparse_dim=single_col_dim) + prev_label_indices = label_indices.copy() + + enc_spatial_sizes = rf.copy_to_device(enc_spatial_dim_downsampled.dyn_size_ext, non_blank_targets.device) + + # ------------------------ compute LSTM sequence ------------------------ + + label_lstm_out_seq, _ = model.s_wo_att( + non_blank_input_embeddings, + state=label_lstm_state, + spatial_dim=non_blank_targets_padded_spatial_dim, + ) + + # ------------------------ chunk dim ------------------------ + + chunk_dim = Dim(precompute_chunk_size, name="chunk") + chunk_range = rf.expand_dim(rf.range_over_dim(chunk_dim), batch_dims[0]) + + i = 0 + seq_targets = [] + seq_backrefs = [] + while i < max_seq_len.raw_tensor: + # get current number of labels for each hypothesis + if i > 0: + label_indices = rf.where( + update_state_mask, + rf.where( + prev_label_indices == non_blank_targets_padded_spatial_sizes - 1, + prev_label_indices, + prev_label_indices + 1 + ), + prev_label_indices + ) + + # gather ground truth, input embeddings and LSTM output for current label index + label_ground_truth = rf.gather( + non_blank_targets_padded, + indices=label_indices, + axis=non_blank_targets_padded_spatial_dim, + clip_to_valid=True + ) + input_embed = rf.gather( + non_blank_input_embeddings, + indices=label_indices, + axis=non_blank_targets_padded_spatial_dim, + clip_to_valid=True + ) + label_lstm_out = rf.gather( + label_lstm_out_seq, + indices=label_indices, + axis=non_blank_targets_padded_spatial_dim, + clip_to_valid=True + ) + + # precompute attention for the current chunk (more efficient than computing it individually for each label index) + if i % precompute_chunk_size == 0: + seg_starts = rf.gather( + segment_starts, + indices=chunk_range, + axis=enc_spatial_dim_downsampled, + clip_to_valid=True + ) + seg_lens = rf.gather( + segment_lens, + indices=chunk_range, + axis=enc_spatial_dim_downsampled, + clip_to_valid=True + ) + att = model( + enc=enc, + enc_ctx=enc_ctx, + enc_spatial_dim=enc_spatial_dim, + s=label_lstm_out_seq, + segment_starts=seg_starts, + segment_lens=seg_lens, + ) # [B, S+1, T, D] + chunk_range += precompute_chunk_size + + # gather attention for the current label index + att_step = rf.gather( + att, + indices=label_indices, + axis=non_blank_targets_padded_spatial_dim, + clip_to_valid=True + ) + att_step = rf.gather( + att_step, + indices=rf.constant(i % precompute_chunk_size, dims=batch_dims, device=att_step.device), + axis=chunk_dim, + clip_to_valid=True + ) + + logits = model.decode_logits( + input_embed=input_embed, + att=att_step, + s=label_lstm_out, + ) # [B, S+1, T, D] + + label_log_prob = rf.log_softmax(logits, axis=model.target_dim) + + # mask label log prob in order to only allow hypotheses corresponding to the ground truth: + # log prob needs to correspond to the next non-blank label... + log_prob_mask = vocab_range == label_ground_truth + rem_frames = enc_spatial_sizes - i + rem_labels = non_blank_targets_spatial_sizes - label_indices + # ... or to blank if there are more frames than labels left + log_prob_mask = rf.logical_or( + log_prob_mask, + rf.logical_and( + vocab_range == blank_tensor, + rem_frames > rem_labels + ) + ) + label_log_prob = rf.where( + log_prob_mask, + label_log_prob, + rf.constant(-1.0e30, dims=batch_dims + [beam_dim, model.target_dim]) + ) + + # interpolate with given alignment: a * one_hot + (1 - a) * label_log_prob + if interpolation_alignment_factor > 0.0: + interpolation_alignment_ground_truth = rf.gather( + interpolation_alignment, + indices=i, + axis=enc_spatial_dim_downsampled, + ) + label_log_prob = interpolation_alignment_factor * rf.log(rf.sparse_to_dense( + interpolation_alignment_ground_truth, label_value=0.9, other_value=0.1 / model.target_dim.dimension + )) + (1 - interpolation_alignment_factor) * label_log_prob + + label_log_prob = rf.where( + rf.convert_to_tensor(i >= enc_spatial_sizes), + rf.sparse_to_dense( + model.blank_idx, + axis=model.target_dim, + label_value=0.0, + other_value=-1.0e30 + ), + label_log_prob + ) + + if use_recombination is not None: + seq_log_prob = recombination.recombine_seqs_train( + seq_log_prob=seq_log_prob, + label_log_prob=label_log_prob, + label_indices=label_indices, + ground_truth=label_ground_truth, + target_dim=model.target_dim, + single_col_dim=single_col_dim, + beam_dim=beam_dim, + batch_dims=batch_dims, + blank_idx=model.blank_idx, + use_sum_recombination=use_recombination == "sum" + ) + beam_size_ = min( + min((i + 2), rf.reduce_max(rem_frames, axis=rem_frames.dims).raw_tensor.item()), + min((max_num_labels + 1), beam_size) + ) + seq_log_prob, (backrefs, target), beam_dim = rf.top_k( + seq_log_prob, + k_dim=Dim(beam_size_, name=f"dec-step{i}-beam"), + axis=[beam_dim, single_col_dim] + ) + + prev_label_indices = rf.gather(label_indices, indices=backrefs) + # mask blank label + update_state_mask = rf.convert_to_tensor(target != prev_label_indices) + else: + beam_size_ = beam_size + + seq_log_prob = seq_log_prob + label_log_prob # Batch, InBeam, Vocab + seq_log_prob, (backrefs, target), beam_dim = rf.top_k( + seq_log_prob, + k_dim=Dim(beam_size_, name=f"dec-step{i}-beam"), + axis=[beam_dim, model.target_dim] + ) + + prev_label_indices = rf.gather(label_indices, indices=backrefs) + # mask blank label + update_state_mask = rf.convert_to_tensor(target != model.blank_idx) + + seq_targets.append(target) + seq_backrefs.append(backrefs) + + i += 1 + + if return_realignment: + # Backtrack via backrefs, resolve beams. + seq_targets_ = [] + indices = rf.range_over_dim(beam_dim) # FinalBeam -> FinalBeam + for backrefs, target in zip(seq_backrefs[::-1], seq_targets[::-1]): + # indices: FinalBeam -> Beam + # backrefs: Beam -> PrevBeam + seq_targets_.insert(0, rf.gather(target, indices=indices)) + indices = rf.gather(backrefs, indices=indices) # FinalBeam -> PrevBeam + + seq_targets__ = TensorArray(seq_targets_[0]) + for target in seq_targets_: + seq_targets__ = seq_targets__.push_back(target) + seq_targets = seq_targets__.stack(axis=enc_spatial_dim_downsampled) + + if use_recombination is not None: + seq_targets_padded = rf.shift_right(seq_targets, axis=enc_spatial_dim_downsampled, pad_value=0) + seq_targets = rf.where( + seq_targets == seq_targets_padded, + model.blank_idx, + rf.gather( + non_blank_targets, indices=rf.cast(seq_targets - 1, "int32"), axis=non_blank_targets_spatial_dim, clip_to_valid=True) + ) + realignment = rf.squeeze(seq_targets, beam_dim) + else: + realignment = seq_targets + + realignment_spatial_dim = enc_spatial_dim_downsampled + else: + realignment = None + realignment_spatial_dim = None + + if use_recombination is not None: + seq_log_prob = rf.squeeze(seq_log_prob, beam_dim) + + return seq_log_prob, realignment, realignment_spatial_dim + + +def _returnn_v2_forward_step(*, model, extern_data: TensorDict, **_kwargs_unused): + import returnn.frontend as rf + from returnn.tensor import Tensor, Dim, batch_dim + from returnn.config import get_global_config + + if rf.is_executing_eagerly(): + batch_size = int(batch_dim.get_dim_value()) + for batch_idx in range(batch_size): + seq_tag = extern_data["seq_tag"].raw_tensor[batch_idx].item() + print(f"batch {batch_idx + 1}/{batch_size} seq_tag: {seq_tag!r}") + + config = get_global_config() + default_input_key = config.typed_value("default_input") + data = extern_data[default_input_key] + data_spatial_dim = data.get_time_dim_tag() + realign_def = config.typed_value("_realign_def") + + default_target_key = config.typed_value("target") + targets = extern_data[default_target_key] + targets_spatial_dim = targets.get_time_dim_tag() + + realign_out = realign_def( + model=model, + data=data, + data_spatial_dim=data_spatial_dim, + non_blank_targets=targets, + non_blank_targets_spatial_dim=targets_spatial_dim, + ) + + if len(realign_out) == 3: + # realign results including viterbi_align, + # log probs {batch,}, + # out_spatial_dim, + viterbi_align, scores, out_spatial_dim = realign_out + else: + raise ValueError(f"unexpected num outputs {len(realign_out)} from recog_def") + assert isinstance(viterbi_align, Tensor) and isinstance(scores, Tensor) + assert isinstance(out_spatial_dim, Dim) + rf.get_run_ctx().mark_as_output(viterbi_align, "viterbi_align", dims=[batch_dim, out_spatial_dim]) + rf.get_run_ctx().mark_as_output(scores, "scores", dims=[batch_dim,]) + + +_v2_forward_out_scores_filename = "scores.py.gz" +_v2_forward_out_alignment_filename = "realignment.hdf" + + +def _returnn_v2_get_forward_callback(): + from typing import TextIO + import numpy as np + from returnn.tensor import Tensor, TensorDict + from returnn.forward_iface import ForwardCallbackIface + from returnn.config import get_global_config + from returnn.datasets.hdf import SimpleHDFWriter + + class _ReturnnRecogV2ForwardCallbackIface(ForwardCallbackIface): + def __init__(self): + self.score_file: Optional[TextIO] = None + self.alignment_file: Optional[SimpleHDFWriter] = None + + def init(self, *, model): + import gzip + + self.score_file = gzip.open(_v2_forward_out_scores_filename, "wt") + self.score_file.write("{\n") + + self.alignment_file = SimpleHDFWriter( + filename=_v2_forward_out_alignment_filename, dim=model.target_dim.dimension, ndim=1 + ) + + def process_seq(self, *, seq_tag: str, outputs: TensorDict): + viterbi_align: Tensor = outputs["viterbi_align"] # [T] + scores: Tensor = outputs["scores"] # [] + assert len(viterbi_align.dims) == 1, f"expected hyps to be 1D, but got {viterbi_align.dims}" + assert viterbi_align.dims[0].dyn_size_ext, f"viterbi_align {viterbi_align} does not define seq lengths" + self.score_file.write(f"{seq_tag!r}: ") + score = float(scores.raw_tensor) + self.score_file.write(f"{score!r},\n") + + seq_len = viterbi_align.dims[0].dyn_size_ext.raw_tensor.item() + viterbi_align_raw = viterbi_align.raw_tensor[:seq_len] + + hdf.dump_hdf_numpy( + hdf_dataset=self.alignment_file, + data=viterbi_align_raw[None], # [1, T] + seq_lens=np.array([seq_len]), # [1] + seq_tags=[seq_tag], + ) + + def finish(self): + self.score_file.write("}\n") + self.score_file.close() + self.alignment_file.close() + + return _ReturnnRecogV2ForwardCallbackIface() diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recog.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recog.py index 53e39d302..e070839d9 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recog.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recog.py @@ -1,7 +1,8 @@ -from typing import Optional, Dict, Any, Tuple +from typing import Optional, Dict, Any, Tuple, Sequence import tree from returnn.tensor import Tensor, Dim, single_step_dim +from returnn.frontend.state import State import returnn.frontend as rf from returnn.frontend.tensor_array import TensorArray @@ -14,15 +15,223 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.blank_model.model import ( BlankDecoderV1, BlankDecoderV3, + BlankDecoderV5, + BlankDecoderV6, ) +def update_state( + model: SegmentalAttentionModel, + update_state_mask: Tensor, + backrefs: Tensor, + label_decoder_state: State, + label_decoder_state_updated: State, + blank_decoder_state: Optional[State], + blank_decoder_state_updated: Optional[State], + lm_state: Optional[State], + lm_state_updated: Optional[State], +) -> Tuple[State, Optional[State], Optional[State]]: + + # ------------------- update blank decoder state ------------------- + + if blank_decoder_state is not None: + blank_decoder_state = tree.map_structure( + lambda s: rf.gather(s, indices=backrefs), blank_decoder_state_updated) + + # ------------------- update label decoder state ------------------- + + if model.label_decoder_state == "joint-lstm": + label_decoder_state = tree.map_structure( + lambda s: rf.gather(s, indices=backrefs), label_decoder_state_updated) + else: + def _get_masked_state(old, new, mask): + old = rf.gather(old, indices=backrefs) + new = rf.gather(new, indices=backrefs) + return rf.where(mask, new, old) + + label_decoder_state = tree.map_structure( + lambda old_state, new_state: _get_masked_state(old_state, new_state, update_state_mask), + label_decoder_state, label_decoder_state_updated + ) + + # ------------------- update external LM state ------------------- + + if lm_state is not None: + for state in lm_state: + if state == "pos": + lm_state[state] = rf.where( + update_state_mask, + rf.gather(lm_state_updated[state], indices=backrefs), + rf.gather(lm_state[state], indices=backrefs) + ) + else: + updated_accum_axis = lm_state_updated[state].self_att.accum_axis + + updated_self_att_expand_dim_dyn_size_ext = rf.gather(updated_accum_axis.dyn_size_ext, indices=backrefs) + masked_self_att_expand_dim_dyn_size_ext = rf.where( + update_state_mask, + updated_self_att_expand_dim_dyn_size_ext, + updated_self_att_expand_dim_dyn_size_ext - 1 + ) + masked_self_att_expand_dim = Dim(masked_self_att_expand_dim_dyn_size_ext, name="self_att_expand_dim_init") + lm_state[state].self_att.accum_axis = masked_self_att_expand_dim + + def _mask_lm_state(tensor: rf.Tensor): + tensor = rf.gather(tensor, indices=backrefs) + tensor = tensor.copy_transpose( + [updated_accum_axis] + tensor.remaining_dims(updated_accum_axis)) + tensor_raw = tensor.raw_tensor + tensor_raw = tensor_raw[:rf.reduce_max( + masked_self_att_expand_dim_dyn_size_ext, + axis=masked_self_att_expand_dim_dyn_size_ext.dims + ).raw_tensor.item()] + tensor = tensor.copy_template_replace_dim_tag( + tensor.get_axis_from_description(updated_accum_axis), masked_self_att_expand_dim + ) + tensor.raw_tensor = tensor_raw + return tensor + + lm_state[state].self_att.k_accum = _mask_lm_state(lm_state_updated[state].self_att.k_accum) + lm_state[state].self_att.v_accum = _mask_lm_state(lm_state_updated[state].self_att.v_accum) + + return label_decoder_state, blank_decoder_state, lm_state + + +def get_score( + model: SegmentalAttentionModel, + i: int, + input_embed_label_model: Tensor, + input_embed_blank_model: Optional[Tensor], + nb_target: Tensor, + label_decoder_state: State, + blank_decoder_state: Optional[State], + lm_state: Optional[State], + enc_args: Dict[str, Tensor], + enc_spatial_dim: Dim, + beam_dim: Dim, + batch_dims: Sequence[Dim], + external_lm_scale: Optional[float] = None, +) -> Tuple[Tensor, State, Optional[State], Optional[State]]: + # ------------------- label step ------------------- + + center_position = rf.minimum( + rf.full(dims=[beam_dim] + batch_dims, fill_value=i, dtype="int32"), + rf.copy_to_device(enc_spatial_dim.get_size_tensor() - 1, input_embed_label_model.device) + ) + segment_starts = rf.maximum( + rf.convert_to_tensor(0, dtype="int32"), center_position - model.center_window_size // 2) + segment_ends = rf.minimum( + rf.copy_to_device(enc_spatial_dim.get_size_tensor() - 1, input_embed_label_model.device), + center_position + model.center_window_size // 2 + ) + segment_lens = segment_ends - segment_starts + 1 + + label_step_out, label_decoder_state = model.label_decoder.loop_step( + **enc_args, + enc_spatial_dim=enc_spatial_dim, + input_embed=input_embed_label_model, + segment_lens=segment_lens, + segment_starts=segment_starts, + state=label_decoder_state, + ) + label_logits = model.label_decoder.decode_logits(input_embed=input_embed_label_model, **label_step_out) + label_log_prob = rf.log_softmax(label_logits, axis=model.target_dim) + + # ------------------- external LM step ------------------- + + lm_eos_log_prob = rf.zeros(batch_dims, dtype="float32") + if lm_state is not None: + lm_logits, lm_state = model.language_model( + nb_target, + spatial_dim=single_step_dim, + state=lm_state, + ) + lm_label_log_prob = rf.log_softmax(lm_logits, axis=model.target_dim) + label_log_prob += external_lm_scale * lm_label_log_prob + + lm_eos_log_prob = rf.where( + rf.convert_to_tensor(i == rf.copy_to_device(enc_spatial_dim.get_size_tensor(), input_embed_label_model.device) - 1), + # TODO: change to non hard-coded BOS index + rf.gather(lm_label_log_prob, indices=rf.constant(0, dtype="int32", dims=batch_dims, sparse_dim=nb_target.sparse_dim)), + lm_eos_log_prob + ) + + # ------------------- blank step ------------------- + + if blank_decoder_state is not None: + if model.blank_decoder_version in (1, 3): + blank_loop_step_kwargs = dict( + enc=enc_args["enc"], + enc_spatial_dim=enc_spatial_dim, + state=blank_decoder_state, + ) + if isinstance(model.blank_decoder, BlankDecoderV1): + blank_loop_step_kwargs["input_embed"] = input_embed_blank_model + else: + blank_loop_step_kwargs["label_model_state"] = label_step_out["s"] + + blank_step_out, blank_decoder_state = model.blank_decoder.loop_step(**blank_loop_step_kwargs) + blank_logits = model.blank_decoder.decode_logits(**blank_step_out) + else: + assert isinstance(model.blank_decoder, BlankDecoderV5) or isinstance(model.blank_decoder, BlankDecoderV6) + enc_position = rf.minimum( + rf.full(dims=batch_dims, fill_value=i, dtype="int32"), + rf.copy_to_device(enc_spatial_dim.get_size_tensor() - 1, input_embed_label_model.device) + ) + enc_frame = rf.gather(enc_args["enc"], indices=enc_position, axis=enc_spatial_dim) + enc_frame = rf.expand_dim(enc_frame, beam_dim) + if isinstance(model.blank_decoder, BlankDecoderV5): + # no LSTM -> no state -> just leave (empty) state as is + blank_logits = model.blank_decoder.emit_prob( + rf.concat_features(enc_frame, label_step_out["s"])) + else: + prev_lstm_state = blank_decoder_state.s_blank + blank_decoder_state = rf.State() + s_blank, blank_decoder_state.s_blank = model.blank_decoder.s( + enc_frame, + state=prev_lstm_state, + spatial_dim=single_step_dim + ) + blank_logits = model.blank_decoder.emit_prob(rf.concat_features(s_blank, label_step_out["s"])) + + emit_log_prob = rf.log(rf.sigmoid(blank_logits)) + emit_log_prob = rf.squeeze(emit_log_prob, axis=emit_log_prob.feature_dim) + blank_log_prob = rf.log(rf.sigmoid(-blank_logits)) + blank_log_prob += lm_eos_log_prob + + # ------------------- combination ------------------- + + label_log_prob += emit_log_prob + output_log_prob, _ = rf.concat( + (label_log_prob, model.target_dim), (blank_log_prob, blank_log_prob.feature_dim), + out_dim=model.align_target_dim + ) + else: + output_log_prob = label_log_prob + + # for shorter seqs in the batch, set the blank score to zero and the others to ~-inf + output_log_prob = rf.where( + rf.convert_to_tensor(i >= rf.copy_to_device(enc_spatial_dim.get_size_tensor(), input_embed_label_model.device)), + rf.sparse_to_dense( + model.blank_idx, + axis=model.target_dim if model.use_joint_model else model.align_target_dim, + label_value=0.0, + other_value=-1.0e30 + ), + output_log_prob + ) + + return output_log_prob, label_decoder_state, blank_decoder_state, lm_state + + def model_recog( *, model: SegmentalAttentionModel, data: Tensor, data_spatial_dim: Dim, + beam_size: int, use_recombination: Optional[str] = None, + external_lm_scale: Optional[float] = None, ) -> Tuple[Tensor, Tensor, Dim, Dim]: """ Function is run within RETURNN. @@ -37,32 +246,60 @@ def model_recog( out_spatial_dim, final beam_dim """ - # assert not model.language_model # not implemented here. use the pure PyTorch search instead assert any( - isinstance(model.blank_decoder, cls) for cls in (BlankDecoderV1, BlankDecoderV3) + isinstance(model.blank_decoder, cls) for cls in (BlankDecoderV1, BlankDecoderV3, BlankDecoderV5, BlankDecoderV6) ) or model.blank_decoder is None, "blank_decoder not supported" if model.blank_decoder is None: assert model.use_joint_model, "blank_decoder is None, so use_joint_model must be True" + if model.language_model: + assert external_lm_scale is not None, "external_lm_scale must be defined with LM" + assert model.label_decoder_state in {"nb-lstm", "joint-lstm", "nb-linear1"} + + # --------------------------------- init encoder, dims, etc --------------------------------- - batch_dims = data.remaining_dims((data_spatial_dim, data.feature_dim)) enc_args, enc_spatial_dim = model.encoder.encode(data, in_spatial_dim=data_spatial_dim) - beam_size = 12 + max_seq_len = enc_spatial_dim.get_size_tensor() - print("** max seq len:", max_seq_len.raw_tensor) max_seq_len = rf.reduce_max(max_seq_len, axis=max_seq_len.dims) - # Initial state. + batch_dims = data.remaining_dims((data_spatial_dim, data.feature_dim)) beam_dim = Dim(1, name="initial-beam") batch_dims_ = [beam_dim] + batch_dims + backrefs = rf.zeros(batch_dims_, dtype="int32") + + bos_idx = 0 + + seq_log_prob = rf.constant(0.0, dims=batch_dims_) + + if use_recombination: + assert len(batch_dims) == 1 + assert use_recombination in {"sum", "max"} + seq_hash = rf.constant(0, dims=batch_dims_, dtype="int64") + else: + seq_hash = None + + # lists of [B, beam] tensors + seq_targets = [] + seq_backrefs = [] + + update_state_mask = rf.constant(True, dims=batch_dims_) + # --------------------------------- init states --------------------------------- + + # label decoder label_decoder_state = model.label_decoder.default_initial_state(batch_dims=batch_dims_, ) + + # blank decoder if model.blank_decoder is not None: blank_decoder_state = model.blank_decoder.default_initial_state(batch_dims=batch_dims_) + else: + blank_decoder_state = None + + # external LM if model.language_model: lm_state = model.language_model.default_initial_state(batch_dims=batch_dims_) for state in lm_state: if state == "pos": - # pass lm_state[state] = rf.zeros(batch_dims_, dtype="int32") else: self_att_expand_dim = Dim(rf.zeros(batch_dims_, dtype="int32"), name="self_att_expand_dim_init") @@ -81,24 +318,19 @@ def model_recog( v_accum.get_axis_from_description("stag:self_att_expand_dim_init"), self_att_expand_dim ) lm_state[state].self_att.v_accum.raw_tensor = v_accum_raw + else: + lm_state = None - bos_idx = 0 + # --------------------------------- init targets, embeddings --------------------------------- if model.use_joint_model: target = rf.constant(bos_idx, dims=batch_dims_, sparse_dim=model.target_dim) - if model.label_decoder_state == "nb-lstm": + if model.label_decoder_state in ("nb-lstm", "nb-linear1"): target_non_blank = target.copy() else: target = rf.constant(bos_idx, dims=batch_dims_, sparse_dim=model.align_target_dim) - update_state_mask = rf.convert_to_tensor(target != model.blank_idx) target_non_blank = rf.constant(bos_idx, dims=batch_dims_, sparse_dim=model.target_dim) - seq_log_prob = rf.constant(0.0, dims=batch_dims_) - if use_recombination: - assert len(batch_dims) == 1 - assert use_recombination in {"sum", "max"} - seq_hash = rf.constant(0, dims=batch_dims_, dtype="int64") - input_embed = rf.zeros( batch_dims_ + [model.label_decoder.target_embed.out_dim], feature_dim=model.label_decoder.target_embed.out_dim, @@ -111,12 +343,9 @@ def model_recog( else: input_embed_length_model = None - old_beam_dim = beam_dim.copy() - backrefs = rf.zeros(batch_dims_, dtype="int32") + # --------------------------------- main loop --------------------------------- i = 0 - seq_targets = [] - seq_backrefs = [] while i < max_seq_len.raw_tensor: if i > 0: if model.label_decoder_state == "joint-lstm": @@ -127,94 +356,28 @@ def model_recog( input_embed = rf.where( update_state_mask, model.label_decoder.target_embed(target_non_blank), - rf.gather(input_embed, indices=backrefs, axis=old_beam_dim) + rf.gather(input_embed, indices=backrefs) ) if isinstance(model.blank_decoder, BlankDecoderV1): input_embed_length_model = model.blank_decoder.target_embed(target) - # ------------------- label step ------------------- - - center_position = rf.minimum( - rf.full(dims=[beam_dim] + batch_dims, fill_value=i, dtype="int32"), - rf.copy_to_device(enc_spatial_dim.get_size_tensor() - 1, data.device) - ) - segment_starts = rf.maximum( - rf.convert_to_tensor(0, dtype="int32"), center_position - model.center_window_size // 2) - segment_ends = rf.minimum( - rf.copy_to_device(enc_spatial_dim.get_size_tensor() - 1, data.device), - center_position + model.center_window_size // 2 - ) - segment_lens = segment_ends - segment_starts + 1 - - label_step_out, label_decoder_state_updated = model.label_decoder.loop_step( - **enc_args, + output_log_prob, label_decoder_state_updated, blank_decoder_state_updated, lm_state_updated = get_score( + model=model, + i=i, + input_embed_label_model=input_embed, + input_embed_blank_model=input_embed_length_model, + nb_target=target_non_blank, + label_decoder_state=label_decoder_state, + blank_decoder_state=blank_decoder_state, + lm_state=lm_state, + enc_args=enc_args, enc_spatial_dim=enc_spatial_dim, - input_embed=input_embed, - segment_lens=segment_lens, - segment_starts=segment_starts, - state=label_decoder_state, - ) - label_logits = model.label_decoder.decode_logits(input_embed=input_embed, **label_step_out) - label_log_prob = rf.log_softmax(label_logits, axis=model.target_dim) - - # ------------------- external LM step ------------------- - - if model.language_model: - lm_logits, lm_state_updated = model.language_model( - target_non_blank, - spatial_dim=single_step_dim, - state=lm_state, - ) - lm_label_log_prob = rf.log_softmax(lm_logits, axis=model.target_dim) - # print(i) - # print("lm_label_log_prob: ", lm_label_log_prob.copy_transpose(batch_dims + [beam_dim, model.target_dim]).raw_tensor[0, :, :6]) - # print() - # if i == 10: - # exit() - label_log_prob += 0.4 * lm_label_log_prob - - # ------------------- blank step ------------------- - - if not model.use_joint_model: - blank_loop_step_kwargs = dict( - enc=enc_args["enc"], - enc_spatial_dim=enc_spatial_dim, - state=blank_decoder_state, - ) - if isinstance(model.blank_decoder, BlankDecoderV1): - blank_loop_step_kwargs["input_embed"] = input_embed_length_model - else: - blank_loop_step_kwargs["label_model_state"] = label_step_out["s"] - - blank_step_out, blank_decoder_state = model.blank_decoder.loop_step(**blank_loop_step_kwargs) - blank_logits = model.blank_decoder.decode_logits(**blank_step_out) - emit_log_prob = rf.log(rf.sigmoid(blank_logits)) - emit_log_prob = rf.squeeze(emit_log_prob, axis=emit_log_prob.feature_dim) - blank_log_prob = rf.log(rf.sigmoid(-blank_logits)) - - # ------------------- combination ------------------- - - label_log_prob += emit_log_prob - output_log_prob, _ = rf.concat( - (label_log_prob, model.target_dim), (blank_log_prob, blank_log_prob.feature_dim), - out_dim=model.align_target_dim - ) - else: - output_log_prob = label_log_prob - - # for shorter seqs in the batch, set the blank score to zero and the others to ~-inf - output_log_prob = rf.where( - rf.convert_to_tensor(i >= rf.copy_to_device(enc_spatial_dim.get_size_tensor(), data.device)), - rf.sparse_to_dense( - model.blank_idx, - axis=model.target_dim if model.use_joint_model else model.align_target_dim, - label_value=0.0, - other_value=-1.0e30 - ), - output_log_prob + beam_dim=beam_dim, + batch_dims=batch_dims, + external_lm_scale=external_lm_scale, ) - # ------------------- top-k ------------------- + # ------------------- recombination ------------------- if use_recombination: seq_log_prob = recombination.recombine_seqs( @@ -226,108 +389,36 @@ def model_recog( use_sum=use_recombination == "sum" ) + # ------------------- top-k ------------------- + seq_log_prob = seq_log_prob + output_log_prob # Batch, InBeam, Vocab - old_beam_dim = beam_dim.copy() seq_log_prob, (backrefs, target), beam_dim = rf.top_k( seq_log_prob, k_dim=Dim(beam_size, name=f"dec-step{i}-beam"), axis=[beam_dim, model.target_dim if model.use_joint_model else model.align_target_dim] - ) # seq_log_prob, backrefs, target: Batch, Beam - # print("seq_log_prob: ", seq_log_prob.raw_tensor) + ) seq_targets.append(target) seq_backrefs.append(backrefs) + # ------------------- update hash for recombination ------------------- + if use_recombination: seq_hash = recombination.update_seq_hash(seq_hash, target, backrefs, model.blank_idx) # mask for updating label-sync states update_state_mask = rf.convert_to_tensor(target != model.blank_idx) - # ------------------- update blank decoder state ------------------- - - if not model.use_joint_model: - blank_decoder_state = tree.map_structure(lambda s: rf.gather(s, indices=backrefs), blank_decoder_state) - - # ------------------- update label decoder state ------------------- - - if model.label_decoder_state == "joint-lstm": - label_decoder_state = tree.map_structure(lambda s: rf.gather(s, indices=backrefs), label_decoder_state_updated) - else: - def _get_masked_state(old, new, mask): - old = rf.gather(old, indices=backrefs, axis=old_beam_dim) - new = rf.gather(new, indices=backrefs, axis=old_beam_dim) - return rf.where(mask, new, old) - - label_decoder_state = tree.map_structure( - lambda old_state, new_state: _get_masked_state(old_state, new_state, update_state_mask), - label_decoder_state, label_decoder_state_updated - ) - - # ------------------- update external LM state ------------------- - - if model.language_model: - for state in lm_state: - if state == "pos": - lm_state[state] = rf.where( - update_state_mask, - rf.gather(lm_state_updated[state], indices=backrefs), - rf.gather(lm_state[state], indices=backrefs) - ) - else: - updated_accum_axis = lm_state_updated[state].self_att.accum_axis - - updated_self_att_expand_dim_dyn_size_ext = rf.gather(updated_accum_axis.dyn_size_ext, indices=backrefs) - masked_self_att_expand_dim_dyn_size_ext = rf.where( - update_state_mask, - updated_self_att_expand_dim_dyn_size_ext, - updated_self_att_expand_dim_dyn_size_ext - 1 - ) - masked_self_att_expand_dim = Dim(masked_self_att_expand_dim_dyn_size_ext, name="self_att_expand_dim_init") - lm_state[state].self_att.accum_axis = masked_self_att_expand_dim - - def _mask_lm_state(tensor: rf.Tensor): - tensor = rf.gather(tensor, indices=backrefs) - tensor = tensor.copy_transpose( - [updated_accum_axis] + tensor.remaining_dims(updated_accum_axis)) - tensor_raw = tensor.raw_tensor - tensor_raw = tensor_raw[:rf.reduce_max( - masked_self_att_expand_dim_dyn_size_ext, - axis=masked_self_att_expand_dim_dyn_size_ext.dims - ).raw_tensor.item()] - tensor = tensor.copy_template_replace_dim_tag( - tensor.get_axis_from_description(updated_accum_axis), masked_self_att_expand_dim - ) - tensor.raw_tensor = tensor_raw - return tensor - - lm_state[state].self_att.k_accum = _mask_lm_state(lm_state_updated[state].self_att.k_accum) - lm_state[state].self_att.v_accum = _mask_lm_state(lm_state_updated[state].self_att.v_accum) - - # lm_state[state].self_att.k_accum = rf.gather(lm_state_updated[state].self_att.k_accum, indices=backrefs) - # lm_state[state].self_att.k_accum = lm_state[state].self_att.k_accum.copy_transpose( - # [updated_accum_axis] + lm_state[state].self_att.k_accum.remaining_dims(updated_accum_axis)) - # k_accum_raw = lm_state[state].self_att.k_accum.raw_tensor - # k_accum_raw = k_accum_raw[:rf.reduce_max( - # masked_self_att_expand_dim_dyn_size_ext, - # axis=masked_self_att_expand_dim_dyn_size_ext.dims - # ).raw_tensor.item()] - # lm_state[state].self_att.k_accum = lm_state[state].self_att.k_accum.copy_template_replace_dim_tag( - # lm_state[state].self_att.k_accum.get_axis_from_description(updated_accum_axis), masked_self_att_expand_dim - # ) - # lm_state[state].self_att.k_accum.raw_tensor = k_accum_raw - # - # lm_state[state].self_att.v_accum = rf.gather(lm_state_updated[state].self_att.v_accum, indices=backrefs) - # lm_state[state].self_att.v_accum = lm_state[state].self_att.v_accum.copy_transpose( - # [updated_accum_axis] + lm_state[state].self_att.v_accum.remaining_dims(updated_accum_axis)) - # v_accum_raw = lm_state[state].self_att.v_accum.raw_tensor - # v_accum_raw = v_accum_raw[:rf.reduce_max( - # masked_self_att_expand_dim_dyn_size_ext, - # axis=masked_self_att_expand_dim_dyn_size_ext.dims - # ).raw_tensor.item()] - # lm_state[state].self_att.v_accum = lm_state[state].self_att.v_accum.copy_template_replace_dim_tag( - # lm_state[state].self_att.v_accum.get_axis_from_description(updated_accum_axis), masked_self_att_expand_dim - # ) - # lm_state[state].self_att.v_accum.raw_tensor = v_accum_raw + label_decoder_state, blank_decoder_state, lm_state = update_state( + model=model, + update_state_mask=update_state_mask, + backrefs=backrefs, + label_decoder_state=label_decoder_state, + label_decoder_state_updated=label_decoder_state_updated, + blank_decoder_state=blank_decoder_state, + blank_decoder_state_updated=blank_decoder_state_updated, + lm_state=lm_state, + lm_state_updated=lm_state_updated, + ) i += 1 @@ -342,9 +433,6 @@ def _mask_lm_state(tensor: rf.Tensor): use_sum=use_recombination == "sum" ) - # print("seq_log_prob: ", seq_log_prob.raw_tensor) - # exit() - # Backtrack via backrefs, resolve beams. seq_targets_ = [] indices = rf.range_over_dim(beam_dim) # FinalBeam -> FinalBeam diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recombination.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recombination.py index b8017325c..20225e6a5 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recombination.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/recombination.py @@ -68,12 +68,12 @@ def recombine_seqs_train( label_log_prob: Tensor, label_indices: Tensor, ground_truth: Tensor, - labels_padded_spatial_sizes: Tensor, target_dim: Dim, single_col_dim: Dim, beam_dim: Dim, batch_dims: Sequence[Dim], blank_idx: int, + use_sum_recombination: bool = True, ) -> Tensor: # local horizontal scores for each hyp: [B, beam] horizontal_scores = rf.gather( @@ -176,22 +176,30 @@ def recombine_seqs_train( rf.constant(-1.0e30, dims=batch_dims) ) + # for each hyp, merge horizontal and diagonal scores into one column + # i.e. [-inf, ..., d, ..., -inf] + [-inf, ..., h, ..., -inf] -> [-inf, ..., d, h, ..., -inf] best_scores = rf.maximum(horizontal_scores, diagonal_scores) - sum_scores = rf.exp(horizontal_scores - best_scores) + rf.exp(diagonal_scores - best_scores) - sum_scores = best_scores + rf.safe_log(sum_scores) + merged_scores = rf.exp(horizontal_scores - best_scores) + rf.exp(diagonal_scores - best_scores) + merged_scores = best_scores + rf.safe_log(merged_scores) - best_scores = rf.reduce_max(sum_scores, axis=beam_dim) - is_max = sum_scores == best_scores - sum_scores = best_scores + rf.log(rf.reduce_sum(rf.exp(sum_scores - best_scores), axis=beam_dim)) + # recombine the scores of different hypotheses + best_scores = rf.reduce_max(merged_scores, axis=beam_dim) + is_max = merged_scores == best_scores - sum_scores = rf.expand_dim(sum_scores, beam_dim) - sum_scores = rf.where( + if use_sum_recombination: + recombined_score = best_scores + rf.log(rf.reduce_sum(rf.exp(merged_scores - best_scores), axis=beam_dim)) + else: + recombined_score = best_scores + + # add back the beam dimension and set the score of the worse hypotheses to -1.0e30 + recombined_score = rf.expand_dim(recombined_score, beam_dim) + recombined_score = rf.where( is_max, - sum_scores, + recombined_score, rf.constant(-1.0e30, dims=batch_dims) ) - return sum_scores + return recombined_score def update_seq_hash(seq_hash: Tensor, target: Tensor, backrefs: Tensor, blank_idx: int) -> Tensor: diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/train.py index bb5c56b40..1de1c0047 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/train.py @@ -1,10 +1,14 @@ +from typing import Optional, Dict, List import torch +import os from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.training import FramewiseTrainDef, FullSumTrainDef from returnn.tensor import TensorDict +from returnn.datasets.hdf import SimpleHDFWriter from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental import utils +from i6_experiments.users.schmitt import hdf from i6_experiments.users.schmitt.augmentation.alignment import shift_alignment_boundaries_batched from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model import SegmentalAttentionModel from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.train import ( @@ -16,14 +20,8 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.train import ( full_sum_training as label_model_full_sum_training ) -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.train import ( - full_sum_training_w_beam as label_model_full_sum_training_w_beam -) -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.train import ( - full_sum_training_w_beam_eff as label_model_full_sum_training_w_beam_eff -) -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.train import ( - full_sum_training_w_beam_eff_w_recomb as label_model_full_sum_training_w_beam_eff_w_recomb +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.realignment import ( + model_realign ) from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.blank_model.train import ( viterbi_training as blank_model_viterbi_training @@ -87,6 +85,13 @@ def _returnn_v2_full_sum_train_step(*, model, extern_data: TensorDict, **_kwargs data_spatial_dim = data.get_time_dim_tag() targets = extern_data[default_target_key] targets_spatial_dim = targets.get_time_dim_tag() + interpolation_alignment = extern_data.data.get("interpolation_alignment") + if interpolation_alignment is not None: + interpolation_alignment_spatial_dim = interpolation_alignment.get_time_dim_tag() + else: + interpolation_alignment_spatial_dim = None + interpolation_alignment_scores = extern_data.data.get("interpolation_alignment_scores") + train_def: FullSumTrainDef = config.typed_value("_train_def") train_def( model=model, @@ -94,6 +99,10 @@ def _returnn_v2_full_sum_train_step(*, model, extern_data: TensorDict, **_kwargs data_spatial_dim=data_spatial_dim, non_blank_targets=targets, non_blank_targets_spatial_dim=targets_spatial_dim, + seq_tags=extern_data["seq_tag"], + interpolation_alignment=interpolation_alignment, + interpolation_alignment_spatial_dim=interpolation_alignment_spatial_dim, + interpolation_alignment_scores=interpolation_alignment_scores, ) @@ -103,9 +112,15 @@ def viterbi_training( data: rf.Tensor, data_spatial_dim: Dim, align_targets: rf.Tensor, - align_targets_spatial_dim: Dim + align_targets_spatial_dim: Dim, + enc_args: Optional[Dict[str, rf.Tensor]] = None, + enc_spatial_dim: Optional[Dim] = None, + batch_dims: Optional[List[Dim]] = None, + beam_dim: Optional[Dim] = None, ): - """Function is run within RETURNN.""" + if enc_args is not None: + assert enc_spatial_dim is not None + from returnn.config import get_global_config config = get_global_config() # noqa @@ -119,7 +134,8 @@ def viterbi_training( data = rf.squeeze(data, axis=data.feature_dim) assert not data.feature_dim # raw audio - batch_dims = data.remaining_dims(data_spatial_dim) + if batch_dims is None: + batch_dims = data.remaining_dims(data_spatial_dim) alignment_augmentation_opts = config.typed_value("alignment_augmentation_opts", None) if alignment_augmentation_opts is not None: @@ -165,53 +181,54 @@ def viterbi_training( non_blank_targets_spatial_dim ) - # ------------------- encoder aux loss ------------------- - - collected_outputs = {} - enc_args, enc_spatial_dim = model.encoder.encode( - data, in_spatial_dim=data_spatial_dim, collected_outputs=collected_outputs) - - if aux_loss_layers: - if use_ctc_loss: - for i, layer_idx in enumerate(aux_loss_layers): - if layer_idx > len(model.encoder.layers): - continue + if enc_args is None: + # ------------------- encoder aux loss ------------------- + + collected_outputs = {} + enc_args, enc_spatial_dim = model.encoder.encode( + data, in_spatial_dim=data_spatial_dim, collected_outputs=collected_outputs) + + if aux_loss_layers: + if use_ctc_loss: + for i, layer_idx in enumerate(aux_loss_layers): + if layer_idx > len(model.encoder.layers): + continue + linear = getattr(model.encoder, f"enc_aux_logits_{layer_idx}") + aux_logits = linear(collected_outputs[str(layer_idx - 1)]) + aux_loss = rf.ctc_loss( + logits=aux_logits, + targets=non_blank_targets, + input_spatial_dim=enc_spatial_dim, + targets_spatial_dim=non_blank_targets_spatial_dim, + blank_index=model.blank_idx, + ) + aux_loss.mark_as_loss( + f"ctc_{layer_idx}", + scale=aux_loss_scales[i], + custom_inv_norm_factor=align_targets_spatial_dim.get_size_tensor(), + use_normalized_loss=True, + ) + elif generate_ctc_alignments_on_the_fly: + assert len(aux_loss_layers) == 1 + assert len(batch_dims) == 1 + assert model.blank_idx == model.target_dim.dimension + print("Generating CTC alignments on the fly") + layer_idx = aux_loss_layers[0] linear = getattr(model.encoder, f"enc_aux_logits_{layer_idx}") - aux_logits = linear(collected_outputs[str(layer_idx - 1)]) - aux_loss = rf.ctc_loss( - logits=aux_logits, - targets=non_blank_targets, - input_spatial_dim=enc_spatial_dim, - targets_spatial_dim=non_blank_targets_spatial_dim, - blank_index=model.blank_idx, - ) - aux_loss.mark_as_loss( - f"ctc_{layer_idx}", - scale=aux_loss_scales[i], - custom_inv_norm_factor=align_targets_spatial_dim.get_size_tensor(), - use_normalized_loss=True, + aux_logits = linear(collected_outputs[str(layer_idx - 1)]) # type: rf.Tensor + print("aux_logits", aux_logits) + + from torchaudio.functional import forced_align + rem_dims = aux_logits.remaining_dims(batch_dims + [enc_spatial_dim]) + ctc_align = forced_align( + log_probs=aux_logits.copy_transpose(batch_dims + [enc_spatial_dim] + rem_dims).raw_tensor, + targets=non_blank_targets.copy_transpose(batch_dims + [non_blank_targets_spatial_dim]).raw_tensor.contiguous(), + input_lengths=enc_spatial_dim.get_size_tensor().raw_tensor, + target_lengths=non_blank_targets_spatial_dim.get_size_tensor().raw_tensor, + blank=model.blank_idx, ) - elif generate_ctc_alignments_on_the_fly: - assert len(aux_loss_layers) == 1 - assert len(batch_dims) == 1 - assert model.blank_idx == model.target_dim.dimension - print("Generating CTC alignments on the fly") - layer_idx = aux_loss_layers[0] - linear = getattr(model.encoder, f"enc_aux_logits_{layer_idx}") - aux_logits = linear(collected_outputs[str(layer_idx - 1)]) # type: rf.Tensor - print("aux_logits", aux_logits) - - from torchaudio.functional import forced_align - rem_dims = aux_logits.remaining_dims(batch_dims + [enc_spatial_dim]) - ctc_align = forced_align( - log_probs=aux_logits.copy_transpose(batch_dims + [enc_spatial_dim] + rem_dims).raw_tensor, - targets=non_blank_targets.copy_transpose(batch_dims + [non_blank_targets_spatial_dim]).raw_tensor.contiguous(), - input_lengths=enc_spatial_dim.get_size_tensor().raw_tensor, - target_lengths=non_blank_targets_spatial_dim.get_size_tensor().raw_tensor, - blank=model.blank_idx, - ) - print("ctc_align", ctc_align.shape) - exit() + print("ctc_align", ctc_align.shape) + exit() if model.use_joint_model: @@ -256,6 +273,7 @@ def viterbi_training( ce_targets=align_targets, ce_spatial_dim=align_targets_spatial_dim, batch_dims=batch_dims, + beam_dim=beam_dim, ) else: @@ -375,11 +393,17 @@ def full_sum_training( data: rf.Tensor, data_spatial_dim: Dim, non_blank_targets: rf.Tensor, - non_blank_targets_spatial_dim: Dim + non_blank_targets_spatial_dim: Dim, + seq_tags: rf.Tensor, + interpolation_alignment: Optional[rf.Tensor] = None, + interpolation_alignment_spatial_dim: Optional[Dim] = None, + interpolation_alignment_scores: Optional[rf.Tensor] = None, ): assert model.use_joint_model assert isinstance(model.label_decoder, SegmentalAttEfficientLabelDecoder) assert model.label_decoder_state == "nb-lstm" + if interpolation_alignment is not None: + assert interpolation_alignment_scores is not None from returnn.config import get_global_config @@ -387,13 +411,14 @@ def full_sum_training( aux_loss_layers = config.typed_value("aux_loss_layers") aux_loss_scales = config.typed_value("aux_loss_scales", ([1.0] * len(aux_loss_layers)) if aux_loss_layers else None) - full_sum_training_beam_size = config.int("full_sum_training_beam_size", None) + full_sum_beam_size = config.int("full_sum_beam_size", None) if data.feature_dim and data.feature_dim.dimension == 1: data = rf.squeeze(data, axis=data.feature_dim) assert not data.feature_dim # raw audio batch_dims = data.remaining_dims(data_spatial_dim) + assert len(batch_dims) == 1 # ------------------- encoder aux loss ------------------- @@ -431,20 +456,151 @@ def full_sum_training( enc_spatial_dim ) - # ------------------- joint loop ------------------- + if full_sum_beam_size: + downsampling = config.int("full_sum_lattice_downsampling", 1) + precompute_chunk_size = config.int("full_sum_precompute_chunk_size", 10) + interpolation_alignment_factor = config.float("full_sum_alignment_interpolation_factor", 0.0) + partition_epoch = config.int("train_partition_epoch", 20) + train_on_viterbi_paths = config.bool("full_sum_train_on_viterbi_paths", False) + + # do not use interpolation alignment for eval datasets + if not rf.get_run_ctx().train_flag or interpolation_alignment_factor == 0.0: + interpolation_alignment_factor = 0.0 + # in the first full epoch (20 subepochs), use linear alignment for interpolation + elif 1 <= rf.get_run_ctx().epoch <= partition_epoch: + interpolation_alignment = utils.get_linear_alignment( + non_blank_targets=non_blank_targets, + non_blank_targets_spatial_dim=non_blank_targets_spatial_dim, + enc_spatial_dim=enc_spatial_dim, + batch_dims=batch_dims, + blank_idx=model.blank_idx, + ) + # log(uniform) * T + interpolation_alignment_scores = rf.copy_to_device( + enc_spatial_dim.get_size_tensor(), device=data.device) * rf.log( + rf.convert_to_tensor(1 / model.target_dim.dimension)) + # otherwise, use given interpolation alignment + else: + # set spatial dim to enc_spatial_dim + interpolation_alignment = utils.copy_tensor_replace_dim_tag( + interpolation_alignment, interpolation_alignment_spatial_dim, enc_spatial_dim) + interpolation_alignment_scores = rf.squeeze( + interpolation_alignment_scores, axis=interpolation_alignment_scores.feature_dim) + + if train_on_viterbi_paths: + with torch.no_grad(): + viterbi_alignment_scores, viterbi_alignment, viterbi_alignment_spatial_dim = model_realign( + model=model.label_decoder, + enc=enc_args["enc"], + enc_ctx=enc_args["enc_ctx"], + enc_spatial_dim=enc_spatial_dim, + non_blank_targets=non_blank_targets, + non_blank_targets_spatial_dim=non_blank_targets_spatial_dim, + segment_starts=segment_starts, + segment_lens=segment_lens, + batch_dims=batch_dims, + beam_size=100 if full_sum_beam_size == 1 else full_sum_beam_size, + downsampling=downsampling if rf.get_run_ctx().train_flag else 1, + precompute_chunk_size=precompute_chunk_size, + interpolation_alignment=interpolation_alignment, + interpolation_alignment_factor=interpolation_alignment_factor, + use_recombination="max" if full_sum_beam_size == 1 else None, + return_realignment=True, + ) - if full_sum_training_beam_size: - label_model_full_sum_training_w_beam_eff_w_recomb( - model=model.label_decoder, - enc_args=enc_args, - enc_spatial_dim=enc_spatial_dim, - non_blank_targets=non_blank_targets, - non_blank_targets_spatial_dim=non_blank_targets_spatial_dim, - segment_starts=segment_starts, - segment_lens=segment_lens, - batch_dims=batch_dims, - beam_size=full_sum_training_beam_size - ) + if full_sum_beam_size > 1: + beam_dim = viterbi_alignment.remaining_dims(batch_dims + [viterbi_alignment_spatial_dim]) + assert len(beam_dim) == 1 + beam_dim = beam_dim[0] + else: + beam_dim = None + + viterbi_training( + model=model, + data=data, + data_spatial_dim=data_spatial_dim, + align_targets=viterbi_alignment, + align_targets_spatial_dim=viterbi_alignment_spatial_dim, + enc_args=enc_args, + enc_spatial_dim=enc_spatial_dim, + batch_dims=(batch_dims + [beam_dim]) if beam_dim is not None else batch_dims, + beam_dim=beam_dim, + ) + else: + # full-sum loss with beam search + seq_log_prob, _, _ = model_realign( + model=model.label_decoder, + enc=enc_args["enc"], + enc_ctx=enc_args["enc_ctx"], + enc_spatial_dim=enc_spatial_dim, + non_blank_targets=non_blank_targets, + non_blank_targets_spatial_dim=non_blank_targets_spatial_dim, + segment_starts=segment_starts, + segment_lens=segment_lens, + batch_dims=batch_dims, + beam_size=full_sum_beam_size, + downsampling=downsampling if rf.get_run_ctx().train_flag else 1, + precompute_chunk_size=precompute_chunk_size, + interpolation_alignment=interpolation_alignment, + interpolation_alignment_factor=interpolation_alignment_factor, + use_recombination=None, + ) + loss = -1 * seq_log_prob + loss.mark_as_loss("full_sum_loss", scale=1.0, use_normalized_loss=True) + + # in training, do realignment and update previous interpolation alignment if realignment is better + if rf.get_run_ctx().train_flag and interpolation_alignment_factor > 0.0: + with torch.no_grad(): + viterbi_alignment_scores, viterbi_alignment, viterbi_alignment_spatial_dim = model_realign( + model=model.label_decoder, + enc=enc_args["enc"], + enc_ctx=enc_args["enc_ctx"], + enc_spatial_dim=enc_spatial_dim, + non_blank_targets=non_blank_targets, + non_blank_targets_spatial_dim=non_blank_targets_spatial_dim, + segment_starts=segment_starts, + segment_lens=segment_lens, + batch_dims=batch_dims, + beam_size=100, + downsampling=1, + precompute_chunk_size=10, + interpolation_alignment=None, + interpolation_alignment_factor=0.0, + use_recombination="max", + return_realignment=True, + ) + + interpolation_alignment = rf.where( + viterbi_alignment_scores > interpolation_alignment_scores, + viterbi_alignment, + interpolation_alignment + ) + interpolation_alignment_scores = rf.where( + viterbi_alignment_scores > interpolation_alignment_scores, + viterbi_alignment_scores, + interpolation_alignment_scores + ) + + for name, tensor, dim, ndim in zip( + ("interpolation-alignment", "interpolation-alignment-scores"), + (interpolation_alignment, interpolation_alignment_scores), + (model.target_dim.dimension, 1), + (1, 0) + ): + # dump the new interpolation alignment to hdf + hdf_filename = f"{name}_full-epoch-{(rf.get_run_ctx().epoch - 1) // partition_epoch + 1}.hdf" + hdf_dataset = SimpleHDFWriter( + filename=hdf_filename, + dim=dim, + ndim=ndim, + extend_existing_file=os.path.exists(hdf_filename), + ) + hdf.dump_hdf_rf( + hdf_dataset=hdf_dataset, + data=tensor, + batch_dim=batch_dims[0], + seq_tags=seq_tags, + ) else: label_model_full_sum_training( model=model.label_decoder, diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/utils.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/utils.py index 73da27aa7..7654a10e6 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/utils.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/utils.py @@ -21,7 +21,7 @@ def get_masked( else: new_lens = rf.copy_to_device(result_spatial_dim.get_size_tensor(), input.device) # max number of non-blank targets in the batch - result_spatial_size = rf.cast(rf.reduce_max(new_lens, axis=batch_dims), "int32") + result_spatial_size = rf.cast(rf.reduce_max(new_lens, axis=new_lens.dims), "int32") mask_axis = mask.get_axis_from_description(mask_dim) # scatter indices idxs = rf.cast(mask, "int32").copy_template() @@ -74,7 +74,8 @@ def get_segment_starts_and_lens( out_spatial_dim: Dim ): targets_range = rf.range_over_dim(align_targets_spatial_dim, dtype="int32") - targets_range = rf.expand_dim(targets_range, batch_dims[0]) + for batch_dim in batch_dims: + targets_range = rf.expand_dim(targets_range, batch_dim) non_blank_positions, _ = get_masked( targets_range, non_blank_mask, align_targets_spatial_dim, batch_dims, out_spatial_dim ) @@ -101,3 +102,68 @@ def get_emit_ground_truth( torch.set_printoptions(threshold=10000) return result, sparse_dim + + +def copy_tensor_replace_dim_tag(tensor: Tensor, old_dim_tag: Dim, new_dim_tag: Dim): + tensor_raw = tensor.raw_tensor + tensor = tensor.copy_template_replace_dim_tag( + tensor.get_axis_from_description(old_dim_tag), new_dim_tag + ) + tensor.raw_tensor = tensor_raw + return tensor + + +def get_linear_alignment( + non_blank_targets: Tensor, + non_blank_targets_spatial_dim: Dim, + enc_spatial_dim: Dim, + batch_dims: Sequence[Dim], + blank_idx: int, +): + enc_spatial_sizes = rf.copy_to_device(enc_spatial_dim.get_size_tensor(), non_blank_targets.device) + non_blank_targets_spatial_sizes = rf.copy_to_device( + non_blank_targets_spatial_dim.get_size_tensor(), non_blank_targets.device) + + # linearly distributed label positions over encoder dimension + linear_label_positions = rf.range_over_dim(non_blank_targets_spatial_dim) + linear_label_positions = rf.cast( + (linear_label_positions + 0.5) * (enc_spatial_sizes / non_blank_targets_spatial_sizes), + "int32" + ) + # set positions, which are too large, to T+1 (cut off this dummy frame later) + linear_label_positions = rf.where( + linear_label_positions < enc_spatial_sizes, + linear_label_positions, + enc_spatial_sizes + ) + enc_spatial_dim_ext = enc_spatial_dim + 1 + linear_label_positions.sparse_dim = enc_spatial_dim_ext + + # scatter non-blank targets into zero tensor + linear_alignment = rf.scatter( + non_blank_targets, + indices=linear_label_positions, + indices_dim=non_blank_targets_spatial_dim, + out_dim=enc_spatial_dim_ext, + ) + # replace all non-blank frames with blank + linear_label_positions_scattered = rf.scatter( + linear_label_positions, + indices=linear_label_positions, + indices_dim=non_blank_targets_spatial_dim, + out_dim=enc_spatial_dim_ext, + ) + linear_alignment = rf.where( + rf.range_over_dim(enc_spatial_dim_ext) != linear_label_positions_scattered, + blank_idx, + linear_alignment + ) + # cut off dummy frame + linear_alignment = linear_alignment.copy_transpose([enc_spatial_dim_ext] + batch_dims) + linear_alignment_raw = linear_alignment.raw_tensor + linear_alignment = linear_alignment.copy_template_replace_dim_tag( + linear_alignment.get_axis_from_description(enc_spatial_dim_ext), + enc_spatial_dim, + ) + linear_alignment.raw_tensor = linear_alignment_raw[:-1] + return linear_alignment diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/__init__.py index 8be9edbfb..dadf251bd 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/__init__.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/__init__.py @@ -10,77 +10,73 @@ def run_exps(): + # ------------------------------------- ctx-1 models ------------------------------------- + for model_alias, config_builder in baseline.center_window_att_baseline_rf( - win_size_list=(5,), blank_decoder_version=4, + win_size_list=(5,), label_decoder_state="nb-linear1", blank_decoder_version=5 ): - for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( - alias=model_alias, - config_builder=config_builder, - n_epochs_list=(10,), - const_lr_list=(1e-4,), - ): - recog.center_window_returnn_frame_wise_beam_search( - alias=train_alias, + for import_model_name in ("glob.conformer.mohammad.5.4",): + for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( + alias=model_alias, config_builder=config_builder, - checkpoint=checkpoint, - ) + n_epochs_list=(100,), + const_lr_list=(1e-4,), + import_model_name=import_model_name, + ): + recog.center_window_returnn_frame_wise_beam_search( + alias=train_alias, + config_builder=config_builder, + checkpoint=checkpoint, + ) - # for model_alias, config_builder in baseline.center_window_att_baseline_rf( - # win_size_list=(5,), blank_decoder_version=5, - # ): - # for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( - # alias=model_alias, - # config_builder=config_builder, - # n_epochs_list=(10,), - # const_lr_list=(1e-4,), - # ): - # pass - # recog.center_window_returnn_frame_wise_beam_search( - # alias=train_alias, - # config_builder=config_builder, - # checkpoint=checkpoint, - # ) + for model_alias, config_builder in baseline.center_window_att_baseline_rf( + win_size_list=(5,), label_decoder_state="nb-linear1", blank_decoder_version=3 + ): + for import_model_name in ("glob.conformer.mohammad.5.4",): + for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( + alias=model_alias, + config_builder=config_builder, + n_epochs_list=(100,), + const_lr_list=(1e-4,), + import_model_name=import_model_name, + ): + recog.center_window_returnn_frame_wise_beam_search( + alias=train_alias, + config_builder=config_builder, + checkpoint=checkpoint, + ) - # for model_alias, config_builder in baseline.center_window_att_baseline_rf( - # win_size_list=(5,), blank_decoder_version=6, - # ): - # for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( - # alias=model_alias, - # config_builder=config_builder, - # n_epochs_list=(10,), - # const_lr_list=(1e-4,), - # ): - # pass - # recog.center_window_returnn_frame_wise_beam_search( - # alias=train_alias, - # config_builder=config_builder, - # checkpoint=checkpoint, - # ) + for model_alias, config_builder in baseline.center_window_att_baseline_rf( + win_size_list=(5,), + label_decoder_state="nb-linear1", + blank_decoder_version=5, + use_weight_feedback=False, + use_att_ctx_in_state=False + ): + for import_model_name in ("glob.conformer.mohammad.5.4",): + for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( + alias=model_alias, + config_builder=config_builder, + n_epochs_list=(100,), + const_lr_list=(1e-4,), + import_model_name=import_model_name, + ): + recog.center_window_returnn_frame_wise_beam_search( + alias=train_alias, + config_builder=config_builder, + checkpoint=checkpoint, + ) - # for model_alias, config_builder in baseline.center_window_att_baseline_rf( - # win_size_list=(5,) - # ): - # for max_shift, num_iterations in [(1, 1), (2, 1), (1, 2)]: - # for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( - # alias=model_alias, - # config_builder=config_builder, - # n_epochs_list=(100,), - # const_lr_list=(1e-4,), - # alignment_augmentation_opts={"max_shift": max_shift, "num_iterations": num_iterations}, - # ): - # recog.center_window_returnn_frame_wise_beam_search( - # alias=train_alias, - # config_builder=config_builder, - # checkpoint=checkpoint, - # ) + # ------------------------------------- blank decoder variants ------------------------------------- for model_alias, config_builder in baseline.center_window_att_baseline_rf( - win_size_list=(5,), + win_size_list=(5,), blank_decoder_version=5, ): - for train_alias, checkpoint in train.train_center_window_att_viterbi_from_scratch( + for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( alias=model_alias, config_builder=config_builder, - n_epochs_list=(500,), + n_epochs_list=(100,), + const_lr_list=(1e-4,), ): recog.center_window_returnn_frame_wise_beam_search( alias=train_alias, @@ -88,24 +84,8 @@ def run_exps(): checkpoint=checkpoint, ) - # for model_alias, config_builder in baseline.center_window_att_baseline_rf( - # win_size_list=(5,), - # ): - # for train_alias, checkpoint in train.train_center_window_att_viterbi_from_scratch( - # alias=model_alias, - # config_builder=config_builder, - # n_epochs_list=(500,), - # use_speed_pert=True, - # ): - # recog.center_window_returnn_frame_wise_beam_search( - # alias=train_alias, - # config_builder=config_builder, - # checkpoint=checkpoint, - # ) - - # ------------------------------------- best models: KEEP! ------------------------------------- for model_alias, config_builder in baseline.center_window_att_baseline_rf( - win_size_list=(5, 129), + win_size_list=(5,), blank_decoder_version=6, ): for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( alias=model_alias, @@ -119,20 +99,85 @@ def run_exps(): checkpoint=checkpoint, ) + # ------------------------------------- from-scratch Viterbi training ------------------------------------- + for model_alias, config_builder in baseline.center_window_att_baseline_rf( win_size_list=(5,), ): - for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( + for train_alias, checkpoint in train.train_center_window_att_viterbi_from_scratch( alias=model_alias, config_builder=config_builder, - n_epochs_list=(100,), - const_lr_list=(1e-4,), + n_epochs_list=(500,), ): - for use_recombination in {"max", "sum"}: + recog.center_window_returnn_frame_wise_beam_search( + alias=train_alias, + config_builder=config_builder, + checkpoint=checkpoint, + ) + + # ------------------------------------- best models: KEEP! ------------------------------------- + + for use_weight_feedback in (True, False): + for use_att_ctx_in_state in (True, False): + for model_alias, config_builder in baseline.center_window_att_baseline_rf( + win_size_list=(5,), use_weight_feedback=use_weight_feedback, use_att_ctx_in_state=use_att_ctx_in_state + ): + for import_model_name in ("glob.conformer.mohammad.5.4",): + for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( + alias=model_alias, + config_builder=config_builder, + n_epochs_list=(100,), + const_lr_list=(1e-4,), + import_model_name=import_model_name, + ): + recog.center_window_returnn_frame_wise_beam_search( + alias=train_alias, + config_builder=config_builder, + checkpoint=checkpoint, + ) + if "129" not in model_alias: + recog.center_window_returnn_frame_wise_beam_search( + alias=train_alias, + config_builder=config_builder, + checkpoint=checkpoint, + checkpoint_aliases=("last",), + # lm_type="trafo", + # lm_scale_list=(0.4,), + ) + if use_weight_feedback and use_att_ctx_in_state: + recog.center_window_returnn_frame_wise_beam_search( + alias=train_alias, + config_builder=config_builder, + checkpoint=checkpoint, + checkpoint_aliases=("last",), + lm_type="trafo", + lm_scale_list=(0.54,), + ilm_type="mini_att", + ilm_scale_list=(0.4,), + ) + + for model_alias, config_builder in baseline.center_window_att_baseline_rf( + win_size_list=(129,), + ): + for import_model_name in ("glob.conformer.mohammad.5.4",): + for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( + alias=model_alias, + config_builder=config_builder, + n_epochs_list=(100,), + const_lr_list=(1e-4,), + import_model_name=import_model_name, + ): recog.center_window_returnn_frame_wise_beam_search( alias=train_alias, config_builder=config_builder, checkpoint=checkpoint, - checkpoint_aliases=("best-4-avg",), - use_recombination=use_recombination, ) + if "129" not in model_alias: + recog.center_window_returnn_frame_wise_beam_search( + alias=train_alias, + config_builder=config_builder, + checkpoint=checkpoint, + checkpoint_aliases=("last",), + # lm_type="trafo", + # lm_scale_list=(0.4,), + ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/baseline.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/baseline.py index 8068a8061..e39569b0a 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/baseline.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v3/baseline.py @@ -9,6 +9,7 @@ def center_window_att_baseline_rf( use_att_ctx_in_state: bool = True, use_weight_feedback: bool = True, blank_decoder_version: int = 3, + label_decoder_state: str = "nb-lstm", ): for win_size in win_size_list: alias, config_builder = get_center_window_att_config_builder_rf( @@ -17,6 +18,7 @@ def center_window_att_baseline_rf( blank_decoder_version=blank_decoder_version, use_joint_model=False, use_weight_feedback=use_weight_feedback, + label_decoder_state=label_decoder_state, ) alias = f"{base_alias}/baseline_rf/{alias}" yield alias, config_builder diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/__init__.py index 478e6957d..da72bdf7e 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/__init__.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/baseline_v4/__init__.py @@ -2,104 +2,166 @@ baseline, ) from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.center_window_att import ( - train, recog + train, recog, realign ) def run_exps(): for model_alias, config_builder in baseline.center_window_att_baseline_rf( - win_size_list=(5,), label_decoder_state="nb-lstm", use_att_ctx_in_state=False, use_weight_feedback=False, + win_size_list=(5,), label_decoder_state="nb-linear1", use_att_ctx_in_state=False, use_weight_feedback=False, ): - for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( - alias=model_alias, - config_builder=config_builder, - n_epochs_list=(200, 300), - ): - recog.center_window_returnn_frame_wise_beam_search( - alias=train_alias, + for import_model_name in ("glob.conformer.mohammad.5.4",): + for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( + alias=model_alias, config_builder=config_builder, - checkpoint=checkpoint, - ) + n_epochs_list=(100,), + import_model_name=import_model_name, + ): + for recombination in (None,): + recog.center_window_returnn_frame_wise_beam_search( + alias=train_alias, + config_builder=config_builder, + checkpoint=checkpoint, + # checkpoint_aliases=("last",), + use_recombination=recombination, + ) - for model_alias, config_builder in baseline.center_window_att_baseline_rf( - win_size_list=(5,), label_decoder_state="joint-lstm", use_att_ctx_in_state=False, use_weight_feedback=False, - ): - for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( - alias=model_alias, - config_builder=config_builder, - n_epochs_list=(200, 300), - ): - recog.center_window_returnn_frame_wise_beam_search( - alias=train_alias, - config_builder=config_builder, - checkpoint=checkpoint, - ) + # for model_alias, config_builder in baseline.center_window_att_baseline_rf( + # win_size_list=(5,), label_decoder_state="joint-lstm", + # ): + # for import_model_name in ("glob.conformer.mohammad.5.4",): + # for train_alias, checkpoint in train.train_center_window_att_viterbi_import_global_tf( + # alias=model_alias, + # config_builder=config_builder, + # n_epochs_list=(100,), + # import_model_name=import_model_name, + # ): + # for recombination in (None,): + # recog.center_window_returnn_frame_wise_beam_search( + # alias=train_alias, + # config_builder=config_builder, + # checkpoint=checkpoint, + # # checkpoint_aliases=("last",), + # use_recombination=recombination, + # ) for model_alias, config_builder in baseline.center_window_att_baseline_rf( win_size_list=(5,), label_decoder_state="nb-lstm", use_att_ctx_in_state=False, use_weight_feedback=False, - bpe_vocab_size=1056, ): for train_alias, checkpoint in train.train_center_window_att_full_sum_from_scratch( alias=model_alias, config_builder=config_builder, n_epochs_list=(125,), use_speed_pert=True, - batch_size=3_000, + batch_size=8_000, + time_rqmt=80, + use_mgpu=False, + beam_size=4, + lattice_downsampling=1, + alignment_interpolation_factor=0.5, + ): + for epoch, chckpt in checkpoint["checkpoints"].items(): + realign.center_window_returnn_realignment( + alias=train_alias, + config_builder=config_builder, + checkpoint=chckpt, + checkpoint_alias=f"epoch-{epoch}", + plot=True, + ) + + for train_alias, checkpoint in train.train_center_window_att_full_sum_from_scratch( + alias=model_alias, + config_builder=config_builder, + n_epochs_list=(125,), + use_speed_pert=True, + batch_size=8_000, time_rqmt=80, use_mgpu=False, + beam_size=100, + lattice_downsampling=3, + alignment_interpolation_factor=0.0, ): - pass - # recog.center_window_returnn_frame_wise_beam_search( - # alias=train_alias, - # config_builder=config_builder, - # checkpoint=checkpoint, - # ) + for epoch, chckpt in checkpoint["checkpoints"].items(): + realign.center_window_returnn_realignment( + alias=train_alias, + config_builder=config_builder, + checkpoint=chckpt, + checkpoint_alias=f"epoch-{epoch}", + plot=True, + ) + + for train_alias, checkpoint in train.train_center_window_att_full_sum_from_scratch( + alias=model_alias, + config_builder=config_builder, + n_epochs_list=(125,), + use_speed_pert=True, + batch_size=8_000, + time_rqmt=80, + use_mgpu=False, + beam_size=1, + lattice_downsampling=1, + alignment_interpolation_factor=0.0, + train_on_viterbi_paths=True, + ): + for epoch, chckpt in checkpoint["checkpoints"].items(): + realign.center_window_returnn_realignment( + alias=train_alias, + config_builder=config_builder, + checkpoint=chckpt, + checkpoint_alias=f"epoch-{epoch}", + plot=True, + ) for model_alias, config_builder in baseline.center_window_att_baseline_rf( - win_size_list=(5,), + win_size_list=(15,), label_decoder_state="nb-lstm", use_att_ctx_in_state=False, use_weight_feedback=False, - bpe_vocab_size=1056, ): for train_alias, checkpoint in train.train_center_window_att_full_sum_from_scratch( alias=model_alias, config_builder=config_builder, n_epochs_list=(125,), use_speed_pert=True, - batch_size=3_000, - time_rqmt=1, - use_mgpu=True, + batch_size=8_000, + time_rqmt=80, + use_mgpu=False, + beam_size=100, + lattice_downsampling=8, + alignment_interpolation_factor=0.0, ): - pass - # recog.center_window_returnn_frame_wise_beam_search( - # alias=train_alias, - # config_builder=config_builder, - # checkpoint=checkpoint, - # ) + for epoch, chckpt in checkpoint["checkpoints"].items(): + realign.center_window_returnn_realignment( + alias=train_alias, + config_builder=config_builder, + checkpoint=chckpt, + checkpoint_alias=f"epoch-{epoch}", + plot=True, + ) for model_alias, config_builder in baseline.center_window_att_baseline_rf( - win_size_list=(1,), + win_size_list=(5,), label_decoder_state="nb-lstm", use_att_ctx_in_state=False, use_weight_feedback=False, + bpe_vocab_size=1056, ): for train_alias, checkpoint in train.train_center_window_att_full_sum_from_scratch( alias=model_alias, config_builder=config_builder, n_epochs_list=(125,), use_speed_pert=True, - batch_size=3_000, + batch_size=8_000, time_rqmt=80, use_mgpu=False, - beam_size=100 ): - pass - # recog.center_window_returnn_frame_wise_beam_search( - # alias=train_alias, - # config_builder=config_builder, - # checkpoint=checkpoint, - # ) + for epoch, chckpt in checkpoint["checkpoints"].items(): + realign.center_window_returnn_realignment( + alias=train_alias, + config_builder=config_builder, + checkpoint=chckpt, + checkpoint_alias=f"epoch-{epoch}", + ) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/config_builder.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/config_builder.py index d43cc8813..8b876f61d 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/config_builder.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/config_builder.py @@ -8,7 +8,7 @@ from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.general.returnn.exes import RETURNN_EXE_NEW, RETURNN_CURRENT_ROOT from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.config_builder_rf.base import SegmentalAttConfigBuilderRF -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model import from_scratch_model_def, _returnn_v2_get_model +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model import from_scratch_model_def, _returnn_v2_get_model, _returnn_v2_get_joint_model def get_center_window_att_config_builder_rf( @@ -35,16 +35,21 @@ def get_center_window_att_config_builder_rf( "returnn_root": RETURNN_CURRENT_ROOT } + if use_joint_model: + get_model_func = _returnn_v2_get_joint_model + else: + get_model_func = _returnn_v2_get_model + config_builder = SegmentalAttConfigBuilderRF( variant_params=variant_params, model_def=from_scratch_model_def, - get_model_func=_returnn_v2_get_model, + get_model_func=get_model_func, center_window_size=win_size, use_att_ctx_in_state=use_att_ctx_in_state, blank_decoder_version=blank_decoder_version, use_joint_model=use_joint_model, use_weight_feedback=use_weight_feedback, - label_decoder_state=label_decoder_state + label_decoder_state=label_decoder_state, ) alias = ( diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/realign.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/realign.py new file mode 100644 index 000000000..efa96d460 --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/realign.py @@ -0,0 +1,80 @@ +from typing import Tuple, Optional, List, Union, Dict +import copy + +from i6_core.returnn.training import PtCheckpoint +from i6_core.returnn.forward import ReturnnForwardJob, ReturnnForwardJobV2 +from sisyphus import Path, tk + +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.config_builder_rf.base import SegmentalAttConfigBuilderRF +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.recog_new import ReturnnSegmentalAttDecodingPipeline, RasrSegmentalAttDecodingExperiment +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.realignment_new import RasrRealignmentExperiment +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.recog import _returnn_v2_forward_step, _returnn_v2_get_forward_callback +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.realignment import model_realign_, _returnn_v2_forward_step, _returnn_v2_get_forward_callback +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model import _returnn_v2_get_joint_model +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.general.returnn.exes import RETURNN_EXE_NEW, RETURNN_CURRENT_ROOT +from i6_experiments.users.schmitt.visualization.visualization import PlotAlignmentJob +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.labels.v2.librispeech.label_singletons import LibrispeechBPE10025_CTC_ALIGNMENT + +def center_window_returnn_realignment( + alias: str, + config_builder: SegmentalAttConfigBuilderRF, + checkpoint: Union[PtCheckpoint, Dict], + checkpoint_alias: str, + plot: bool = False, +): + alias += ( + f"/returnn_realignment/{checkpoint_alias}-checkpoint" + ) + + if isinstance(checkpoint, PtCheckpoint): + checkpoint = checkpoint + else: + assert isinstance(checkpoint, dict) + checkpoint = config_builder.get_recog_checkpoints(**checkpoint)[checkpoint_alias] + + realign_config = config_builder.get_realign_config( + opts={ + "corpus_key": "dev-other", + "realign_def": model_realign_, + "forward_step_func": _returnn_v2_forward_step, + "forward_callback": _returnn_v2_get_forward_callback, + }) + + realign_job = ReturnnForwardJobV2( + model_checkpoint=checkpoint, + returnn_config=realign_config, + returnn_root=RETURNN_CURRENT_ROOT, + returnn_python_exe=RETURNN_EXE_NEW, + output_files=["scores.py.gz", "realignment.hdf"], + mem_rqmt=6, + time_rqmt=1, + ) + realign_job.add_alias(f"{alias}/realignment") + tk.register_output(realign_job.get_one_alias(), realign_job.out_files["realignment.hdf"]) + + if plot: + plot_alignment_job = PlotAlignmentJob( + alignment_hdf=realign_job.out_files["realignment.hdf"], + # ref_alignment_hdf=Path( + # "/u/schmitt/experiments/segmental_models_2022_23_rf/alias/models/ls_conformer/global_att/baseline_v1/baseline/no-finetuning/ctc_alignments/dev-other/output/alignments.hdf"), + ref_alignment_hdf=LibrispeechBPE10025_CTC_ALIGNMENT.alignment_paths["dev-other"], + json_vocab_path=Path( + "/u/zeineldeen/setups/librispeech/2022-11-28--conformer-att/work/i6_core/text/label/subword_nmt/train/ReturnnTrainBpeJob.vTq56NZ8STWt/output/bpe.vocab"), + target_blank_idx=0, + segment_list=[ + "dev-other/3660-6517-0005/3660-6517-0005", + "dev-other/6467-62797-0001/6467-62797-0001", + "dev-other/6467-62797-0002/6467-62797-0002", + "dev-other/7697-105815-0015/7697-105815-0015", + "dev-other/7697-105815-0051/7697-105815-0051", + # high ctc-cog error + "dev-other/6123-59150-0027/6123-59150-0027", + # non-monotonic att weights + "dev-other/1255-138279-0000/1255-138279-0000", + "dev-other/7601-291468-0006/7601-291468-0006", + "dev-other/7601-101619-0003/7601-101619-0003" + ], + ref_alignment_blank_idx=10025, + ) + plot_alignment_job.add_alias(f"{alias}/plot_realignment") + tk.register_output(plot_alignment_job.get_one_alias(), plot_alignment_job.out_plot_dir) diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/recog.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/recog.py index dc0ffa994..de6da6cad 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/recog.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/recog.py @@ -48,6 +48,7 @@ def center_window_returnn_frame_wise_beam_search( "forward_step_func": _returnn_v2_forward_step, "forward_callback": _returnn_v2_get_forward_callback, "use_recombination": use_recombination, + "batch_size": 15_000 }, search_alias=f'returnn_decoding{"_pure_torch" if pure_torch else ""}' ).run() diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/train.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/train.py index 61e373208..35946f178 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/train.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/center_window_att/train.py @@ -9,7 +9,6 @@ viterbi_training, full_sum_training, ) -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model import _returnn_v2_get_model_for_full_sum_training from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.checkpoints import ( external_checkpoints, default_import_model_name, @@ -27,7 +26,7 @@ def train_center_window_att_viterbi_from_scratch( use_mgpu: bool = True, ): for n_epochs in n_epochs_list: - alias += f"/viterbi-train_from_scratch/{n_epochs}-epochs_bs-{batch_size}_w-ctc-loss_{'w' if use_speed_pert else 'wo'}-speed-pert" + alias += f"/viterbi-train_from_scratch/{n_epochs}-epochs_bs-{batch_size}_wo-ctc-loss_{'w' if use_speed_pert else 'wo'}-speed-pert" train_opts = { "dataset_opts": { @@ -110,12 +109,19 @@ def train_center_window_att_full_sum_from_scratch( batch_size: int = 15_000, use_mgpu: bool = True, beam_size: Optional[int] = None, + lattice_downsampling: int = 1, + alignment_interpolation_factor: float = 0.5, + train_on_viterbi_paths: bool = False, ): - # TODO: do this in a nicer way - config_builder = copy.deepcopy(config_builder) - config_builder.get_model_func = _returnn_v2_get_model_for_full_sum_training + # # TODO: do this in a nicer way + # config_builder = copy.deepcopy(config_builder) + # config_builder.get_model_func = _returnn_v2_get_model_for_full_sum_training for n_epochs in n_epochs_list: - alias += f"/full-sum-train_from_scratch/{n_epochs}-epochs_bs-{batch_size}_w-ctc-loss_{'w' if use_speed_pert else 'wo'}-speed-pert" + alias += ( + f"/full-sum-train_from_scratch/{n_epochs}-epochs_bs-{batch_size}_{'w' if use_speed_pert else 'wo'}-sp" + f"_beams-{beam_size}_lat-down-{lattice_downsampling}_{alignment_interpolation_factor}-interp" + f"_{'ce' if train_on_viterbi_paths else 'sum'}-loss" + ) train_opts = { "dataset_opts": { @@ -152,10 +158,15 @@ def train_center_window_att_full_sum_from_scratch( # "max_seq_length": {"targets": 75}, "train_def": full_sum_training, "train_step_func": _returnn_v2_full_sum_train_step, + "full_sum_alignment_interpolation_factor": alignment_interpolation_factor, + "full_sum_lattice_downsampling": lattice_downsampling, } if beam_size is not None: - train_opts["full_sum_training_beam_size"] = beam_size + train_opts["full_sum_beam_size"] = beam_size + + if train_on_viterbi_paths: + train_opts["full_sum_train_on_viterbi_paths"] = True train_rqmt = { "time": time_rqmt, @@ -195,7 +206,7 @@ def train_center_window_att_viterbi_import_global_tf( alignment_augmentation_opts: Optional[Dict] = None, import_model_name: str = default_import_model_name, ): - if not config_builder.use_att_ctx_in_state: + if not config_builder.use_att_ctx_in_state and "lstm" in config_builder.label_decoder_state: # only randomly init FF weights, since only the input dim of the lstm layer is different custom_missing_load_func = load_missing_params else: diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/__init__.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/__init__.py index c6c6301de..c5d7a6835 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/__init__.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/__init__.py @@ -8,27 +8,54 @@ def run_exps(): - for use_weight_feedback in (True,): - for model_alias, config_builder in baseline.global_att_baseline_rf(use_weight_feedback=use_weight_feedback): - for train_alias, checkpoint in ( - (f"{model_alias}/import_{default_import_model_name}", external_checkpoints[default_import_model_name]), - (f"{model_alias}/import_glob.conformer.mohammad.5.4", external_checkpoints["glob.conformer.mohammad.5.4"]), + for model_alias, config_builder in baseline.global_att_baseline_rf(use_weight_feedback=True): + for train_alias, checkpoint in ( + (f"{model_alias}/import_{default_import_model_name}", external_checkpoints[default_import_model_name]), + (f"{model_alias}/import_glob.conformer.mohammad.5.4", external_checkpoints["glob.conformer.mohammad.5.4"]), + ): + recog.global_att_returnn_label_sync_beam_search( + alias=train_alias, + config_builder=config_builder, + checkpoint=checkpoint, + checkpoint_aliases=("best-4-avg",), + ) + + for import_model_name in ("glob.conformer.mohammad.5.4",): + for train_alias, checkpoint in train.train_import_global_tf( + alias=model_alias, + config_builder=config_builder, + n_epochs_list=(100,), + const_lr_list=(1e-4,), + import_model_name=import_model_name, ): recog.global_att_returnn_label_sync_beam_search( alias=train_alias, config_builder=config_builder, checkpoint=checkpoint, - checkpoint_aliases=("best-4-avg",), + checkpoint_aliases=("best",), + lm_type="trafo", + lm_scale_list=(0.4, 0.54), + ilm_scale_list=(0.4,), + ilm_type="mini_att", + beam_size_list=(12,) ) + for model_alias, config_builder in baseline.global_att_baseline_rf( + use_weight_feedback=False, use_att_ctx_in_state=False + ): + for import_model_name in ("glob.conformer.mohammad.5.4",): for train_alias, checkpoint in train.train_import_global_tf( alias=model_alias, config_builder=config_builder, - n_epochs_list=(10, 100), + n_epochs_list=(100,), const_lr_list=(1e-4,), + import_model_name=import_model_name, ): recog.global_att_returnn_label_sync_beam_search( alias=train_alias, config_builder=config_builder, checkpoint=checkpoint, ) + + + diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/baseline.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/baseline.py index c9a3a3c95..7b50a3562 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/baseline.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/baseline_v1/baseline.py @@ -1,49 +1,5 @@ -import copy -from typing import Dict, List, Any, Optional, Tuple - -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.config_builder_rf.base import GlobalAttConfigBuilderRF -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.global_.model import from_scratch_model_def, _returnn_v2_get_model -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.model_variants.model_variants_ls_conf import models from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.pipelines.pipeline_ls_conf.global_att.baseline_v1.alias import alias as base_alias -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.labels.v2.librispeech.label_singletons import ( - LibrispeechBPE10025_LABELS, -LIBRISPEECH_CORPUS -) -from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.general.returnn.exes import RETURNN_EXE_NEW, RETURNN_CURRENT_ROOT - - -def get_global_att_config_builder_rf( - use_weight_feedback: bool = True, - use_att_ctx_in_state: bool = True, -): - variant_params = { - "dependencies": LibrispeechBPE10025_LABELS, - "dataset": { - "feature_type": "raw", - "corpus": LIBRISPEECH_CORPUS - }, - "config": { - "train_seq_ordering": "laplace:.1000" - }, - "network": {"length_scale": 1.0}, - "returnn_python_exe": RETURNN_EXE_NEW, - "returnn_root": RETURNN_CURRENT_ROOT - } - - config_builder = GlobalAttConfigBuilderRF( - variant_params=variant_params, - model_def=from_scratch_model_def, - get_model_func=_returnn_v2_get_model, - use_weight_feedback=use_weight_feedback, - use_att_ctx_in_state=use_att_ctx_in_state, - ) - - alias = ( - f"{'w' if use_weight_feedback else 'wo'}-weight-feedback/" - f"{'w' if use_att_ctx_in_state else 'wo'}-att-ctx-in-state" - ) - - return alias, config_builder +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.pipelines.pipeline_ls_conf.global_att.config_builder import get_global_att_config_builder_rf def global_att_baseline_rf(use_weight_feedback: bool = True, use_att_ctx_in_state: bool = True): diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/config_builder.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/config_builder.py new file mode 100644 index 000000000..c79921858 --- /dev/null +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/config_builder.py @@ -0,0 +1,41 @@ +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.config_builder_rf.base import GlobalAttConfigBuilderRF +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.global_.model import from_scratch_model_def, _returnn_v2_get_model +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.labels.v2.librispeech.label_singletons import ( + LibrispeechBPE10025_LABELS, +LIBRISPEECH_CORPUS +) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23.dependencies.general.returnn.exes import RETURNN_EXE_NEW, RETURNN_CURRENT_ROOT + + +def get_global_att_config_builder_rf( + use_weight_feedback: bool = True, + use_att_ctx_in_state: bool = True, +): + variant_params = { + "dependencies": LibrispeechBPE10025_LABELS, + "dataset": { + "feature_type": "raw", + "corpus": LIBRISPEECH_CORPUS + }, + "config": { + "train_seq_ordering": "laplace:.1000" + }, + "network": {"length_scale": 1.0}, + "returnn_python_exe": RETURNN_EXE_NEW, + "returnn_root": RETURNN_CURRENT_ROOT + } + + config_builder = GlobalAttConfigBuilderRF( + variant_params=variant_params, + model_def=from_scratch_model_def, + get_model_func=_returnn_v2_get_model, + use_weight_feedback=use_weight_feedback, + use_att_ctx_in_state=use_att_ctx_in_state, + ) + + alias = ( + f"{'w' if use_weight_feedback else 'wo'}-weight-feedback/" + f"{'w' if use_att_ctx_in_state else 'wo'}-att-ctx-in-state" + ) + + return alias, config_builder diff --git a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/recog.py b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/recog.py index f599574a3..4fbb46542 100644 --- a/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/recog.py +++ b/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/pipelines/pipeline_ls_conf/global_att/recog.py @@ -21,7 +21,7 @@ def global_att_returnn_label_sync_beam_search( checkpoint_aliases: Tuple[str, ...] = ("last", "best", "best-4-avg"), run_analysis: bool = False, att_weight_seq_tags: Optional[List] = None, - pure_torch: bool = False + corpus_keys: Tuple[str, ...] = ("dev-other",), ): ilm_opts = {"type": ilm_type} if ilm_type == "mini_att": @@ -42,9 +42,10 @@ def global_att_returnn_label_sync_beam_search( run_analysis=run_analysis, analysis_opts={"att_weight_seq_tags": att_weight_seq_tags}, recog_opts={ - "recog_def": model_recog_pure_torch if pure_torch else model_recog, + "recog_def": model_recog, "forward_step_func": _returnn_v2_forward_step, "forward_callback": _returnn_v2_get_forward_callback, }, - search_alias=f'returnn_decoding{"_pure_torch" if pure_torch else ""}' + search_alias=f'returnn_decoding', + corpus_keys=corpus_keys, ).run() diff --git a/users/schmitt/hdf.py b/users/schmitt/hdf.py index cad04e016..4cbbba212 100644 --- a/users/schmitt/hdf.py +++ b/users/schmitt/hdf.py @@ -1,9 +1,12 @@ import h5py from typing import List +import numpy as np from sisyphus import Path, tk from i6_core.returnn import ReturnnDumpHDFJob +from returnn.datasets.hdf import SimpleHDFWriter +from returnn.tensor import Dim, Tensor def load_hdf_data(hdf_path: Path, num_dims: int = 1, segment_list: List = None): @@ -68,3 +71,69 @@ def build_hdf_from_alignment( ).out_hdf return hdf_file + + +def dump_hdf_numpy( + hdf_dataset: SimpleHDFWriter, + data: np.array, + seq_lens: np.array, + seq_tags: List[str], +): + """ + Dump data to an hdf file. + + :param data: torch.Tensor of shape (batch, spatial) with sparse_dim=dimension + :param seq_lens: torch.Tensor of shape (batch,) + :param seq_tags: torch.Tensor of shape (batch,) + :param dimension: int, the sparse dimension of the data + """ + assert len(data.shape) == 2 + assert len(seq_lens.shape) == 1 + assert data.shape[0] == seq_lens.shape[0] + + seq_lens = {0: seq_lens} + batch_seq_sizes = np.expand_dims(seq_lens[0], 1) + + hdf_dataset.insert_batch( + data, + seq_len=seq_lens, + seq_tag=list(seq_tags), + extra={"seq_sizes": batch_seq_sizes} + ) + + +def dump_hdf_rf( + hdf_dataset: SimpleHDFWriter, + data: Tensor, + batch_dim: Dim, + seq_tags: Tensor, +): + """ + Dump data to an hdf file. + + :param data: torch.Tensor of shape (batch, spatial) with sparse_dim=dimension + :param seq_lens: torch.Tensor of shape (batch,) + :param seq_tags: torch.Tensor of shape (batch,) + :param dimension: int, the sparse dimension of the data + """ + assert len(data.batch_shape) <= 2 + + spatial_dims = data.remaining_dims(batch_dim) + data_raw = data.copy_transpose( + [batch_dim] + spatial_dims + ).raw_tensor + + if len(spatial_dims) == 1: + seq_lens = {0: spatial_dims[0].get_size_tensor().raw_tensor.numpy()} + batch_seq_sizes = np.expand_dims(seq_lens[0], 1) + else: + seq_lens = {} + batch_seq_sizes = np.zeros((batch_dim.get_dim_value(), 1)) + + hdf_dataset.insert_batch( + data_raw.to(device="cpu").numpy(), + seq_len=seq_lens, + seq_tag=list(seq_tags.raw_tensor), + extra={"seq_sizes": batch_seq_sizes} + ) + hdf_dataset.close() diff --git a/users/schmitt/returnn_frontend/model_interfaces/training.py b/users/schmitt/returnn_frontend/model_interfaces/training.py index d834db360..14b14c834 100644 --- a/users/schmitt/returnn_frontend/model_interfaces/training.py +++ b/users/schmitt/returnn_frontend/model_interfaces/training.py @@ -63,6 +63,10 @@ def __call__( data_spatial_dim: Dim, non_blank_targets: Tensor, non_blank_targets_spatial_dim: Dim, + seq_tags: Tensor, + interpolation_alignment: Optional[Tensor] = None, + interpolation_alignment_spatial_dim: Optional[Dim] = None, + interpolation_alignment_scores: Optional[Tensor] = None, ): raise NotImplementedError diff --git a/users/schmitt/visualization/visualization.py b/users/schmitt/visualization/visualization.py index 89d3abcad..a733c89f5 100644 --- a/users/schmitt/visualization/visualization.py +++ b/users/schmitt/visualization/visualization.py @@ -269,6 +269,7 @@ def set_ticks( vocab: Dict[int, str], ref_alignment_blank_idx: int, target_blank_idx: Optional[int] = None, + draw_vertical_lines: bool = False ): """ Set the ticks and labels for the x and y axis. @@ -281,7 +282,8 @@ def set_ticks( ref_labels = ref_alignment[ref_alignment != ref_alignment_blank_idx] ref_labels = [vocab[idx] for idx in ref_labels] # x axis - ax.set_xticks([tick - 1.0 for tick in ref_label_positions]) + xticks = [tick - 1.0 for tick in ref_label_positions] + ax.set_xticks(xticks) ax.set_xticklabels(ref_labels, rotation=90) # output labels of the model @@ -297,6 +299,10 @@ def set_ticks( for ytick in yticks: ax.axhline(y=ytick - .5, xmin=0, xmax=1, color="k", linewidth=.5) + if draw_vertical_lines: + for xtick in xticks: + ax.axvline(x=xtick, ymin=0, ymax=1, color="r", linewidth=.5, linestyle="--", alpha=0.5) + @staticmethod def _draw_segment_boundaries( ax: plt.Axes, @@ -330,7 +336,13 @@ def _draw_center_positions( ax.axvline(x=center_position + .5, ymin=ymin, ymax=ymax, color="lime") @staticmethod - def plot_ctc_alignment(ax: plt.Axes, ctc_alignment: np.ndarray, num_labels: int, ctc_blank_idx: int): + def plot_ctc_alignment( + ax: plt.Axes, + ctc_alignment: np.ndarray, + num_labels: int, + ctc_blank_idx: int, + plot_trailing_blanks: bool = False + ): label_idx = 0 # store alignment like: 000011112222223333, where the number is the label index (~ height in the plot) ctc_alignment_plot_data = [] @@ -339,7 +351,7 @@ def plot_ctc_alignment(ax: plt.Axes, ctc_alignment: np.ndarray, num_labels: int, if ctc_label != ctc_blank_idx: label_idx += 1 # stop if we reached the last label, the rest of the ctc alignment are blanks - if label_idx == num_labels: + if label_idx == num_labels and not plot_trailing_blanks: break ax.plot(ctc_alignment_plot_data, "o", color="black", alpha=.4) @@ -530,13 +542,20 @@ def __init__( json_vocab_path: Path, target_blank_idx: int, segment_list: List[str], + ref_alignment_hdf: Optional[Path] = None, + ref_alignment_blank_idx: Optional[int] = None, silence_idx: Optional[int] = None, ): + if ref_alignment_hdf is not None: + assert ref_alignment_blank_idx is not None + self.alignment_hdf = alignment_hdf self.json_vocab_path = json_vocab_path self.target_blank_idx = target_blank_idx self.segment_list = segment_list self.silence_idx = silence_idx + self.ref_alignment_hdf = ref_alignment_hdf if ref_alignment_hdf is not None else alignment_hdf + self.ref_alignment_blank_idx = ref_alignment_blank_idx if ref_alignment_blank_idx is not None else target_blank_idx self.out_plot_dir = self.output_path("plots", True) @@ -582,9 +601,38 @@ def _set_ticks( for i, position in enumerate(label_positions): ax.axvline(x=position - 1.0, ymin=ymin, ymax=ymax, color=color) + @staticmethod + def _set_ticks_alt( + ax: plt.Axes, + alignment: np.ndarray, + ref_alignment: np.ndarray, + vocab: Dict[int, str], + blank_idx: int, + ref_alignment_blank_idx: int, + ): + """ + Set the ticks and labels for the x and y axis. + x-axis: reference alignment + y-axis: model output + """ + PlotAttentionWeightsJobV2.set_ticks( + ax, + ref_alignment=ref_alignment, + targets=alignment, + vocab=vocab, + ref_alignment_blank_idx=ref_alignment_blank_idx, + target_blank_idx=blank_idx, + draw_vertical_lines=True + ) + + # draw last horizontal line for trailing blanks + labels = alignment[alignment != blank_idx] + ax.axhline(y=len(labels) - .5, xmin=0, xmax=1, color="k", linewidth=.5) + def run(self): # load data from hdf alignment_dict = hdf.load_hdf_data(self.alignment_hdf, segment_list=self.segment_list) + ref_alignment_dict = hdf.load_hdf_data(self.ref_alignment_hdf, segment_list=self.segment_list) # load vocabulary as dictionary with open(self.json_vocab_path.get_path(), "r") as f: @@ -597,13 +645,38 @@ def run(self): # for each seq tag, plot the corresponding att weights for seq_tag in alignment_dict.keys(): alignment = alignment_dict[seq_tag] + ref_alignment = ref_alignment_dict[seq_tag] labels = alignment[alignment != self.target_blank_idx] - fig, ax = self._get_fig_ax(alignment) + dummy_matrix = np.zeros((len(labels) + 1, len(alignment))) + fig, ax = PlotAttentionWeightsJobV2._get_fig_ax(att_weights=dummy_matrix) + ax.matshow(dummy_matrix, aspect="auto", alpha=0.0) # set y ticks and labels - self._set_ticks(ax, alignment, labels, vocab, self.target_blank_idx) + self._set_ticks_alt( + ax, + alignment, + ref_alignment, + vocab, + self.target_blank_idx, + ref_alignment_blank_idx=self.ref_alignment_blank_idx + ) + PlotAttentionWeightsJobV2.plot_ctc_alignment( + ax, + alignment, + num_labels=labels.shape[0], + ctc_blank_idx=self.target_blank_idx, + plot_trailing_blanks=True + ) + # plt.gca().invert_yaxis() dirname = self.out_plot_dir.get_path() filename = os.path.join(dirname, "plot.%s" % seq_tag.replace("/", "_")) plt.savefig(filename + ".png") plt.savefig(filename + ".pdf") + + @classmethod + def hash(cls, kwargs): + if kwargs["ref_alignment_hdf"] is None: + kwargs.pop("ref_alignment_hdf") + kwargs.pop("ref_alignment_blank_idx") + return super().hash(kwargs) From 524a2ca4bfa556d4031971fa66ca61f2f1464941 Mon Sep 17 00:00:00 2001 From: Peter Vieting Date: Tue, 11 Jun 2024 15:11:03 +0200 Subject: [PATCH 169/227] ls960 pretrain: use phoneme info for mask boundaries --- .../wav2vec2/config_02_fairseq_phoneme.py | 40 +++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/users/vieting/experiments/librispeech/librispeech_960_pretraining/wav2vec2/config_02_fairseq_phoneme.py b/users/vieting/experiments/librispeech/librispeech_960_pretraining/wav2vec2/config_02_fairseq_phoneme.py index ff0dc5ae0..8b4cf6d01 100644 --- a/users/vieting/experiments/librispeech/librispeech_960_pretraining/wav2vec2/config_02_fairseq_phoneme.py +++ b/users/vieting/experiments/librispeech/librispeech_960_pretraining/wav2vec2/config_02_fairseq_phoneme.py @@ -63,7 +63,7 @@ def get_fairseq_root(commit="e4a2e4e93efbcbaaae52a17ae6600beb2083fb33", fairseq_ return fairseq_root -def run_fairseq_pretraining_informed(): +def run_fairseq_pretraining_negatives_other_target(): prefix_name = "experiments/librispeech/librispeech_960_pretraining/wav2vec2/" alignment = get_alignment_hdf() num_gpus = 8 @@ -82,7 +82,7 @@ def run_fairseq_pretraining_informed(): ) # run pre-training - exp_name = "monophone1" + exp_name = "monophone_negatives_other_target_v1" fairseq_args = get_fairseq_args(num_gpus=num_gpus) fairseq_args["task"]["alignment"] = alignment fairseq_args["model"]["negative_sampling_strategy"] = "other_target" @@ -95,5 +95,39 @@ def run_fairseq_pretraining_informed(): return job +def run_fairseq_pretraining_phoneme_boundary_masking(): + prefix_name = "experiments/librispeech/librispeech_960_pretraining/wav2vec2/" + alignment = get_alignment_hdf() + num_gpus = 8 + fairseq_python_exe = tk.Path( + "/home/pv653172/setups/librispeech/20230328_wav2vec2/dependencies/python_launcher.sh", + hash_overwrite="itc_python_launcher_py310_torch", + ) + fairseq_root = get_fairseq_root(fairseq_exe=fairseq_python_exe) + fairseq_training_args = dict( + save_interval=25, + max_epoch=600, + max_update=420000, + fairseq_root=fairseq_root, + fairseq_python_exe=fairseq_python_exe, + rqmt={"time": 120, "mem": 12, "cpu": 2, "gpu": num_gpus}, + ) + + # run pre-training + exp_name = "monophone_boundary_masking_v1" + fairseq_args = get_fairseq_args(num_gpus=num_gpus) + fairseq_args["task"]["alignment"] = alignment + fairseq_args["model"]["mask_strategy"] = "phoneme" + fairseq_args["model"]["mask_length"] = 1 + fairseq_root = get_fairseq_root(fairseq_exe=fairseq_python_exe, commit="b768be5b81987364d39a07d1caad2bfe1e956896") + fairseq_training_args["fairseq_root"] = fairseq_root + fairseq_config = FairseqHydraConfig(fairseq_args) + job = FairseqHydraTrainingJob(fairseq_config, **fairseq_training_args) + job.add_alias(os.path.join(prefix_name, exp_name, "pretraining")) + tk.register_output(f"{prefix_name}/{exp_name}/pretraining/scores.png", job.out_plot_se) + return job + + def py(): - run_fairseq_pretraining_informed() + run_fairseq_pretraining_negatives_other_target() + run_fairseq_pretraining_phoneme_boundary_masking() From 01b526b780a2a11c5b10d58dd51972e1ea786693 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Fri, 14 Jun 2024 15:22:52 +0200 Subject: [PATCH 170/227] BatchRenorm initial implementation (untested) --- users/zeyer/nn_rf/batchnorm.py | 165 +++++++++++++++++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 users/zeyer/nn_rf/batchnorm.py diff --git a/users/zeyer/nn_rf/batchnorm.py b/users/zeyer/nn_rf/batchnorm.py new file mode 100644 index 000000000..ae92bb6dd --- /dev/null +++ b/users/zeyer/nn_rf/batchnorm.py @@ -0,0 +1,165 @@ +""" +Batch norm variations, e.g. Batch Renormalization + +https://github.com/rwth-i6/returnn/issues/1539 +""" + +from __future__ import annotations +from typing import Optional, Union, Callable +from returnn.tensor import Tensor, Dim +import returnn.frontend as rf + + +class BatchRenorm(rf.Module): + """ + Batch Renormalization. https://arxiv.org/abs/1702.03275 + + We calculate statistics over all axes except the given in_dim. + I.e. all other axes are reduced for the statistics. + + To compensate the normalization, there are learnable parameters gamma and beta + (optional, used when option `affine` is True, which is the default). + + The usual behavior depends on whether this is used in training or evaluation, + although this often configurable in other frameworks. + The usual behavior, in training:: + + # Using statistics from current batch. + mean_cur_batch, variance_cur_batch = moments(source, reduce_dims) + y = (x - mean_cur_batch) / sqrt(variance_cur_batch + epsilon) + y = gamma * y + beta + + # Updating running statistics for later use. + mean = (1 - momentum) * mean + momentum * mean_cur_batch + variance = (1 - momentum) * variance + momentum * variance_cur_batch + + The usual behavior, not in training (i.e. in evaluation):: + + # Using collected statistics. Not using statistics from current batch. + y = (x - mean) / sqrt(variance + epsilon) + y = gamma * y + beta + + """ + + def __init__( + self, + in_dim: Dim, + *, + affine: bool = True, + momentum: float = 0.1, + eps: float = 1e-3, + use_mask: Optional[bool] = None, + unbiased: bool = False, + r_max: Union[float, Callable[[BatchRenorm], float]] = 1.0, + d_max: Union[float, Callable[[BatchRenorm], float]] = 0.0, + ): + """ + :param in_dim: the feature dimension of the input + :param affine: whether to use learnable parameters gamma and beta + :param momentum: momentum for the running mean and variance + :param eps: epsilon for the variance + :param use_mask: whether to use a mask for dynamic spatial dims. + This must be specified if the input has dynamic spatial dims. + True would use the correct masking then. However, that is inconsistent to all other frameworks. + False would be consistent to all other frameworks. + :param unbiased: if True, uses unbiased variance calculation + via `Bessel correction `__ + :param r_max: clip how much we should use the running variance instead of the current batch variance + during training. + Value should be >= 1.0. + r_max=1.0 means always use the current batch variance, i.e. like standard batch norm. + r_max=inf means always use the running variance. + r_max can also be scheduled via a callable, e.g. using rf.get_run_ctx().step inside. + The original paper suggests to keep r_max=1.0 for the first 5k steps, + then linearly increase to reach r_max=3.0 at 40k steps. + :param d_max: clip how much we should use the running mean instead of the current batch mean during training. + Value should be >= 0.0. + d_max=0.0 means always use the current batch mean, i.e. like standard batch norm. + d_max=inf means always use the running mean. + d_max can also be scheduled via a callable, e.g. using rf.get_run_ctx().step inside. + The original paper suggests to keep d_max=0.0 for the first 5k steps, + then linearly increase to reach d_max=5.0 at 25k steps. + """ + super().__init__() + assert isinstance(in_dim, Dim) + self.in_dim = in_dim + self.affine = affine + self.momentum = momentum + self.eps = eps + self.use_mask = use_mask + self.unbiased = unbiased + self.r_max = r_max + self.d_max = d_max + self.running_mean = rf.Parameter([in_dim], auxiliary=True) + self.running_mean.initial = 0.0 + self.running_variance = rf.Parameter([in_dim], auxiliary=True) + self.running_variance.initial = 1.0 + self.gamma = None # type: Optional[rf.Parameter] + self.beta = None # type: Optional[rf.Parameter] + if self.affine: + self.gamma = rf.Parameter([in_dim]) + self.gamma.initial = 1.0 + self.beta = rf.Parameter([in_dim]) + self.beta.initial = 0.0 + + def __call__(self, source: Tensor) -> Tensor: + assert self.in_dim in source.dims + + if any(d.need_masking() for d in source.dims if d != self.in_dim): + if self.use_mask is None: + raise ValueError( + f"{self}: use_mask must be specified if the input {source} has any dynamic spatial dims" + ) + use_mask = self.use_mask + else: + use_mask = False # not needed. False because this potentially enables an efficient fused op. + + train_flag = rf.get_run_ctx().train_flag + d_max = self.d_max(self) if callable(self.d_max) else self.d_max + r_max = self.r_max(self) if callable(self.r_max) else self.r_max + + mean_cur_batch, variance_cur_batch = rf.cond( + train_flag, + # Only conditionally calculate the moments when needed. + lambda: rf.moments( + source, + axis=[d for d in source.dims if d != self.in_dim], + use_mask=use_mask, + correction=1 if self.unbiased else 0, + ), + # Return some dummy values. They are not used. + lambda: (self.running_mean, self.running_variance), + ) + + def _update_running_stats(): + self.running_mean.assign_add((mean_cur_batch - self.running_mean) * self.momentum) + self.running_variance.assign_add((variance_cur_batch - self.running_variance) * self.momentum) + + rf.cond(train_flag, _update_running_stats, lambda: None) + + def _train_mean_std_dev(): + inv_std_dev_ = rf.rsqrt(variance_cur_batch + self.eps) + if r_max > 1: + inv_std_dev_ *= rf.clip_by_value( + rf.rsqrt(self.running_variance + self.eps) + * rf.sqrt(rf.stop_gradient(variance_cur_batch) + self.eps), + 1 / r_max, + r_max, + ) + mean_ = mean_cur_batch + if d_max > 0: + limit = d_max * rf.reciprocal(rf.stop_gradient(inv_std_dev_)) + mean_ += rf.clip_by_value(self.running_mean - rf.stop_gradient(mean_cur_batch), -limit, limit) + return mean_, inv_std_dev_ + + mean, inv_std_dev = rf.cond( + train_flag, _train_mean_std_dev, lambda: (self.running_mean, rf.rsqrt(self.running_variance + self.eps)) + ) + + m = inv_std_dev + if self.gamma is not None: + m *= self.gamma + bn = (source - mean) * m + if self.beta is not None: + bn += self.beta + return bn From f23b1177d41c4f5ddc03a38f9bf94e8e2da0b346 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Fri, 14 Jun 2024 22:45:14 +0200 Subject: [PATCH 171/227] test_piecewise_linear --- users/zeyer/lr_schedules/piecewise_linear.py | 28 ++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/users/zeyer/lr_schedules/piecewise_linear.py b/users/zeyer/lr_schedules/piecewise_linear.py index 68672702c..f76b78c89 100644 --- a/users/zeyer/lr_schedules/piecewise_linear.py +++ b/users/zeyer/lr_schedules/piecewise_linear.py @@ -25,3 +25,31 @@ def dyn_lr_piecewise_linear(*, global_train_step: int, learning_rate: float, **_ last_step = step return learning_rate * lrs[-1] + + +def test_piecewise_linear(): + from numpy.testing import assert_almost_equal, assert_equal + + def _f(x, xs, ys): + assert isinstance(x, int) + assert len(xs) + 1 == len(ys) + last_step = 0 + for i, step in enumerate(xs): + assert isinstance(step, int) + assert step > last_step + assert x >= last_step + if x < step: + factor = (x + 1 - last_step) / (step - last_step) + return ys[i + 1] * factor + ys[i] * (1 - factor) + last_step = step + + return ys[-1] + + assert_almost_equal(_f(0, [10, 20], [0, 1, 0.5]), 0.1) + assert_almost_equal(_f(5, [10, 20], [0, 1, 0.5]), 0.6) + assert_equal(_f(9, [10, 20], [0, 1, 0.5]), 1) + assert_almost_equal(_f(10, [10, 20], [0, 1, 0.5]), 0.95) + assert_almost_equal(_f(11, [10, 20], [0, 1, 0.5]), 0.90) + assert_almost_equal(_f(15, [10, 20], [0, 1, 0.5]), 0.70) + assert_almost_equal(_f(19, [10, 20], [0, 1, 0.5]), 0.5) + assert_equal(_f(20, [10, 20], [0, 1, 0.5]), 0.5) From 53cbaecbf7e6c740dcc199e3f0dcdf0ec2dd18df Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Fri, 14 Jun 2024 22:47:02 +0200 Subject: [PATCH 172/227] test_piecewise_linear use dyn_lr_piecewise_linear --- users/zeyer/lr_schedules/piecewise_linear.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/users/zeyer/lr_schedules/piecewise_linear.py b/users/zeyer/lr_schedules/piecewise_linear.py index f76b78c89..2c9a440c9 100644 --- a/users/zeyer/lr_schedules/piecewise_linear.py +++ b/users/zeyer/lr_schedules/piecewise_linear.py @@ -28,22 +28,12 @@ def dyn_lr_piecewise_linear(*, global_train_step: int, learning_rate: float, **_ def test_piecewise_linear(): + from returnn.config import global_config_ctx, Config from numpy.testing import assert_almost_equal, assert_equal def _f(x, xs, ys): - assert isinstance(x, int) - assert len(xs) + 1 == len(ys) - last_step = 0 - for i, step in enumerate(xs): - assert isinstance(step, int) - assert step > last_step - assert x >= last_step - if x < step: - factor = (x + 1 - last_step) / (step - last_step) - return ys[i + 1] * factor + ys[i] * (1 - factor) - last_step = step - - return ys[-1] + with global_config_ctx(Config({"learning_rate_piecewise_steps": xs, "learning_rate_piecewise_values": ys})): + return dyn_lr_piecewise_linear(global_train_step=x, learning_rate=1.0) assert_almost_equal(_f(0, [10, 20], [0, 1, 0.5]), 0.1) assert_almost_equal(_f(5, [10, 20], [0, 1, 0.5]), 0.6) From 04894c96756b28c43c8e9d89511dc1cf2199103a Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Fri, 14 Jun 2024 23:09:58 +0200 Subject: [PATCH 173/227] dyn_lr_piecewise_linear use RETURNN PiecewiseLinear --- users/zeyer/lr_schedules/piecewise_linear.py | 28 +++++++++----------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/users/zeyer/lr_schedules/piecewise_linear.py b/users/zeyer/lr_schedules/piecewise_linear.py index 2c9a440c9..d418939ac 100644 --- a/users/zeyer/lr_schedules/piecewise_linear.py +++ b/users/zeyer/lr_schedules/piecewise_linear.py @@ -8,23 +8,21 @@ def dyn_lr_piecewise_linear(*, global_train_step: int, learning_rate: float, **_ Piecewise linear """ from returnn.config import get_global_config + from returnn.util.math import PiecewiseLinear config = get_global_config() - - steps = config.int_list("learning_rate_piecewise_steps") - lrs = config.float_list("learning_rate_piecewise_values") - assert len(steps) + 1 == len(lrs) - - last_step = 0 - for i, step in enumerate(steps): - assert step > last_step - assert global_train_step >= last_step - if global_train_step < step: - factor = (global_train_step + 1 - last_step) / (step - last_step) - return learning_rate * (lrs[i + 1] * factor + lrs[i] * (1 - factor)) - last_step = step - - return learning_rate * lrs[-1] + f = config.typed_dict.get("_learning_rate_piecewise_cache") + if f is None: + steps = config.int_list("learning_rate_piecewise_steps") + lrs = config.float_list("learning_rate_piecewise_values") + assert len(steps) + 1 == len(lrs) + last_step = 0 + for i, step in enumerate(steps): + assert step > last_step + last_step = step + f = PiecewiseLinear(dict(zip([0] + list(steps), lrs))) + config.typed_dict["_learning_rate_piecewise_cache"] = f + return f(global_train_step + 1) * learning_rate def test_piecewise_linear(): From 37f0d3d6eff25f39435dd87d9caaa5f34741022e Mon Sep 17 00:00:00 2001 From: vieting <45091115+vieting@users.noreply.github.com> Date: Mon, 17 Jun 2024 11:25:17 +0200 Subject: [PATCH 174/227] DeleteLemmataFromLexiconJob (#225) --- users/berger/recipe/lexicon/modification.py | 30 +++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/users/berger/recipe/lexicon/modification.py b/users/berger/recipe/lexicon/modification.py index fae2d7c4e..f2ab83ec3 100644 --- a/users/berger/recipe/lexicon/modification.py +++ b/users/berger/recipe/lexicon/modification.py @@ -1,3 +1,4 @@ +from typing import List from sisyphus import Job, Task, tk from i6_core.lib.lexicon import Lexicon, Lemma @@ -101,6 +102,35 @@ def run(self): write_xml(self.out_lexicon.get_path(), out_lexicon.to_xml()) +class DeleteLemmataFromLexiconJob(Job): + """ + Remove lemmata from a bliss lexicon based on their orth + """ + + def __init__(self, bliss_lexicon: tk.Path, orths: List[str]): + self.bliss_lexicon = bliss_lexicon + self.orths = orths + + self.out_lexicon = self.output_path("lexicon.xml") + + def tasks(self): + yield Task("run", mini_task=True) + + def run(self): + in_lexicon = Lexicon() + in_lexicon.load(self.bliss_lexicon.get_path()) + + out_lexicon = Lexicon() + out_lexicon.phonemes = in_lexicon.phonemes + out_lexicon.lemmata = [] + + for lemma in in_lexicon.lemmata: + if not any(orth in lemma.orth for orth in self.orths): + out_lexicon.lemmata.append(lemma) + + write_xml(self.out_lexicon.get_path(), out_lexicon.to_xml()) + + class EnsureSilenceFirstJob(Job): """ Moves the silence phoneme (defined via lemma) to the beginning in the inventory for RASR CTC/Transducer compatibility From 8fc42f84ec61b6e2e5ac3f735e33aae2b6d4f3db Mon Sep 17 00:00:00 2001 From: Peter Vieting Date: Mon, 17 Jun 2024 14:00:42 +0200 Subject: [PATCH 175/227] ls960 pretrain: phoneme mask and other targets --- .../wav2vec2/config_02_fairseq_phoneme.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/users/vieting/experiments/librispeech/librispeech_960_pretraining/wav2vec2/config_02_fairseq_phoneme.py b/users/vieting/experiments/librispeech/librispeech_960_pretraining/wav2vec2/config_02_fairseq_phoneme.py index 8b4cf6d01..79bf27e47 100644 --- a/users/vieting/experiments/librispeech/librispeech_960_pretraining/wav2vec2/config_02_fairseq_phoneme.py +++ b/users/vieting/experiments/librispeech/librispeech_960_pretraining/wav2vec2/config_02_fairseq_phoneme.py @@ -128,6 +128,41 @@ def run_fairseq_pretraining_phoneme_boundary_masking(): return job +def run_fairseq_pretraining_phoneme_negatives_other_target_boundary_masking(): + prefix_name = "experiments/librispeech/librispeech_960_pretraining/wav2vec2/" + alignment = get_alignment_hdf() + num_gpus = 8 + fairseq_python_exe = tk.Path( + "/home/pv653172/setups/librispeech/20230328_wav2vec2/dependencies/python_launcher.sh", + hash_overwrite="itc_python_launcher_py310_torch", + ) + fairseq_root = get_fairseq_root(fairseq_exe=fairseq_python_exe) + fairseq_training_args = dict( + save_interval=25, + max_epoch=600, + max_update=420000, + fairseq_root=fairseq_root, + fairseq_python_exe=fairseq_python_exe, + rqmt={"time": 120, "mem": 12, "cpu": 2, "gpu": num_gpus}, + ) + + # run pre-training + exp_name = "monophone_negatives_other_target_boundary_masking_v1" + fairseq_args = get_fairseq_args(num_gpus=num_gpus) + fairseq_args["task"]["alignment"] = alignment + fairseq_args["model"]["negative_sampling_strategy"] = "other_target" + fairseq_args["model"]["mask_strategy"] = "phoneme" + fairseq_args["model"]["mask_length"] = 1 + fairseq_root = get_fairseq_root(fairseq_exe=fairseq_python_exe, commit="b768be5b81987364d39a07d1caad2bfe1e956896") + fairseq_training_args["fairseq_root"] = fairseq_root + fairseq_config = FairseqHydraConfig(fairseq_args) + job = FairseqHydraTrainingJob(fairseq_config, **fairseq_training_args) + job.add_alias(os.path.join(prefix_name, exp_name, "pretraining")) + tk.register_output(f"{prefix_name}/{exp_name}/pretraining/scores.png", job.out_plot_se) + return job + + def py(): run_fairseq_pretraining_negatives_other_target() run_fairseq_pretraining_phoneme_boundary_masking() + run_fairseq_pretraining_phoneme_negatives_other_target_boundary_masking() From 3b5959b15ebc837d6a14046ca2db7fff9cb3ae50 Mon Sep 17 00:00:00 2001 From: Peter Vieting Date: Mon, 17 Jun 2024 17:32:25 +0200 Subject: [PATCH 176/227] ls960 pretrain: update num epochs --- .../wav2vec2/config_01_fairseq_main.py | 4 ++-- .../wav2vec2/config_02_fairseq_phoneme.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/users/vieting/experiments/librispeech/librispeech_960_pretraining/wav2vec2/config_01_fairseq_main.py b/users/vieting/experiments/librispeech/librispeech_960_pretraining/wav2vec2/config_01_fairseq_main.py index d1c67fa69..4313042b7 100755 --- a/users/vieting/experiments/librispeech/librispeech_960_pretraining/wav2vec2/config_01_fairseq_main.py +++ b/users/vieting/experiments/librispeech/librispeech_960_pretraining/wav2vec2/config_01_fairseq_main.py @@ -146,8 +146,8 @@ def run_fairseq_pretraining(): job = FairseqHydraTrainingJob( fairseq_config, save_interval=25, - max_epoch=300, - max_update=400000, + max_epoch=600, + max_update=420000, fairseq_root=fairseq_root, fairseq_python_exe=fairseq_exe, rqmt={"time": 120, "mem": 12, "cpu": 2, "gpu": num_gpus}, diff --git a/users/vieting/experiments/librispeech/librispeech_960_pretraining/wav2vec2/config_02_fairseq_phoneme.py b/users/vieting/experiments/librispeech/librispeech_960_pretraining/wav2vec2/config_02_fairseq_phoneme.py index 79bf27e47..53aef623a 100644 --- a/users/vieting/experiments/librispeech/librispeech_960_pretraining/wav2vec2/config_02_fairseq_phoneme.py +++ b/users/vieting/experiments/librispeech/librispeech_960_pretraining/wav2vec2/config_02_fairseq_phoneme.py @@ -74,8 +74,8 @@ def run_fairseq_pretraining_negatives_other_target(): fairseq_root = get_fairseq_root(fairseq_exe=fairseq_python_exe) fairseq_training_args = dict( save_interval=25, - max_epoch=300, - max_update=400000, + max_epoch=600, + max_update=420000, fairseq_root=fairseq_root, fairseq_python_exe=fairseq_python_exe, rqmt={"time": 120, "mem": 12, "cpu": 2, "gpu": num_gpus}, From 8a2a9bb99f98b72a0ec48949a78154f608ac7e65 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 18 Jun 2024 00:50:50 +0200 Subject: [PATCH 177/227] better --- users/zeineldeen/experiments/canary_aed/nemo/run_eval.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py index 3d65ae83f..f6a62aaf3 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py @@ -85,9 +85,11 @@ def buffer_audio_and_transcribe( filepaths = write_audio(buffer, cache_prefix) if pnc is not None: - transcriptions = model.transcribe(filepaths, batch_size=batch_size, pnc=False, verbose=False) + transcriptions = model.transcribe( + filepaths, batch_size=batch_size, pnc=False, verbose=False, num_workers=4 + ) else: - transcriptions = model.transcribe(filepaths, batch_size=batch_size, verbose=False) + transcriptions = model.transcribe(filepaths, batch_size=batch_size, verbose=False, num_workers=4) # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis if type(transcriptions) == tuple and len(transcriptions) == 2: transcriptions = transcriptions[0] From 471199541560cbfd843ebb6adfd2d666ca1a6cfd Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 18 Jun 2024 13:02:52 +0200 Subject: [PATCH 178/227] first version of beam search --- .../canary_aed/nemo/run_eval_beam_search.py | 420 ++++++++++++++++++ 1 file changed, 420 insertions(+) create mode 100644 users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py new file mode 100644 index 000000000..1116d62bd --- /dev/null +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py @@ -0,0 +1,420 @@ +""" +Adapted from here: https://github.com/huggingface/open_asr_leaderboard/blob/5c03c1f85a84ab7a991dcc1b3f14905ec6d632c9/nemo_asr/run_eval.py +""" +from __future__ import annotations + +import argparse + +import os +import sys +import shutil + +from typing import Any, Tuple, List + +import tree + +import torch + +import soundfile + +from tqdm import tqdm +from normalizer import data_utils + +from datasets import load_from_disk + +from nemo.collections.asr.models import ASRModel +from nemo.collections.asr.models.aed_multitask_models import MultiTaskTranscriptionConfig +from nemo.collections.asr.parts.mixins.transcription import GenericTranscriptionType +from nemo.collections.common.parts.transformer_utils import mask_padded_tokens + +sys.path.insert(0, "/u/zeineldeen/setups/ubuntu_22_setups/2024-06-07--canary-aed/recipe") + +from i6_experiments.users.zeyer.decoding.beam_search_torch.interface import ( + LabelScorerIntf, + StateObjIgnored, +) +from i6_experiments.users.zeyer.decoding.beam_search_torch.beam_search_v5 import ( + beam_search_v5, + BeamSearchOptsV5, +) + +DATA_CACHE_DIR = "/var/tmp/audio_cache" + + +def compute_wer(predictions, references): + from jiwer import compute_measures + + incorrect = 0 + total = 0 + for prediction, reference in zip(predictions, references): + measures = compute_measures(reference, prediction) + incorrect += measures["substitutions"] + measures["deletions"] + measures["insertions"] + total += measures["substitutions"] + measures["deletions"] + measures["hits"] + return incorrect / total + + +def dataset_iterator(dataset): + for i, item in enumerate(dataset): + yield { + **item["audio"], + "reference": item["norm_text"], + "audio_filename": f"file_{i}", + "sample_rate": 16_000, + "sample_id": i, + } + + +def write_audio(buffer, cache_prefix) -> list: + cache_dir = os.path.join(DATA_CACHE_DIR, cache_prefix) + + if os.path.exists(cache_dir): + shutil.rmtree(cache_dir, ignore_errors=True) + + os.makedirs(cache_dir) + + data_paths = [] + for idx, data in enumerate(buffer): + fn = os.path.basename(data["audio_filename"]) + fn = os.path.splitext(fn)[0] + path = os.path.join(cache_dir, f"{idx}_{fn}.wav") + data_paths.append(path) + + soundfile.write(path, data["array"], samplerate=data["sample_rate"]) + + return data_paths + + +def pack_results(results: list, buffer, transcriptions): + for sample, transcript in zip(buffer, transcriptions): + result = {"reference": sample["reference"], "pred_text": transcript} + results.append(result) + return results + + +def get_our_canary_label_scorer( + model: ASRModel, enc: torch.Tensor, enc_input_mask: torch.Tensor, pad_id: int, bos_prefix_seq: List[int] +) -> LabelScorerIntf: + """ + Creates a CanaryLabelScorer object that is used in the beam search implementation. + + :param model: nemo ASRModel object + :param enc: [B,T] + :param enc_input_mask: [B,T] + :param pad_id: + :param bos_prefix_seq: + """ + + trafo_decoder_module = model.transf_decoder # type: torch.nn.Module + log_softmax_module = model.log_softmax # type: torch.nn.Module + + class CanaryLabelScorer(LabelScorerIntf): + def get_initial_state(self, *, batch_size: int, device: torch.device) -> Any: + return { + "step": StateObjIgnored(0), + "prefix_input": torch.tile( + torch.tensor(bos_prefix_seq, device=device)[None, :], [batch_size, 1] + ), # [Batch,InputSeqLen] + "model_state": None, + } + + def score_and_update_state( + self, + *, + prev_state: Any, + prev_label: torch.Tensor, + ) -> Tuple[torch.Tensor, Any]: + """ + :param prev_state: state of the scorer (decoder). any nested structure. + all tensors are expected to have shape [Batch, Beam, ...]. + :param prev_label: shape [Batch, Beam] -> index in [0...Label-1] + :return: (scores, state). + scores: shape [Batch, Beam, Label], log-prob-like scores. + Broadcasting is allowed for any of the dims (e.g. think of :class:`LengthRewardScorer`). + state: all tensors are expected to have shape [Batch, Beam, ...]. + """ + batch_size, beam_size = prev_label.shape + + # Convert all [batch,beam,...] tensors to [batch*beam,...]. + def _map(x): + if x is None: + return None + assert isinstance(x, torch.Tensor) and x.shape[:2] == (batch_size, beam_size) + return x.flatten(0, 1) + + prev_model_state = tree.map_structure(_map, prev_state["model_state"]) + + input = prev_state["prefix_input"] # [batch*beam,in_seq_len] or None + if input is None: + input = prev_label.flatten(0, 1)[:, None] # [batch*beam,1] + + # start_pos is used for the positional encoding added to the embeddings + dec_embed = trafo_decoder_module.embedding.forward( + input, start_pos=prev_state["step"].content + ) # [batch*beam,in_seq_len|1,D] + dec_input_mask = mask_padded_tokens(input, pad_id=pad_id).float() + + import pdb + + pdb.set_trace() + + # decoder_mems_list is a list of size num_layers that cache output activations of shape + # [batch*beam,history,D] + decoder_mems_list = trafo_decoder_module.decoder.forward( + decoder_states=dec_embed, + decoder_mask=dec_input_mask, + encoder_states=enc, + encoder_mask=enc_input_mask, + decoder_mems_list=prev_model_state, + return_mems=True, + return_mems_as_list=True, + ) + + # decoder_mems_list[-1][:, -1:] is the output of the last layer at the current decoding step position + log_probs = log_softmax_module.forward(hidden_states=decoder_mems_list[-1][:, -1:]) # [batch*beam,1,V] + + def _map(x): + assert isinstance(x, torch.Tensor) and x.shape[:1] == (batch_size * beam_size,) + return x.unflatten(0, (batch_size, beam_size)) + + log_probs = log_probs.squeeze(1) # [batch*beam,V] + log_probs = _map(log_probs) # [batch,beam,V] + + # if input is not None: + # for j in range(len(decoder_mems_list)): + # decoder_mems_list[j] = decoder_mems_list[j].repeat(beam_size, 1, 1) + + def _map(x): + assert isinstance(x, torch.Tensor) and x.shape[:1] == (batch_size * beam_size,) + return x.unflatten(0, (batch_size, beam_size)) + + decoder_mems_list = tree.map_structure(_map, decoder_mems_list) + + return log_probs, { + "step": StateObjIgnored(prev_state["step"].content + input.size(1)), + "prefix_input": None, + "model_state": decoder_mems_list, + } + + return CanaryLabelScorer() + + +# TODO: make this configurable +beam_search_v5_opts = BeamSearchOptsV5( + beam_size=4, + bos_label=3, + eos_label=2, + num_labels=4128, + length_normalization_exponent=1, + pruning_threshold=0.0, + adaptive_pruning=False, +) + +# debug with batch size 3 and beam size 4: +# running their beam search: +# +# (Pdb) decoder_input_ids +# tensor([[ 3, 4, 8, 4, 10], +# [ 3, 4, 8, 4, 10], +# [ 3, 4, 8, 4, 10]]) +# [batch*1,out_len|5] +# +# (Pdb) decoder_hidden_states.shape +# torch.Size([3, 5, 1024]) # [batch*1,out_len|5,D] +# +# (Pdb) decoder_mems_list[0].shape +# torch.Size([3, 5, 1024]) # [batch*1,out_len|5,D] +# +# (Pdb) log_probs.shape +# torch.Size([3, 1, 4128]) # [batch*1,1,V] +# +# step 2: +# +# (Pdb) decoder_input_ids.shape +# torch.Size([12, 1]) # [batch*beam,out_len|1] +# +# (Pdb) decoder_hidden_states.shape +# torch.Size([12, 1, 1024]) +# +# prev_state: +# (Pdb) decoder_mems_list[0].shape +# torch.Size([12, 5, 1024]) + + +def _transcribe_output_processing_our_beam_search( + outputs, trcfg: MultiTaskTranscriptionConfig +) -> GenericTranscriptionType: + # outputs are returned from `_transcribe_forward` function call + enc_states = outputs.pop("encoder_states") + enc_lens = outputs.pop("encoded_lengths") + enc_mask = outputs.pop("encoder_mask") + + canary_label_scorer = get_our_canary_label_scorer( + model=asr_model, enc=enc_states, enc_input_mask=enc_mask, pad_id=1, bos_prefix_seq=[3, 4, 8, 4, 10] + ) + + seq_targets, _, out_seq_len = beam_search_v5( + canary_label_scorer, + batch_size=enc_states.size(0), + max_seq_len=enc_lens, + device=enc_states.device, + opts=beam_search_v5_opts, + ) # [B,Beam,L] + + import pdb + + pdb.set_trace() + + best_hyp_int = seq_targets[:, 0, : out_seq_len[:, 0]] # [B,1,L] + # TODO: convert hyp to text using model.tokenizer + # TODO: filter out EOS? + return best_hyp_int + + +def buffer_audio_and_transcribe( + model: ASRModel, dataset, batch_size: int, pnc: bool, cache_prefix: str, verbose: bool = True +): + buffer = [] + results = [] + for sample in tqdm(dataset_iterator(dataset), desc="Evaluating: Sample id", unit="", disable=not verbose): + buffer.append(sample) + + if len(buffer) == batch_size: + filepaths = write_audio(buffer, cache_prefix) + + if pnc is not None: + transcriptions = model.transcribe( + filepaths, batch_size=batch_size, pnc=False, verbose=False, num_workers=4 + ) + else: + transcriptions = model.transcribe(filepaths, batch_size=batch_size, verbose=False, num_workers=4) + + # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis + if type(transcriptions) == tuple and len(transcriptions) == 2: + transcriptions = transcriptions[0] + results = pack_results(results, buffer, transcriptions) + buffer.clear() + + if len(buffer) > 0: + filepaths = write_audio(buffer, cache_prefix) + if pnc is not None: + transcriptions = model.transcribe(filepaths, batch_size=batch_size, pnc=False, verbose=False) + else: + transcriptions = model.transcribe(filepaths, batch_size=batch_size, verbose=False) + # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis + if type(transcriptions) == tuple and len(transcriptions) == 2: + transcriptions = transcriptions[0] + results = pack_results(results, buffer, transcriptions) + buffer.clear() + + # Delete temp cache dir + if os.path.exists(DATA_CACHE_DIR): + shutil.rmtree(DATA_CACHE_DIR) + + return results + + +def main(args): + if args.device >= 0: + device = torch.device(f"cuda:{args.device}") + else: + device = torch.device("cpu") + + global asr_model + asr_model = ASRModel.restore_from(args.model_path, map_location=device) + asr_model.freeze() + + # hook our beam search implementation + asr_model._transcribe_output_processing = _transcribe_output_processing_our_beam_search + + dataset = load_from_disk(args.dataset_path) + + if args.max_eval_samples is not None and args.max_eval_samples > 0: + print(f"Subsampling dataset to first {args.max_eval_samples} samples !") + dataset = dataset.take(args.max_eval_samples) + + dataset = data_utils.prepare_data(dataset) + + predictions = [] + references = [] + + # run streamed inference + cache_prefix = ( + f"{args.model_id.replace('/', '-')}-{args.dataset_path.replace('/', '')}-" + f"{args.dataset.replace('/', '-')}-{args.split}" + ) + results = buffer_audio_and_transcribe(asr_model, dataset, args.batch_size, args.pnc, cache_prefix, verbose=True) + for sample in results: + predictions.append(data_utils.normalizer(sample["pred_text"])) + references.append(sample["reference"]) + + # Write manifest results to args.manifest_path. This required modification in normalizer/eval_utils.py script + manifest_path = data_utils.write_manifest( + args.manifest_path, references, predictions, args.model_id, args.dataset_path, args.dataset, args.split + ) + print("Results saved at path:", os.path.abspath(manifest_path)) + + wer = compute_wer(references=references, predictions=predictions) + wer = round(100 * wer, 2) + + print("WER:", wer, "%") + + if args.wer_out_path: + with open(args.wer_out_path, "w") as f: + f.write(f"{wer}\n") + print(f"Wrote WER (%) to {args.wer_out_path}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("--model_id", type=str, required=True, help="Model ID.") + + parser.add_argument( + "--model_path", + type=str, + required=True, + help="Path to nemo model.", + ) + + parser.add_argument("--dataset_path", type=str, required=True, help="Dataset path.") + parser.add_argument("--dataset", type=str, required=True, help="Dataset name.") + parser.add_argument("--split", type=str, required=True, help="Dataset split.") + + parser.add_argument("--manifest_path", type=str, required=True, help="Path to save the search output.") + + parser.add_argument("--wer_out_path", type=str, default=None, help="Path to save the WER output.") + + parser.add_argument( + "--device", + type=int, + default=-1, + help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.", + ) + parser.add_argument( + "--batch_size", + type=int, + default=32, + help="Number of samples to go through each streamed batch.", + ) + parser.add_argument( + "--max_eval_samples", + type=int, + default=None, + help="Number of samples to be evaluated. Put a lower number e.g. 64 for testing this script.", + ) + parser.add_argument( + "--pnc", + type=bool, + default=None, + help="flag to indicate inferene in pnc mode for models that support punctuation and capitalization", + ) + parser.add_argument( + "--no-streaming", + dest="streaming", + action="store_false", + help="Choose whether you'd like to download the entire dataset or stream it during the evaluation.", + ) + args = parser.parse_args() + parser.set_defaults(streaming=True) + + main(args) From 681fffe01ea17d46f4277a9104b96346d0c32e89 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 18 Jun 2024 13:38:44 +0200 Subject: [PATCH 179/227] fix --- .../canary_aed/nemo/run_eval_beam_search.py | 53 ++++--------------- 1 file changed, 11 insertions(+), 42 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py index 1116d62bd..a43f9a30b 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py @@ -153,10 +153,6 @@ def _map(x): ) # [batch*beam,in_seq_len|1,D] dec_input_mask = mask_padded_tokens(input, pad_id=pad_id).float() - import pdb - - pdb.set_trace() - # decoder_mems_list is a list of size num_layers that cache output activations of shape # [batch*beam,history,D] decoder_mems_list = trafo_decoder_module.decoder.forward( @@ -179,10 +175,6 @@ def _map(x): log_probs = log_probs.squeeze(1) # [batch*beam,V] log_probs = _map(log_probs) # [batch,beam,V] - # if input is not None: - # for j in range(len(decoder_mems_list)): - # decoder_mems_list[j] = decoder_mems_list[j].repeat(beam_size, 1, 1) - def _map(x): assert isinstance(x, torch.Tensor) and x.shape[:1] == (batch_size * beam_size,) return x.unflatten(0, (batch_size, beam_size)) @@ -209,36 +201,6 @@ def _map(x): adaptive_pruning=False, ) -# debug with batch size 3 and beam size 4: -# running their beam search: -# -# (Pdb) decoder_input_ids -# tensor([[ 3, 4, 8, 4, 10], -# [ 3, 4, 8, 4, 10], -# [ 3, 4, 8, 4, 10]]) -# [batch*1,out_len|5] -# -# (Pdb) decoder_hidden_states.shape -# torch.Size([3, 5, 1024]) # [batch*1,out_len|5,D] -# -# (Pdb) decoder_mems_list[0].shape -# torch.Size([3, 5, 1024]) # [batch*1,out_len|5,D] -# -# (Pdb) log_probs.shape -# torch.Size([3, 1, 4128]) # [batch*1,1,V] -# -# step 2: -# -# (Pdb) decoder_input_ids.shape -# torch.Size([12, 1]) # [batch*beam,out_len|1] -# -# (Pdb) decoder_hidden_states.shape -# torch.Size([12, 1, 1024]) -# -# prev_state: -# (Pdb) decoder_mems_list[0].shape -# torch.Size([12, 5, 1024]) - def _transcribe_output_processing_our_beam_search( outputs, trcfg: MultiTaskTranscriptionConfig @@ -264,10 +226,13 @@ def _transcribe_output_processing_our_beam_search( pdb.set_trace() - best_hyp_int = seq_targets[:, 0, : out_seq_len[:, 0]] # [B,1,L] - # TODO: convert hyp to text using model.tokenizer - # TODO: filter out EOS? - return best_hyp_int + best_hyps = [] + for i in range(seq_targets.shape[0]): + # TODO: convert hyp to text using model.tokenizer + # TODO: filter out EOS? + best_hyp_int = seq_targets[i, 0, : out_seq_len[i, 0]] # [B,1,L] + best_hyps.append(best_hyp_int) + return best_hyps def buffer_audio_and_transcribe( @@ -314,6 +279,10 @@ def buffer_audio_and_transcribe( def main(args): + import better_exchook + + better_exchook.install() + if args.device >= 0: device = torch.device(f"cuda:{args.device}") else: From 0a00b4209d9fa39658e2429ebe8ae30c6ad71b89 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 18 Jun 2024 11:46:36 +0000 Subject: [PATCH 180/227] fix enc shape --- .../experiments/canary_aed/nemo/run_eval_beam_search.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py index a43f9a30b..ef431883c 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py @@ -153,13 +153,17 @@ def _map(x): ) # [batch*beam,in_seq_len|1,D] dec_input_mask = mask_padded_tokens(input, pad_id=pad_id).float() + _, enc_len, enc_dim = enc.size() + enc_input_mask_ = enc_input_mask.repeat(1, beam_size).view(-1, enc_len) + enc_ = enc.repeat(1, beam_size, 1).view(-1, enc_len, enc_dim) + # decoder_mems_list is a list of size num_layers that cache output activations of shape # [batch*beam,history,D] decoder_mems_list = trafo_decoder_module.decoder.forward( decoder_states=dec_embed, decoder_mask=dec_input_mask, - encoder_states=enc, - encoder_mask=enc_input_mask, + encoder_states=enc_, + encoder_mask=enc_input_mask_, decoder_mems_list=prev_model_state, return_mems=True, return_mems_as_list=True, From 5463d1194ae9a596ac19542934886429a1269325 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 18 Jun 2024 12:17:20 +0000 Subject: [PATCH 181/227] use expand instead of repeat for efficiency --- .../experiments/canary_aed/nemo/run_eval_beam_search.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py index ef431883c..767c52503 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py @@ -154,8 +154,8 @@ def _map(x): dec_input_mask = mask_padded_tokens(input, pad_id=pad_id).float() _, enc_len, enc_dim = enc.size() - enc_input_mask_ = enc_input_mask.repeat(1, beam_size).view(-1, enc_len) - enc_ = enc.repeat(1, beam_size, 1).view(-1, enc_len, enc_dim) + enc_input_mask_ = enc_input_mask.unsqueeze(1).expand(-1, beam_size, -1).contiguous().view(-1, enc_len) + enc_ = enc.unsqueeze(1).expand(-1, beam_size, -1, -1).contiguous().view(-1, enc_len, enc_dim) # decoder_mems_list is a list of size num_layers that cache output activations of shape # [batch*beam,history,D] @@ -226,10 +226,6 @@ def _transcribe_output_processing_our_beam_search( opts=beam_search_v5_opts, ) # [B,Beam,L] - import pdb - - pdb.set_trace() - best_hyps = [] for i in range(seq_targets.shape[0]): # TODO: convert hyp to text using model.tokenizer From 8a071b29ca42310b961d29cd285f9c84ec4a4132 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 18 Jun 2024 12:30:09 +0000 Subject: [PATCH 182/227] better --- .../canary_aed/nemo/run_eval_beam_search.py | 43 +++++++++++-------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py index 767c52503..86fae43cd 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py @@ -92,7 +92,7 @@ def pack_results(results: list, buffer, transcriptions): def get_our_canary_label_scorer( - model: ASRModel, enc: torch.Tensor, enc_input_mask: torch.Tensor, pad_id: int, bos_prefix_seq: List[int] + model: ASRModel, enc: torch.Tensor, enc_input_mask: torch.Tensor, pad_id: int, bos_prefix_seq: torch.Tensor ) -> LabelScorerIntf: """ Creates a CanaryLabelScorer object that is used in the beam search implementation. @@ -111,9 +111,10 @@ class CanaryLabelScorer(LabelScorerIntf): def get_initial_state(self, *, batch_size: int, device: torch.device) -> Any: return { "step": StateObjIgnored(0), - "prefix_input": torch.tile( - torch.tensor(bos_prefix_seq, device=device)[None, :], [batch_size, 1] - ), # [Batch,InputSeqLen] + # "prefix_input": torch.tile( + # torch.tensor(bos_prefix_seq, device=device)[None, :], [batch_size, 1] + # ), # [Batch,InputSeqLen] + "prefix_input": bos_prefix_seq, # [Batch,InputSeqLen] "model_state": None, } @@ -194,18 +195,6 @@ def _map(x): return CanaryLabelScorer() -# TODO: make this configurable -beam_search_v5_opts = BeamSearchOptsV5( - beam_size=4, - bos_label=3, - eos_label=2, - num_labels=4128, - length_normalization_exponent=1, - pruning_threshold=0.0, - adaptive_pruning=False, -) - - def _transcribe_output_processing_our_beam_search( outputs, trcfg: MultiTaskTranscriptionConfig ) -> GenericTranscriptionType: @@ -213,9 +202,14 @@ def _transcribe_output_processing_our_beam_search( enc_states = outputs.pop("encoder_states") enc_lens = outputs.pop("encoded_lengths") enc_mask = outputs.pop("encoder_mask") + dec_input_ids = outputs.pop("decoder_input_ids") canary_label_scorer = get_our_canary_label_scorer( - model=asr_model, enc=enc_states, enc_input_mask=enc_mask, pad_id=1, bos_prefix_seq=[3, 4, 8, 4, 10] + model=asr_model, + enc=enc_states, + enc_input_mask=enc_mask, + pad_id=asr_model.tokenizer.pad_id, + bos_prefix_seq=dec_input_ids, # [3, 4, 8, 4, 10], ) seq_targets, _, out_seq_len = beam_search_v5( @@ -292,6 +286,17 @@ def main(args): asr_model = ASRModel.restore_from(args.model_path, map_location=device) asr_model.freeze() + global beam_search_v5_opts + beam_search_v5_opts = BeamSearchOptsV5( + beam_size=args.beam_size, + bos_label=asr_model.tokenizer.bos_id, + eos_label=asr_model.tokenizer.eos_id, + num_labels=len(asr_model.tokenizer.vocab), + length_normalization_exponent=1, + pruning_threshold=args.pruning_threshold, + adaptive_pruning=args.adaptive_pruning, + ) + # hook our beam search implementation asr_model._transcribe_output_processing = _transcribe_output_processing_our_beam_search @@ -353,6 +358,10 @@ def main(args): parser.add_argument("--wer_out_path", type=str, default=None, help="Path to save the WER output.") + parser.add_argument("--beam_size", type=int, default=4) + parser.add_argument("--pruning_threshold", type=float, default=0.0) + parser.add_argument("--adaptive_pruning", type=bool, default=False) + parser.add_argument( "--device", type=int, From f8399c62abc8111141faf42787e60c549fbea96f Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 18 Jun 2024 12:34:47 +0000 Subject: [PATCH 183/227] add hyp postprocessing --- .../experiments/canary_aed/nemo/run_eval_beam_search.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py index 86fae43cd..b5ad60ece 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py @@ -222,10 +222,10 @@ def _transcribe_output_processing_our_beam_search( best_hyps = [] for i in range(seq_targets.shape[0]): - # TODO: convert hyp to text using model.tokenizer - # TODO: filter out EOS? - best_hyp_int = seq_targets[i, 0, : out_seq_len[i, 0]] # [B,1,L] - best_hyps.append(best_hyp_int) + best_hyp_int = seq_targets[i, 0, : out_seq_len[i, 0]].tolist() + best_hyp_text = asr_model.tokenizer.ids_to_text(best_hyp_int) + best_hyps.append(asr_model.decoding.strip_special_tokens(best_hyp_text)) + print(best_hyps) return best_hyps From 748596a13bad41737c631b356ab0d8f46b1575dd Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 18 Jun 2024 12:36:24 +0000 Subject: [PATCH 184/227] better --- .../experiments/canary_aed/nemo/run_eval_beam_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py index b5ad60ece..d50250a3b 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py @@ -209,7 +209,7 @@ def _transcribe_output_processing_our_beam_search( enc=enc_states, enc_input_mask=enc_mask, pad_id=asr_model.tokenizer.pad_id, - bos_prefix_seq=dec_input_ids, # [3, 4, 8, 4, 10], + bos_prefix_seq=dec_input_ids, # [3, 4, 8, 4, 10] ) seq_targets, _, out_seq_len = beam_search_v5( From 01b8c7f1011b0f872a0ac8693625e59b5cd95760 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 18 Jun 2024 12:49:18 +0000 Subject: [PATCH 185/227] add beam search --- .../canary_aed/configs/canary_1b_recog.py | 39 ++++++++++++++++--- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py index a01b0da88..e316d1e42 100644 --- a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py +++ b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py @@ -49,10 +49,15 @@ def py(): dataset_paths = download_test_datasets() model_path = download_canary_1b_model() - search_script = tk.Path( + huggface_search_script = tk.Path( "/u/zeineldeen/setups/ubuntu_22_setups/2024-06-07--canary-aed/recipe/i6_experiments/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py", hash_overwrite="run_eval_v1", ) + our_beam_search_script = tk.Path( + "/u/zeineldeen/setups/ubuntu_22_setups/2024-06-07--canary-aed/recipe/i6_experiments/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py", + hash_overwrite="run_eval_v2", + ) + # to run canary model, this env has installed nemo toolkit with: # pip3 install git+https://github.com/NVIDIA/NeMo.git@r2.0.0rc0#egg=nemo_toolkit[all] # related issue: https://github.com/huggingface/open_asr_leaderboard/issues/26 @@ -76,7 +81,7 @@ def py(): dataset_path=dataset_paths[test_set], dataset_name=test_set, split=split, - search_script=search_script, + search_script=huggface_search_script, search_args={"batch_size": 64, "pnc": False, "max_eval_samples": -1}, python_exe=python_exe, device="gpu", @@ -85,6 +90,30 @@ def py(): cpu_rqmt=2, ) search_job.rqmt["sbatch_args"] = ["-p", "gpu_24gb"] - search_job.add_alias(f"canary_1b/{test_set}_bs64_wo-pnc") - tk.register_output(f"canary_1b/{test_set}_bs64_wo-pnc/search_out", search_job.out_search_results) - tk.register_output(f"canary_1b/{test_set}_bs64_wo-pnc/wer", search_job.out_wer) + search_job.add_alias(f"canary_1b/huggingface/{test_set}_bs64_greedy") + tk.register_output(f"canary_1b/huggingface/{test_set}_bs64_greedy/search_out", search_job.out_search_results) + tk.register_output(f"canary_1b/huggingface/{test_set}_bs64_greedy/wer", search_job.out_wer) + + # Run with our beam search + for beam_size in [4]: + for test_set, split in TEST_DATASETS.items(): + search_job = SearchJob( + model_id=MODEL_ID, + model_path=model_path, + dataset_path=dataset_paths[test_set], + dataset_name=test_set, + split=split, + search_script=our_beam_search_script, + search_args={"batch_size": 64, "pnc": False, "max_eval_samples": -1, "beam_size": beam_size}, + python_exe=python_exe, + device="gpu", + time_rqmt=24, + mem_rqmt=8, + cpu_rqmt=2, + ) + search_job.rqmt["sbatch_args"] = ["-p", "gpu_24gb"] + search_job.add_alias(f"canary_1b/beam_search_v5/{test_set}_bs64_beam{beam_size}") + tk.register_output( + f"canary_1b/beam_search_v5/{test_set}_bs64_beam{beam_size}/search_out", search_job.out_search_results + ) + tk.register_output(f"canary_1b/beam_search_v5/{test_set}_bs64_beam{beam_size}/wer", search_job.out_wer) From f464e26fbd50eb86c09331c8fd695e9cb252f614 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 18 Jun 2024 12:50:37 +0000 Subject: [PATCH 186/227] remove print --- .../experiments/canary_aed/nemo/run_eval_beam_search.py | 1 - 1 file changed, 1 deletion(-) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py index d50250a3b..06621b306 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py @@ -225,7 +225,6 @@ def _transcribe_output_processing_our_beam_search( best_hyp_int = seq_targets[i, 0, : out_seq_len[i, 0]].tolist() best_hyp_text = asr_model.tokenizer.ids_to_text(best_hyp_int) best_hyps.append(asr_model.decoding.strip_special_tokens(best_hyp_text)) - print(best_hyps) return best_hyps From 63a45f886ab341895be5fd338017b34d2dc0073a Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Sun, 16 Jun 2024 12:09:27 +0200 Subject: [PATCH 187/227] more --- .../exp2024_04_23_baselines/ctc.py | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 2963c7ec6..d3e020b2e 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -32,6 +32,10 @@ def py(): """Sisyphus entry point""" + from i6_experiments.users.zeyer.datasets.librispeech import get_librispeech_log_mel_stats + + feature_stats = get_librispeech_log_mel_stats(_log_mel_feature_dim) + """ Luca: @@ -53,9 +57,10 @@ def py(): - Luca uses larger batch 2_400_000 -> 6_400_000, grad accum 1 -> 2 - Luca uses wd 1e-06 - Luca uses older behavior_version 21 -> 16. + - Luca uses feature normalization (global based on Tedlium statistics...). """ - train_exp( + train_exp( # {"dev-clean": 6.44, "dev-other": 9.77, "test-clean": 6.89, "test-other": 9.98} f"v6-bhv21-24gb-bf16-bs40k-accgrad2-wd1e_6-lrlin1e_5_450k-bpe10k", config_24gb_v6, config_updates={ @@ -63,7 +68,7 @@ def py(): }, ) - train_exp( + train_exp( # {"dev-clean": 6.61, "dev-other": 9.68, "test-clean": 7.01, "test-other": 10.36} f"v6-bhv21-24gb-bf16-bs40k-accgrad2-wd1e_6-lrlin1e_5_600k-bpe10k", config_24gb_v6, config_updates={ @@ -72,6 +77,16 @@ def py(): }, ) + train_exp( + f"v6-bhv21-24gb-bf16-bs40k-accgrad2-wd1e_6-lrlin1e_5_600k-featGN-bpe10k", + config_24gb_v6, + model_config={"feature_stats": {"mean": feature_stats.mean, "std_dev": feature_stats.std_dev}}, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(40_000, 2000), + "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], + }, + ) + for acc, wd in [ # (5, 1e-5), # 9.90 (5, 1e-3), # 9.53 @@ -196,10 +211,6 @@ def py(): train_vocab_opts={"other_opts": {"enable_sampling": True, "alpha": 0.7}}, ) - from i6_experiments.users.zeyer.datasets.librispeech import get_librispeech_log_mel_stats - - feature_stats = get_librispeech_log_mel_stats(_log_mel_feature_dim) - # Test different feature normalization schemes. # Note: It seems the diff between dev-other and test-other is less here, probably du to the normalization. for name, model_opts in { From 68d90d619626a3cc38d7dc51a3aa3cbd3e514dad Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Mon, 17 Jun 2024 12:29:15 +0200 Subject: [PATCH 188/227] BatchRenorm with build_from_dict --- users/zeyer/nn_rf/batchnorm.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/users/zeyer/nn_rf/batchnorm.py b/users/zeyer/nn_rf/batchnorm.py index ae92bb6dd..0167e3f04 100644 --- a/users/zeyer/nn_rf/batchnorm.py +++ b/users/zeyer/nn_rf/batchnorm.py @@ -5,7 +5,7 @@ """ from __future__ import annotations -from typing import Optional, Union, Callable +from typing import Optional, Union, Any, Callable from returnn.tensor import Tensor, Dim import returnn.frontend as rf @@ -50,8 +50,8 @@ def __init__( eps: float = 1e-3, use_mask: Optional[bool] = None, unbiased: bool = False, - r_max: Union[float, Callable[[BatchRenorm], float]] = 1.0, - d_max: Union[float, Callable[[BatchRenorm], float]] = 0.0, + r_max: Union[float, Callable[[BatchRenorm], Union[float, Tensor]], Any] = 1.0, + d_max: Union[float, Callable[[BatchRenorm], Union[float, Tensor]], Any] = 0.0, ): """ :param in_dim: the feature dimension of the input @@ -88,6 +88,10 @@ def __init__( self.eps = eps self.use_mask = use_mask self.unbiased = unbiased + if isinstance(r_max, dict): + r_max = rf.build_from_dict(r_max) + if isinstance(d_max, dict): + d_max = rf.build_from_dict(d_max) self.r_max = r_max self.d_max = d_max self.running_mean = rf.Parameter([in_dim], auxiliary=True) @@ -139,7 +143,7 @@ def _update_running_stats(): def _train_mean_std_dev(): inv_std_dev_ = rf.rsqrt(variance_cur_batch + self.eps) - if r_max > 1: + if isinstance(r_max, Tensor) or r_max > 1: inv_std_dev_ *= rf.clip_by_value( rf.rsqrt(self.running_variance + self.eps) * rf.sqrt(rf.stop_gradient(variance_cur_batch) + self.eps), @@ -147,7 +151,7 @@ def _train_mean_std_dev(): r_max, ) mean_ = mean_cur_batch - if d_max > 0: + if isinstance(d_max, Tensor) or d_max > 0: limit = d_max * rf.reciprocal(rf.stop_gradient(inv_std_dev_)) mean_ += rf.clip_by_value(self.running_mean - rf.stop_gradient(mean_cur_batch), -limit, limit) return mean_, inv_std_dev_ From 7d87cc735578c5a7533ee2d096e1132a3805afde Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Mon, 17 Jun 2024 13:49:33 +0200 Subject: [PATCH 189/227] more --- .../exp2024_04_23_baselines/ctc.py | 38 ++++++++++++++++++- users/zeyer/nn_rf/batchnorm.py | 2 + 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index d3e020b2e..ccecb2ed6 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -126,7 +126,7 @@ def py(): "spm1k", # 12.72 "spm_bpe1k", # 11.76 ]: - train_exp( # 8.23 + train_exp( f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-{vocab}", config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, config_updates={ @@ -138,6 +138,7 @@ def py(): vocab=vocab, ) + # lrlin1e_5_393k train_exp( f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_393k-speedpertV2-bpe10k", config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, @@ -237,6 +238,36 @@ def py(): train_vocab_opts={"other_opts": {"enable_sampling": True, "alpha": 0.7}}, ) + from i6_experiments.users.zeyer.nn_rf.batchnorm import BatchRenorm + + for vocab, alpha in [("bpe10k", 0.01)]: # [("bpe10k", 0.01), ("spm10k", 0.7)]: + train_exp( + f"v6-batchRenorm-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-{vocab}" + f"-{'spmSample' if vocab.startswith('spm') else 'bpeSample'}{str(alpha).replace('.', '')}", + config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, + model_config={ + "conv_norm": rf.build_dict( + BatchRenorm, + r_max=rf.build_dict(rf.PiecewiseLinearStepwiseScheduler, points={5_000: 1.0, 40_000: 3.0}), + d_max=rf.build_dict(rf.PiecewiseLinearStepwiseScheduler, points={5_000: 0.0, 25_000: 5.0}), + ) + }, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), + "optimizer.weight_decay": 1e-2, + "__train_audio_preprocess": speed_pert_librosa_config, + "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], + }, + vocab=vocab, + train_vocab_opts={ + "other_opts": ( + {"enable_sampling": True, "alpha": alpha} + if vocab.startswith("spm") + else {"class": "SamplingBytePairEncoding", "breadth_prob": alpha} + ) + }, + ) + # noinspection PyShadowingNames def train_exp( @@ -342,6 +373,9 @@ def ctc_model_def(*, epoch: int, in_dim: Dim, target_dim: Dim) -> Model: # real input is raw audio, internally it does logmel in_dim = Dim(name="logmel", dimension=_log_mel_feature_dim, kind=Dim.Types.Feature) + conv_norm = config.typed_value("conv_norm", None) + conv_norm: Dict[str, Any] = {"class": "rf.BatchNorm", "use_mask": True} if not conv_norm else conv_norm + return Model( in_dim, num_enc_layers=num_enc_layers, @@ -349,7 +383,7 @@ def ctc_model_def(*, epoch: int, in_dim: Dim, target_dim: Dim) -> Model: enc_ff_dim=Dim(name="enc-ff", dimension=2048, kind=Dim.Types.Feature), enc_att_num_heads=8, enc_conformer_layer_opts=dict( - conv_norm_opts=dict(use_mask=True), + conv_norm=conv_norm, self_att_opts=dict( # Shawn et al 2018 style, old RETURNN way. with_bias=False, diff --git a/users/zeyer/nn_rf/batchnorm.py b/users/zeyer/nn_rf/batchnorm.py index 0167e3f04..40b5eba87 100644 --- a/users/zeyer/nn_rf/batchnorm.py +++ b/users/zeyer/nn_rf/batchnorm.py @@ -72,6 +72,7 @@ def __init__( r_max can also be scheduled via a callable, e.g. using rf.get_run_ctx().step inside. The original paper suggests to keep r_max=1.0 for the first 5k steps, then linearly increase to reach r_max=3.0 at 40k steps. + You can use ``rf.build_dict(rf.PiecewiseLinearStepwiseScheduler, points={5_000: 1.0, 40_000: 3.0})``. :param d_max: clip how much we should use the running mean instead of the current batch mean during training. Value should be >= 0.0. d_max=0.0 means always use the current batch mean, i.e. like standard batch norm. @@ -79,6 +80,7 @@ def __init__( d_max can also be scheduled via a callable, e.g. using rf.get_run_ctx().step inside. The original paper suggests to keep d_max=0.0 for the first 5k steps, then linearly increase to reach d_max=5.0 at 25k steps. + You can use ``rf.build_dict(rf.PiecewiseLinearStepwiseScheduler, points={5_000: 0.0, 25_000: 5.0})``. """ super().__init__() assert isinstance(in_dim, Dim) From 87f41f25f3a9bcdbd9066bee8194180efae4ee92 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Mon, 17 Jun 2024 14:47:29 +0200 Subject: [PATCH 190/227] small fix --- users/zeyer/experiments/exp2024_04_23_baselines/ctc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index ccecb2ed6..24e61fa84 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -248,6 +248,7 @@ def py(): model_config={ "conv_norm": rf.build_dict( BatchRenorm, + use_mask=True, r_max=rf.build_dict(rf.PiecewiseLinearStepwiseScheduler, points={5_000: 1.0, 40_000: 3.0}), d_max=rf.build_dict(rf.PiecewiseLinearStepwiseScheduler, points={5_000: 0.0, 25_000: 5.0}), ) From 00dd150ebef00ae7885b14cb8977b5f9c6cba4a6 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Mon, 17 Jun 2024 19:07:18 +0200 Subject: [PATCH 191/227] more --- .../exp2024_04_23_baselines/ctc.py | 59 ++++++++++--------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 24e61fa84..a24e29122 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -240,34 +240,37 @@ def py(): from i6_experiments.users.zeyer.nn_rf.batchnorm import BatchRenorm - for vocab, alpha in [("bpe10k", 0.01)]: # [("bpe10k", 0.01), ("spm10k", 0.7)]: - train_exp( - f"v6-batchRenorm-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-{vocab}" - f"-{'spmSample' if vocab.startswith('spm') else 'bpeSample'}{str(alpha).replace('.', '')}", - config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, - model_config={ - "conv_norm": rf.build_dict( - BatchRenorm, - use_mask=True, - r_max=rf.build_dict(rf.PiecewiseLinearStepwiseScheduler, points={5_000: 1.0, 40_000: 3.0}), - d_max=rf.build_dict(rf.PiecewiseLinearStepwiseScheduler, points={5_000: 0.0, 25_000: 5.0}), - ) - }, - config_updates={ - **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), - "optimizer.weight_decay": 1e-2, - "__train_audio_preprocess": speed_pert_librosa_config, - "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], - }, - vocab=vocab, - train_vocab_opts={ - "other_opts": ( - {"enable_sampling": True, "alpha": alpha} - if vocab.startswith("spm") - else {"class": "SamplingBytePairEncoding", "breadth_prob": alpha} - ) - }, - ) + for name, opts in { + "batchRenorm": rf.build_dict( + BatchRenorm, + use_mask=True, + r_max=rf.build_dict(rf.PiecewiseLinearStepwiseScheduler, points={5_000: 1.0, 40_000: 3.0}), + d_max=rf.build_dict(rf.PiecewiseLinearStepwiseScheduler, points={5_000: 0.0, 25_000: 5.0}), + ), + "groupNorm": {"class": rf.GroupNorm, "num_groups": 32}, + "layerNorm": {"class": rf.LayerNorm}, + }.items(): + for vocab, alpha in [("bpe10k", 0.01)]: # [("bpe10k", 0.01), ("spm10k", 0.7)]: + train_exp( + f"v6-{name}-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-{vocab}" + f"-{'spmSample' if vocab.startswith('spm') else 'bpeSample'}{str(alpha).replace('.', '')}", + config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, + model_config={"conv_norm": opts}, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), + "optimizer.weight_decay": 1e-2, + "__train_audio_preprocess": speed_pert_librosa_config, + "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], + }, + vocab=vocab, + train_vocab_opts={ + "other_opts": ( + {"enable_sampling": True, "alpha": alpha} + if vocab.startswith("spm") + else {"class": "SamplingBytePairEncoding", "breadth_prob": alpha} + ) + }, + ) # noinspection PyShadowingNames From d87eb8d01101013174f10d59114751d7cfa07c46 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Mon, 17 Jun 2024 19:09:34 +0200 Subject: [PATCH 192/227] small fix --- users/zeyer/experiments/exp2024_04_23_baselines/ctc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index a24e29122..fc2ed32d1 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -247,8 +247,8 @@ def py(): r_max=rf.build_dict(rf.PiecewiseLinearStepwiseScheduler, points={5_000: 1.0, 40_000: 3.0}), d_max=rf.build_dict(rf.PiecewiseLinearStepwiseScheduler, points={5_000: 0.0, 25_000: 5.0}), ), - "groupNorm": {"class": rf.GroupNorm, "num_groups": 32}, - "layerNorm": {"class": rf.LayerNorm}, + "groupNorm": rf.build_dict(rf.GroupNorm, num_groups=32), + "layerNorm": rf.build_dict(rf.LayerNorm), }.items(): for vocab, alpha in [("bpe10k", 0.01)]: # [("bpe10k", 0.01), ("spm10k", 0.7)]: train_exp( From 9ec8ad098738d32fb65b66dc6bef6ae5a0185a78 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Tue, 18 Jun 2024 10:50:03 +0200 Subject: [PATCH 193/227] reorder code --- users/zeyer/collect_model_dataset_stats.py | 78 +++++++++++----------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/users/zeyer/collect_model_dataset_stats.py b/users/zeyer/collect_model_dataset_stats.py index c65e13c4a..f03f3875c 100644 --- a/users/zeyer/collect_model_dataset_stats.py +++ b/users/zeyer/collect_model_dataset_stats.py @@ -27,6 +27,45 @@ from returnn.tensor import Tensor, Dim, TensorDict +@dataclass +class StatisticsOutput: + """statistics""" + + mean: tk.Path + std_dev: tk.Path + min: tk.Path + max: tk.Path + info: tk.Path + + +def collect_log_mel_feature_statistics( + *, dataset: DatasetConfig, dim: int, backend: str = "torch", behavior_version: int = 21, **kwargs +) -> StatisticsOutput: + """ + Get feature stats + + :param dataset: + :param dim: log mel feature dim + :param backend: + :param behavior_version: + :param kwargs: all passed to rf.audio.log_mel_filterbank_from_raw. + Default sampling_rate is 16_000, which is also what we have for Librispeech usually. + Note on log_base: Default is 10.0. + Note that in some earlier setups, and also Mohammads original AED setup, + we used log_base=math.exp(2.3026), which is almost 10.0 but not exactly... + """ + return collect_statistics( + dataset=dataset, + forward_def=_log_mel_stats_returnn_forward, + config={ + "backend": backend, + "behavior_version": behavior_version, + "_audio_feature_dim": dim, + "_audio_feature_opts": kwargs, + }, + ) + + def collect_statistics( *, dataset: DatasetConfig, @@ -73,34 +112,6 @@ def collect_statistics( return StatisticsOutput(**{k: forward_job.out_files[v] for k, v in out_files.items()}) -def collect_log_mel_feature_statistics( - *, dataset: DatasetConfig, dim: int, backend: str = "torch", behavior_version: int = 21, **kwargs -): - """ - Get feature stats - - :param dataset: - :param dim: log mel feature dim - :param backend: - :param behavior_version: - :param kwargs: all passed to rf.audio.log_mel_filterbank_from_raw. - Default sampling_rate is 16_000, which is also what we have for Librispeech usually. - Note on log_base: Default is 10.0. - Note that in some earlier setups, and also Mohammads original AED setup, - we used log_base=math.exp(2.3026), which is almost 10.0 but not exactly... - """ - return collect_statistics( - dataset=dataset, - forward_def=_log_mel_stats_returnn_forward, - config={ - "backend": backend, - "behavior_version": behavior_version, - "_audio_feature_dim": dim, - "_audio_feature_opts": kwargs, - }, - ) - - def _log_mel_stats_returnn_forward(source: Tensor, /, in_spatial_dim: Dim, model: Any) -> Tuple[Tensor, Dim]: from returnn.config import get_global_config import returnn.frontend as rf @@ -120,17 +131,6 @@ def _log_mel_stats_returnn_forward(source: Tensor, /, in_spatial_dim: Dim, model return source, out_spatial_dim -@dataclass -class StatisticsOutput: - """statistics""" - - mean: tk.Path - std_dev: tk.Path - min: tk.Path - max: tk.Path - info: tk.Path - - _prior_mean_out_filename = "stats.mean.txt" _prior_std_dev_out_filename = "stats.std_dev.txt" _prior_min_out_filename = "stats.min.txt" From ccdc9e67699fb28be52bd723d06505ba7e71329f Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Tue, 18 Jun 2024 11:02:37 +0200 Subject: [PATCH 194/227] comment --- users/zeyer/collect_model_dataset_stats.py | 1 + 1 file changed, 1 insertion(+) diff --git a/users/zeyer/collect_model_dataset_stats.py b/users/zeyer/collect_model_dataset_stats.py index f03f3875c..95cbb70cd 100644 --- a/users/zeyer/collect_model_dataset_stats.py +++ b/users/zeyer/collect_model_dataset_stats.py @@ -113,6 +113,7 @@ def collect_statistics( def _log_mel_stats_returnn_forward(source: Tensor, /, in_spatial_dim: Dim, model: Any) -> Tuple[Tensor, Dim]: + """ForwardDef API""" from returnn.config import get_global_config import returnn.frontend as rf from returnn.tensor import Dim From 17e2b6cc4843a4ebe694e3b7f1fc628db5a6790c Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Tue, 18 Jun 2024 11:33:49 +0200 Subject: [PATCH 195/227] prior --- users/zeyer/collect_model_dataset_stats.py | 64 ++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/users/zeyer/collect_model_dataset_stats.py b/users/zeyer/collect_model_dataset_stats.py index 95cbb70cd..234dbc96f 100644 --- a/users/zeyer/collect_model_dataset_stats.py +++ b/users/zeyer/collect_model_dataset_stats.py @@ -66,6 +66,45 @@ def collect_log_mel_feature_statistics( ) +def compute_model_softmax_prior_statistics( + *, + model: ModelWithCheckpoint, + model_output_kind: str = "logits", + dataset: DatasetConfig, + backend: str = "torch", + behavior_version: int = 21, + **kwargs, +) -> StatisticsOutput: + """ + Calculate model softmax prior average. + + :param model: after construction, the model will be called as: + ``out, out_spatial_dim = model(input, in_spatial_dim=in_spatial_dim)`` + (This is the RETURNN ISeqDownsamplingEncoder interface.) + ``out.feature_dim`` is expected to be set. + Use ``model_output_kind`` to specify what kind of output you have in the model output ``out``. + (If you have a model with a different interface, just call :collect_statistics` directly + with your custom ``forward_def`` function.) + :param model_output_kind: "logits", "log_prob" or "prob" + :param dataset: + :param backend: + :param behavior_version: + :param kwargs: passed to :func:`collect_statistics` + """ + assert model_output_kind in {"logits", "log_prob", "prob"} + return collect_statistics( + model=model, + dataset=dataset, + forward_def=_model_softmax_prior_returnn_forward, + config={ + "backend": backend, + "behavior_version": behavior_version, + "_model_output_kind": model_output_kind, + }, + **kwargs, + ) + + def collect_statistics( *, dataset: DatasetConfig, @@ -132,6 +171,31 @@ def _log_mel_stats_returnn_forward(source: Tensor, /, in_spatial_dim: Dim, model return source, out_spatial_dim +def _model_softmax_prior_returnn_forward(source: Tensor, /, in_spatial_dim: Dim, model: Any) -> Tuple[Tensor, Dim]: + """ForwardDef API""" + from returnn.config import get_global_config + import returnn.frontend as rf + from returnn.tensor import Tensor, Dim + + out, out_spatial_dim = model(source, in_spatial_dim=in_spatial_dim) + assert isinstance(out, Tensor) and isinstance(out_spatial_dim, Dim) + assert out.feature_dim # we expect a feature dim + assert out_spatial_dim in out.dims + + config = get_global_config() + model_output_kind = config.typed_value("_model_output_kind", None) + if model_output_kind == "logits": + out = rf.softmax(out, axis=out.feature_dim) + elif model_output_kind == "log_prob": + out = rf.exp(out) + elif model_output_kind == "prob": + pass + else: + raise ValueError(f"invalid model_output_kind {model_output_kind!r}") + + return out, out_spatial_dim + + _prior_mean_out_filename = "stats.mean.txt" _prior_std_dev_out_filename = "stats.std_dev.txt" _prior_min_out_filename = "stats.min.txt" From 973c32839ef2c9ff97cd799ce34e4dcbafde5cc2 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Tue, 18 Jun 2024 11:34:02 +0200 Subject: [PATCH 196/227] cleanup --- .../zeyer/experiments/exp2024_04_23_baselines/ctc.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index fc2ed32d1..8e13c8ef8 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -581,17 +581,6 @@ def model_recog( model_recog.batch_size_dependent = False # not totally correct, but we treat it as such... -def _gather_backrefs(s, *, backrefs: Tensor): - if isinstance(s, Tensor): - if backrefs.sparse_dim in s.dims: - return rf.gather(s, indices=backrefs) # really the default case - return s # e.g. scalar or so, independent from beam - if isinstance(s, Dim): - assert s.dimension or backrefs not in s.dyn_size_ext.dims # currently not supported, also not expected - return s - raise TypeError(f"_gather_backrefs: unexpected type ({type(s)})") - - class Model(rf.Module): """Model definition""" From 0476918a4fda8326cb418278f59b11eed6ea6ece Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Tue, 18 Jun 2024 15:24:40 +0200 Subject: [PATCH 197/227] cache enc beam expansion --- .../canary_aed/nemo/run_eval_beam_search.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py index 06621b306..52e61f65d 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py @@ -1,6 +1,7 @@ """ Adapted from here: https://github.com/huggingface/open_asr_leaderboard/blob/5c03c1f85a84ab7a991dcc1b3f14905ec6d632c9/nemo_asr/run_eval.py """ + from __future__ import annotations import argparse @@ -98,7 +99,7 @@ def get_our_canary_label_scorer( Creates a CanaryLabelScorer object that is used in the beam search implementation. :param model: nemo ASRModel object - :param enc: [B,T] + :param enc: [B,T,D] :param enc_input_mask: [B,T] :param pad_id: :param bos_prefix_seq: @@ -107,6 +108,9 @@ def get_our_canary_label_scorer( trafo_decoder_module = model.transf_decoder # type: torch.nn.Module log_softmax_module = model.log_softmax # type: torch.nn.Module + enc = enc[:, None] # [B,Beam=1,T,D] + enc_input_mask = enc_input_mask[:, None] # [B,Beam=1,T] + class CanaryLabelScorer(LabelScorerIntf): def get_initial_state(self, *, batch_size: int, device: torch.device) -> Any: return { @@ -154,9 +158,12 @@ def _map(x): ) # [batch*beam,in_seq_len|1,D] dec_input_mask = mask_padded_tokens(input, pad_id=pad_id).float() - _, enc_len, enc_dim = enc.size() - enc_input_mask_ = enc_input_mask.unsqueeze(1).expand(-1, beam_size, -1).contiguous().view(-1, enc_len) - enc_ = enc.unsqueeze(1).expand(-1, beam_size, -1, -1).contiguous().view(-1, enc_len, enc_dim) + nonlocal enc, enc_input_mask + if enc.size(1) < beam_size: + enc = enc[:, :1].expand(-1, beam_size, -1, -1).continguous() # [batch,beam,T,D] + enc_input_mask = enc_input_mask[:, :1].expand(-1, beam_size, -1).continguous() # [batch,beam,T] + enc_ = enc[:, :beam_size].flatten(0, 1) + enc_input_mask_ = enc_input_mask[:, :beam_size].flatten(0, 1) # [batch*beam,T] # decoder_mems_list is a list of size num_layers that cache output activations of shape # [batch*beam,history,D] From cd16ba0b4017dd55224c865654909bb47885ab97 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Tue, 18 Jun 2024 13:40:17 +0000 Subject: [PATCH 198/227] fix bug --- .../experiments/canary_aed/nemo/run_eval_beam_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py index 52e61f65d..cc928f526 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py @@ -160,8 +160,8 @@ def _map(x): nonlocal enc, enc_input_mask if enc.size(1) < beam_size: - enc = enc[:, :1].expand(-1, beam_size, -1, -1).continguous() # [batch,beam,T,D] - enc_input_mask = enc_input_mask[:, :1].expand(-1, beam_size, -1).continguous() # [batch,beam,T] + enc = enc[:, :1].expand(-1, beam_size, -1, -1).contiguous() # [batch,beam,T,D] + enc_input_mask = enc_input_mask[:, :1].expand(-1, beam_size, -1).contiguous() # [batch,beam,T] enc_ = enc[:, :beam_size].flatten(0, 1) enc_input_mask_ = enc_input_mask[:, :beam_size].flatten(0, 1) # [batch*beam,T] From 94274d3211c0bc4b5801095cc167837864cbb6e7 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Wed, 19 Jun 2024 00:37:07 +0200 Subject: [PATCH 199/227] update --- .../experiments/canary_aed/configs/canary_1b_recog.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py index e316d1e42..28db5beb1 100644 --- a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py +++ b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py @@ -95,8 +95,10 @@ def py(): tk.register_output(f"canary_1b/huggingface/{test_set}_bs64_greedy/wer", search_job.out_wer) # Run with our beam search - for beam_size in [4]: + for beam_size in [1, 4, 8]: for test_set, split in TEST_DATASETS.items(): + if test_set == "gigaspeech": + continue # TODO: need to ask nick to set a reservaion tag to increase time limit search_job = SearchJob( model_id=MODEL_ID, model_path=model_path, @@ -107,11 +109,11 @@ def py(): search_args={"batch_size": 64, "pnc": False, "max_eval_samples": -1, "beam_size": beam_size}, python_exe=python_exe, device="gpu", - time_rqmt=24, + time_rqmt=0.5, mem_rqmt=8, cpu_rqmt=2, ) - search_job.rqmt["sbatch_args"] = ["-p", "gpu_24gb"] + search_job.rqmt["sbatch_args"] = ["-p", "gpu_test_24gb"] search_job.add_alias(f"canary_1b/beam_search_v5/{test_set}_bs64_beam{beam_size}") tk.register_output( f"canary_1b/beam_search_v5/{test_set}_bs64_beam{beam_size}/search_out", search_job.out_search_results From bd0dfe47f1aadd11de547e46ff4f9bdedcdb4221 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Tue, 18 Jun 2024 16:52:27 +0200 Subject: [PATCH 200/227] more --- users/zeyer/experiments/exp2024_04_23_baselines/ctc.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 8e13c8ef8..138d56d79 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -165,7 +165,8 @@ def py(): # The higher the alpha, the longer (the reverse as for SPM Unigram). # See archive/returnn-spm_bpe10-sample.config. # spm_bpe10k no sampling: 7.97 - ("spm_bpe10k", 0.001), + ("spm_bpe10k", 0.0001), + ("spm_bpe10k", 0.001), # 8.15 ("spm_bpe10k", 0.005), # 8.66 ("spm_bpe10k", 0.01), # 8.99 # ("spm_bpe10k", 0.3), # broken @@ -174,7 +175,7 @@ def py(): # See archive/returnn-bpe10-sample.config. # The higher the alpha, the longer the sequence, i.e. the more aggressive the sampling. # bpe10k no sampling: 8.23 - ("bpe10k", 0.005), + ("bpe10k", 0.005), # 7.32 ("bpe10k", 0.01), # 7.10 ("bpe10k", 0.02), ]: From 293e11cbf2d70a035285ad3de975d787e7c6f69e Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Tue, 18 Jun 2024 20:33:16 +0200 Subject: [PATCH 201/227] more --- users/zeyer/experiments/exp2024_04_23_baselines/aed.py | 3 ++- users/zeyer/experiments/exp2024_04_23_baselines/ctc.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/aed.py b/users/zeyer/experiments/exp2024_04_23_baselines/aed.py index 17e26e500..c9319cb50 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/aed.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/aed.py @@ -105,7 +105,8 @@ def py(): # See archive/returnn-bpe10-sample.config. # The higher the alpha, the longer the sequence, i.e. the more aggressive the sampling. # bpe10k without sampling: 5.32 - ("bpe10k", 0.01), + ("bpe10k", 0.01), # 5.25 + ("bpe10k", 0.02), ]: train_exp( f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-{vocab}" diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 138d56d79..ee1af23f1 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -154,6 +154,8 @@ def py(): # Testing different vocabs together with sampling. for vocab, alpha in [ + # spm20k no sampling: 7.44 + ("spm20k", 0.7), # See archive/returnn-spm10-sample.config for playing around with alpha and checking avg seq len. # The lower the alpha, the longer the seq len, i.e. the more aggressive the sampling. # spm10k no sampling: 8.12 From 818a37b9aaf81a2383c3adccc3db1ecf85499ffe Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Tue, 18 Jun 2024 20:35:44 +0200 Subject: [PATCH 202/227] more --- users/zeyer/experiments/exp2024_04_23_baselines/ctc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index ee1af23f1..28fae2cce 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -179,7 +179,7 @@ def py(): # bpe10k no sampling: 8.23 ("bpe10k", 0.005), # 7.32 ("bpe10k", 0.01), # 7.10 - ("bpe10k", 0.02), + ("bpe10k", 0.02), # 7.35 ]: train_exp( f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-{vocab}" From 0ed3c7d5e41e1acce7ece8bec31df7d0cb863276 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Wed, 19 Jun 2024 10:37:31 +0200 Subject: [PATCH 203/227] more --- .../exp2024_04_23_baselines/ctc.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index 28fae2cce..ef17629d9 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -117,6 +117,7 @@ def py(): }, ) + # Comparing vocabs. Note that max_seq_length_default_target=75 always here... for vocab in [ "spm20k", # 7.44 "bpe10k", # 8.23 @@ -138,6 +139,39 @@ def py(): vocab=vocab, ) + # Comparing vocabs with better settings: feature norm, sampling, no max seq len. + for vocab, alpha in [ + ("spm20k", 0.7), + ("bpe10k", 0.01), + ("spm10k", 0.7), + # ("spm_bpe10k", ...), # unclear what sampling scheme... + ("spm4k", 0.7), + ("spm1k", 0.7), + # ("spm_bpe1k", ...) + ]: + train_exp( + f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-maxSeqLenNone" + f"-wd1e_2-lrlin1e_5_295k-featBN-speedpertV2-{vocab}" + f"-{'spmSample' if vocab.startswith('spm') else 'bpeSample'}{str(alpha).replace('.', '')}", + config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, + model_config={"feature_batch_norm": True}, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), + "optimizer.weight_decay": 1e-2, + "__train_audio_preprocess": speed_pert_librosa_config, + "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], + "max_seq_length_default_target": None, + }, + vocab=vocab, + train_vocab_opts={ + "other_opts": ( + {"enable_sampling": True, "alpha": alpha} + if vocab.startswith("spm") + else {"class": "SamplingBytePairEncoding", "breadth_prob": alpha} + ) + }, + ) + # lrlin1e_5_393k train_exp( f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_393k-speedpertV2-bpe10k", @@ -201,6 +235,7 @@ def py(): }, ) + # Checking EOS. train_exp( # 7.36 (vs without EOS 6.99), so EOS made it worse "v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-speedpertV2-spm10k-eos-spmSample07", config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, @@ -243,6 +278,7 @@ def py(): from i6_experiments.users.zeyer.nn_rf.batchnorm import BatchRenorm + # Replacing batch norm in the Conformer Convolution Module with other normalization schemes. for name, opts in { "batchRenorm": rf.build_dict( BatchRenorm, From a2b6d49e4babcc8aefebf333b3c7390308821219 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Wed, 19 Jun 2024 10:46:18 +0200 Subject: [PATCH 204/227] LS spm vocab alias --- users/zeyer/datasets/librispeech.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/users/zeyer/datasets/librispeech.py b/users/zeyer/datasets/librispeech.py index 630fc3fbe..4b7c94a1f 100644 --- a/users/zeyer/datasets/librispeech.py +++ b/users/zeyer/datasets/librispeech.py @@ -30,6 +30,8 @@ from i6_experiments.users.zeyer.collect_model_dataset_stats import StatisticsOutput +_alias_prefix = "datasets/LibriSpeech/" + librispeech_ogg_zip_dict = librispeech.get_ogg_zip_dict() # $ ls -la /u/zeyer/setups/librispeech/dataset/tars/ @@ -59,6 +61,7 @@ def _get_spm_vocab( *, dim: Union[int, str], model_type: SentencePieceType = SentencePieceType.UNIGRAM ) -> SentencePieceModel: + dim_str = str(dim) if isinstance(dim, str): # Not sure if power-of-two or just multiple-of-64, but 10240 has more 2s in it (2048*5) than 10048. dim = {"20k": 20_480, "10k": 10_240, "5k": 5_120, "4k": 4_096, "1k": 1_024}[dim] @@ -76,6 +79,7 @@ def _get_spm_vocab( "eos_id": 0, # default is 2 }, ) + _spm_train_job.add_alias(_alias_prefix + f"vocab/spm{dim_str}{model_type}-train") spm = SentencePieceModel( dim=dim, model_file=_spm_train_job.out_model, From e15023279bbd9ee5d15928017f2300ba51d8530a Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Wed, 19 Jun 2024 10:49:19 +0200 Subject: [PATCH 205/227] make private --- users/zeyer/datasets/librispeech.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/users/zeyer/datasets/librispeech.py b/users/zeyer/datasets/librispeech.py index 4b7c94a1f..d5bf26cfd 100644 --- a/users/zeyer/datasets/librispeech.py +++ b/users/zeyer/datasets/librispeech.py @@ -32,7 +32,7 @@ _alias_prefix = "datasets/LibriSpeech/" -librispeech_ogg_zip_dict = librispeech.get_ogg_zip_dict() +_librispeech_ogg_zip_dict = librispeech.get_ogg_zip_dict() # $ ls -la /u/zeyer/setups/librispeech/dataset/tars/ # -rw-r--r-- 1 zeyer assi 360977013 Feb 26 2018 dev-clean.zip @@ -43,7 +43,7 @@ # -rw-r--r-- 1 zeyer assi 6625963133 Feb 26 2018 train-clean-100.zip # -rw-r--r-- 1 zeyer assi 23919296392 Feb 26 2018 train-clean-360.zip # -rw-r--r-- 1 zeyer assi 31839925140 Feb 26 2018 train-other-500.zip -librispeech_tars_zip_base_path = tk.Path( +_librispeech_tars_zip_base_path = tk.Path( "/u/zeyer/setups/librispeech/dataset/tars", hash_overwrite="Librispeech-tars-zip-base-path" ) @@ -181,7 +181,7 @@ def _get_dataset(key: str, *, subset=None, train_partition_epoch=None, training: parts = [part for part in _Parts if part.startswith(key)] assert parts, f"invalid key {key!r}" for part in parts: - files += [librispeech_ogg_zip_dict[part]] + files += [_librispeech_ogg_zip_dict[part]] d = { "class": "OggZipDataset", "path": files, @@ -328,7 +328,7 @@ def get_dataset(self, key: str, *, training: bool = False, subset: Optional[int] parts = [part for part in _Parts if part.startswith(key)] assert parts, f"invalid key {key!r}" for part in parts: - files += [librispeech_ogg_zip_dict[part]] + files += [_librispeech_ogg_zip_dict[part]] d = { "class": "OggZipDataset", "path": files, @@ -457,7 +457,7 @@ def get_main_dataset(self) -> Dict[str, Any]: def get_dataset(self, key: str, *, training: bool = False, subset: Optional[int] = None) -> Dict[str, Any]: d = { "class": "LibriSpeechCorpus", - "path": librispeech_tars_zip_base_path, + "path": _librispeech_tars_zip_base_path, "use_zip": True, "prefix": key, "use_cache_manager": True, From ac81ea89a68169106a1c4d3a0a8a014c3d0dee0b Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Wed, 19 Jun 2024 10:51:29 +0200 Subject: [PATCH 206/227] move --- users/zeyer/datasets/librispeech.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/users/zeyer/datasets/librispeech.py b/users/zeyer/datasets/librispeech.py index d5bf26cfd..e4790b229 100644 --- a/users/zeyer/datasets/librispeech.py +++ b/users/zeyer/datasets/librispeech.py @@ -34,19 +34,6 @@ _librispeech_ogg_zip_dict = librispeech.get_ogg_zip_dict() -# $ ls -la /u/zeyer/setups/librispeech/dataset/tars/ -# -rw-r--r-- 1 zeyer assi 360977013 Feb 26 2018 dev-clean.zip -# -rw-r--r-- 1 zeyer assi 338709788 Feb 26 2018 dev-other.zip -# -rw-r--r-- 1 zeyer assi 1024 Feb 27 2018 .history.zeyer -# -rw-r--r-- 1 zeyer assi 369096021 Feb 26 2018 test-clean.zip -# -rw-r--r-- 1 zeyer assi 353841318 Feb 26 2018 test-other.zip -# -rw-r--r-- 1 zeyer assi 6625963133 Feb 26 2018 train-clean-100.zip -# -rw-r--r-- 1 zeyer assi 23919296392 Feb 26 2018 train-clean-360.zip -# -rw-r--r-- 1 zeyer assi 31839925140 Feb 26 2018 train-other-500.zip -_librispeech_tars_zip_base_path = tk.Path( - "/u/zeyer/setups/librispeech/dataset/tars", hash_overwrite="Librispeech-tars-zip-base-path" -) - # Get Bliss corpus. Same audio format as in ogg_zip, so already there anyway due to how we created the ogg_zip. # WARNING: Do not use these directly... It will keep another ogg copy of the audio... # However, these are used later in the scoring, so when changing them, make sure it's optional, @@ -372,6 +359,19 @@ class LibrispeechOldFlacTarZip(DatasetConfig): i.e. keeping the original flac files inside the zip files. """ + # $ ls -la /u/zeyer/setups/librispeech/dataset/tars/ + # -rw-r--r-- 1 zeyer assi 360977013 Feb 26 2018 dev-clean.zip + # -rw-r--r-- 1 zeyer assi 338709788 Feb 26 2018 dev-other.zip + # -rw-r--r-- 1 zeyer assi 1024 Feb 27 2018 .history.zeyer + # -rw-r--r-- 1 zeyer assi 369096021 Feb 26 2018 test-clean.zip + # -rw-r--r-- 1 zeyer assi 353841318 Feb 26 2018 test-other.zip + # -rw-r--r-- 1 zeyer assi 6625963133 Feb 26 2018 train-clean-100.zip + # -rw-r--r-- 1 zeyer assi 23919296392 Feb 26 2018 train-clean-360.zip + # -rw-r--r-- 1 zeyer assi 31839925140 Feb 26 2018 train-other-500.zip + _librispeech_tars_zip_base_path = tk.Path( + "/u/zeyer/setups/librispeech/dataset/tars", hash_overwrite="Librispeech-tars-zip-base-path" + ) + def __init__( self, *, @@ -457,7 +457,7 @@ def get_main_dataset(self) -> Dict[str, Any]: def get_dataset(self, key: str, *, training: bool = False, subset: Optional[int] = None) -> Dict[str, Any]: d = { "class": "LibriSpeechCorpus", - "path": _librispeech_tars_zip_base_path, + "path": self._librispeech_tars_zip_base_path, "use_zip": True, "prefix": key, "use_cache_manager": True, From 222a1c59635f74efe1636f9ffe1357237b705787 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Wed, 19 Jun 2024 11:11:07 +0200 Subject: [PATCH 207/227] lazy, aliases --- users/zeyer/datasets/librispeech.py | 49 ++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/users/zeyer/datasets/librispeech.py b/users/zeyer/datasets/librispeech.py index e4790b229..2a068c7c0 100644 --- a/users/zeyer/datasets/librispeech.py +++ b/users/zeyer/datasets/librispeech.py @@ -32,16 +32,35 @@ _alias_prefix = "datasets/LibriSpeech/" -_librispeech_ogg_zip_dict = librispeech.get_ogg_zip_dict() -# Get Bliss corpus. Same audio format as in ogg_zip, so already there anyway due to how we created the ogg_zip. -# WARNING: Do not use these directly... It will keep another ogg copy of the audio... -# However, these are used later in the scoring, so when changing them, make sure it's optional, -# to not break hashes of old setups. -_bliss_corpus_dict = librispeech.get_bliss_corpus_dict(audio_format="ogg") -_corpus_text_dicts = {k: CorpusToTextDictJob(v, gzip=True).out_dictionary for k, v in _bliss_corpus_dict.items()} -_train_corpus_text_dict = _corpus_text_dicts["train-other-960"] -_train_corpus_text = TextDictToTextLinesJob(_train_corpus_text_dict, gzip=True).out_text_lines +@cache +def _get_librispeech_ogg_zip_dict() -> Dict[str, tk.Path]: + return librispeech.get_ogg_zip_dict() + + +@cache +def _get_bliss_corpus_dict() -> Dict[str, tk.Path]: + # Get Bliss corpus. Same audio format as in ogg_zip, so already there anyway due to how we created the ogg_zip. + # WARNING: Do not use these directly... It will keep another ogg copy of the audio... + # However, these are used later in the scoring, so when changing them, make sure it's optional, + # to not break hashes of old setups. + return librispeech.get_bliss_corpus_dict(audio_format="ogg") + + +@cache +def _get_corpus_text_dict(key: str) -> tk.Path: + job = CorpusToTextDictJob(_get_bliss_corpus_dict()[key], gzip=True) + job.add_alias(_alias_prefix + f"{key.replace('-', '_')}_corpus_text_dict") + return job.out_dictionary + + +@cache +def _get_train_corpus_text() -> tk.Path: + key = "train-other-960" + train_corpus_text_dict = _get_corpus_text_dict(key) + job = TextDictToTextLinesJob(train_corpus_text_dict, gzip=True) + job.add_alias(_alias_prefix + f"{key.replace('-', '_')}_corpus_text_lines") + return job.out_text_lines @cache @@ -56,7 +75,7 @@ def _get_spm_vocab( # https://github.com/google/sentencepiece/blob/master/doc/options.md _spm_train_job = TrainSentencePieceJob( - training_text=_train_corpus_text, + training_text=_get_train_corpus_text(), vocab_size=dim, model_type=model_type, additional_options={ @@ -66,7 +85,7 @@ def _get_spm_vocab( "eos_id": 0, # default is 2 }, ) - _spm_train_job.add_alias(_alias_prefix + f"vocab/spm{dim_str}{model_type}-train") + _spm_train_job.add_alias(_alias_prefix + f"vocab/spm_{model_type.name.lower()}_{dim_str}_train") spm = SentencePieceModel( dim=dim, model_file=_spm_train_job.out_model, @@ -168,7 +187,7 @@ def _get_dataset(key: str, *, subset=None, train_partition_epoch=None, training: parts = [part for part in _Parts if part.startswith(key)] assert parts, f"invalid key {key!r}" for part in parts: - files += [_librispeech_ogg_zip_dict[part]] + files += [_get_librispeech_ogg_zip_dict()[part]] d = { "class": "OggZipDataset", "path": files, @@ -315,7 +334,7 @@ def get_dataset(self, key: str, *, training: bool = False, subset: Optional[int] parts = [part for part in _Parts if part.startswith(key)] assert parts, f"invalid key {key!r}" for part in parts: - files += [_librispeech_ogg_zip_dict[part]] + files += [_get_librispeech_ogg_zip_dict()[part]] d = { "class": "OggZipDataset", "path": files, @@ -652,7 +671,7 @@ def _score_recog_out_v1(dataset: DatasetConfig, recog_output: RecogOutput) -> Sc hyp_words = recog_output.output corpus_name = dataset.get_main_name() - bliss_corpus = _bliss_corpus_dict[corpus_name] + bliss_corpus = _get_bliss_corpus_dict()[corpus_name] search_ctm = SearchWordsToCTMJob(recog_words_file=hyp_words, bliss_corpus=bliss_corpus).out_ctm_file stm_file = CorpusToStmJob(bliss_corpus=bliss_corpus).out_stm_path @@ -674,7 +693,7 @@ def _score_recog_out_v2(dataset: DatasetConfig, recog_output: RecogOutput) -> Sc hyp_words = recog_output.output corpus_name = dataset.get_main_name() - corpus_text_dict = _corpus_text_dicts[corpus_name] + corpus_text_dict = _get_corpus_text_dict(corpus_name) # Arbitrary seg length time. The jobs SearchWordsDummyTimesToCTMJob and TextDictToStmJob # serialize two points after decimal, so long seqs (>1h or so) might be problematic, # and no reason not to just use a high value here to avoid this problem whenever we get to it. From 323b0f8e0e49f46dfdca2586ce7df59452fd5079 Mon Sep 17 00:00:00 2001 From: "luca.gaudino" Date: Wed, 19 Jun 2024 13:23:29 +0200 Subject: [PATCH 208/227] update and test rf vs torch mhsa --- users/gaudino/datasets/librispeech.py | 11 + .../rnnt/decoder/experimental_rnnt_decoder.py | 6 + .../rnnt/decoder/rnnt_beam_search.py | 1 + .../conformer_import_moh_att_train.py | 58 ++++- .../librispeech_960/_import_model_nick.py | 181 +++++++++----- .../_test_returnn_torch_mhsa.py | 140 +++++++++++ .../librispeech_960/conformer_ctc_train.py | 19 ++ .../librispeech_960/conformer_rnnt_train.py | 88 +++++-- .../rf/conformer_ctc/model_conformer_ctc.py | 3 +- .../rf/conformer_rnnt/model_conformer_rnnt.py | 232 ++++++++++++++++-- .../asr/rf/conformer_rnnt/model_recog_rnnt.py | 2 + 11 files changed, 626 insertions(+), 115 deletions(-) create mode 100644 users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/_test_returnn_torch_mhsa.py diff --git a/users/gaudino/datasets/librispeech.py b/users/gaudino/datasets/librispeech.py index be0d7d821..7ee057ea6 100644 --- a/users/gaudino/datasets/librispeech.py +++ b/users/gaudino/datasets/librispeech.py @@ -53,6 +53,15 @@ # unknown_label="", unknown_label=None, ) +bpe5k = Bpe( + dim=5_048, + eos_idx=0, + bos_idx=0, + codes=generic_job_output("i6_core/text/label/subword_nmt/train/ReturnnTrainBpeJob.yH3Z10x9CgDt/output/bpe.codes"), + vocab=generic_job_output("i6_core/text/label/subword_nmt/train/ReturnnTrainBpeJob.yH3Z10x9CgDt/output/bpe.vocab"), + # unknown_label="", + unknown_label=None, +) _Parts = ["train-clean-100", "train-clean-360", "train-other-500", "dev-clean", "dev-other", "test-clean", "test-other"] @@ -310,6 +319,8 @@ def get_librispeech_task_raw(*, vocab: VocabConfig, **dataset_train_opts) -> Tas def get_librispeech_task_bpe10k_raw(**dataset_train_opts) -> Task: return get_librispeech_task_raw(vocab=bpe10k, **dataset_train_opts) +def get_librispeech_task_bpe5k_raw(**dataset_train_opts) -> Task: + return get_librispeech_task_raw(vocab=bpe5k, **dataset_train_opts) def _bpe_to_words(bpe: RecogOutput) -> RecogOutput: """BPE to words""" diff --git a/users/gaudino/experiments/ctc_rnnt_standalone_2024/librispeech_960/pytorch_networks/rnnt/decoder/experimental_rnnt_decoder.py b/users/gaudino/experiments/ctc_rnnt_standalone_2024/librispeech_960/pytorch_networks/rnnt/decoder/experimental_rnnt_decoder.py index 139fef54e..bccd05905 100644 --- a/users/gaudino/experiments/ctc_rnnt_standalone_2024/librispeech_960/pytorch_networks/rnnt/decoder/experimental_rnnt_decoder.py +++ b/users/gaudino/experiments/ctc_rnnt_standalone_2024/librispeech_960/pytorch_networks/rnnt/decoder/experimental_rnnt_decoder.py @@ -76,12 +76,18 @@ def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Ten """ squeezed_features = torch.squeeze(input) + breakpoint() with torch.no_grad(): audio_features, audio_features_len = self.feature_extraction(squeezed_features, lengths) mask = mask_tensor(audio_features, audio_features_len) + breakpoint() + encoder_out, out_mask = self.encoder(audio_features, mask) + + breakpoint() + encoder_out = self.mapping(encoder_out) encoder_out_lengths = torch.sum(out_mask, dim=1) # [B, T] -> [B] diff --git a/users/gaudino/experiments/ctc_rnnt_standalone_2024/librispeech_960/pytorch_networks/rnnt/decoder/rnnt_beam_search.py b/users/gaudino/experiments/ctc_rnnt_standalone_2024/librispeech_960/pytorch_networks/rnnt/decoder/rnnt_beam_search.py index d8a488928..996d818bf 100644 --- a/users/gaudino/experiments/ctc_rnnt_standalone_2024/librispeech_960/pytorch_networks/rnnt/decoder/rnnt_beam_search.py +++ b/users/gaudino/experiments/ctc_rnnt_standalone_2024/librispeech_960/pytorch_networks/rnnt/decoder/rnnt_beam_search.py @@ -77,6 +77,7 @@ def forward_semi_batched( Returns: List[Hypothesis]: top-``beam_width`` hypotheses found by beam search. """ + if input.dim() != 3: raise ValueError("input must be of shape (B, T, D)") diff --git a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_train.py b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_train.py index 475f6d6d7..a86dfd478 100644 --- a/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_train.py +++ b/users/gaudino/experiments/rf_conformer_att_2023/librispeech_960/conformer_import_moh_att_train.py @@ -323,8 +323,35 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): # # OOM in ep 889 # post_config_updates={"PYTORCH_CUDA_ALLOC_CONF": "backend:cudaMallocAsync"}, # ) - model = train_exp( # 5.41 - "base-24gb-v6-lrlin1e_5_600k", + + + # model = train_exp( # 5.41 + # "base-24gb-v6-lrlin1e_5_600k", + # config_24gb_v6, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 2000 epochs: 982.312 + # "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], + # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + # }, + # ) + # model = train_exp( # 5.42 + # "base-24gb-v6-lrlin1e_5_600k_noCTC", + # config_24gb_v6, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 2000 epochs: 982.312 + # "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], + # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + # "aux_loss_layers": [], + # }, + # ) + + # train with bpe5k vocab + model = train_exp( + "base-24gb-v6-lrlin1e_5_600k-bpe5k", config_24gb_v6, config_updates={ "learning_rate": 1.0, @@ -332,10 +359,12 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): # total steps after 2000 epochs: 982.312 "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + "bpe_size": "BPE5k", }, + bpe_size="BPE5k", ) - model = train_exp( # 5.42 - "base-24gb-v6-lrlin1e_5_600k_noCTC", + model = train_exp( + "base-24gb-v6-lrlin1e_5_600k_noCTC-bpe5k", config_24gb_v6, config_updates={ "learning_rate": 1.0, @@ -344,8 +373,11 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], "aux_loss_layers": [], + "bpe_size": "BPE5k", }, + bpe_size="BPE5k", ) + # All beam search experiments using model_recog_pure_torch, beam_search_sep_ended_keep_v6. # for name, recog_config in { # "beam12-batch200-lenReward01": { @@ -1027,6 +1059,7 @@ def train_exp( fine_tune: Optional[Union[int, List[Tuple[int, Dict[str, Any]]]]] = None, time_rqmt: Optional[int] = None, model_avg: bool = False, + bpe_size: str = "BPE10k", ) -> ModelWithCheckpoints: """ Train experiment @@ -1034,13 +1067,13 @@ def train_exp( from i6_experiments.users.gaudino.experiments.rf_conformer_att_2023.train import ( train, ) - from i6_experiments.users.zeyer.recog import recog_training_exp + from i6_experiments.users.gaudino.recog_2 import recog_training_exp if _sis_prefix is None: _sis_setup_global_prefix() prefix = _sis_prefix + "/" + name - task = _get_ls_task() + task = _get_ls_task(bpe_size) config = config.copy() config = dict_update_deep(config, config_updates, config_deletes) if "__num_epochs" in config: @@ -1127,16 +1160,21 @@ def train_exp( _ls_task = None -def _get_ls_task(): +def _get_ls_task(bpe_size): global _ls_task if _ls_task: return _ls_task - from i6_experiments.users.zeyer.datasets.librispeech import ( - get_librispeech_task_bpe10k_raw, + from i6_experiments.users.gaudino.datasets.librispeech import ( + get_librispeech_task_bpe10k_raw, get_librispeech_task_bpe5k_raw ) - _ls_task = get_librispeech_task_bpe10k_raw(with_eos_postfix=True) + if bpe_size == "BPE10k": + _ls_task = get_librispeech_task_bpe10k_raw(with_eos_postfix=True) + + if bpe_size == "BPE5k": + _ls_task = get_librispeech_task_bpe5k_raw(with_eos_postfix=True) + return _ls_task diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/_import_model_nick.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/_import_model_nick.py index 0a4835e40..d944d6c95 100644 --- a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/_import_model_nick.py +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/_import_model_nick.py @@ -18,7 +18,9 @@ MakeModel, ) -from i6_experiments.users.gaudino.models.asr.rf.conformer_rnnt.model_conformer_rnnt import MakeModel as MakeModelRNNT +from i6_experiments.users.gaudino.models.asr.rf.conformer_rnnt.model_conformer_rnnt import ( + MakeModelV2 as MakeModelRNNT, +) from i6_experiments.users.gaudino.models.asr.rf.nn_lm.lm_import_2023_11_09 import ( MakeModel as MakeModelLM, @@ -38,7 +40,9 @@ _nick_pure_torch_rnnt_ckpt_path = "/work/asr4/rossenbach/sisyphus_work_folders/tts_decoder_asr_work/i6_core/returnn/training/ReturnnTrainingJob.6lwn4XuFkhkI/output/models/epoch.250.pt" -from i6_experiments.users.gaudino.experiments.conformer_att_2023.tedlium2.model_ckpt_info import models +from i6_experiments.users.gaudino.experiments.conformer_att_2023.tedlium2.model_ckpt_info import ( + models, +) def convert_checkpoint( @@ -63,13 +67,12 @@ def convert_checkpoint( print(f"{k}: {v.shape if hasattr(v, 'shape') else v}") # print(reader.debug_string().decode("utf-8")) - print() - print("Creating model...") rf.select_backend_torch() - model = MakeModelRNNT(80, 1_057)() + # model = MakeModelRNNT(80, 1_057)() + model = MakeModelRNNT(80, 5048)() print("Created model:", model) print("Model parameters:") @@ -83,7 +86,8 @@ def convert_checkpoint( print("Create ParamMapping...") param_mapping = {} _add_params_predictor_joiner(param_mapping) - # _add_params_conformer(param_mapping, prefix="") + _add_params_conformer(param_mapping, prefix="") + # if not ctc_only: # _add_params_att_decoder(param_mapping) # _add_params_trafo_lm(param_mapping) @@ -91,14 +95,13 @@ def convert_checkpoint( # _add_params_conformer(param_mapping, prefix="sep_enc_ctc_") for name, param in model.named_parameters(): - if name in param_mapping: - assert isinstance(name, str) - assert isinstance(param, rf.Parameter) + assert isinstance(name, str) + assert isinstance(param, rf.Parameter) - value = map_param_func(ckpt, name, param, param_mapping) - assert isinstance(value, numpy.ndarray) - # noinspection PyProtectedMember - param._raw_backend.set_parameter_initial_value(param, value) + value = map_param_func(ckpt, name, param, param_mapping) + assert isinstance(value, numpy.ndarray) + # noinspection PyProtectedMember + param._raw_backend.set_parameter_initial_value(param, value) epoch = 1 if epoch is None: @@ -114,11 +117,9 @@ def convert_checkpoint( pt_model = rf_module_to_pt_module(model) - breakpoint() - if save_model: os.makedirs(out_dir, exist_ok=True) - filename = out_dir + "/" + ckpt_name + ".pt" + filename = out_dir + "/" + ckpt_name # + ".pt" print(f"*** saving PyTorch model checkpoint: {filename}") torch.save( {"model": pt_model.state_dict(), "epoch": epoch, "step": step}, filename @@ -139,98 +140,145 @@ def convert_checkpoint( os.symlink(os.path.basename(meta_filename), symlink_filename_2) # assert os.path.exists(self.out_checkpoint.get_path()) + +_transpose_list = [ + "encoder_out_linear.weight", + "encoder.input_projection.weight", + "joiner.linear.weight", + "predictor.linear.weight", +] + +for layer_idx in range(12): + _transpose_list.append(f"encoder.layers.{layer_idx}.ffn1.linear_ff.weight") + _transpose_list.append(f"encoder.layers.{layer_idx}.ffn1.linear_out.weight") + _transpose_list.append(f"encoder.layers.{layer_idx}.ffn2.linear_ff.weight") + _transpose_list.append(f"encoder.layers.{layer_idx}.ffn2.linear_out.weight") + + _transpose_list.append( + f"encoder.layers.{layer_idx}.conv_block.positionwise_conv1.weight" + ) + _transpose_list.append( + f"encoder.layers.{layer_idx}.self_att.qkv.weight" + ) + + def _add_params_conformer(param_mapping: Dict[str, str], prefix: str): # rf -> pt # frontend - for layer_idx in [0, 1, 2]: - orig_name = "conv0" if layer_idx == 0 else f"subsample_conv{layer_idx - 1}" + for layer_idx in [0, 1, 2, 3]: + orig_name = f"conformer.frontend.conv{layer_idx + 1}" param_mapping.update( { prefix - + f"encoder.input_layer.conv_layers.{layer_idx}.filter": f"{orig_name}/W", + + f"enc_input_layer.conv_layers.{layer_idx}.filter": f"{orig_name}.weight", prefix - + f"encoder.input_layer.conv_layers.{layer_idx}.bias": f"{orig_name}/bias", + + f"enc_input_layer.conv_layers.{layer_idx}.bias": f"{orig_name}.bias", } ) param_mapping.update( { - prefix + "encoder.input_projection.weight": "source_linear/W", + prefix + + "encoder.input_projection.weight": "conformer.frontend.linear.weight", + prefix + "encoder.input_projection.bias": "conformer.frontend.linear.bias", # prefix + "ctc.weight": "ctc/W", # prefix + "ctc.bias": "ctc/b", - prefix + "enc_aux_logits_12.weight": "ctc/W", - prefix + "enc_aux_logits_12.bias": "ctc/b", + # prefix + "enc_aux_logits_12.weight": "ctc/W", + # prefix + "enc_aux_logits_12.bias": "ctc/b", } ) # conformer for layer_idx in range(12): + orig_name_prefix = f"conformer.module_list.{layer_idx}." # FF for sub in [1, 2]: param_mapping[ prefix + f"encoder.layers.{layer_idx}.ffn{sub}.linear_ff.weight" - ] = f"conformer_block_{layer_idx + 1:02d}_ffmod_{sub}_ff1/W" + ] = (orig_name_prefix + f"ff{sub}.linear_ff.weight") param_mapping[ prefix + f"encoder.layers.{layer_idx}.ffn{sub}.linear_ff.bias" - ] = f"conformer_block_{layer_idx + 1:02d}_ffmod_{sub}_ff1/b" + ] = (orig_name_prefix + f"ff{sub}.linear_ff.bias") param_mapping[ prefix + f"encoder.layers.{layer_idx}.ffn{sub}.linear_out.weight" - ] = f"conformer_block_{layer_idx + 1:02d}_ffmod_{sub}_ff2/W" + ] = (orig_name_prefix + f"ff{sub}.linear_out.weight") param_mapping[ prefix + f"encoder.layers.{layer_idx}.ffn{sub}.linear_out.bias" - ] = f"conformer_block_{layer_idx + 1:02d}_ffmod_{sub}_ff2/b" + ] = (orig_name_prefix + f"ff{sub}.linear_out.bias") param_mapping[ prefix + f"encoder.layers.{layer_idx}.ffn{sub}_layer_norm.scale" - ] = f"conformer_block_{layer_idx + 1:02d}_ffmod_{sub}_ln/scale" + ] = (orig_name_prefix + f"ff{sub}.layer_norm.weight") param_mapping[ prefix + f"encoder.layers.{layer_idx}.ffn{sub}_layer_norm.bias" - ] = f"conformer_block_{layer_idx + 1:02d}_ffmod_{sub}_ln/bias" + ] = (orig_name_prefix + f"ff{sub}.layer_norm.bias") # conv param_mapping[ prefix + f"encoder.layers.{layer_idx}.conv_block.positionwise_conv1.weight" - ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_pointwise_conv1/W" + ] = (orig_name_prefix + "conv.pointwise_conv1.weight") param_mapping[ prefix + f"encoder.layers.{layer_idx}.conv_block.positionwise_conv1.bias" - ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_pointwise_conv1/b" + ] = (orig_name_prefix + "conv.pointwise_conv1.bias") param_mapping[ prefix + f"encoder.layers.{layer_idx}.conv_block.depthwise_conv.filter" - ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_depthwise_conv2/W" + ] = (orig_name_prefix + "conv.depthwise_conv.weight") param_mapping[ prefix + f"encoder.layers.{layer_idx}.conv_block.depthwise_conv.bias" - ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_depthwise_conv2/bias" + ] = (orig_name_prefix + "conv.depthwise_conv.bias") param_mapping[ prefix + f"encoder.layers.{layer_idx}.conv_block.positionwise_conv2.weight" - ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_pointwise_conv2/W" + ] = (orig_name_prefix + "conv.pointwise_conv2.weight") param_mapping[ prefix + f"encoder.layers.{layer_idx}.conv_block.positionwise_conv2.bias" - ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_pointwise_conv2/b" - param_mapping[ - prefix + f"encoder.layers.{layer_idx}.conv_layer_norm.scale" - ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_ln/scale" - param_mapping[ - prefix + f"encoder.layers.{layer_idx}.conv_layer_norm.bias" - ] = f"conformer_block_{layer_idx + 1:02d}_conv_mod_ln/bias" + ] = (orig_name_prefix + "conv.pointwise_conv2.bias") + param_mapping[prefix + f"encoder.layers.{layer_idx}.conv_block.norm.gamma"] = ( + orig_name_prefix + "conv.norm.weight" + ) + param_mapping[prefix + f"encoder.layers.{layer_idx}.conv_block.norm.beta"] = ( + orig_name_prefix + "conv.norm.bias" + ) + param_mapping[prefix + f"encoder.layers.{layer_idx}.conv_layer_norm.scale"] = ( + orig_name_prefix + "conv.layer_norm.weight" + ) + param_mapping[prefix + f"encoder.layers.{layer_idx}.conv_layer_norm.bias"] = ( + orig_name_prefix + "conv.layer_norm.bias" + ) # self-att - param_mapping[ - prefix + f"encoder.layers.{layer_idx}.self_att.qkv.weight" - ] = f"conformer_block_{layer_idx + 1:02d}_self_att/QKV" - param_mapping[ - prefix + f"encoder.layers.{layer_idx}.self_att.proj.weight" - ] = f"conformer_block_{layer_idx + 1:02d}_self_att_linear/W" + param_mapping[prefix + f"encoder.layers.{layer_idx}.self_att.qkv.weight"] = ( + orig_name_prefix + "mhsa.mhsa.in_proj_weight" + ) + param_mapping[prefix + f"encoder.layers.{layer_idx}.self_att.qkv.bias"] = ( + orig_name_prefix + "mhsa.mhsa.in_proj_bias" + ) + param_mapping[prefix + f"encoder.layers.{layer_idx}.self_att.proj.weight"] = ( + orig_name_prefix + "mhsa.mhsa.out_proj.weight" + ) + param_mapping[prefix + f"encoder.layers.{layer_idx}.self_att.proj.bias"] = ( + orig_name_prefix + "mhsa.mhsa.out_proj.bias" + ) param_mapping[ prefix + f"encoder.layers.{layer_idx}.self_att_layer_norm.scale" - ] = f"conformer_block_{layer_idx + 1:02d}_self_att_ln/scale" + ] = (orig_name_prefix + "mhsa.layernorm.weight") param_mapping[ prefix + f"encoder.layers.{layer_idx}.self_att_layer_norm.bias" - ] = f"conformer_block_{layer_idx + 1:02d}_self_att_ln/bias" - param_mapping[ - prefix + f"encoder.layers.{layer_idx}.self_att.learned_pos_emb.pos_emb" - ] = f"conformer_block_{layer_idx + 1:02d}_self_att_ln_rel_pos_enc/encoding_matrix" + ] = (orig_name_prefix + "mhsa.layernorm.bias") + # param_mapping[ + # prefix + f"encoder.layers.{layer_idx}.self_att.learned_pos_emb.pos_emb" + # ] = f"conformer_block_{layer_idx + 1:02d}_self_att_ln_rel_pos_enc/encoding_matrix" + # final layer norm - param_mapping[ - prefix + f"encoder.layers.{layer_idx}.final_layer_norm.scale" - ] = f"conformer_block_{layer_idx + 1:02d}_ln/scale" - param_mapping[ - prefix + f"encoder.layers.{layer_idx}.final_layer_norm.bias" - ] = f"conformer_block_{layer_idx + 1:02d}_ln/bias" + param_mapping[prefix + f"encoder.layers.{layer_idx}.final_layer_norm.scale"] = ( + orig_name_prefix + "final_layer_norm.weight" + ) + param_mapping[prefix + f"encoder.layers.{layer_idx}.final_layer_norm.bias"] = ( + orig_name_prefix + "final_layer_norm.bias" + ) + + # output layer + param_mapping.update( + { + prefix + "encoder_out_linear.weight": "encoder_out_linear.weight", + prefix + "encoder_out_linear.bias": "encoder_out_linear.bias", + } + ) + def _add_params_predictor_joiner(param_mapping: Dict[str, str]): # add params of trafo lm @@ -238,7 +286,6 @@ def _add_params_predictor_joiner(param_mapping: Dict[str, str]): param_mapping.update( { f"predictor.layers.{layer_idx}.ff_weight": f"predictor.lstm_layers.{layer_idx}.weight_ih_l0", - f"predictor.layers.{layer_idx}.rec_weight": f"predictor.lstm_layers.{layer_idx}.weight_hh_l0", f"predictor.layers.{layer_idx}.bias": f"predictor.lstm_layers.{layer_idx}.bias_ih_l0", } @@ -274,12 +321,14 @@ def map_param_func( assert isinstance(var, rf.Parameter) if name in param_mapping: - breakpoint() var_name = param_mapping[name] - assert name in ckpt["model"].keys(), f"missing {var_name}" - value = ckpt["model"][name].numpy() + assert var_name in ckpt["model"].keys(), f"missing {var_name}" + value = ckpt["model"][var_name].numpy() assert isinstance(value, numpy.ndarray) + if name in _transpose_list: + value = value.T + assert ( value.shape == var.batch_shape ), f"new param {name} {var.batch_shape} vs ckpt param {var_name} {value.shape}" @@ -328,6 +377,10 @@ def map_param_func( raise NotImplementedError(f"cannot map {name!r} {var}") - if __name__ == "__main__": - convert_checkpoint(ckpt_path=_nick_pure_torch_rnnt_ckpt_path, print_params=True, out_dir="", save_model=False) \ No newline at end of file + convert_checkpoint( + ckpt_path=_nick_pure_torch_rnnt_ckpt_path, + print_params=True, + out_dir="/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/librispeech/rnnt_nick_240614", + save_model=True, + ) diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/_test_returnn_torch_mhsa.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/_test_returnn_torch_mhsa.py new file mode 100644 index 000000000..11b545e0d --- /dev/null +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/_test_returnn_torch_mhsa.py @@ -0,0 +1,140 @@ +# test whether the torch and returnn implementation of the mhsa layer are equivalent + +import torch + +import returnn.frontend as rf + +from returnn.tensor import Dim + +import itertools + +torch.backends.mha.set_fastpath_enabled(False) + +rf.select_backend_torch() +rf.init_forward_step_run_ctx() + +spatial_dim = Dim(3, name="spatial_dim") +out_dim = Dim(512, name="out_dim") + +random_opts = { + "distribution": "normal", + "dtype": "float32", +} + +rf.set_random_seed(1) + +rf_input = rf.random(dims=[Dim(1), spatial_dim, out_dim], **random_opts) + +qkv_weight = rf.random(dims=[out_dim, 3*out_dim], **random_opts) +# qkv_weight = rf.full(dims=[out_dim, 3*out_dim], fill_value=0.0, dtype="float32") +# qkv_weight.raw_tensor[:,1024:] = 0.0 +# qkv_weight.raw_tensor[:,0:1023] = 0.0 +# +# eye = torch.eye(512) +# zeros = torch.zeros(512, 512) +# +# qkv_raw = torch.cat([eye, eye, zeros], dim=1) +# +# qkv_weight.raw_tensor = qkv_raw + +qkv_bias = rf.random(dims=[3*out_dim], **random_opts) +# qkv_bias = rf.full(dims=[3*out_dim], fill_value=0.0, dtype="float32") +# qkv_bias.raw_tensor[1024:] = 0.0 + +proj_weight = rf.random(dims=[out_dim, out_dim], **random_opts) +proj_bias = rf.random(dims=[out_dim], **random_opts) + +torch_input = rf_input.raw_tensor + +rf_mhsa = rf.SelfAttention( + in_dim=out_dim, + proj_dim=out_dim, + key_dim_total=out_dim, + value_dim_total=out_dim, + num_heads=1, + att_dropout=0.1, +) + +rf_mhsa.qkv.weight._raw_backend.set_parameter_initial_value(rf_mhsa.qkv.weight, qkv_weight) +rf_mhsa.qkv.bias._raw_backend.set_parameter_initial_value(rf_mhsa.qkv.bias, qkv_bias) + +rf_mhsa.proj.weight._raw_backend.set_parameter_initial_value(rf_mhsa.proj.weight, proj_weight.raw_tensor) +rf_mhsa.proj.bias._raw_backend.set_parameter_initial_value(rf_mhsa.proj.bias, proj_bias) + +torch_mhsa = torch.nn.MultiheadAttention( + 512, 1, dropout=0.1, batch_first=True, + ) + +state_dict = torch_mhsa.state_dict() +state_dict['in_proj_weight'] = qkv_weight.raw_tensor.T +state_dict['in_proj_bias'] = qkv_bias.raw_tensor +state_dict['out_proj.weight'] = proj_weight.raw_tensor +state_dict['out_proj.bias'] = proj_bias.raw_tensor + +torch_mhsa.load_state_dict(state_dict) +torch_mhsa.eval() + +rf_output = rf_mhsa(rf_input, axis=spatial_dim) +torch_output, _ = torch_mhsa(torch_input, torch_input, torch_input, key_padding_mask=None, need_weights=False) + +print("RF output") +print(rf_output.raw_tensor) +print(rf_output.raw_tensor.shape) +print("---------------------------") +print("Torch output") +print(torch_output) +print(torch_output.shape) +print("Same: ", torch.allclose(rf_output.raw_tensor, torch_output, atol=1e-6)) + +## extended checks + +if False: + q_raw, k_raw, v_raw = torch.split(qkv_weight.raw_tensor, 512, dim=1) + + weights = [q_raw, k_raw, v_raw] + + any = False + + for perm in itertools.permutations(weights): + for q_inv, k_inv, v_inv in itertools.product([0,1], [0,1], [0,1]): + q_raw, k_raw, v_raw = perm + + if q_inv == 1: + q_raw = q_raw.T + if k_inv == 1: + k_raw = k_raw.T + if v_inv == 1: + v_raw = v_raw.T + + qkv_weight_adj_raw = torch.cat([q_raw.T, k_raw.T, v_raw.T], dim=1) + + state_dict = torch_mhsa.state_dict() + state_dict['in_proj_weight'] = qkv_weight_adj_raw.T + state_dict['in_proj_bias'] = qkv_bias.raw_tensor + state_dict['out_proj.weight'] = proj_weight.raw_tensor + state_dict['out_proj.bias'] = proj_bias.raw_tensor + + torch_mhsa.load_state_dict(state_dict) + torch_mhsa.eval() + + rf_output = rf_mhsa(rf_input, axis=spatial_dim) + torch_output, _ = torch_mhsa(torch_input, torch_input, torch_input, key_padding_mask=None, need_weights=False) + + print("RF output") + print(rf_output.raw_tensor) + print(rf_output.raw_tensor.shape) + print("---------------------------") + print("Torch output") + print(torch_output) + print(torch_output.shape) + + if torch.allclose(rf_output.raw_tensor, torch_output, atol=1e-6): + print("Match with perm: ", perm) + print("Match with inversions: ", q_inv, k_inv, v_inv) + any = True + break + + if any: + break + + print("Done") diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py index 29ff6ef1f..3787a9f76 100644 --- a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py @@ -222,6 +222,25 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): ) + train_exp( # dev-other + "base-24gb-lrlin1e_5_600k_ctc_only_aux4_8_40subsample", + config_24gb_v6, + config_updates={ + "learning_rate": 1.0, + "dynamic_learning_rate": dyn_lr_piecewise_linear, + # total steps after 2000 epochs: 982.312 + "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], + "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + "mel_normalization_ted2": False, + "conv_2nd_stride": 2, + }, + search_config = { + "mel_normalization_ted2": False, + }, + with_eos_postfix=False, + + ) + _torch_ckpt_path = "/u/luca.gaudino/setups/2023-08-10--rf-librispeech/work/i6_core/returnn/training/ReturnnTrainingJob.AWwVft0oGy8e/output/models/epoch.1981.pt" diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_rnnt_train.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_rnnt_train.py index 3ce450f69..604e6afc0 100644 --- a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_rnnt_train.py +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_rnnt_train.py @@ -37,7 +37,7 @@ ModelWithCheckpoint, ) -from i6_experiments.users.gaudino.models.asr.rf.conformer_rnnt.model_conformer_rnnt import from_scratch_model_def, from_scratch_training +from i6_experiments.users.gaudino.models.asr.rf.conformer_rnnt.model_conformer_rnnt import from_scratch_model_def, from_scratch_training, from_scratch_model_def_v2 from i6_experiments.users.gaudino.models.asr.rf.conformer_rnnt.model_recog_rnnt import model_recog @@ -86,24 +86,47 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): # train_exp("base-11gb", config_11gb, gpu_mem=11) # train_exp("base-11gb-v1", my_config_11gb, num_epochs=400, gpu_mem=11) - train_exp( - "from-scratch-24gb_aux4_8", - config_24gb_v6, - config_updates={ - "batch_size": 8_000 * _batch_size_factor, - "learning_rate": 1.0, - "dynamic_learning_rate": dyn_lr_piecewise_linear, - # total steps after 2000 epochs: 982.312 - "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], - "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], - "mel_normalization_ted2": False, - }, - config_deletes=["torch_amp"], - search_config={ + # train_exp( + # "from-scratch-24gb_aux4_8", + # config_24gb_v6, + # config_updates={ + # "batch_size": 8_000 * _batch_size_factor, + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 2000 epochs: 982.312 + # "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], + # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + # "mel_normalization_ted2": False, + # }, + # config_deletes=["torch_amp"], + # search_config={ + # "mel_normalization_ted2": False, + # }, + # num_epochs=400, + # gpu_mem=24, + # ) + + ## recog rnnt BPE5k nick + + # imported checkpoint + _torch_ckpt_path = "/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/librispeech/rnnt_nick_240614/epoch.250.pt" + + new_ckpt_path = tk.Path( + _torch_ckpt_path, + hash_overwrite= "rnnt_nick" + "_torch_rf_ckpt", + ) + new_ckpt = PtCheckpoint(new_ckpt_path) + + _recog( + "model_recogs/rnnt_nick_1/rnnt_beam_search/recog_results", + ModelWithCheckpoint( + definition=from_scratch_model_def_v2, checkpoint=new_ckpt + ), + model_recog, + dev_sets=["dev-other"], + recog_config={ "mel_normalization_ted2": False, }, - num_epochs=400, - gpu_mem=24, ) @@ -153,7 +176,7 @@ def _recog( if recog_def is None: recog_def = model_recog - task = _get_ted2_task() + task = _get_ls_task(bpe_size="BPE5k") res = recog_model( task, @@ -194,7 +217,7 @@ def train_exp( _sis_setup_global_prefix() prefix = _sis_prefix + "/" + name - task = _get_ls_task() + task = _get_ls_task(bpe_size="BPE10k") config = config.copy() config = dict_update_deep(config, config_updates, config_deletes) if "__num_epochs" in config: @@ -282,20 +305,37 @@ def train_exp( _ls_task = None _ted2_task = None - -def _get_ls_task(): +def _get_ls_task(bpe_size): global _ls_task if _ls_task: return _ls_task - from i6_experiments.users.zeyer.datasets.librispeech import ( - get_librispeech_task_bpe10k_raw, + from i6_experiments.users.gaudino.datasets.librispeech import ( + get_librispeech_task_bpe10k_raw, get_librispeech_task_bpe5k_raw ) - _ls_task = get_librispeech_task_bpe10k_raw(with_eos_postfix=True) + if bpe_size == "BPE10k": + _ls_task = get_librispeech_task_bpe10k_raw(with_eos_postfix=True) + + if bpe_size == "BPE5k": + _ls_task = get_librispeech_task_bpe5k_raw(with_eos_postfix=True) + return _ls_task +# def _get_ls_task(): +# global _ls_task +# if _ls_task: +# return _ls_task +# +# from i6_experiments.users.zeyer.datasets.librispeech import ( +# get_librispeech_task_bpe10k_raw, +# ) +# +# _ls_task = get_librispeech_task_bpe10k_raw(with_eos_postfix=True) +# return _ls_task + + def _get_ted2_task(): global _ted2_task if _ted2_task: diff --git a/users/gaudino/models/asr/rf/conformer_ctc/model_conformer_ctc.py b/users/gaudino/models/asr/rf/conformer_ctc/model_conformer_ctc.py index f32fc88dd..217d6df49 100644 --- a/users/gaudino/models/asr/rf/conformer_ctc/model_conformer_ctc.py +++ b/users/gaudino/models/asr/rf/conformer_ctc/model_conformer_ctc.py @@ -151,6 +151,7 @@ def __init__( self.mel_normalization = config.typed_value("mel_normalization_ted2", True) self.use_specaugment = config.typed_value("use_specaugment", True) + self.conv_2nd_stride = config.typed_value("conv_2nd_stride", 3) self.in_dim = in_dim self.encoder = ConformerEncoder( @@ -166,7 +167,7 @@ def __init__( ], filter_sizes=[(3, 3), (3, 3), (3, 3)], pool_sizes=[(1, 2)], - strides=[(1, 1), (3, 1), (2, 1)], + strides=[(1, 1), (self.conv_2nd_stride, 1), (2, 1)], ), encoder_layer_opts=enc_conformer_layer_opts, num_layers=num_enc_layers, diff --git a/users/gaudino/models/asr/rf/conformer_rnnt/model_conformer_rnnt.py b/users/gaudino/models/asr/rf/conformer_rnnt/model_conformer_rnnt.py index 8198baeac..b9fc28ec7 100644 --- a/users/gaudino/models/asr/rf/conformer_rnnt/model_conformer_rnnt.py +++ b/users/gaudino/models/asr/rf/conformer_rnnt/model_conformer_rnnt.py @@ -21,7 +21,7 @@ from returnn.tensor import Tensor, Dim, single_step_dim import returnn.frontend as rf from returnn.frontend.tensor_array import TensorArray -from returnn.frontend.encoder.conformer import ConformerEncoder, ConformerConvSubsample +from returnn.frontend.encoder.conformer import ConformerEncoder, ConformerConvSubsampleV2 from i6_experiments.users.gaudino.model_interfaces.supports_label_scorer_torch import ( RFModelWithMakeLabelScorer, @@ -125,6 +125,117 @@ def make_model( ) +class MakeModelV2: + """for import""" + + def __init__( + self, + in_dim: int, + target_dim: int, + *, + eos_label: int = 0, + num_enc_layers: int = 12, + ): + self.in_dim = in_dim + self.target_dim = target_dim + self.eos_label = eos_label + self.num_enc_layers = num_enc_layers + + def __call__(self) -> Model: + from returnn.datasets.util.vocabulary import Vocabulary + + in_dim = Dim(name="in", dimension=self.in_dim, kind=Dim.Types.Feature) + target_dim = Dim( + name="target", dimension=self.target_dim, kind=Dim.Types.Feature + ) + target_dim.vocab = Vocabulary.create_vocab_from_labels( + [str(i) for i in range(target_dim.dimension)], eos_label=self.eos_label + ) + + return self.make_model(in_dim, target_dim, num_enc_layers=self.num_enc_layers) + + @classmethod + def make_model( + cls, + in_dim: Dim, + target_dim: Dim, + *, + num_enc_layers: int = 12, + pos_emb_dropout: float = 0.0, + language_model: Optional[Dict[str, Any]] = None, + **extra, + ) -> Model: + """make""" + lm = None + if language_model: + assert isinstance(language_model, dict) + language_model = language_model.copy() + cls_name = language_model.pop("class") + assert cls_name == "TransformerDecoder" + language_model.pop("vocab_dim", None) # will just overwrite + + from i6_experiments.users.gaudino.experiments.rf_conformer_att_2023.librispeech_960.trafo_lm.trafo_lm import ( + trafo_lm, + ) + + lm = trafo_lm.MakeModel(vocab_dim=target_dim, **language_model)() + lm = (lm, functools.partial(trafo_lm.make_label_scorer_torch, model=lm)) + + return Model( + in_dim, + num_enc_layers=num_enc_layers, + enc_model_dim=Dim(name="enc", dimension=512, kind=Dim.Types.Feature), + enc_ff_dim=Dim(name="enc-ff", dimension=2048, kind=Dim.Types.Feature), + enc_att_num_heads=8, + enc_conformer_layer_opts=dict( + conv_norm_opts=dict( + use_mask=True, track_running_stats=False + ), # Changed: track_running_stats=False + self_att=rf.SelfAttention, + self_att_opts=dict( + with_bias=True, # Changed: with_bias=True + # with_linear_pos=False, + # with_pos_bias=False, + # learnable_pos_emb=False, # Changed: learnable_pos_emb=False + # separate_pos_emb_per_head=False, + # pos_emb_dropout=pos_emb_dropout, + ), + ff_activation=rf.silu, # Changed: rf.silu + conv_kernel_size=31, # Changed: conv_kernel_size=31 + ), + enc_input_layer=ConformerConvSubsampleV2( + in_dim, + out_dims=[ + Dim(32, name="conv1"), + Dim(64, name="conv2"), + Dim(64, name="conv3"), + Dim(32, name="conv4"), # Changed: Dim(64, name="conv4") + ], + filter_sizes=[(3, 3), (3, 3), (3, 3), (3, 3)], # Changed + activation_times=[False, True, False, True], # Changed + pool_sizes=[(1, 1), (3, 1), (1, 1), (2, 1)], # Changed + strides=[(1, 1), (1, 1), (1, 1), (1, 1)], # Changed + padding="same", # Changed: padding="valid" + pool_padding="valid", # Changed + swap_merge_dim_order=True, # Changed + # Note: uses relu activation by default + ), + enc_use_input_proj_bias=True, # Changed: enc_use_input_proj_bias=True + target_dim=target_dim, + blank_idx=target_dim.dimension, + bos_idx=_get_bos_idx(target_dim), + eos_idx=_get_eos_idx(target_dim), + language_model=lm, + use_i6_models_feat_ext = True, + # feat_ext_opts=dict( + # f_min=60, + # f_max=7600, + # n_fft=400, + # ), + **extra, + ) + + class Predictor(rf.Module): r"""Recurrent neural network transducer (RNN-T) prediction network. @@ -378,6 +489,8 @@ def __init__( enc_ff_dim: Dim = Dim(name="enc-ff", dimension=2048), enc_att_num_heads: int = 4, enc_conformer_layer_opts: Optional[Dict[str, Any]] = None, + enc_input_layer: Optional[ConformerConvSubsampleV2] = None, + enc_use_input_proj_bias: bool = False, # enc_key_total_dim: Dim = Dim(name="enc_key_total_dim", dimension=1024), # att_num_heads: Dim = Dim(name="att_num_heads", dimension=1), # att_dropout: float = 0.1, @@ -386,6 +499,8 @@ def __init__( l2: float = 0.0001, language_model: Optional[RFModelWithMakeLabelScorer] = None, joiner_dim: int = 640, + use_i6_models_feat_ext: bool = False, + feat_ext_opts: Optional[Dict[str, Any]] = None, ): super(Model, self).__init__() @@ -394,13 +509,30 @@ def __init__( config = get_global_config(return_empty_if_none=True) self.mel_normalization = config.typed_value("mel_normalization_ted2", True) + self.use_i6_models_feat_ext = use_i6_models_feat_ext + if self.use_i6_models_feat_ext: + from i6_models.primitives.feature_extraction import ( + LogMelFeatureExtractionV1, + LogMelFeatureExtractionV1Config, + ) - self.in_dim = in_dim - self.encoder = ConformerEncoder( - in_dim, - enc_model_dim, - ff_dim=enc_ff_dim, - input_layer=ConformerConvSubsample( + mel_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + **(feat_ext_opts or {}), + ) + self.feature_extraction = LogMelFeatureExtractionV1(cfg=mel_config) + + self.feat_ext_opts = feat_ext_opts + + if enc_input_layer is None: + self.enc_input_layer = ConformerConvSubsampleV2( in_dim, out_dims=[ Dim(32, name="conv1"), @@ -410,7 +542,16 @@ def __init__( filter_sizes=[(3, 3), (3, 3), (3, 3)], pool_sizes=[(1, 2)], strides=[(1, 1), (3, 1), (2, 1)], - ), + ) + else: + self.enc_input_layer = enc_input_layer + + self.in_dim = in_dim + self.encoder = ConformerEncoder( + in_dim, + enc_model_dim, + ff_dim=enc_ff_dim, + input_layer=self.enc_input_layer, encoder_layer_opts=enc_conformer_layer_opts, num_layers=num_enc_layers, num_heads=enc_att_num_heads, @@ -418,6 +559,17 @@ def __init__( att_dropout=enc_att_dropout, ) + self.enc_use_input_proj_bias = enc_use_input_proj_bias + + if self.enc_use_input_proj_bias: + self.encoder.input_projection = rf.Linear( + self.encoder.input_layer.out_dim + if self.encoder.input_layer + else self.encoder.in_dim, + self.encoder.out_dim, + with_bias=True, + ) + self.target_dim = target_dim self.target_dim_w_blank = target_dim + 1 self.blank_idx = blank_idx @@ -501,14 +653,36 @@ def encode( collected_outputs: Optional[Dict[str, Tensor]] = None, ) -> Tuple[Dict[str, Tensor], Dim]: """encode, and extend the encoder output for things we need in the decoder""" - # log mel filterbank features - source, in_spatial_dim = rf.audio.log_mel_filterbank_from_raw( - source, - in_spatial_dim=in_spatial_dim, - out_dim=self.in_dim, - sampling_rate=16_000, - log_base=math.exp(2.3026), # almost 10.0 but not exactly... - ) + + if self.use_i6_models_feat_ext: + squeezed_features = torch.squeeze(source.raw_tensor) + raw_audio_len = in_spatial_dim.dyn_size_ext.raw_tensor + audio_features, audio_features_len_raw = self.feature_extraction( + squeezed_features, raw_audio_len + ) + audio_features_len = rf.Tensor( + name="audio-features-len", + dims=[source.dims[0]], + raw_tensor=audio_features_len_raw, + dtype="int32", + ) + in_spatial_dim = Dim(None, name="in-spatial-dim", dyn_size_ext=audio_features_len) + source = rf.Tensor( + name="audio-features", + dims=[source.dims[0], in_spatial_dim, self.in_dim], + raw_tensor=audio_features, + dtype="float32", + ) + else: + # log mel filterbank features + source, in_spatial_dim = rf.audio.log_mel_filterbank_from_raw_v2( + source, + in_spatial_dim=in_spatial_dim, + out_dim=self.in_dim, + sampling_rate=16_000, + log_base=math.exp(2.3026), # almost 10.0 but not exactly... + **(self.feat_ext_opts or {}), + ) if self.mel_normalization: ted2_global_mean = rf.Tensor( @@ -701,6 +875,32 @@ def from_scratch_model_def(*, epoch: int, in_dim: Dim, target_dim: Dim) -> Model from_scratch_model_def.batch_size_factor = 160 +def from_scratch_model_def_v2(*, epoch: int, in_dim: Dim, target_dim: Dim) -> Model: + """Function is run within RETURNN.""" + from returnn.config import get_global_config + + in_dim, epoch # noqa + config = get_global_config() # noqa + enc_aux_logits = config.typed_value("aux_loss_layers") + pos_emb_dropout = config.float("pos_emb_dropout", 0.0) + # real input is raw audio, internally it does logmel + in_dim = Dim(name="logmel", dimension=_log_mel_feature_dim, kind=Dim.Types.Feature) + lm_opts = config.typed_value("external_language_model") + return MakeModelV2.make_model( + in_dim, + target_dim, + enc_aux_logits=enc_aux_logits or (), + pos_emb_dropout=pos_emb_dropout, + language_model=lm_opts, + ) + + +from_scratch_model_def_v2: ModelDef[Model] +from_scratch_model_def_v2.behavior_version = 16 +from_scratch_model_def_v2.backend = "torch" +from_scratch_model_def_v2.batch_size_factor = 160 + + def from_scratch_training( *, model: Model, diff --git a/users/gaudino/models/asr/rf/conformer_rnnt/model_recog_rnnt.py b/users/gaudino/models/asr/rf/conformer_rnnt/model_recog_rnnt.py index 7c512166f..f8ec639c1 100644 --- a/users/gaudino/models/asr/rf/conformer_rnnt/model_recog_rnnt.py +++ b/users/gaudino/models/asr/rf/conformer_rnnt/model_recog_rnnt.py @@ -101,6 +101,8 @@ def model_recog( not model.language_model ) # not implemented here. use the pure PyTorch search instead + breakpoint() + batch_dims = data.remaining_dims((data_spatial_dim, data.feature_dim)) enc_args, enc_spatial_dim = model.encode(data, in_spatial_dim=data_spatial_dim) beam_size = 12 From 82e79b11286c3d429972419d150817c805efd571 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Wed, 19 Jun 2024 11:21:14 +0200 Subject: [PATCH 209/227] fix warning --- users/zeyer/datasets/librispeech.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/users/zeyer/datasets/librispeech.py b/users/zeyer/datasets/librispeech.py index 2a068c7c0..011bf8a38 100644 --- a/users/zeyer/datasets/librispeech.py +++ b/users/zeyer/datasets/librispeech.py @@ -335,7 +335,7 @@ def get_dataset(self, key: str, *, training: bool = False, subset: Optional[int] assert parts, f"invalid key {key!r}" for part in parts: files += [_get_librispeech_ogg_zip_dict()[part]] - d = { + d: Dict[str, Any] = { "class": "OggZipDataset", "path": files, "use_cache_manager": True, From 8fc236e4991f39c35694c8c3919c408b12e8a246 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Wed, 19 Jun 2024 14:39:17 +0000 Subject: [PATCH 210/227] fix bug --- .../canary_aed/configs/canary_1b_recog.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py index 28db5beb1..41f025593 100644 --- a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py +++ b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py @@ -95,10 +95,11 @@ def py(): tk.register_output(f"canary_1b/huggingface/{test_set}_bs64_greedy/wer", search_job.out_wer) # Run with our beam search - for beam_size in [1, 4, 8]: + for beam_size in [1, 4, 8, 12]: for test_set, split in TEST_DATASETS.items(): if test_set == "gigaspeech": - continue # TODO: need to ask nick to set a reservaion tag to increase time limit + continue + bs_ = 64 if beam_size <= 4 else 32 search_job = SearchJob( model_id=MODEL_ID, model_path=model_path, @@ -106,16 +107,15 @@ def py(): dataset_name=test_set, split=split, search_script=our_beam_search_script, - search_args={"batch_size": 64, "pnc": False, "max_eval_samples": -1, "beam_size": beam_size}, + search_args={"batch_size": bs_, "pnc": False, "max_eval_samples": -1, "beam_size": beam_size}, python_exe=python_exe, device="gpu", - time_rqmt=0.5, + time_rqmt=24, mem_rqmt=8, - cpu_rqmt=2, + cpu_rqmt=4, ) - search_job.rqmt["sbatch_args"] = ["-p", "gpu_test_24gb"] - search_job.add_alias(f"canary_1b/beam_search_v5/{test_set}_bs64_beam{beam_size}") - tk.register_output( - f"canary_1b/beam_search_v5/{test_set}_bs64_beam{beam_size}/search_out", search_job.out_search_results - ) - tk.register_output(f"canary_1b/beam_search_v5/{test_set}_bs64_beam{beam_size}/wer", search_job.out_wer) + search_job.rqmt["sbatch_args"] = ["-p", "gpu_test_24gb", "-w", "cn-290", "--reservation", "hlt_6"] + name = f"{test_set}_bs{bs_}_beam{beam_size}" + search_job.add_alias(f"canary_1b/beam_search_v5/{name}") + tk.register_output(f"canary_1b/beam_search_v5/{name}/search_out", search_job.out_search_results) + tk.register_output(f"canary_1b/beam_search_v5/{name}/wer", search_job.out_wer) From 3532a025f90c8a31791e3ef3b381823cb7ee3f40 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Wed, 19 Jun 2024 22:05:11 +0200 Subject: [PATCH 211/227] vocab outputs --- users/zeyer/datasets/librispeech.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/users/zeyer/datasets/librispeech.py b/users/zeyer/datasets/librispeech.py index 011bf8a38..d260fa165 100644 --- a/users/zeyer/datasets/librispeech.py +++ b/users/zeyer/datasets/librispeech.py @@ -51,6 +51,7 @@ def _get_bliss_corpus_dict() -> Dict[str, tk.Path]: def _get_corpus_text_dict(key: str) -> tk.Path: job = CorpusToTextDictJob(_get_bliss_corpus_dict()[key], gzip=True) job.add_alias(_alias_prefix + f"{key.replace('-', '_')}_corpus_text_dict") + tk.register_output(_alias_prefix + f"{key.replace('-', '_')}_corpus_text_dict.py.gz", job.out_dictionary) return job.out_dictionary @@ -60,6 +61,7 @@ def _get_train_corpus_text() -> tk.Path: train_corpus_text_dict = _get_corpus_text_dict(key) job = TextDictToTextLinesJob(train_corpus_text_dict, gzip=True) job.add_alias(_alias_prefix + f"{key.replace('-', '_')}_corpus_text_lines") + tk.register_output(_alias_prefix + f"{key.replace('-', '_')}_corpus_text_lines.txt.gz", job.out_text_lines) return job.out_text_lines @@ -85,7 +87,8 @@ def _get_spm_vocab( "eos_id": 0, # default is 2 }, ) - _spm_train_job.add_alias(_alias_prefix + f"vocab/spm_{model_type.name.lower()}_{dim_str}_train") + _spm_train_job.add_alias(_alias_prefix + f"vocab/spm_{model_type.value}_{dim_str}_train") + tk.register_output(_alias_prefix + f"vocab/spm_{model_type.value}_{dim_str}_train.model", _spm_train_job.out_model) spm = SentencePieceModel( dim=dim, model_file=_spm_train_job.out_model, From 4dda58ba9165b5ce6000001d849e4fcd0df9cf61 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 20 Jun 2024 10:02:48 +0200 Subject: [PATCH 212/227] more --- users/zeyer/experiments/exp2024_04_23_baselines/aed.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/aed.py b/users/zeyer/experiments/exp2024_04_23_baselines/aed.py index c9319cb50..4d83029e8 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/aed.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/aed.py @@ -68,8 +68,9 @@ def py(): }, ) + # Comparing vocabs. for vocab in [ - "spm20k", + "spm20k", # 5.14 (but test-other is 6.18!) "bpe10k", # 5.32 "spm10k", # 5.16 "spm_bpe10k", # 5.21 From 44fb0a54c419db31e5769af152e834080313fd8a Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 20 Jun 2024 10:03:52 +0200 Subject: [PATCH 213/227] more, AED featBN, sampling --- .../exp2024_04_23_baselines/aed.py | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/aed.py b/users/zeyer/experiments/exp2024_04_23_baselines/aed.py index 4d83029e8..9bafab989 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/aed.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/aed.py @@ -14,6 +14,7 @@ import copy import functools from typing import TYPE_CHECKING, Optional, Union, Tuple, Sequence +import numpy import tree from returnn.tensor import Tensor, Dim, single_step_dim @@ -89,6 +90,40 @@ def py(): vocab=vocab, ) + # Comparing vocabs with better settings: feature norm, sampling, no max seq len. + for vocab, alpha in [ + # ("spm20k", 0.7), + ("bpe10k", 0.01), + ("spm10k", 0.7), + # ("spm_bpe10k", ...), # unclear what sampling scheme... + # ("spm4k", 0.7), + # ("spm1k", 0.7), + # ("spm_bpe1k", ...) + ]: + train_exp( # 5.16 + f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-maxSeqLenNone" + f"-wd1e_2-lrlin1e_5_295k-featBN-speedpertV2-{vocab}" + f"-{'spmSample' if vocab.startswith('spm') else 'bpeSample'}{str(alpha).replace('.', '')}", + config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, + model_config={"feature_batch_norm": True}, + config_updates={ + **_get_cfg_lrlin_oclr_by_bs_nep(15_000, 500), + "optimizer.weight_decay": 1e-2, + "__train_audio_preprocess": speed_pert_librosa_config, + "speed_pert_discrete_values": [0.7, 0.8, 0.9, 1.0, 1.1], + "max_seq_length_default_target": None, + }, + vocab=vocab, + train_vocab_opts={ + "other_opts": ( + {"enable_sampling": True, "alpha": alpha} + if vocab.startswith("spm") + else {"class": "SamplingBytePairEncoding", "breadth_prob": alpha} + ) + }, + ) + + # Sampling. for vocab, alpha in [ # Testing sampling in SPM. # The lower the alpha, the more aggressive the sampling. @@ -567,6 +602,24 @@ def __init__( for i in enc_aux_logits: setattr(self, f"enc_aux_logits_{i}", rf.Linear(self.encoder.out_dim, wb_target_dim)) + self.feature_batch_norm = None + if config.bool("feature_batch_norm", False): + self.feature_batch_norm = rf.BatchNorm(self.in_dim, affine=False, use_mask=True) + self.feature_norm = config.bool("feature_norm", False) + self.feature_stats = None + feature_stats = config.typed_value("feature_stats") + if feature_stats: + assert isinstance(feature_stats, dict) + self.feature_stats = rf.ParameterList( + { + k: rf.Parameter( + rf.convert_to_tensor(numpy.loadtxt(v), dims=[self.in_dim], dtype=rf.get_default_float_dtype()), + auxiliary=True, + ) + for k, v in feature_stats.items() + } + ) + self._specaugment_opts = { "steps": config.typed_value("specaugment_steps") or (0, 1000, 2000), "max_consecutive_spatial_dims": config.typed_value("specaugment_max_consecutive_spatial_dims") or 20, @@ -596,6 +649,12 @@ def encode( out_dim=self.in_dim, sampling_rate=16_000, ) + if self.feature_batch_norm: + source = self.feature_batch_norm(source) + if self.feature_norm: + source = rf.normalize(source, axis=in_spatial_dim) + if self.feature_stats: + source = (source - self.feature_stats.mean) / self.feature_stats.std_dev if self._mixup: source = self._mixup(source, spatial_dim=in_spatial_dim) # SpecAugment From ebfc2e7f2cc90bc78fc0b07cd215e96abea2ee93 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 20 Jun 2024 10:16:57 +0200 Subject: [PATCH 214/227] extract SPM vocab --- users/zeyer/datasets/librispeech.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/users/zeyer/datasets/librispeech.py b/users/zeyer/datasets/librispeech.py index d260fa165..fdce05400 100644 --- a/users/zeyer/datasets/librispeech.py +++ b/users/zeyer/datasets/librispeech.py @@ -12,6 +12,7 @@ from i6_core.corpus.convert import CorpusToTextDictJob from i6_core.text.convert import TextDictToTextLinesJob from i6_core.text.label.sentencepiece.train import TrainSentencePieceJob, SentencePieceType +from i6_core.text.label.sentencepiece.vocab import ExtractSentencePieceVocabJob from returnn.util.basic import NotSpecified from returnn_common.datasets_old_2022_10.interface import DatasetConfig, VocabConfig from i6_experiments.common.datasets import librispeech @@ -89,6 +90,10 @@ def _get_spm_vocab( ) _spm_train_job.add_alias(_alias_prefix + f"vocab/spm_{model_type.value}_{dim_str}_train") tk.register_output(_alias_prefix + f"vocab/spm_{model_type.value}_{dim_str}_train.model", _spm_train_job.out_model) + tk.register_output( + _alias_prefix + f"vocab/spm_{model_type.value}_{dim_str}_train.vocab", + ExtractSentencePieceVocabJob(_spm_train_job.out_model).out_vocab, + ) spm = SentencePieceModel( dim=dim, model_file=_spm_train_job.out_model, From 8b7dae5c680f4fecf0fe7b5553d08e594ad9825c Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 20 Jun 2024 11:33:33 +0000 Subject: [PATCH 215/227] add rtfs --- .../experiments/canary_aed/nemo/run_eval.py | 18 ++++++++++++++++-- .../canary_aed/nemo/run_eval_beam_search.py | 19 +++++++++++++++++-- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py index f6a62aaf3..eab4d1d32 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py @@ -78,6 +78,12 @@ def buffer_audio_and_transcribe( ): buffer = [] results = [] + + model.total_audio_length_in_sec = 0.0 + model.total_recog_time_in_sec = 0.0 + model.total_enc_recog_time_in_sec = 0.0 + model.total_dec_recog_time_in_sec = 0.0 + for sample in tqdm(dataset_iterator(dataset), desc="Evaluating: Sample id", unit="", disable=not verbose): buffer.append(sample) @@ -86,10 +92,10 @@ def buffer_audio_and_transcribe( if pnc is not None: transcriptions = model.transcribe( - filepaths, batch_size=batch_size, pnc=False, verbose=False, num_workers=4 + filepaths, batch_size=batch_size, pnc=False, verbose=False, num_workers=2 ) else: - transcriptions = model.transcribe(filepaths, batch_size=batch_size, verbose=False, num_workers=4) + transcriptions = model.transcribe(filepaths, batch_size=batch_size, verbose=False, num_workers=2) # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis if type(transcriptions) == tuple and len(transcriptions) == 2: transcriptions = transcriptions[0] @@ -108,6 +114,14 @@ def buffer_audio_and_transcribe( results = pack_results(results, buffer, transcriptions) buffer.clear() + print(f"Total audio duration: {model.total_audio_length_in_sec:.3f} sec") + print(f"Total recog time: {model.total_recog_time_in_sec:.3f} sec") + print(f"Total enc recog time: {model.total_enc_recog_time_in_sec:.3f} sec") + print(f"Total dec recog time: {model.total_dec_recog_time_in_sec:.3f} sec") + print(f"Overall RTF: {model.total_recog_time_in_sec / model.total_audio_length_in_sec:.3f}") + print(f"Enc RTF: {model.total_enc_recog_time_in_sec / model.total_audio_length_in_sec:.3f}") + print(f"Dec RTF: {model.total_dec_recog_time_in_sec / model.total_audio_length_in_sec:.3f}") + # Delete temp cache dir if os.path.exists(DATA_CACHE_DIR): shutil.rmtree(DATA_CACHE_DIR) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py index cc928f526..20996fb0a 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py @@ -225,6 +225,7 @@ def _transcribe_output_processing_our_beam_search( max_seq_len=enc_lens, device=enc_states.device, opts=beam_search_v5_opts, + # debug_out=sys.stdout, ) # [B,Beam,L] best_hyps = [] @@ -240,6 +241,12 @@ def buffer_audio_and_transcribe( ): buffer = [] results = [] + + model.total_audio_length_in_sec = 0.0 + model.total_recog_time_in_sec = 0.0 + model.total_enc_recog_time_in_sec = 0.0 + model.total_dec_recog_time_in_sec = 0.0 + for sample in tqdm(dataset_iterator(dataset), desc="Evaluating: Sample id", unit="", disable=not verbose): buffer.append(sample) @@ -248,10 +255,10 @@ def buffer_audio_and_transcribe( if pnc is not None: transcriptions = model.transcribe( - filepaths, batch_size=batch_size, pnc=False, verbose=False, num_workers=4 + filepaths, batch_size=batch_size, pnc=False, verbose=False, num_workers=2 ) else: - transcriptions = model.transcribe(filepaths, batch_size=batch_size, verbose=False, num_workers=4) + transcriptions = model.transcribe(filepaths, batch_size=batch_size, verbose=False, num_workers=2) # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis if type(transcriptions) == tuple and len(transcriptions) == 2: @@ -271,6 +278,14 @@ def buffer_audio_and_transcribe( results = pack_results(results, buffer, transcriptions) buffer.clear() + print(f"Total audio duration: {model.total_audio_length_in_sec:.3f} sec") + print(f"Total recog time: {model.total_recog_time_in_sec:.3f} sec") + print(f"Total enc recog time: {model.total_enc_recog_time_in_sec:.3f} sec") + print(f"Total dec recog time: {model.total_dec_recog_time_in_sec:.3f} sec") + print(f"Overall RTF: {model.total_recog_time_in_sec / model.total_audio_length_in_sec:.3f}") + print(f"Enc RTF: {model.total_enc_recog_time_in_sec / model.total_audio_length_in_sec:.3f}") + print(f"Dec RTF: {model.total_dec_recog_time_in_sec / model.total_audio_length_in_sec:.3f}") + # Delete temp cache dir if os.path.exists(DATA_CACHE_DIR): shutil.rmtree(DATA_CACHE_DIR) From a79f06a710ec49e155e93a8799c3725a7b9e3cfd Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 20 Jun 2024 11:38:44 +0000 Subject: [PATCH 216/227] add cache suffix --- .../canary_aed/configs/canary_1b_recog.py | 105 +++++++++--------- .../experiments/canary_aed/nemo/run_eval.py | 4 + .../canary_aed/nemo/run_eval_beam_search.py | 6 + .../experiments/canary_aed/nemo/search.py | 5 + 4 files changed, 69 insertions(+), 51 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py index 41f025593..0f4f0d64d 100644 --- a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py +++ b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py @@ -49,15 +49,6 @@ def py(): dataset_paths = download_test_datasets() model_path = download_canary_1b_model() - huggface_search_script = tk.Path( - "/u/zeineldeen/setups/ubuntu_22_setups/2024-06-07--canary-aed/recipe/i6_experiments/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py", - hash_overwrite="run_eval_v1", - ) - our_beam_search_script = tk.Path( - "/u/zeineldeen/setups/ubuntu_22_setups/2024-06-07--canary-aed/recipe/i6_experiments/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py", - hash_overwrite="run_eval_v2", - ) - # to run canary model, this env has installed nemo toolkit with: # pip3 install git+https://github.com/NVIDIA/NeMo.git@r2.0.0rc0#egg=nemo_toolkit[all] # related issue: https://github.com/huggingface/open_asr_leaderboard/issues/26 @@ -74,48 +65,60 @@ def py(): # earnings22 | 12.23 | 12.25 # gigaspeech | 10.14 | 10.19 - for test_set, split in TEST_DATASETS.items(): - search_job = SearchJob( - model_id=MODEL_ID, - model_path=model_path, - dataset_path=dataset_paths[test_set], - dataset_name=test_set, - split=split, - search_script=huggface_search_script, - search_args={"batch_size": 64, "pnc": False, "max_eval_samples": -1}, - python_exe=python_exe, - device="gpu", - time_rqmt=24, - mem_rqmt=8, - cpu_rqmt=2, + for run in range(1): + huggface_search_script = tk.Path( + "/u/zeineldeen/setups/ubuntu_22_setups/2024-06-07--canary-aed/recipe/i6_experiments/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py", + hash_overwrite=f"run_eval_v1_rtf_run{run}", ) - search_job.rqmt["sbatch_args"] = ["-p", "gpu_24gb"] - search_job.add_alias(f"canary_1b/huggingface/{test_set}_bs64_greedy") - tk.register_output(f"canary_1b/huggingface/{test_set}_bs64_greedy/search_out", search_job.out_search_results) - tk.register_output(f"canary_1b/huggingface/{test_set}_bs64_greedy/wer", search_job.out_wer) + for test_set, split in TEST_DATASETS.items(): + for bs in [64]: # [16, 32, 64]: + name = f"{test_set}_bs{bs}_greedy_run{run}" + search_job = SearchJob( + model_id=MODEL_ID, + model_path=model_path, + dataset_path=dataset_paths[test_set], + dataset_name=test_set, + cache_dir_name_suffix=name, + split=split, + search_script=huggface_search_script, + search_args={"batch_size": bs, "pnc": False, "max_eval_samples": -1}, + python_exe=python_exe, + device="gpu", + time_rqmt=24, + mem_rqmt=8, + cpu_rqmt=4, + ) + search_job.rqmt["sbatch_args"] = ["-p", "gpu_test_24gb", "-w", "cn-290", "--reservation", "hlt_6"] + search_job.add_alias(f"canary_1b/huggingface/{name}") + tk.register_output(f"canary_1b/huggingface/{name}/search_out", search_job.out_search_results) + tk.register_output(f"canary_1b/huggingface/{name}/wer", search_job.out_wer) # Run with our beam search - for beam_size in [1, 4, 8, 12]: - for test_set, split in TEST_DATASETS.items(): - if test_set == "gigaspeech": - continue - bs_ = 64 if beam_size <= 4 else 32 - search_job = SearchJob( - model_id=MODEL_ID, - model_path=model_path, - dataset_path=dataset_paths[test_set], - dataset_name=test_set, - split=split, - search_script=our_beam_search_script, - search_args={"batch_size": bs_, "pnc": False, "max_eval_samples": -1, "beam_size": beam_size}, - python_exe=python_exe, - device="gpu", - time_rqmt=24, - mem_rqmt=8, - cpu_rqmt=4, - ) - search_job.rqmt["sbatch_args"] = ["-p", "gpu_test_24gb", "-w", "cn-290", "--reservation", "hlt_6"] - name = f"{test_set}_bs{bs_}_beam{beam_size}" - search_job.add_alias(f"canary_1b/beam_search_v5/{name}") - tk.register_output(f"canary_1b/beam_search_v5/{name}/search_out", search_job.out_search_results) - tk.register_output(f"canary_1b/beam_search_v5/{name}/wer", search_job.out_wer) + for run in range(1): + our_beam_search_script = tk.Path( + "/u/zeineldeen/setups/ubuntu_22_setups/2024-06-07--canary-aed/recipe/i6_experiments/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py", + hash_overwrite=f"run_eval_v2_rtf_run{run}", + ) + for beam_size in [1, 4]: + for bs_ in [16, 32]: + for test_set, split in TEST_DATASETS.items(): + name = f"{test_set}_bs{bs_}_beam{beam_size}_run{run}" + search_job = SearchJob( + model_id=MODEL_ID, + model_path=model_path, + dataset_path=dataset_paths[test_set], + dataset_name=test_set, + cache_dir_name_suffix=name, + split=split, + search_script=our_beam_search_script, + search_args={"batch_size": bs_, "pnc": False, "max_eval_samples": -1, "beam_size": beam_size}, + python_exe=python_exe, + device="gpu", + time_rqmt=24, + mem_rqmt=8, + cpu_rqmt=4, + ) + search_job.rqmt["sbatch_args"] = ["-p", "gpu_test_24gb", "-w", "cn-290", "--reservation", "hlt_6"] + search_job.add_alias(f"canary_1b/beam_search_v5/{name}") + tk.register_output(f"canary_1b/beam_search_v5/{name}/search_out", search_job.out_search_results) + tk.register_output(f"canary_1b/beam_search_v5/{name}/wer", search_job.out_wer) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py index eab4d1d32..ce379c02c 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py @@ -155,6 +155,8 @@ def main(args): f"{args.model_id.replace('/', '-')}-{args.dataset_path.replace('/', '')}-" f"{args.dataset.replace('/', '-')}-{args.split}" ) + if args.cache_dir_name_suffix: + cache_prefix += f"_{args.cache_dir_name_suffix}" results = buffer_audio_and_transcribe(asr_model, dataset, args.batch_size, args.pnc, cache_prefix, verbose=True) for sample in results: predictions.append(data_utils.normalizer(sample["pred_text"])) @@ -193,6 +195,8 @@ def main(args): parser.add_argument("--dataset", type=str, required=True, help="Dataset name.") parser.add_argument("--split", type=str, required=True, help="Dataset split.") + parser.add_argument("--cache_dir_name_suffix", type=str, default=None, help="Cache dir name suffix.") + parser.add_argument("--manifest_path", type=str, required=True, help="Path to save the search output.") parser.add_argument("--wer_out_path", type=str, default=None, help="Path to save the WER output.") diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py index 20996fb0a..0aff98d42 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py @@ -337,6 +337,8 @@ def main(args): f"{args.model_id.replace('/', '-')}-{args.dataset_path.replace('/', '')}-" f"{args.dataset.replace('/', '-')}-{args.split}" ) + if args.cache_dir_name_suffix: + cache_prefix += f"_{args.cache_dir_name_suffix}" results = buffer_audio_and_transcribe(asr_model, dataset, args.batch_size, args.pnc, cache_prefix, verbose=True) for sample in results: predictions.append(data_utils.normalizer(sample["pred_text"])) @@ -375,6 +377,10 @@ def main(args): parser.add_argument("--dataset", type=str, required=True, help="Dataset name.") parser.add_argument("--split", type=str, required=True, help="Dataset split.") + parser.add_argument("--cache_dir_name_suffix", type=str, default=None, help="Cache dir name suffix.") + + parser.add_argument("--cache_dir_name_suffix", type=str, required=True, help="Cache dir name suffix.") + parser.add_argument("--manifest_path", type=str, required=True, help="Path to save the search output.") parser.add_argument("--wer_out_path", type=str, default=None, help="Path to save the WER output.") diff --git a/users/zeineldeen/experiments/canary_aed/nemo/search.py b/users/zeineldeen/experiments/canary_aed/nemo/search.py index 75c9cd98f..09d57454f 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/search.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/search.py @@ -14,6 +14,7 @@ def __init__( model_path: tk.Path, dataset_path: tk.Path, dataset_name: str, + cache_dir_name_suffix: str, split: str, search_script: tk.Path, search_args: Optional[Dict[str, Any]] = None, @@ -27,6 +28,7 @@ def __init__( self.model_path = model_path self.dataset_path = dataset_path self.dataset_name = dataset_name + self.cache_dir_name_suffix = cache_dir_name_suffix self.split = split self.search_script = search_script self.search_args = search_args if search_args is not None else {} @@ -58,6 +60,8 @@ def get_cmd(self): self.dataset_path.get_path(), "--dataset", self.dataset_name, + "--cache_dir_name_suffix", + self.cache_dir_name_suffix, "--split", self.split, "--manifest_path", @@ -90,6 +94,7 @@ def hash(cls, kwargs): "model_path": kwargs["model_path"], "dataset_path": kwargs["dataset_path"], "dataset_name": kwargs["dataset_name"], + "cache_dir_name_suffix": kwargs["cache_dir_name_suffix"], "split": kwargs["split"], "search_script": kwargs["search_script"], "search_args": kwargs["search_args"], From 801e5060c501da72d88bc6d79327963fcdb79cee Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 20 Jun 2024 11:42:27 +0000 Subject: [PATCH 217/227] update --- .../experiments/canary_aed/configs/canary_1b_recog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py index 0f4f0d64d..6a2325a50 100644 --- a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py +++ b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py @@ -71,7 +71,7 @@ def py(): hash_overwrite=f"run_eval_v1_rtf_run{run}", ) for test_set, split in TEST_DATASETS.items(): - for bs in [64]: # [16, 32, 64]: + for bs in [16, 32, 64]: name = f"{test_set}_bs{bs}_greedy_run{run}" search_job = SearchJob( model_id=MODEL_ID, From 37a681d2ae9a44671acaa48acd94e0f9dda26846 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 20 Jun 2024 11:45:45 +0000 Subject: [PATCH 218/227] fix --- .../experiments/canary_aed/nemo/run_eval_beam_search.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py index 0aff98d42..0246be0c6 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py @@ -379,8 +379,6 @@ def main(args): parser.add_argument("--cache_dir_name_suffix", type=str, default=None, help="Cache dir name suffix.") - parser.add_argument("--cache_dir_name_suffix", type=str, required=True, help="Cache dir name suffix.") - parser.add_argument("--manifest_path", type=str, required=True, help="Path to save the search output.") parser.add_argument("--wer_out_path", type=str, default=None, help="Path to save the WER output.") From e38138014abcebc746f8129ee0b4e303edace91f Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 20 Jun 2024 11:46:15 +0000 Subject: [PATCH 219/227] add debug out --- .../experiments/canary_aed/nemo/run_eval_beam_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py index 0246be0c6..acd49d0ad 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py @@ -225,7 +225,7 @@ def _transcribe_output_processing_our_beam_search( max_seq_len=enc_lens, device=enc_states.device, opts=beam_search_v5_opts, - # debug_out=sys.stdout, + debug_out=sys.stdout, ) # [B,Beam,L] best_hyps = [] From 352299ea0b0743fadfb91fae2ea98d72a2ce3a22 Mon Sep 17 00:00:00 2001 From: Mohammad Zeineldeen Date: Thu, 20 Jun 2024 14:13:45 +0000 Subject: [PATCH 220/227] add batch size logging --- .../canary_aed/configs/canary_1b_recog.py | 69 +++++++++++++------ .../experiments/canary_aed/nemo/run_eval.py | 1 + .../canary_aed/nemo/run_eval_beam_search.py | 1 + 3 files changed, 49 insertions(+), 22 deletions(-) diff --git a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py index 6a2325a50..8c658458f 100644 --- a/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py +++ b/users/zeineldeen/experiments/canary_aed/configs/canary_1b_recog.py @@ -100,25 +100,50 @@ def py(): hash_overwrite=f"run_eval_v2_rtf_run{run}", ) for beam_size in [1, 4]: - for bs_ in [16, 32]: - for test_set, split in TEST_DATASETS.items(): - name = f"{test_set}_bs{bs_}_beam{beam_size}_run{run}" - search_job = SearchJob( - model_id=MODEL_ID, - model_path=model_path, - dataset_path=dataset_paths[test_set], - dataset_name=test_set, - cache_dir_name_suffix=name, - split=split, - search_script=our_beam_search_script, - search_args={"batch_size": bs_, "pnc": False, "max_eval_samples": -1, "beam_size": beam_size}, - python_exe=python_exe, - device="gpu", - time_rqmt=24, - mem_rqmt=8, - cpu_rqmt=4, - ) - search_job.rqmt["sbatch_args"] = ["-p", "gpu_test_24gb", "-w", "cn-290", "--reservation", "hlt_6"] - search_job.add_alias(f"canary_1b/beam_search_v5/{name}") - tk.register_output(f"canary_1b/beam_search_v5/{name}/search_out", search_job.out_search_results) - tk.register_output(f"canary_1b/beam_search_v5/{name}/wer", search_job.out_wer) + for bs in [16, 32]: + for thre_pruning in [0.0]: + for adaptive_prune in [False]: + if beam_size == 1 and (thre_pruning != 0.0 or adaptive_prune is True): + continue + for test_set, split in TEST_DATASETS.items(): + name = f"{test_set}_bs{bs}_beam{beam_size}_run{run}" + search_args = { + "batch_size": bs, + "pnc": False, + "max_eval_samples": -1, + "beam_size": beam_size, + } + if adaptive_prune: + search_args["adaptive_pruning"] = True + name += "adaptivePrune" + if thre_pruning: + search_args["threshold_pruning"] = thre_pruning + name += f"threPruning{thre_pruning}" + search_job = SearchJob( + model_id=MODEL_ID, + model_path=model_path, + dataset_path=dataset_paths[test_set], + dataset_name=test_set, + cache_dir_name_suffix=name, + split=split, + search_script=our_beam_search_script, + search_args=search_args, + python_exe=python_exe, + device="gpu", + time_rqmt=24, + mem_rqmt=8, + cpu_rqmt=4, + ) + search_job.rqmt["sbatch_args"] = [ + "-p", + "gpu_test_24gb", + "-w", + "cn-290", + "--reservation", + "hlt_6", + ] + search_job.add_alias(f"canary_1b/beam_search_v5/{name}") + tk.register_output( + f"canary_1b/beam_search_v5/{name}/search_out", search_job.out_search_results + ) + tk.register_output(f"canary_1b/beam_search_v5/{name}/wer", search_job.out_wer) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py index ce379c02c..4ad20b795 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval.py @@ -157,6 +157,7 @@ def main(args): ) if args.cache_dir_name_suffix: cache_prefix += f"_{args.cache_dir_name_suffix}" + print(f"Using batch size: {args.batch_size}") results = buffer_audio_and_transcribe(asr_model, dataset, args.batch_size, args.pnc, cache_prefix, verbose=True) for sample in results: predictions.append(data_utils.normalizer(sample["pred_text"])) diff --git a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py index acd49d0ad..46c30e22a 100644 --- a/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py +++ b/users/zeineldeen/experiments/canary_aed/nemo/run_eval_beam_search.py @@ -339,6 +339,7 @@ def main(args): ) if args.cache_dir_name_suffix: cache_prefix += f"_{args.cache_dir_name_suffix}" + print(f"Using batch size: {args.batch_size}") results = buffer_audio_and_transcribe(asr_model, dataset, args.batch_size, args.pnc, cache_prefix, verbose=True) for sample in results: predictions.append(data_utils.normalizer(sample["pred_text"])) From c42d139015ee4246dfc476789b2104a8ee9fa101 Mon Sep 17 00:00:00 2001 From: "luca.gaudino" Date: Thu, 20 Jun 2024 16:34:40 +0200 Subject: [PATCH 221/227] import i6_models conformer in rf, batch 1 --- .../librispeech_960/_import_model_nick.py | 71 +++-- .../_test_returnn_torch_mhsa.py | 72 +++-- .../librispeech_960/conformer_ctc_train.py | 24 +- .../conformer_diff_i6_models_vs_rf.md | 85 ++++++ .../rf_robin_rnnt_2024/__init__.py | 0 .../output/recog_config.config | 136 +++++++++ .../ReturnnForwardJobV2/work/rnn.sh | 2 + .../output/train_config.config | 263 ++++++++++++++++++ .../ReturnnTrainingJob/work/rnn.sh | 2 + .../luca_example_transducer/__init__.py | 0 .../transducer_model_luca.py | 234 ++++++++++++++++ .../rf/conformer_rnnt/model_conformer_rnnt.py | 7 +- 12 files changed, 845 insertions(+), 51 deletions(-) create mode 100644 users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_diff_i6_models_vs_rf.md create mode 100644 users/gaudino/experiments/rf_robin_rnnt_2024/__init__.py create mode 100644 users/gaudino/experiments/rf_robin_rnnt_2024/luca_example_transducer/ReturnnForwardJobV2/output/recog_config.config create mode 100755 users/gaudino/experiments/rf_robin_rnnt_2024/luca_example_transducer/ReturnnForwardJobV2/work/rnn.sh create mode 100644 users/gaudino/experiments/rf_robin_rnnt_2024/luca_example_transducer/ReturnnTrainingJob/output/train_config.config create mode 100755 users/gaudino/experiments/rf_robin_rnnt_2024/luca_example_transducer/ReturnnTrainingJob/work/rnn.sh create mode 100644 users/gaudino/experiments/rf_robin_rnnt_2024/luca_example_transducer/__init__.py create mode 100644 users/gaudino/experiments/rf_robin_rnnt_2024/luca_example_transducer/transducer_model_luca.py diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/_import_model_nick.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/_import_model_nick.py index d944d6c95..fb589c306 100644 --- a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/_import_model_nick.py +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/_import_model_nick.py @@ -98,7 +98,7 @@ def convert_checkpoint( assert isinstance(name, str) assert isinstance(param, rf.Parameter) - value = map_param_func(ckpt, name, param, param_mapping) + value = map_param_func(ckpt, name, param, param_mapping, model) assert isinstance(value, numpy.ndarray) # noinspection PyProtectedMember param._raw_backend.set_parameter_initial_value(param, value) @@ -119,7 +119,7 @@ def convert_checkpoint( if save_model: os.makedirs(out_dir, exist_ok=True) - filename = out_dir + "/" + ckpt_name # + ".pt" + filename = out_dir + "/" + ckpt_name # + ".pt" print(f"*** saving PyTorch model checkpoint: {filename}") torch.save( {"model": pt_model.state_dict(), "epoch": epoch, "step": step}, filename @@ -158,10 +158,11 @@ def convert_checkpoint( f"encoder.layers.{layer_idx}.conv_block.positionwise_conv1.weight" ) _transpose_list.append( - f"encoder.layers.{layer_idx}.self_att.qkv.weight" + f"encoder.layers.{layer_idx}.conv_block.positionwise_conv2.weight" ) + def _add_params_conformer(param_mapping: Dict[str, str], prefix: str): # rf -> pt # frontend @@ -228,10 +229,10 @@ def _add_params_conformer(param_mapping: Dict[str, str], prefix: str): param_mapping[ prefix + f"encoder.layers.{layer_idx}.conv_block.positionwise_conv2.bias" ] = (orig_name_prefix + "conv.pointwise_conv2.bias") - param_mapping[prefix + f"encoder.layers.{layer_idx}.conv_block.norm.gamma"] = ( + param_mapping[prefix + f"encoder.layers.{layer_idx}.conv_block.norm.scale"] = ( orig_name_prefix + "conv.norm.weight" ) - param_mapping[prefix + f"encoder.layers.{layer_idx}.conv_block.norm.beta"] = ( + param_mapping[prefix + f"encoder.layers.{layer_idx}.conv_block.norm.bias"] = ( orig_name_prefix + "conv.norm.bias" ) param_mapping[prefix + f"encoder.layers.{layer_idx}.conv_layer_norm.scale"] = ( @@ -241,15 +242,15 @@ def _add_params_conformer(param_mapping: Dict[str, str], prefix: str): orig_name_prefix + "conv.layer_norm.bias" ) # self-att - param_mapping[prefix + f"encoder.layers.{layer_idx}.self_att.qkv.weight"] = ( - orig_name_prefix + "mhsa.mhsa.in_proj_weight" - ) - param_mapping[prefix + f"encoder.layers.{layer_idx}.self_att.qkv.bias"] = ( - orig_name_prefix + "mhsa.mhsa.in_proj_bias" - ) - param_mapping[prefix + f"encoder.layers.{layer_idx}.self_att.proj.weight"] = ( - orig_name_prefix + "mhsa.mhsa.out_proj.weight" - ) + # param_mapping[prefix + f"encoder.layers.{layer_idx}.self_att.qkv.weight"] = ( + # orig_name_prefix + "mhsa.mhsa.in_proj_weight" + # ) + # param_mapping[prefix + f"encoder.layers.{layer_idx}.self_att.qkv.bias"] = ( + # orig_name_prefix + "mhsa.mhsa.in_proj_bias" + # ) + # param_mapping[prefix + f"encoder.layers.{layer_idx}.self_att.proj.weight"] = ( + # orig_name_prefix + "mhsa.mhsa.out_proj.weight" + # ) param_mapping[prefix + f"encoder.layers.{layer_idx}.self_att.proj.bias"] = ( orig_name_prefix + "mhsa.mhsa.out_proj.bias" ) @@ -308,7 +309,7 @@ def _add_params_predictor_joiner(param_mapping: Dict[str, str]): def map_param_func( - ckpt, name: str, var: rf.Parameter, param_mapping: Dict[str, str] + ckpt, name: str, var: rf.Parameter, param_mapping: Dict[str, str], model: rf.Module ) -> numpy.ndarray: """map params, TF to RF""" from i6_experiments.users.zeyer.returnn.convert.params import ( @@ -337,13 +338,49 @@ def map_param_func( ), f"new param {name} {var.dtype} vs ckpt param {var_name} {value.dtype}" return value + layer_idx = int(name.split(".")[2]) + num_heads = model.encoder.layers[layer_idx].self_att.num_heads.dimension + self_att_dim = model.encoder.layers[layer_idx].self_att.out_dim.dimension + + if name.endswith(".self_att.qkv.weight"): + value = ckpt["model"][ + f"conformer.module_list.{layer_idx}.mhsa.mhsa.in_proj_weight" + ] + # from rf to torch + # value = ( + # value.reshape(self_att_dim, num_heads, 3, self_att_dim // num_heads) + # .permute(2, 1, 3, 0) + # .reshape(-1, self_att_dim) + # ) + value = value.reshape(3, num_heads, self_att_dim // num_heads, self_att_dim).permute(3, 1, 0, 2).reshape(self_att_dim, -1) + assert value.shape == var.batch_shape, name + f" {value.shape} vs {var.batch_shape}" + return value.numpy() + + if name.endswith(".self_att.qkv.bias"): + value = ckpt["model"][ + f"conformer.module_list.{layer_idx}.mhsa.mhsa.in_proj_bias" + ] + # value = value.reshape(num_heads, 3, self_att_dim // num_heads).permute(1, 0, 2).reshape(-1) + value = value.reshape(3, num_heads, self_att_dim // num_heads).permute(1, 0, 2).reshape(-1) + assert value.shape == var.batch_shape, name + f" {value.shape} vs {var.batch_shape}" + return value.numpy() + + if name.endswith(".self_att.proj.weight"): + value = ckpt["model"][ + f"conformer.module_list.{layer_idx}.mhsa.mhsa.out_proj.weight" + ] + # value = value.reshape(num_heads, self_att_dim // num_heads, self_att_dim).permute(2, 0, 1).reshape(-1, self_att_dim) + value = value.reshape(self_att_dim, num_heads, self_att_dim // num_heads).permute(1, 2, 0).reshape(self_att_dim, -1) + assert value.shape == var.batch_shape, name + f" {value.shape} vs {var.batch_shape}" + return value.numpy() + # if name == "s.ff_weight": # value = reader.get_tensor("output/rec/s/rec/lstm_cell/kernel") # value = convert_params_np.convert_tf_lstm_to_native_lstm_ff(value) # assert value.shape == var.batch_shape, name # assert value.dtype.name == var.dtype, name # return value - # + #0 # if name == "s.rec_weight": # value = reader.get_tensor("output/rec/s/rec/lstm_cell/kernel") # value = convert_params_np.convert_tf_lstm_to_native_lstm_rec(value) @@ -381,6 +418,6 @@ def map_param_func( convert_checkpoint( ckpt_path=_nick_pure_torch_rnnt_ckpt_path, print_params=True, - out_dir="/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/librispeech/rnnt_nick_240614", + out_dir="/work/asr3/zeineldeen/hiwis/luca.gaudino/setups-data/2023-08-10--rf-librispeech/work/i6_experiments/users/gaudino/returnn/convert_ckpt_rf/librispeech/rnnt_nick_240619", save_model=True, ) diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/_test_returnn_torch_mhsa.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/_test_returnn_torch_mhsa.py index 11b545e0d..62c92a83e 100644 --- a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/_test_returnn_torch_mhsa.py +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/_test_returnn_torch_mhsa.py @@ -25,7 +25,7 @@ rf_input = rf.random(dims=[Dim(1), spatial_dim, out_dim], **random_opts) -qkv_weight = rf.random(dims=[out_dim, 3*out_dim], **random_opts) +qkv_weight = rf.random(dims=[out_dim, 3 * out_dim], **random_opts) # qkv_weight = rf.full(dims=[out_dim, 3*out_dim], fill_value=0.0, dtype="float32") # qkv_weight.raw_tensor[:,1024:] = 0.0 # qkv_weight.raw_tensor[:,0:1023] = 0.0 @@ -37,7 +37,7 @@ # # qkv_weight.raw_tensor = qkv_raw -qkv_bias = rf.random(dims=[3*out_dim], **random_opts) +qkv_bias = rf.random(dims=[3 * out_dim], **random_opts) # qkv_bias = rf.full(dims=[3*out_dim], fill_value=0.0, dtype="float32") # qkv_bias.raw_tensor[1024:] = 0.0 @@ -51,31 +51,59 @@ proj_dim=out_dim, key_dim_total=out_dim, value_dim_total=out_dim, - num_heads=1, + num_heads=8, att_dropout=0.1, ) -rf_mhsa.qkv.weight._raw_backend.set_parameter_initial_value(rf_mhsa.qkv.weight, qkv_weight) +rf_mhsa.qkv.weight._raw_backend.set_parameter_initial_value( + rf_mhsa.qkv.weight, qkv_weight +) rf_mhsa.qkv.bias._raw_backend.set_parameter_initial_value(rf_mhsa.qkv.bias, qkv_bias) -rf_mhsa.proj.weight._raw_backend.set_parameter_initial_value(rf_mhsa.proj.weight, proj_weight.raw_tensor) +rf_mhsa.proj.weight._raw_backend.set_parameter_initial_value( + rf_mhsa.proj.weight, proj_weight.raw_tensor +) rf_mhsa.proj.bias._raw_backend.set_parameter_initial_value(rf_mhsa.proj.bias, proj_bias) torch_mhsa = torch.nn.MultiheadAttention( - 512, 1, dropout=0.1, batch_first=True, - ) + 512, + 8, + dropout=0.1, + batch_first=True, +) state_dict = torch_mhsa.state_dict() -state_dict['in_proj_weight'] = qkv_weight.raw_tensor.T -state_dict['in_proj_bias'] = qkv_bias.raw_tensor -state_dict['out_proj.weight'] = proj_weight.raw_tensor -state_dict['out_proj.bias'] = proj_bias.raw_tensor + +num_heads = 8 + +state_dict["in_proj_weight"] = ( + qkv_weight.raw_tensor.reshape( + out_dim.dimension, num_heads, 3, out_dim.dimension // num_heads + ) + .permute(2, 1, 3, 0) + .reshape(-1, out_dim.dimension) +) +state_dict["in_proj_bias"] = ( + qkv_bias.raw_tensor.reshape(num_heads, 3, out_dim.dimension // num_heads) + .permute(1, 0, 2) + .reshape(-1) +) +state_dict["out_proj.weight"] = ( + proj_weight.raw_tensor.reshape( + num_heads, out_dim.dimension // num_heads, out_dim.dimension + ) + .permute(2, 0, 1) + .reshape(-1, out_dim.dimension) +) +state_dict["out_proj.bias"] = proj_bias.raw_tensor torch_mhsa.load_state_dict(state_dict) torch_mhsa.eval() rf_output = rf_mhsa(rf_input, axis=spatial_dim) -torch_output, _ = torch_mhsa(torch_input, torch_input, torch_input, key_padding_mask=None, need_weights=False) +torch_output, _ = torch_mhsa( + torch_input, torch_input, torch_input, key_padding_mask=None, need_weights=False +) print("RF output") print(rf_output.raw_tensor) @@ -84,7 +112,7 @@ print("Torch output") print(torch_output) print(torch_output.shape) -print("Same: ", torch.allclose(rf_output.raw_tensor, torch_output, atol=1e-6)) +print("Same: ", torch.allclose(rf_output.raw_tensor, torch_output, atol=1e-3)) ## extended checks @@ -96,7 +124,7 @@ any = False for perm in itertools.permutations(weights): - for q_inv, k_inv, v_inv in itertools.product([0,1], [0,1], [0,1]): + for q_inv, k_inv, v_inv in itertools.product([0, 1], [0, 1], [0, 1]): q_raw, k_raw, v_raw = perm if q_inv == 1: @@ -109,16 +137,22 @@ qkv_weight_adj_raw = torch.cat([q_raw.T, k_raw.T, v_raw.T], dim=1) state_dict = torch_mhsa.state_dict() - state_dict['in_proj_weight'] = qkv_weight_adj_raw.T - state_dict['in_proj_bias'] = qkv_bias.raw_tensor - state_dict['out_proj.weight'] = proj_weight.raw_tensor - state_dict['out_proj.bias'] = proj_bias.raw_tensor + state_dict["in_proj_weight"] = qkv_weight_adj_raw.T + state_dict["in_proj_bias"] = qkv_bias.raw_tensor + state_dict["out_proj.weight"] = proj_weight.raw_tensor + state_dict["out_proj.bias"] = proj_bias.raw_tensor torch_mhsa.load_state_dict(state_dict) torch_mhsa.eval() rf_output = rf_mhsa(rf_input, axis=spatial_dim) - torch_output, _ = torch_mhsa(torch_input, torch_input, torch_input, key_padding_mask=None, need_weights=False) + torch_output, _ = torch_mhsa( + torch_input, + torch_input, + torch_input, + key_padding_mask=None, + need_weights=False, + ) print("RF output") print(rf_output.raw_tensor) diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py index 3787a9f76..880000ee1 100644 --- a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_ctc_train.py @@ -123,18 +123,18 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): # }, # ) - train_exp( # dev-other 9.01 - "base-24gb-lrlin1e_5_600k_ctc_only", - config_24gb_v6, - config_updates={ - "learning_rate": 1.0, - "dynamic_learning_rate": dyn_lr_piecewise_linear, - # total steps after 2000 epochs: 982.312 - "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], - "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], - "aux_loss_layers":[], - }, - ) + # train_exp( # dev-other 9.01 + # "base-24gb-lrlin1e_5_600k_ctc_only", + # config_24gb_v6, + # config_updates={ + # "learning_rate": 1.0, + # "dynamic_learning_rate": dyn_lr_piecewise_linear, + # # total steps after 2000 epochs: 982.312 + # "learning_rate_piecewise_steps": [600_000, 900_000, 982_000], + # "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + # "aux_loss_layers":[], + # }, + # ) train_exp( # dev-other 6.93 "base-24gb-lrlin1e_5_600k_ctc_only_aux4_8", diff --git a/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_diff_i6_models_vs_rf.md b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_diff_i6_models_vs_rf.md new file mode 100644 index 000000000..464916a8d --- /dev/null +++ b/users/gaudino/experiments/rf_conformer_rnnt_2024/librispeech_960/conformer_diff_i6_models_vs_rf.md @@ -0,0 +1,85 @@ +# Importing i6_models conformer into returnn frontend setup + +## Structural differences +| Difference | * | i6_models | rf | +|--------------------------------------------------------|---|-----------|--| +| input_projection uses bias | | yes | no | +| self attention qkv uses bias | | yes | no | +| convolutional kernel size | | 31 | 32 | +| positional encoding in mhsa | * | no | yes | +| batch norm | * | no | yes | +| ff_activation | | silu | rf.relu(x) ** 2.0 | +| vgg frontend (several differences) | | | | +| feature extraction (not sure of the exact difference ) | | | | + +VGG Frontend configuration for rf: +``` +ConformerConvSubsampleV2( + in_dim, + out_dims=[ + Dim(32, name="conv1"), + Dim(64, name="conv2"), + Dim(64, name="conv3"), + Dim(32, name="conv4"), # Changed: Dim(64, name="conv4") + ], + filter_sizes=[(3, 3), (3, 3), (3, 3), (3, 3)], # Changed + activation_times=[False, True, False, True], # Changed + pool_sizes=[(1, 1), (3, 1), (1, 1), (2, 1)], # Changed + strides=[(1, 1), (1, 1), (1, 1), (1, 1)], # Changed + padding="same", # Changed: padding="valid" + pool_padding="valid", # Changed + swap_merge_dim_order=True, # Changed + # Note: uses relu activation by default + ), +``` + +## Adjust weights + +Transpose: + +``` +_transpose_list = [ + "encoder_out_linear.weight", + "encoder.input_projection.weight", + "joiner.linear.weight", + "predictor.linear.weight", +] + +for layer_idx in range(12): + _transpose_list.append(f"encoder.layers.{layer_idx}.ffn1.linear_ff.weight") + _transpose_list.append(f"encoder.layers.{layer_idx}.ffn1.linear_out.weight") + _transpose_list.append(f"encoder.layers.{layer_idx}.ffn2.linear_ff.weight") + _transpose_list.append(f"encoder.layers.{layer_idx}.ffn2.linear_out.weight") + + _transpose_list.append( + f"encoder.layers.{layer_idx}.conv_block.positionwise_conv1.weight" + ) + _transpose_list.append( + f"encoder.layers.{layer_idx}.conv_block.positionwise_conv2.weight" + ) + +``` + +Adjust self att weights: +``` + if name.endswith(".self_att.qkv.weight"): + value = ckpt["model"][ + f"conformer.module_list.{layer_idx}.mhsa.mhsa.in_proj_weight" + ] + value = value.reshape(3, num_heads, self_att_dim // num_heads, self_att_dim).permute(3, 1, 0, 2).reshape(self_att_dim, -1) + return value.numpy() + + if name.endswith(".self_att.qkv.bias"): + value = ckpt["model"][ + f"conformer.module_list.{layer_idx}.mhsa.mhsa.in_proj_bias" + ] + value = value.reshape(3, num_heads, self_att_dim // num_heads).permute(1, 0, 2).reshape(-1) + return value.numpy() + + if name.endswith(".self_att.proj.weight"): + value = ckpt["model"][ + f"conformer.module_list.{layer_idx}.mhsa.mhsa.out_proj.weight" + ] + value = value.reshape(self_att_dim, num_heads, self_att_dim // num_heads).permute(1, 2, 0).reshape(self_att_dim, -1) + return value.numpy() +``` \ No newline at end of file diff --git a/users/gaudino/experiments/rf_robin_rnnt_2024/__init__.py b/users/gaudino/experiments/rf_robin_rnnt_2024/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/gaudino/experiments/rf_robin_rnnt_2024/luca_example_transducer/ReturnnForwardJobV2/output/recog_config.config b/users/gaudino/experiments/rf_robin_rnnt_2024/luca_example_transducer/ReturnnForwardJobV2/output/recog_config.config new file mode 100644 index 000000000..d4c45598a --- /dev/null +++ b/users/gaudino/experiments/rf_robin_rnnt_2024/luca_example_transducer/ReturnnForwardJobV2/output/recog_config.config @@ -0,0 +1,136 @@ +#!rnn.py + + +accum_grad_multiple_step = 4 +backend = "torch" +batch_size = 2400000 +batching = "random" +beam_search_opts = {"beam_size": 12} +debug_print_layer_output_template = True +default_input = "data" +device = "gpu" +forward_data = { + "class": "MetaDataset", + "data_map": { + "data": ("zip_dataset", "data"), + "targets": ("zip_dataset", "classes"), + }, + "datasets": { + "zip_dataset": { + "audio": { + "features": "raw", + "peak_normalization": True, + "pre_process": None, + "preemphasis": None, + }, + "class": "OggZipDataset", + "epoch_wise_filter": None, + "fixed_random_subset": None, + "partition_epoch": 1, + "path": [ + "/u/zeineldeen/setups/librispeech/2022-11-28--conformer-att/work/i6_core/returnn/oggzip/BlissToOggZipJob.NSdIHfk1iw2M/output/out.ogg.zip" + ], + "segment_file": None, + "seq_ordering": "sorted_reverse", + "targets": { + "bpe_file": "/u/zeineldeen/setups/librispeech/2022-11-28--conformer-att/work/i6_core/text/label/subword_nmt/train/ReturnnTrainBpeJob.vTq56NZ8STWt/output/bpe.codes", + "class": "BytePairEncoding", + "seq_postfix": [0], + "unknown_label": None, + "vocab_file": "/u/zeineldeen/setups/librispeech/2022-11-28--conformer-att/work/i6_core/text/label/subword_nmt/train/ReturnnTrainBpeJob.vTq56NZ8STWt/output/bpe.vocab", + }, + "use_cache_manager": True, + } + }, + "seq_order_control_dataset": "zip_dataset", +} + +log = ["./returnn.log"] +log_batch_size = True +log_verbosity = 5 +max_seqs = 200 +non_blank_vocab = { + "bos_label": 0, + "bpe_file": "/u/zeineldeen/setups/librispeech/2022-11-28--conformer-att/work/i6_core/text/label/subword_nmt/train/ReturnnTrainBpeJob.vTq56NZ8STWt/output/bpe.codes", + "eos_label": 0, + "unknown_label": None, + "vocab_file": "/u/zeineldeen/setups/librispeech/2022-11-28--conformer-att/work/i6_core/text/label/subword_nmt/train/ReturnnTrainBpeJob.vTq56NZ8STWt/output/bpe.vocab", +} +optimizer = {"class": "adamw", "epsilon": 1e-08, "weight_decay": 1e-06} +search_output_layer = "decision" +target = "targets" +task = "forward" +tf_log_dir = "returnn-tf-log" +torch_dataloader_opts = {"num_workers": 1} +torch_log_memory_usage = True +config = {} + +locals().update(**config) + +import os +import sys + +sys.path.insert(0, "/u/schmitt/experiments/segmental_models_2022_23_rf/recipe") +sys.path.insert(1, "/u/schmitt/src/sisyphus") +from returnn.tensor import Dim, batch_dim, single_step_dim +from returnn.tensor.marked_dim import ImplicitDynSizeDim, ImplicitSparseDim + +time_dim = Dim(description="time", dimension=None, kind=Dim.Types.Spatial) +audio_dim = Dim(description="audio", dimension=1, kind=Dim.Types.Feature) +out_spatial_dim = Dim(description="out_spatial", dimension=None, kind=Dim.Types.Spatial) +vocab_dim = Dim(description="vocab", dimension=10026, kind=Dim.Types.Spatial) + +extern_data = { + "data": {"dim_tags": [batch_dim, time_dim, audio_dim]}, + "targets": {"dim_tags": [batch_dim, out_spatial_dim], "sparse_dim": vocab_dim}, +} + +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.luca_example_transducer.transducer_model_luca import ( + from_scratch_model_def as _model_def, +) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.luca_example_transducer.transducer_model_luca import ( + _returnn_v2_get_model as get_model, +) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.recog import ( + model_recog as _recog_def, +) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.recog import ( + _returnn_v2_forward_step as forward_step, +) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.recog import ( + _returnn_v2_get_forward_callback as forward_callback, +) + +# https://github.com/rwth-i6/returnn/issues/957 +# https://stackoverflow.com/a/16248113/133374 +import resource +import sys + +try: + resource.setrlimit(resource.RLIMIT_STACK, (2**29, -1)) +except Exception as exc: + print(f"resource.setrlimit {type(exc).__name__}: {exc}") +sys.setrecursionlimit(10**6) +_cf_cache = {} + + +def cf(filename): + "Cache manager" + from subprocess import check_output, CalledProcessError + + if filename in _cf_cache: + return _cf_cache[filename] + if int(os.environ.get("RETURNN_DEBUG", "0")): + print("use local file: %s" % filename) + return filename # for debugging + try: + cached_fn = check_output(["cf", filename]).strip().decode("utf8") + except CalledProcessError: + print("Cache manager: Error occurred, using local file") + return filename + assert os.path.exists(cached_fn) + _cf_cache[filename] = cached_fn + return cached_fn + + +# -*- mode: python; tab-width: 4 -*- diff --git a/users/gaudino/experiments/rf_robin_rnnt_2024/luca_example_transducer/ReturnnForwardJobV2/work/rnn.sh b/users/gaudino/experiments/rf_robin_rnnt_2024/luca_example_transducer/ReturnnForwardJobV2/work/rnn.sh new file mode 100755 index 000000000..6edd841f1 --- /dev/null +++ b/users/gaudino/experiments/rf_robin_rnnt_2024/luca_example_transducer/ReturnnForwardJobV2/work/rnn.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +/usr/bin/python3 /u/schmitt/src/returnn_new/rnn.py /u/schmitt/experiments/segmental_models_2022_23_rf_pycharm/recipe/i6_experiments/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/luca_example_transducer/ReturnnForwardJobV2/output/recog_config.config \ No newline at end of file diff --git a/users/gaudino/experiments/rf_robin_rnnt_2024/luca_example_transducer/ReturnnTrainingJob/output/train_config.config b/users/gaudino/experiments/rf_robin_rnnt_2024/luca_example_transducer/ReturnnTrainingJob/output/train_config.config new file mode 100644 index 000000000..0b9458b25 --- /dev/null +++ b/users/gaudino/experiments/rf_robin_rnnt_2024/luca_example_transducer/ReturnnTrainingJob/output/train_config.config @@ -0,0 +1,263 @@ +#!rnn.py + + +def dyn_lr_piecewise_linear( + *, global_train_step: int, learning_rate: float, **_kwargs +) -> float: + """ + Piecewise linear + """ + from returnn.config import get_global_config + + config = get_global_config() + + steps = config.int_list("learning_rate_piecewise_steps") + lrs = config.float_list("learning_rate_piecewise_values") + assert len(steps) + 1 == len(lrs) + + last_step = 0 + for i, step in enumerate(steps): + assert step > last_step + assert global_train_step >= last_step + if global_train_step < step: + factor = (global_train_step + 1 - last_step) / (step - last_step) + return learning_rate * (lrs[i + 1] * factor + lrs[i] * (1 - factor)) + last_step = step + + return learning_rate * lrs[-1] + + +accum_grad_multiple_step = 4 +aux_loss_layers = None +backend = "torch" +batch_size = 2400000 +batching = "laplace:.1000" +cleanup_old_models = {"keep": [500], "keep_best_n": 4, "keep_last_n": 1} +debug_print_layer_output_template = True +default_input = "data" +dev = { + "class": "MetaDataset", + "data_map": {"data": ("zip_dataset", "data"), "targets": ("align", "data")}, + "datasets": { + "align": { + "class": "HDFDataset", + "files": [ + "/u/schmitt/experiments/segmental_models_2022_23_rf/work/i6_core/returnn/forward/ReturnnForwardJob.xERLI3g7bpFq/output/alignments.hdf" + ], + "partition_epoch": 1, + "seq_list_filter_file": "/u/zeineldeen/setups/librispeech/2022-11-28--conformer-att/work/i6_core/text/processing/PipelineJob.gTty7UHs0uBu/output/out", + "use_cache_manager": True, + }, + "zip_dataset": { + "audio": { + "features": "raw", + "peak_normalization": True, + "pre_process": None, + "preemphasis": None, + }, + "class": "OggZipDataset", + "epoch_wise_filter": None, + "fixed_random_subset": None, + "partition_epoch": 1, + "path": [ + "/u/zeineldeen/setups/librispeech/2022-11-28--conformer-att/work/i6_core/returnn/oggzip/BlissToOggZipJob.RvwLniNrgMit/output/out.ogg.zip", + "/u/zeineldeen/setups/librispeech/2022-11-28--conformer-att/work/i6_core/returnn/oggzip/BlissToOggZipJob.NSdIHfk1iw2M/output/out.ogg.zip", + ], + "segment_file": "/u/zeineldeen/setups/librispeech/2022-11-28--conformer-att/work/i6_core/text/processing/PipelineJob.gTty7UHs0uBu/output/out", + "seq_ordering": "sorted_reverse", + "targets": { + "bpe_file": "/u/zeineldeen/setups/librispeech/2022-11-28--conformer-att/work/i6_core/text/label/subword_nmt/train/ReturnnTrainBpeJob.vTq56NZ8STWt/output/bpe.codes", + "class": "BytePairEncoding", + "seq_postfix": [0], + "unknown_label": None, + "vocab_file": "/u/zeineldeen/setups/librispeech/2022-11-28--conformer-att/work/i6_core/text/label/subword_nmt/train/ReturnnTrainBpeJob.vTq56NZ8STWt/output/bpe.vocab", + }, + "use_cache_manager": True, + }, + }, + "seq_order_control_dataset": "zip_dataset", +} +device = "gpu" +dynamic_learning_rate = dyn_lr_piecewise_linear +eval_datasets = { + "devtrain": { + "class": "MetaDataset", + "data_map": {"data": ("zip_dataset", "data"), "targets": ("align", "data")}, + "datasets": { + "align": { + "class": "HDFDataset", + "files": [ + "/u/schmitt/experiments/segmental_models_2022_23_rf/work/i6_core/returnn/forward/ReturnnForwardJob.4k03LS27KUmL/output/alignments.hdf" + ], + "partition_epoch": 1, + "seq_list_filter_file": None, + "use_cache_manager": True, + }, + "zip_dataset": { + "audio": { + "features": "raw", + "peak_normalization": True, + "pre_process": None, + "preemphasis": None, + }, + "class": "OggZipDataset", + "epoch_wise_filter": None, + "fixed_random_subset": 3000, + "partition_epoch": 1, + "path": [ + "/u/zeineldeen/setups/librispeech/2022-11-28--conformer-att/work/i6_core/returnn/oggzip/BlissToOggZipJob.Cbboscd6En6A/output/out.ogg.zip" + ], + "segment_file": None, + "seq_ordering": "sorted_reverse", + "targets": { + "bpe_file": "/u/zeineldeen/setups/librispeech/2022-11-28--conformer-att/work/i6_core/text/label/subword_nmt/train/ReturnnTrainBpeJob.vTq56NZ8STWt/output/bpe.codes", + "class": "BytePairEncoding", + "seq_postfix": [0], + "unknown_label": None, + "vocab_file": "/u/zeineldeen/setups/librispeech/2022-11-28--conformer-att/work/i6_core/text/label/subword_nmt/train/ReturnnTrainBpeJob.vTq56NZ8STWt/output/bpe.vocab", + }, + "use_cache_manager": True, + }, + }, + "seq_order_control_dataset": "zip_dataset", + } +} +grad_scaler = None +gradient_clip_global_norm = 5.0 +learning_rate = 1.0 +learning_rate_file = "learning_rates" +learning_rate_piecewise_steps = [295000, 590000, 652000] +learning_rate_piecewise_values = [1e-05, 0.001, 1e-05, 1e-06] +log = ["./returnn.log"] +log_batch_size = True +log_verbosity = 5 +max_seqs = 200 +model = "/u/schmitt/experiments/segmental_models_2022_23_rf/work/i6_core/returnn/training/ReturnnTrainingJob.9zPcngFbJCDE/output/models/epoch" +num_epochs = 500 +optimizer = { + "class": "adamw", + "epsilon": 1e-16, + "weight_decay": 1e-06, + "weight_decay_modules_blacklist": [ + "rf.Embedding", + "rf.LearnedRelativePositionalEncoding", + ], +} +pos_emb_dropout = 0.1 +rf_att_dropout_broadcast = False +save_interval = 1 +target = "targets" +task = "train" +torch_dataloader_opts = {"num_workers": 1} +torch_distributed = {} +torch_log_memory_usage = True +train = { + "class": "MetaDataset", + "data_map": {"data": ("zip_dataset", "data"), "targets": ("align", "data")}, + "datasets": { + "align": { + "class": "HDFDataset", + "files": [ + "/u/schmitt/experiments/segmental_models_2022_23_rf/work/i6_core/returnn/forward/ReturnnForwardJob.4k03LS27KUmL/output/alignments.hdf" + ], + "partition_epoch": 20, + "seq_list_filter_file": None, + "use_cache_manager": True, + }, + "zip_dataset": { + "audio": { + "features": "raw", + "peak_normalization": True, + "pre_process": None, + "preemphasis": None, + }, + "class": "OggZipDataset", + "epoch_wise_filter": {(1, 5): {"max_mean_len": 1000}}, + "fixed_random_subset": None, + "partition_epoch": 20, + "path": [ + "/u/zeineldeen/setups/librispeech/2022-11-28--conformer-att/work/i6_core/returnn/oggzip/BlissToOggZipJob.Cbboscd6En6A/output/out.ogg.zip" + ], + "segment_file": None, + "seq_ordering": "laplace:.1000", + "targets": { + "bpe_file": "/u/zeineldeen/setups/librispeech/2022-11-28--conformer-att/work/i6_core/text/label/subword_nmt/train/ReturnnTrainBpeJob.vTq56NZ8STWt/output/bpe.codes", + "class": "BytePairEncoding", + "seq_postfix": [0], + "unknown_label": None, + "vocab_file": "/u/zeineldeen/setups/librispeech/2022-11-28--conformer-att/work/i6_core/text/label/subword_nmt/train/ReturnnTrainBpeJob.vTq56NZ8STWt/output/bpe.vocab", + }, + "use_cache_manager": True, + }, + }, + "seq_order_control_dataset": "zip_dataset", +} +use_horovod = True +config = {} + +locals().update(**config) + +import os +import sys + +sys.path.insert(0, "/u/schmitt/experiments/segmental_models_2022_23_rf/recipe") +sys.path.insert(1, "/u/schmitt/src/sisyphus") +from returnn.tensor import Dim, batch_dim, single_step_dim +from returnn.tensor.marked_dim import ImplicitDynSizeDim, ImplicitSparseDim + +time_dim = Dim(description="time", dimension=None, kind=Dim.Types.Spatial) +audio_dim = Dim(description="audio", dimension=1, kind=Dim.Types.Feature) +out_spatial_dim = Dim(description="out_spatial", dimension=None, kind=Dim.Types.Spatial) +vocab_dim = Dim(description="vocab", dimension=10026, kind=Dim.Types.Spatial) + +extern_data = { + "data": {"dim_tags": [batch_dim, time_dim, audio_dim]}, + "targets": {"dim_tags": [batch_dim, out_spatial_dim], "sparse_dim": vocab_dim}, +} + +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.luca_example_transducer.transducer_model_luca import ( + from_scratch_model_def as _model_def, +) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.luca_example_transducer.transducer_model_luca import ( + _returnn_v2_get_model as get_model, +) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.train import ( + viterbi_training as _train_def, +) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.train import ( + _returnn_v2_train_step as train_step, +) + +# https://github.com/rwth-i6/returnn/issues/957 +# https://stackoverflow.com/a/16248113/133374 +import resource +import sys + +try: + resource.setrlimit(resource.RLIMIT_STACK, (2**29, -1)) +except Exception as exc: + print(f"resource.setrlimit {type(exc).__name__}: {exc}") +sys.setrecursionlimit(10**6) +_cf_cache = {} + + +def cf(filename): + "Cache manager" + from subprocess import check_output, CalledProcessError + + if filename in _cf_cache: + return _cf_cache[filename] + if int(os.environ.get("RETURNN_DEBUG", "0")): + print("use local file: %s" % filename) + return filename # for debugging + try: + cached_fn = check_output(["cf", filename]).strip().decode("utf8") + except CalledProcessError: + print("Cache manager: Error occurred, using local file") + return filename + assert os.path.exists(cached_fn) + _cf_cache[filename] = cached_fn + return cached_fn + + +# -*- mode: python; tab-width: 4 -*- diff --git a/users/gaudino/experiments/rf_robin_rnnt_2024/luca_example_transducer/ReturnnTrainingJob/work/rnn.sh b/users/gaudino/experiments/rf_robin_rnnt_2024/luca_example_transducer/ReturnnTrainingJob/work/rnn.sh new file mode 100755 index 000000000..b3bc931a7 --- /dev/null +++ b/users/gaudino/experiments/rf_robin_rnnt_2024/luca_example_transducer/ReturnnTrainingJob/work/rnn.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +/usr/bin/python3 -mtorch.distributed.run --standalone --nnodes=1 --nproc-per-node=4 /u/schmitt/src/returnn_new/rnn.py /u/schmitt/experiments/segmental_models_2022_23_rf_pycharm/recipe/i6_experiments/users/schmitt/experiments/config/pipelines/global_vs_segmental_2022_23_rf/dependencies/returnn/network_builder_rf/segmental/luca_example_transducer/ReturnnTrainingJob/output/train_config.config \ No newline at end of file diff --git a/users/gaudino/experiments/rf_robin_rnnt_2024/luca_example_transducer/__init__.py b/users/gaudino/experiments/rf_robin_rnnt_2024/luca_example_transducer/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/gaudino/experiments/rf_robin_rnnt_2024/luca_example_transducer/transducer_model_luca.py b/users/gaudino/experiments/rf_robin_rnnt_2024/luca_example_transducer/transducer_model_luca.py new file mode 100644 index 000000000..07655dab5 --- /dev/null +++ b/users/gaudino/experiments/rf_robin_rnnt_2024/luca_example_transducer/transducer_model_luca.py @@ -0,0 +1,234 @@ +from typing import Optional, Dict, Any, Sequence, Tuple, List +import functools + +from returnn.tensor import Dim +import returnn.frontend as rf + +from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.model import ModelDef +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.base import _batch_size_factor, _log_mel_feature_dim +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.segmental.model_new.label_model.model import ( + SegmentalAttEfficientLabelDecoder +) +from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.encoder.global_ import GlobalConformerEncoder +from i6_experiments.users.schmitt.returnn_frontend.model_interfaces.supports_label_scorer_torch import RFModelWithMakeLabelScorer + + +class SegmentalAttentionModel(rf.Module): + def __init__( + self, + *, + center_window_size: int, + align_target_dim: Dim, + target_dim: Dim, + blank_idx: int, + enc_key_total_dim: Dim = Dim(name="enc_key_total_dim", dimension=1024), + att_dropout: float = 0.1, + l2: float = 0.0001, + language_model: Optional[RFModelWithMakeLabelScorer] = None, + enc_in_dim: Dim, + enc_out_dim: Dim = Dim(name="enc", dimension=512), + enc_num_layers: int = 12, + enc_aux_logits: Sequence[int] = (), # layers + enc_ff_dim: Dim = Dim(name="enc-ff", dimension=2048), + enc_num_heads: int = 4, + encoder_layer_opts: Optional[Dict[str, Any]] = None, + dec_att_num_heads: Dim = Dim(name="att_num_heads", dimension=1), + enc_dropout: float = 0.1, + use_att_ctx_in_state: bool = True, + blank_decoder_version: int = 1, + use_joint_model: bool = False, + use_weight_feedback: bool = True, + label_decoder_state: str = "nb-lstm", + ): + super(SegmentalAttentionModel, self).__init__() + + self.encoder = GlobalConformerEncoder( + enc_in_dim, + enc_out_dim, + num_layers=enc_num_layers, + target_dim=target_dim, + wb_target_dim=align_target_dim, + aux_logits=enc_aux_logits, + ff_dim=enc_ff_dim, + num_heads=enc_num_heads, + encoder_layer_opts=encoder_layer_opts, + enc_key_total_dim=enc_key_total_dim, + dec_att_num_heads=dec_att_num_heads, + dropout=enc_dropout, + att_dropout=att_dropout, + l2=l2, + ) + + assert label_decoder_state in {"nb-lstm", "joint-lstm", "nb-2linear-ctx1"} + + self.label_decoder = SegmentalAttEfficientLabelDecoder( + enc_out_dim=self.encoder.out_dim, + target_dim=target_dim, + att_num_heads=dec_att_num_heads, + att_dropout=att_dropout, + blank_idx=blank_idx, + enc_key_total_dim=enc_key_total_dim, + l2=l2, + center_window_size=center_window_size, + use_weight_feedback=use_weight_feedback, + use_att_ctx_in_state=use_att_ctx_in_state, + decoder_state=label_decoder_state, + ) + + if language_model: + self.language_model, self.language_model_make_label_scorer = language_model + else: + self.language_model = None + self.language_model_make_label_scorer = None + + self.blank_idx = self.label_decoder.blank_idx + self.center_window_size = center_window_size + self.target_dim = self.label_decoder.target_dim + self.align_target_dim = align_target_dim + self.use_joint_model = use_joint_model + self.blank_decoder_version = blank_decoder_version + self.label_decoder_state = label_decoder_state + + +class MakeModel: + """for import""" + + def __init__(self, in_dim: int, align_target_dim: int, target_dim: int, *, center_window_size: int, eos_label: int = 0, num_enc_layers: int = 12): + self.in_dim = in_dim + self.align_target_dim = align_target_dim + self.target_dim = target_dim + self.center_window_size = center_window_size + self.eos_label = eos_label + self.num_enc_layers = num_enc_layers + + def __call__(self) -> SegmentalAttentionModel: + from returnn.datasets.util.vocabulary import Vocabulary + + in_dim = Dim(name="in", dimension=self.in_dim, kind=Dim.Types.Feature) + align_target_dim = Dim(name="align_target", dimension=self.align_target_dim, kind=Dim.Types.Feature) + target_dim = Dim(name="non_blank_target", dimension=self.target_dim, kind=Dim.Types.Feature) + target_dim.vocab = Vocabulary.create_vocab_from_labels( + [str(i) for i in range(target_dim.dimension)], eos_label=self.eos_label + ) + + return self.make_model(in_dim, align_target_dim, target_dim, center_window_size=self.center_window_size) + + @classmethod + def make_model( + cls, + in_dim: Dim, + align_target_dim: Dim, + target_dim: Dim, + *, + num_enc_layers: int = 12, + pos_emb_dropout: float = 0.0, + language_model: Optional[Dict[str, Any]] = None, + enc_out_dim: int, + enc_key_total_dim: int, + enc_ff_dim: int, + **extra, + ) -> SegmentalAttentionModel: + """make""" + lm = None + if language_model: + assert isinstance(language_model, dict) + language_model = language_model.copy() + cls_name = language_model.pop("class") + assert cls_name == "TransformerDecoder" + language_model.pop("vocab_dim", None) # will just overwrite + + from i6_experiments.users.schmitt.experiments.config.pipelines.global_vs_segmental_2022_23_rf.dependencies.returnn.network_builder_rf.lm.trafo import model as trafo_lm + + lm = trafo_lm.MakeModel(vocab_dim=target_dim, **language_model)() + lm = (lm, functools.partial(trafo_lm.make_time_sync_label_scorer_torch, model=lm, align_target_dim=align_target_dim)) + + return SegmentalAttentionModel( + enc_in_dim=in_dim, + enc_num_layers=num_enc_layers, + enc_out_dim=Dim(name="enc", dimension=enc_out_dim, kind=Dim.Types.Feature), + enc_ff_dim=Dim(name="enc-ff", dimension=enc_ff_dim, kind=Dim.Types.Feature), + enc_key_total_dim=Dim(name="enc_key_total_dim", dimension=enc_key_total_dim), + enc_num_heads=8, + encoder_layer_opts=dict( + conv_norm_opts=dict(use_mask=True), + self_att_opts=dict( + # Shawn et al 2018 style, old RETURNN way. + with_bias=False, + with_linear_pos=False, + with_pos_bias=False, + learnable_pos_emb=True, + separate_pos_emb_per_head=False, + pos_emb_dropout=pos_emb_dropout, + ), + ff_activation=lambda x: rf.relu(x) ** 2.0, + ), + target_dim=target_dim, + align_target_dim=align_target_dim, + blank_idx=0, + language_model=lm, + center_window_size=1, + use_att_ctx_in_state=False, + blank_decoder_version=3, + use_joint_model=True, + use_weight_feedback=False, + label_decoder_state="nb-lstm", + **extra, + ) + + +def from_scratch_model_def( + *, epoch: int, in_dim: Dim, align_target_dim: Dim, target_dim: Dim) -> SegmentalAttentionModel: + """Function is run within RETURNN.""" + from returnn.config import get_global_config + + in_dim, epoch # noqa + config = get_global_config() # noqa + enc_aux_logits = config.typed_value("aux_loss_layers") + pos_emb_dropout = config.float("pos_emb_dropout", 0.0) + # real input is raw audio, internally it does logmel + in_dim = Dim(name="logmel", dimension=_log_mel_feature_dim, kind=Dim.Types.Feature) + lm_opts = config.typed_value("external_lm") + + enc_out_dim = config.int("enc_out_dim", 512) + enc_key_total_dim = config.int("enc_key_total_dim", 1024) + enc_ff_dim = config.int("enc_ff_dim", 2048) + + return MakeModel.make_model( + in_dim, + align_target_dim, + target_dim, + enc_aux_logits=enc_aux_logits or (), + pos_emb_dropout=pos_emb_dropout, + language_model=lm_opts, + enc_out_dim=enc_out_dim, + enc_key_total_dim=enc_key_total_dim, + enc_ff_dim=enc_ff_dim, + ) + + +from_scratch_model_def: ModelDef[SegmentalAttentionModel] +from_scratch_model_def.behavior_version = 16 +from_scratch_model_def.backend = "torch" +from_scratch_model_def.batch_size_factor = _batch_size_factor + + +def _returnn_v2_get_model(*, epoch: int, **_kwargs_unused): + from returnn.tensor import Tensor + from returnn.config import get_global_config + from returnn.datasets.util.vocabulary import BytePairEncoding + + config = get_global_config() + default_input_key = config.typed_value("default_input") + default_target_key = config.typed_value("target") + extern_data_dict = config.typed_value("extern_data") + data = Tensor(name=default_input_key, **extern_data_dict[default_input_key]) + targets = Tensor(name=default_target_key, **extern_data_dict[default_target_key]) + + non_blank_vocab = config.typed_value("non_blank_vocab") + if non_blank_vocab is not None: + targets.sparse_dim.vocab = BytePairEncoding(**non_blank_vocab) + + model_def = config.typed_value("_model_def") + model = model_def( + epoch=epoch, in_dim=data.feature_dim, align_target_dim=targets.sparse_dim, target_dim=targets.sparse_dim) + return model diff --git a/users/gaudino/models/asr/rf/conformer_rnnt/model_conformer_rnnt.py b/users/gaudino/models/asr/rf/conformer_rnnt/model_conformer_rnnt.py index b9fc28ec7..a93eb834e 100644 --- a/users/gaudino/models/asr/rf/conformer_rnnt/model_conformer_rnnt.py +++ b/users/gaudino/models/asr/rf/conformer_rnnt/model_conformer_rnnt.py @@ -188,9 +188,10 @@ def make_model( enc_ff_dim=Dim(name="enc-ff", dimension=2048, kind=Dim.Types.Feature), enc_att_num_heads=8, enc_conformer_layer_opts=dict( - conv_norm_opts=dict( - use_mask=True, track_running_stats=False - ), # Changed: track_running_stats=False + conv_norm = rf.LayerNorm, + # conv_norm_opts=dict( + # in_dim= + # ), # Changed below self_att=rf.SelfAttention, self_att_opts=dict( with_bias=True, # Changed: with_bias=True From 05c2c3502fb58cddc098092952ab03bdc98700b2 Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Thu, 20 Jun 2024 21:25:46 +0200 Subject: [PATCH 222/227] SamplingBytePairEncoding for SentencePiece --- users/zeyer/datasets/utils/spm.py | 11 +++++++++++ .../exp2024_04_23_baselines/ctc.py | 19 ++++++++++--------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/users/zeyer/datasets/utils/spm.py b/users/zeyer/datasets/utils/spm.py index 92116a16a..3ffc5b823 100644 --- a/users/zeyer/datasets/utils/spm.py +++ b/users/zeyer/datasets/utils/spm.py @@ -52,6 +52,17 @@ def get_opts(self) -> Dict[str, Any]: } if self.other_opts: d.update(self.other_opts) + if d["class"] == "SamplingBytePairEncoding": + # Need to fix this a bit. model_file not used here. But we need vocab_file instead. + model_file = d.pop("model_file") + if not d.get("vocab_file"): + from i6_core.text.label.sentencepiece.vocab import ExtractSentencePieceVocabJob + + d["vocab_file"] = ExtractSentencePieceVocabJob(model_file).out_vocab + d.setdefault("word_prefix_symbol", "▁") + d.setdefault("unknown_label", self.unknown_label) + d.setdefault("bos_label", self.bos_idx) + d.setdefault("eos_label", self.eos_idx) return d def get_eos_idx(self) -> Optional[int]: diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py index ef17629d9..bcbc97fbd 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc.py @@ -140,19 +140,20 @@ def py(): ) # Comparing vocabs with better settings: feature norm, sampling, no max seq len. - for vocab, alpha in [ - ("spm20k", 0.7), - ("bpe10k", 0.01), - ("spm10k", 0.7), - # ("spm_bpe10k", ...), # unclear what sampling scheme... - ("spm4k", 0.7), - ("spm1k", 0.7), + for vocab, sample, alpha in [ + ("spm20k", "spm", 0.7), + ("bpe10k", "bpe", 0.01), + ("spm10k", "spm", 0.7), + ("spm10k", "bpe", 0.01), + ("spm_bpe10k", "bpe", 0.01), + ("spm4k", "spm", 0.7), + ("spm1k", "spm", 0.7), # ("spm_bpe1k", ...) ]: train_exp( f"v6-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-maxSeqLenNone" f"-wd1e_2-lrlin1e_5_295k-featBN-speedpertV2-{vocab}" - f"-{'spmSample' if vocab.startswith('spm') else 'bpeSample'}{str(alpha).replace('.', '')}", + f"-{sample}Sample{str(alpha).replace('.', '')}", config_11gb_v6_f32_accgrad1_mgpu4_pavg100_wd1e_4, model_config={"feature_batch_norm": True}, config_updates={ @@ -166,7 +167,7 @@ def py(): train_vocab_opts={ "other_opts": ( {"enable_sampling": True, "alpha": alpha} - if vocab.startswith("spm") + if sample == "spm" else {"class": "SamplingBytePairEncoding", "breadth_prob": alpha} ) }, From ed093d93fbcc75a8ca706503eebdeb0f4de371e5 Mon Sep 17 00:00:00 2001 From: Nick Rossenbach Date: Thu, 20 Jun 2024 22:26:34 +0200 Subject: [PATCH 223/227] add gradient clipping to example baseline --- .../ctc_rnnt_standalone_2024/experiments/ctc_phon/baseline.py | 1 + 1 file changed, 1 insertion(+) diff --git a/example_setups/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_phon/baseline.py b/example_setups/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_phon/baseline.py index 6bdd99499..1769f682e 100644 --- a/example_setups/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_phon/baseline.py +++ b/example_setups/librispeech/ctc_rnnt_standalone_2024/experiments/ctc_phon/baseline.py @@ -195,6 +195,7 @@ def tune_and_evaluate_helper( "max_seq_length": {"audio_features": 35 * 16000}, "accum_grad_multiple_step": 1, "torch_amp_options": {"dtype": "bfloat16"}, + "gradient_clip": 1.0, } network_module = "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6" From 1190b57d834c8573bedc87657224e0b306c3bb7a Mon Sep 17 00:00:00 2001 From: Nick Rossenbach Date: Thu, 20 Jun 2024 22:27:37 +0200 Subject: [PATCH 224/227] 2-precision WER and quantization helper --- .../ctc_rnnt_standalone_2024/pipeline.py | 97 +++++++++++++++++-- 1 file changed, 89 insertions(+), 8 deletions(-) diff --git a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pipeline.py b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pipeline.py index 01dc3ac5e..f5fdb0f86 100644 --- a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pipeline.py +++ b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/pipeline.py @@ -31,6 +31,12 @@ class ASRModel: prior_file: Optional[tk.Path] prefix_name: Optional[str] +@dataclass +class NeuralLM: + checkpoint: tk.Path + net_args: Dict[str, Any] + network_module: str + prefix_name: Optional[str] def search_single( prefix_name: str, @@ -79,7 +85,7 @@ def search_single( stm_file = CorpusToStmJob(bliss_corpus=recognition_bliss_corpus).out_stm_path - sclite_job = ScliteJob(ref=stm_file, hyp=search_ctm, sctk_binary_path=SCTK_BINARY_PATH) + sclite_job = ScliteJob(ref=stm_file, hyp=search_ctm, sctk_binary_path=SCTK_BINARY_PATH, precision_ndigit=2) tk.register_output(prefix_name + "/sclite/wer", sclite_job.out_wer) tk.register_output(prefix_name + "/sclite/report", sclite_job.out_report_dir) @@ -204,6 +210,16 @@ def training(training_name, datasets, train_args, num_epochs, returnn_exe, retur return train_job +@dataclass +class QuantArgs: + quant_config_dict: Dict[str, Any] + num_samples: int + seed: int + datasets: TrainingDatasets + network_module: str + filter_args: Optional[Dict[str, Any]] = None + + def prepare_asr_model( training_name, train_job, @@ -214,6 +230,7 @@ def prepare_asr_model( get_best_averaged_checkpoint: Optional[Tuple[int, str]] = None, get_last_averaged_checkpoint: Optional[int] = None, prior_config: Optional[Dict[str, Any]] = None, + quant_args: Optional[QuantArgs] = None ): """ :param training_name: @@ -291,12 +308,76 @@ def prepare_asr_model( if prior_config is not None: raise ValueError("prior_config can only be set if with_prior is True") - asr_model = ASRModel( - checkpoint=checkpoint, - network_module=train_args["network_module"], - net_args=train_args["net_args"], - prior_file=prior_file, - prefix_name=training_name, - ) + if quant_args: + from .config import get_static_quant_config + quant_config = get_static_quant_config( + training_datasets=quant_args.datasets, + network_module=quant_args.network_module, + net_args=train_args["net_args"], + quant_args=quant_args.quant_config_dict, + config={}, + num_samples=quant_args.num_samples, + dataset_seed=quant_args.seed, + debug=False, + dataset_filter_args=quant_args.filter_args + ) + quant_chkpt = quantize_static( + prefix_name=training_name, + returnn_config=quant_config, + checkpoint=checkpoint, + returnn_exe=RETURNN_EXE, + returnn_root=MINI_RETURNN_ROOT, + ) + asr_model = ASRModel( + checkpoint=quant_chkpt, + net_args=train_args["net_args"] | quant_args.quant_config_dict, + network_module=quant_args.network_module, + prior_file=prior_file, + prefix_name=training_name + ) + else: + asr_model = ASRModel( + checkpoint=checkpoint, + network_module=train_args["network_module"], + net_args=train_args["net_args"], + prior_file=prior_file, + prefix_name=training_name, + ) return asr_model + + +@tk.block() +def quantize_static( + prefix_name: str, + returnn_config: ReturnnConfig, + checkpoint: tk.Path, + returnn_exe: tk.Path, + returnn_root: tk.Path, + mem_rqmt: int = 16, +): + """ + Run search for a specific test dataset + + :param prefix_name: prefix folder path for alias and output files + :param returnn_config: the RETURNN config to be used for forwarding + :param Checkpoint checkpoint: path to RETURNN PyTorch model checkpoint + :param returnn_exe: The python executable to run the job with (when using container just "python3") + :param returnn_root: Path to a checked out RETURNN repository + :param mem_rqmt: override the default memory requirement + """ + quantize_job = ReturnnForwardJobV2( + model_checkpoint=checkpoint, + returnn_config=returnn_config, + log_verbosity=5, + mem_rqmt=mem_rqmt, + time_rqmt=2, + device="gpu", + cpu_rqmt=8, + returnn_python_exe=returnn_exe, + returnn_root=returnn_root, + output_files=['model.pt', "seq_tags.txt"], + ) + quantize_job.set_keep_value(5) + quantize_job.add_alias(prefix_name + "/calibration") + return quantize_job.out_files['model.pt'] From 8799f2b93ea0f62e1c858c74cbde6513fb05619a Mon Sep 17 00:00:00 2001 From: Nick Rossenbach Date: Thu, 20 Jun 2024 22:29:16 +0200 Subject: [PATCH 225/227] HDF alignment labels example data pipeline --- .../data/alignment.py | 105 ++++++++++++++++++ .../ctc_rnnt_standalone_2024/data/common.py | 54 ++++++--- 2 files changed, 146 insertions(+), 13 deletions(-) create mode 100644 users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/data/alignment.py diff --git a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/data/alignment.py b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/data/alignment.py new file mode 100644 index 000000000..7f9e41c87 --- /dev/null +++ b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/data/alignment.py @@ -0,0 +1,105 @@ +from dataclasses import dataclass +from sisyphus import tk +from typing import List, Optional + +from i6_core.returnn.hdf import RasrAlignmentDumpHDFJob + +from i6_experiments.common.datasets.librispeech import get_ogg_zip_dict + +from .common import build_training_datasets, TrainingDatasets, DatasetSettings, TrainingHDFLabelFiles +from ..default_tools import MINI_RETURNN_ROOT, RETURNN_EXE + +@dataclass +class HDFAlignmentData: + alignment_caches: [tk.Path] + allophone_file: tk.Path + state_tying_file: tk.Path + + +def build_hdf_from_alignment( + prefix: str, + alignment_data: HDFAlignmentData, + returnn_root: tk.Path, +) -> List[tk.Path]: + """ + + :param prefix: + :param alignment_data: + :param returnn_root: + :return: list of HDF files (one for each cache) + """ + hdf_file_job = RasrAlignmentDumpHDFJob( + alignment_caches=alignment_data.alignment_caches, + allophone_file=alignment_data.allophone_file, + state_tying_file=alignment_data.state_tying_file, + returnn_root=returnn_root, + # TODO: unclear if it should be sparse + ) + hdf_file_job.add_alias(prefix + "/dump_hdf_job") + return hdf_file_job.out_hdf_files + + +def build_rasr_alignment_target_training_datasets( + prefix: str, + librispeech_key: str, + settings: DatasetSettings, + train_alignment_caches: List[tk.Path], + dev_clean_alignment_caches: List[tk.Path], + dev_other_alignment_caches: List[tk.Path], + allophone_file: tk.Path, + state_tying_file: tk.Path, +) -> TrainingDatasets: + """ + :param prefix: + :param librispeech_key: which librispeech corpus to use + :param settings: configuration object for the dataset pipeline + """ + ogg_zip_dict = get_ogg_zip_dict(prefix, returnn_root=MINI_RETURNN_ROOT, returnn_python_exe=RETURNN_EXE) + train_ogg = ogg_zip_dict[librispeech_key] + dev_clean_ogg = ogg_zip_dict["dev-clean"] + dev_other_ogg = ogg_zip_dict["dev-other"] + + train_alignment_data = HDFAlignmentData( + alignment_caches=train_alignment_caches, + allophone_file=allophone_file, + state_tying_file=state_tying_file + ) + + dev_clean_alignment_data = HDFAlignmentData( + alignment_caches=dev_clean_alignment_caches, + allophone_file=allophone_file, + state_tying_file=state_tying_file + ) + + dev_other_alignment_data = HDFAlignmentData( + alignment_caches=dev_other_alignment_caches, + allophone_file=allophone_file, + state_tying_file=state_tying_file + ) + + train_hdf_label_files = TrainingHDFLabelFiles( + train=build_hdf_from_alignment( + prefix=prefix + "/train", + alignment_data=train_alignment_data, + returnn_root=MINI_RETURNN_ROOT + ), + dev_clean=build_hdf_from_alignment( + prefix=prefix + "/dev_clean", + alignment_data=dev_clean_alignment_data, + returnn_root=MINI_RETURNN_ROOT + ), + dev_other=build_hdf_from_alignment( + prefix=prefix + "/dev_other", + alignment_data=dev_other_alignment_data, + returnn_root=MINI_RETURNN_ROOT + ), + ) + + return build_training_datasets( + train_ogg=train_ogg, + dev_clean_ogg=dev_clean_ogg, + dev_other_ogg=dev_other_ogg, + settings=settings, + label_datastream=None, + training_hdf_label_files=train_hdf_label_files, + ) diff --git a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/data/common.py b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/data/common.py index 81cd46ca3..0631df216 100644 --- a/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/data/common.py +++ b/users/rossenbach/experiments/librispeech/ctc_rnnt_standalone_2024/data/common.py @@ -12,7 +12,7 @@ from i6_experiments.common.setups.returnn.datastreams.audio import AudioRawDatastream, ReturnnAudioRawOptions from i6_experiments.common.setups.returnn.datastreams.base import Datastream from i6_experiments.common.setups.returnn.datastreams.vocabulary import LabelDatastream -from i6_experiments.common.setups.returnn.datasets import Dataset, OggZipDataset, MetaDataset +from i6_experiments.common.setups.returnn.datasets import Dataset, OggZipDataset, MetaDataset, HDFDataset from .cv_segments import get_mixed_cv_segments @@ -96,13 +96,19 @@ def get_zip(alias_name: str, bliss_dataset: tk.Path) -> tk.Path: # --------------------------- Dataset functions ----------------------------------- +@dataclass() +class TrainingHDFLabelFiles: + train: List[tk.Path] + dev_clean: List[tk.Path] + dev_other: List[tk.Path] def build_training_datasets( train_ogg: Union[tk.Path, List[tk.Path]], dev_clean_ogg: tk.Path, dev_other_ogg: tk.Path, - label_datastream: LabelDatastream, + label_datastream: Optional[LabelDatastream], settings: DatasetSettings, + training_hdf_label_files: Optional[TrainingHDFLabelFiles] = None, ) -> TrainingDatasets: """ generic dataset construction helper to be used by the phon/bpe specific variants @@ -110,7 +116,9 @@ def build_training_datasets( :param train_ogg: path to the train zip, potentially containing altered transcriptions :param dev_clean_ogg: path to the ls dev-clean zip, potentially containing altered transcriptions :param dev_other_ogg: path to the ls dev-other zip, potentially containing altered transcriptions - :param label_datastream: label datastream (e.g. phoneme or bpe related) + :param label_datastream: label datastream for the ogg zip (e.g. phoneme or bpe related) + :param training_hdf_label_files: e.g. as alternative to label_datastream, use labels from HDF + (can be time-synchronous but do not have to) :param settings: settings object for the RETURNN data pipeline """ audio_datastream = get_audio_raw_datastream(settings.preemphasis, settings.peak_normalization) @@ -120,52 +128,72 @@ def build_training_datasets( "labels": label_datastream, } - data_map = {"raw_audio": ("zip_dataset", "data"), "labels": ("zip_dataset", "classes")} + if training_hdf_label_files is not None: + data_map = {"raw_audio": ("zip_dataset", "data"), "labels": ("hdf_dataset", "data")} + else: + data_map = {"raw_audio": ("zip_dataset", "data"), "labels": ("zip_dataset", "classes")} training_audio_opts = audio_datastream.as_returnn_audio_opts() - def make_meta(dataset: OggZipDataset): + def make_meta_oggzip(dataset: OggZipDataset, *args, **kwargs): return MetaDataset( data_map=data_map, datasets={"zip_dataset": dataset}, seq_order_control_dataset="zip_dataset" ) + def make_meta_oggzip_hdf(dataset: OggZipDataset, hdf_files: List): + return MetaDataset( + data_map=data_map, + datasets={ + "zip_dataset": dataset, + "hdf_dataset": HDFDataset( + files=hdf_files + ) + }, + seq_order_control_dataset="zip_dataset" + ) + + if training_hdf_label_files is None: + make_meta = make_meta_oggzip + else: + make_meta = make_meta_oggzip_hdf + train_zip_dataset = OggZipDataset( files=train_ogg, audio_options=training_audio_opts, - target_options=label_datastream.as_returnn_targets_opts(), + target_options=label_datastream.as_returnn_targets_opts() if label_datastream is not None else None, partition_epoch=settings.train_partition_epoch, seq_ordering=settings.train_seq_ordering, additional_options=settings.train_additional_options, ) - train_dataset = make_meta(train_zip_dataset) + train_dataset = make_meta(train_zip_dataset, training_hdf_label_files.train) cv_zip_dataset = OggZipDataset( files=[dev_clean_ogg, dev_other_ogg], audio_options=audio_datastream.as_returnn_audio_opts(), - target_options=label_datastream.as_returnn_targets_opts(), + target_options=label_datastream.as_returnn_targets_opts() if label_datastream is not None else None, segment_file=get_mixed_cv_segments(), seq_ordering="sorted_reverse", ) - cv_dataset = make_meta(cv_zip_dataset) + cv_dataset = make_meta(cv_zip_dataset, training_hdf_label_files.dev_clean + training_hdf_label_files.dev_other) devtrain_zip_dataset = OggZipDataset( files=train_ogg, audio_options=audio_datastream.as_returnn_audio_opts(), - target_options=label_datastream.as_returnn_targets_opts(), + target_options=label_datastream.as_returnn_targets_opts() if label_datastream is not None else None, seq_ordering="sorted_reverse", random_subset=3000, ) - devtrain_dataset = make_meta(devtrain_zip_dataset) + devtrain_dataset = make_meta(devtrain_zip_dataset, training_hdf_label_files.train) prior_zip_dataset = OggZipDataset( files=train_ogg, audio_options=training_audio_opts, - target_options=label_datastream.as_returnn_targets_opts(), + target_options=label_datastream.as_returnn_targets_opts() if label_datastream is not None else None, partition_epoch=1, seq_ordering="sorted_reverse", additional_options=None, ) - prior_dataset = make_meta(prior_zip_dataset) + prior_dataset = make_meta(prior_zip_dataset, training_hdf_label_files.train) return TrainingDatasets( train=train_dataset, From 0291789284b531075e2a5d783d3dc402e0c4a9a5 Mon Sep 17 00:00:00 2001 From: Peter Vieting Date: Fri, 21 Jun 2024 11:46:46 +0200 Subject: [PATCH 226/227] ls960 pretrain: fix python launcher for itc/i6 --- .../wav2vec2/config_01_fairseq_main.py | 9 +++++---- .../wav2vec2/config_02_fairseq_phoneme.py | 9 +++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/users/vieting/experiments/librispeech/librispeech_960_pretraining/wav2vec2/config_01_fairseq_main.py b/users/vieting/experiments/librispeech/librispeech_960_pretraining/wav2vec2/config_01_fairseq_main.py index 4313042b7..2d5db0ade 100755 --- a/users/vieting/experiments/librispeech/librispeech_960_pretraining/wav2vec2/config_01_fairseq_main.py +++ b/users/vieting/experiments/librispeech/librispeech_960_pretraining/wav2vec2/config_01_fairseq_main.py @@ -139,10 +139,11 @@ def run_fairseq_pretraining(): fairseq_args = get_fairseq_args(num_gpus=num_gpus) fairseq_config = FairseqHydraConfig(fairseq_args) fairseq_root = get_fairseq_root() - fairseq_exe = tk.Path( - "/home/pv653172/setups/librispeech/20230328_wav2vec2/dependencies/python_launcher.sh", - hash_overwrite="python_launcher", - ) + itc_python_launcher = "/home/pv653172/setups/librispeech/20230328_wav2vec2/dependencies/python_launcher.sh" + if os.path.exists(itc_python_launcher): + fairseq_exe = tk.Path(itc_python_launcher, hash_overwrite="python_launcher") + else: + fairseq_exe = tk.Path("/usr/bin/python3", hash_overwrite="python_launcher") job = FairseqHydraTrainingJob( fairseq_config, save_interval=25, diff --git a/users/vieting/experiments/librispeech/librispeech_960_pretraining/wav2vec2/config_02_fairseq_phoneme.py b/users/vieting/experiments/librispeech/librispeech_960_pretraining/wav2vec2/config_02_fairseq_phoneme.py index 53aef623a..b347b7979 100644 --- a/users/vieting/experiments/librispeech/librispeech_960_pretraining/wav2vec2/config_02_fairseq_phoneme.py +++ b/users/vieting/experiments/librispeech/librispeech_960_pretraining/wav2vec2/config_02_fairseq_phoneme.py @@ -67,10 +67,11 @@ def run_fairseq_pretraining_negatives_other_target(): prefix_name = "experiments/librispeech/librispeech_960_pretraining/wav2vec2/" alignment = get_alignment_hdf() num_gpus = 8 - fairseq_python_exe = tk.Path( - "/home/pv653172/setups/librispeech/20230328_wav2vec2/dependencies/python_launcher.sh", - hash_overwrite="itc_python_launcher_py310_torch", - ) + itc_python_launcher = "/home/pv653172/setups/librispeech/20230328_wav2vec2/dependencies/python_launcher.sh" + if os.path.exists(itc_python_launcher): + fairseq_python_exe = tk.Path(itc_python_launcher, hash_overwrite="itc_python_launcher_py310_torch") + else: + fairseq_python_exe = tk.Path("/usr/bin/python3", hash_overwrite="itc_python_launcher_py310_torch") fairseq_root = get_fairseq_root(fairseq_exe=fairseq_python_exe) fairseq_training_args = dict( save_interval=25, From aa2b3fbca67bd9083e8aa26b5075c8f67a54e60b Mon Sep 17 00:00:00 2001 From: marvin84 Date: Fri, 21 Jun 2024 12:43:57 +0200 Subject: [PATCH 227/227] latest users/raissi --- .../decoder/BASE_factored_hybrid_search.py | 104 +++++++++++++++++- .../common/helpers/train/network_params.py | 8 ++ .../legacy/SWB_TF_factored_hybrid_system.py | 56 ++++++++-- users/raissi/utils/default_tools.py | 7 ++ 4 files changed, 165 insertions(+), 10 deletions(-) diff --git a/users/raissi/setups/common/decoder/BASE_factored_hybrid_search.py b/users/raissi/setups/common/decoder/BASE_factored_hybrid_search.py index 61bfbeb1f..187d6927f 100644 --- a/users/raissi/setups/common/decoder/BASE_factored_hybrid_search.py +++ b/users/raissi/setups/common/decoder/BASE_factored_hybrid_search.py @@ -1362,6 +1362,99 @@ def push_delayed_tuple( ) + def recognize_optimize_transtition_values( + self, + *, + label_info: LabelInfo, + num_encoder_output: int, + search_parameters: SearchParameters, + tdp_sil: Optional[List[Tuple[TDP, TDP, TDP, TDP]]] = None, + tdp_speech: Optional[List[Tuple[TDP, TDP, TDP, TDP]]] = None, + altas_value=14.0, + altas_beam=14.0, + keep_value=10, + gpu: Optional[bool] = None, + cpu_rqmt: Optional[int] = None, + mem_rqmt: Optional[int] = None, + crp_update: Optional[Callable[[rasr.RasrConfig], Any]] = None, + pre_path: str = "transition-values", + cpu_slow: bool = True, + ) -> SearchParameters: + + recog_args = dataclasses.replace(search_parameters, altas=altas_value, beam=altas_beam) + + tdp_sil = tdp_sil if tdp_sil is not None else [recog_args.tdp_silence] + tdp_speech = tdp_speech if tdp_speech is not None else [recog_args.tdp_speech] + jobs = { + (tdp_sl, tdp_sp): self.recognize_count_lm( + add_sis_alias_and_output=False, + calculate_stats=False, + cpu_rqmt=cpu_rqmt, + crp_update=crp_update, + gpu=gpu, + is_min_duration=False, + keep_value=keep_value, + label_info=label_info, + mem_rqmt=mem_rqmt, + name_override=f"{self.name}-tdpSil{tdp_sl}-tdpSp{tdp_sp}-", + num_encoder_output=num_encoder_output, + opt_lm_am=False, + rerun_after_opt_lm=False, + search_parameters=dataclasses.replace( + recog_args, tdp_silence=tdp_sl, tdp_speech=tdp_sp + ), + remove_or_set_concurrency=False, + ) + for (tdp_sl, tdp_sp) in itertools.product( + tdp_sil, tdp_speech + ) + } + jobs_num_e = {k: v.scorer.out_num_errors for k, v in jobs.items()} + + for (tdp_sl, tdp_sp), recog_jobs in jobs.items(): + if cpu_slow: + recog_jobs.search.update_rqmt("run", {"cpu_slow": True}) + + pre_name = ( + f"{pre_path}/{self.name}/" + f"tdpSil{format_tdp(tdp_sl)}tdpSp{format_tdp(tdp_sp)}" + ) + + recog_jobs.lat2ctm.set_keep_value(keep_value) + recog_jobs.search.set_keep_value(keep_value) + + recog_jobs.search.add_alias(pre_name) + tk.register_output(f"{pre_name}.wer", recog_jobs.scorer.out_report_dir) + + best_overall_wer = ComputeArgminJob({k: v.scorer.out_wer for k, v in jobs.items()}) + best_overall_n = ComputeArgminJob(jobs_num_e) + tk.register_output( + f"decoding/tdp-best/{self.name}/args", + best_overall_n.out_argmin, + ) + tk.register_output( + f"decoding/tdp-best/{self.name}/wer", + best_overall_wer.out_min, + ) + + def push_delayed_tuple( + argmin: DelayedBase, + ) -> Tuple[DelayedBase, DelayedBase, DelayedBase, DelayedBase]: + return tuple(argmin[i] for i in range(4)) + + # cannot destructure, need to use indices + best_tdp_sil = best_overall_n.out_argmin[0] + best_tdp_sp = best_overall_n.out_argmin[1] + + base_cfg = dataclasses.replace( + search_parameters, + tdp_silence=push_delayed_tuple(best_tdp_sil), + tdp_speech=push_delayed_tuple(best_tdp_sp), + ) + + return base_cfg + + class BASEFactoredHybridAligner(BASEFactoredHybridDecoder): @@ -1401,21 +1494,28 @@ def __init__( set_batch_major_for_feature_scorer=set_batch_major_for_feature_scorer, ) - def correct_transition_applicator(self, crp): + def correct_transition_applicator(self, crp, correct_fsa_strcuture=False): # correct for the FSA bug crp.acoustic_model_config.tdp.applicator_type = "corrected" # The exit penalty is on the lemma level and should not be applied for alignment for tdp_type in ["*", "silence", "nonword-0", "nonword-1"]: crp.acoustic_model_config.tdp[tdp_type]["exit"] = 0.0 + if correct_fsa_strcuture: + crp.acoustic_model_config["*"]["fix-allophone-context-at-word-boundaries"] = True + crp.acoustic_model_config["*"]["transducer-builder-filter-out-invalid-allophones"] = True + crp.acoustic_model_config["*"]["allow-for-silence-repetitions"] = False return crp + + def get_alignment_job( self, label_info: LabelInfo, alignment_parameters: AlignmentParameters, num_encoder_output: int, pre_path: Optional[str] = "alignments", + correct_fsa_structure: bool = False, is_min_duration: bool = False, use_estimated_tdps: bool = False, crp_update: Optional[Callable[[rasr.RasrConfig], Any]] = None, @@ -1562,7 +1662,7 @@ def get_alignment_job( warnings.warn("you planned to use exit penalty for alignment, we set this to zero") - align_crp = self.correct_transition_applicator(align_crp) + align_crp = self.correct_transition_applicator(align_crp, correct_fsa_strcuture=correct_fsa_structure) alignment = mm.AlignmentJob( crp=align_crp, diff --git a/users/raissi/setups/common/helpers/train/network_params.py b/users/raissi/setups/common/helpers/train/network_params.py index 9e8a055c2..5127a6e67 100644 --- a/users/raissi/setups/common/helpers/train/network_params.py +++ b/users/raissi/setups/common/helpers/train/network_params.py @@ -83,6 +83,14 @@ def get_sa_name(self): #Conformer from-scratch +frameshift40_conformer_fullsum_from_scratch_mlp = GeneralNetworkParams( + l2=5e-6, + use_multi_task=False, + add_mlps=True, + specaug_args=asdict(default_sa_args), + frame_rate_reduction_ratio_factor=4, + auxilary_loss_layers=[], +) frameshift40_conformer_fullsum_from_scratch_multi_nomlp = GeneralNetworkParams( l2=5e-6, use_multi_task=True, diff --git a/users/raissi/setups/swb/legacy/SWB_TF_factored_hybrid_system.py b/users/raissi/setups/swb/legacy/SWB_TF_factored_hybrid_system.py index ca1cdd8a2..5c662a4f3 100644 --- a/users/raissi/setups/swb/legacy/SWB_TF_factored_hybrid_system.py +++ b/users/raissi/setups/swb/legacy/SWB_TF_factored_hybrid_system.py @@ -3,6 +3,7 @@ import copy import dataclasses import itertools +import numpy as np import sys from IPython import embed @@ -197,9 +198,9 @@ def __init__( } # 1/9 for 3-state, same amount of silence } - self.transcript_prior_xml = {"monostate": ("/").join( - [self.dependencies_path, "haotian/monostate/monostate.we.transcript.prior.xml"] - ),} + self.transcript_prior_xml = { + "monostate": ("/").join([self.dependencies_path, "haotian/monostate/monostate.we.transcript.prior.xml"]), + } # -------------------- External helpers -------------------- @@ -207,9 +208,7 @@ def set_gammatone_features(self): feature_name = self.feature_info.feature_type.get() for corpus_key in ["train", "hub500", "hub501"]: self.feature_bundles[corpus_key] = {feature_name: feature_bundles[corpus_key]} - self.feature_flows[corpus_key] = { - feature_name: features.basic_cache_flow(feature_bundles[corpus_key]) - } + self.feature_flows[corpus_key] = {feature_name: features.basic_cache_flow(feature_bundles[corpus_key])} mapping = {"train": "train", "hub500": "dev", "hub501": "eval"} cache_pattern = feature_bundles[corpus_key].get_path().split(".bundle")[0] caches = [tk.Path(f"{cache_pattern}.{i}") for i in range(1, concurrent[mapping[corpus_key]] + 1)] @@ -254,7 +253,6 @@ def prepare_data_with_separate_cv_legacy(self, cv_key="train.cvtrain", bw_key="b [self.cross_validation_info["pre_path"], self.cross_validation_info["merged_corpus_segment"]] ) - def get_recognizer_and_args( self, key: str, @@ -344,6 +342,48 @@ def get_recognizer_and_args( return recognizer, recog_args + def get_best_recog_scales_and_transition_values( + self, + key: str, + num_encoder_output: int, + recog_args: SWBSearchParameters, + lm_scale: float, + ) -> SWBSearchParameters: + + assert self.experiments[key]["decode_job"]["runner"] is not None, "Please set the recognizer" + recognizer = self.experiments[key]["decode_job"]["runner"] + + tune_args = recog_args.with_lm_scale(lm_scale) + best_config_scales = recognizer.recognize_optimize_scales_v2( + label_info=self.label_info, + search_parameters=tune_args, + num_encoder_output=num_encoder_output, + altas_value=2.0, + altas_beam=16.0, + tdp_sil=[(11.0, 0.0, "infinity", 20.0)], + tdp_speech=[(8.0, 0.0, "infinity", 0.0)], + tdp_nonword=[(8.0, 0.0, "infinity", 0.0)], + prior_scales=[[v] for v in np.arange(0.1, 0.8, 0.1).round(1)], + tdp_scales=[0.1, 0.2], + ) + + nnsp_tdp = [(l, 0.0, "infinity", e) for l in [8.0, 11.0, 13.0] for e in [10.0, 15.0, 20.0]] + sp_tdp = [(l, 0.0, "infinity", e) for l in [5.0, 8.0, 11.0] for e in [0.0, 5.0]] + best_config= recognizer.recognize_optimize_transtition_values( + label_info=self.label_info, + search_parameters=best_config_scales, + num_encoder_output=512, + altas_value=2.0, + altas_beam=16.0, + tdp_sil=nnsp_tdp, + tdp_speech=sp_tdp, + ) + + return best_config + + + + def get_aligner_and_args( self, key: str, @@ -405,7 +445,7 @@ def get_aligner_and_args( if feature_path is None: feature_path = self.feature_flows[crp_corpus] - #consider if you need to create separate alignment params + # consider if you need to create separate alignment params align_args = self.get_parameters_for_aligner(context_type=context_type, prior_info=p_info) align_args = dataclasses.replace(align_args, non_word_phonemes="[LAUGHTER],[NOISE],[VOCALIZEDNOISE]") diff --git a/users/raissi/utils/default_tools.py b/users/raissi/utils/default_tools.py index baef3b003..91f1cda0e 100644 --- a/users/raissi/utils/default_tools.py +++ b/users/raissi/utils/default_tools.py @@ -45,6 +45,7 @@ def get_rasr_binary_path(rasr_path): hash_overwrite="CONFORMER_DEFAULT_RASR_BINARY_PATH_TF2", ) +U16_RASR_GENERIC_SEQ2SEQ = tk.Path("/work/tools/users/raissi/rasr/generic-seq2seq-dev/arch/linux-x86_64-standard", hash_overwrite="u16") U16_RASR_BINARY_PATHS = {"TF1": u16_rasr_path_tf2, "TF2": u16_rasr_path_tf2_barcelona, "TED_COMMON": u16_rasr_path_ted_common} u16_returnn_launcher_tf2 = tk.Path( "/u/raissi/bin/apptainer-launchers/u16/returnn_tf2.3_apptainer_u16_launcher.sh", @@ -138,6 +139,12 @@ def __post_init__(self) -> None: rasr_binary_path=U16_RASR_BINARY_PATHS["TED_COMMON"], ) +u16_tools_factored = ToolPaths( + returnn_root=RETURNN_ROOT_BW_FACTORED, + returnn_python_exe=U16_RETURNN_LAUNCHERS["TF2"], + rasr_binary_path=U16_RASR_GENERIC_SEQ2SEQ +) + u22_tools_tf = ToolPaths( returnn_root=RETURNN_ROOT_TORCH,