From b39d52fa97fc70ca9b41960443ec1a733bc1ef89 Mon Sep 17 00:00:00 2001 From: Yifan Peng Date: Mon, 21 Oct 2024 18:30:32 -0500 Subject: [PATCH 01/15] add owsm-ctc recipe --- egs2/owsm_ctc_v3.1/s2t1/README.md | 14 +++ egs2/owsm_ctc_v3.1/s2t1/cmd.sh | 110 ++++++++++++++++++ egs2/owsm_ctc_v3.1/s2t1/conf/decode_s2t.yaml | 7 ++ egs2/owsm_ctc_v3.1/s2t1/conf/fbank.conf | 2 + egs2/owsm_ctc_v3.1/s2t1/conf/pbs.conf | 11 ++ egs2/owsm_ctc_v3.1/s2t1/conf/pitch.conf | 1 + egs2/owsm_ctc_v3.1/s2t1/conf/queue.conf | 12 ++ egs2/owsm_ctc_v3.1/s2t1/conf/slurm.conf | 14 +++ ..._multitask-ctc_ebf27_conv2d8_size1024.yaml | 109 +++++++++++++++++ egs2/owsm_ctc_v3.1/s2t1/db.sh | 1 + .../s2t1/local/convert_owsm_data.py | 61 ++++++++++ egs2/owsm_ctc_v3.1/s2t1/local/path.sh | 0 egs2/owsm_ctc_v3.1/s2t1/path.sh | 1 + egs2/owsm_ctc_v3.1/s2t1/pyscripts | 1 + egs2/owsm_ctc_v3.1/s2t1/run.sh | 36 ++++++ egs2/owsm_ctc_v3.1/s2t1/s2t.sh | 1 + egs2/owsm_ctc_v3.1/s2t1/scripts | 1 + egs2/owsm_ctc_v3.1/s2t1/utils | 1 + 18 files changed, 383 insertions(+) create mode 100644 egs2/owsm_ctc_v3.1/s2t1/README.md create mode 100644 egs2/owsm_ctc_v3.1/s2t1/cmd.sh create mode 100644 egs2/owsm_ctc_v3.1/s2t1/conf/decode_s2t.yaml create mode 100644 egs2/owsm_ctc_v3.1/s2t1/conf/fbank.conf create mode 100644 egs2/owsm_ctc_v3.1/s2t1/conf/pbs.conf create mode 100644 egs2/owsm_ctc_v3.1/s2t1/conf/pitch.conf create mode 100644 egs2/owsm_ctc_v3.1/s2t1/conf/queue.conf create mode 100644 egs2/owsm_ctc_v3.1/s2t1/conf/slurm.conf create mode 100644 egs2/owsm_ctc_v3.1/s2t1/conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml create mode 120000 egs2/owsm_ctc_v3.1/s2t1/db.sh create mode 100644 egs2/owsm_ctc_v3.1/s2t1/local/convert_owsm_data.py create mode 100644 egs2/owsm_ctc_v3.1/s2t1/local/path.sh create mode 120000 egs2/owsm_ctc_v3.1/s2t1/path.sh create mode 120000 egs2/owsm_ctc_v3.1/s2t1/pyscripts create mode 100755 egs2/owsm_ctc_v3.1/s2t1/run.sh create mode 120000 egs2/owsm_ctc_v3.1/s2t1/s2t.sh create mode 120000 egs2/owsm_ctc_v3.1/s2t1/scripts create mode 120000 egs2/owsm_ctc_v3.1/s2t1/utils diff --git a/egs2/owsm_ctc_v3.1/s2t1/README.md b/egs2/owsm_ctc_v3.1/s2t1/README.md new file mode 100644 index 00000000000..7c6d7f980a4 --- /dev/null +++ b/egs2/owsm_ctc_v3.1/s2t1/README.md @@ -0,0 +1,14 @@ +# OWSM-CTC v3.1 + +[OWSM-CTC](https://aclanthology.org/2024.acl-long.549/) is an encoder-only speech foundation model based on hierarchical multi-task self-conditioned CTC. +This version is trained on 180k hours of public audio data for multilingual speech recognition, any-to-any speech translation, and language identification, which follows the design of the project, [Open Whisper-style Speech Model (OWSM)](https://arxiv.org/abs/2401.16658). + +## Data Preparation + +The training data follows the same format as the encoder-decoder OWSM v3.1, except that timestamps are removed from the `text` file. Please first follow the `egs2/owsm_v3.1/s2t1` recipe to prepare OWSM data, and then convert `text` into the new format by running `python local/convert_owsm_data.py` (the path to the BPE tokenizer needs to be modified to your path). + +## Pre-trained Model + +The pre-trained model is available at: https://huggingface.co/pyf98/owsm_ctc_v3.1_1B + +The model page also contains example usage. diff --git a/egs2/owsm_ctc_v3.1/s2t1/cmd.sh b/egs2/owsm_ctc_v3.1/s2t1/cmd.sh new file mode 100644 index 00000000000..2aae6919fef --- /dev/null +++ b/egs2/owsm_ctc_v3.1/s2t1/cmd.sh @@ -0,0 +1,110 @@ +# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ====== +# Usage: .pl [options] JOB=1: +# e.g. +# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB +# +# Options: +# --time